From 36b56886974eae4f9c5ebc96befd3e7bfe5de338 Mon Sep 17 00:00:00 2001 From: Stephen Hines Date: Wed, 23 Apr 2014 16:57:46 -0700 Subject: Update to LLVM 3.5a. Change-Id: Ifadecab779f128e62e430c2b4f6ddd84953ed617 --- test/CodeGen/AArch64/128bit_load_store.ll | 53 + test/CodeGen/AArch64/adc.ll | 33 +- test/CodeGen/AArch64/assertion-rc-mismatch.ll | 24 + test/CodeGen/AArch64/atomic-ops.ll | 89 +- test/CodeGen/AArch64/concatvector-bugs.ll | 68 + test/CodeGen/AArch64/cpus.ll | 13 + test/CodeGen/AArch64/fcvt-int.ll | 2 +- test/CodeGen/AArch64/fp-dp3.ll | 54 +- test/CodeGen/AArch64/func-argpassing.ll | 13 +- test/CodeGen/AArch64/func-calls.ll | 8 +- test/CodeGen/AArch64/i128-shift.ll | 43 + test/CodeGen/AArch64/init-array.ll | 1 + test/CodeGen/AArch64/inline-asm-constraints.ll | 2 +- test/CodeGen/AArch64/inline-asm-modifiers.ll | 2 +- test/CodeGen/AArch64/jump-table.ll | 16 + test/CodeGen/AArch64/mature-mc-support.ll | 12 + test/CodeGen/AArch64/misched-basic-A53.ll | 112 ++ test/CodeGen/AArch64/mul-lohi.ll | 19 + test/CodeGen/AArch64/neon-2velem.ll | 303 ++++ test/CodeGen/AArch64/neon-3vdiff.ll | 27 + test/CodeGen/AArch64/neon-across.ll | 20 +- test/CodeGen/AArch64/neon-add-pairwise.ll | 9 + test/CodeGen/AArch64/neon-add-sub.ll | 84 +- test/CodeGen/AArch64/neon-bitcast.ll | 12 +- test/CodeGen/AArch64/neon-bitwise-instructions.ll | 723 ++++++++-- test/CodeGen/AArch64/neon-bsl.ll | 13 + test/CodeGen/AArch64/neon-copy.ll | 887 +++++++++++- test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll | 47 + test/CodeGen/AArch64/neon-crypto.ll | 63 +- test/CodeGen/AArch64/neon-extract.ll | 32 + test/CodeGen/AArch64/neon-facge-facgt.ll | 24 +- test/CodeGen/AArch64/neon-fma.ll | 50 +- test/CodeGen/AArch64/neon-fpround_f128.ll | 18 + test/CodeGen/AArch64/neon-load-store-v1i32.ll | 29 + test/CodeGen/AArch64/neon-max-min-pairwise.ll | 36 + test/CodeGen/AArch64/neon-misc.ll | 399 ++++-- test/CodeGen/AArch64/neon-mla-mls.ll | 24 +- test/CodeGen/AArch64/neon-mov.ll | 82 +- test/CodeGen/AArch64/neon-mul-div.ll | 597 +++++++- test/CodeGen/AArch64/neon-or-combine.ll | 29 + test/CodeGen/AArch64/neon-perm.ll | 1441 ++++++++++++++++++++ test/CodeGen/AArch64/neon-scalar-add-sub.ll | 12 +- test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll | 24 +- test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll | 28 +- test/CodeGen/AArch64/neon-scalar-compare.ll | 28 +- test/CodeGen/AArch64/neon-scalar-copy.ll | 43 +- test/CodeGen/AArch64/neon-scalar-cvt.ll | 52 +- test/CodeGen/AArch64/neon-scalar-ext.ll | 113 ++ test/CodeGen/AArch64/neon-scalar-fabd.ll | 14 +- test/CodeGen/AArch64/neon-scalar-fcvt.ll | 108 +- test/CodeGen/AArch64/neon-scalar-fp-compare.ll | 254 ++-- test/CodeGen/AArch64/neon-scalar-recip.ll | 72 +- .../CodeGen/AArch64/neon-scalar-reduce-pairwise.ll | 212 ++- test/CodeGen/AArch64/neon-scalar-rounding-shift.ll | 8 +- .../AArch64/neon-scalar-saturating-add-sub.ll | 32 +- .../neon-scalar-saturating-rounding-shift.ll | 20 +- .../AArch64/neon-scalar-saturating-shift.ll | 20 +- test/CodeGen/AArch64/neon-scalar-shift.ll | 206 ++- test/CodeGen/AArch64/neon-select_cc.ll | 202 +++ test/CodeGen/AArch64/neon-shift-left-long.ll | 10 + test/CodeGen/AArch64/neon-shl-ashr-lshr.ll | 333 +++++ test/CodeGen/AArch64/neon-simd-ldst-one.ll | 188 ++- test/CodeGen/AArch64/neon-simd-tbl.ll | 176 +-- test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll | 30 + test/CodeGen/AArch64/neon-truncStore-extLoad.ll | 57 + test/CodeGen/AArch64/neon-v1i1-setcc.ll | 68 + test/CodeGen/AArch64/neon-vector-list-spill.ll | 175 +++ test/CodeGen/AArch64/pic-eh-stubs.ll | 5 +- test/CodeGen/AArch64/ragreedy-csr.ll | 297 ++++ test/CodeGen/AArch64/sext_inreg.ll | 198 +++ test/CodeGen/AArch64/sincospow-vector-expansion.ll | 96 ++ test/CodeGen/AArch64/variadic.ll | 65 +- 72 files changed, 7567 insertions(+), 1092 deletions(-) create mode 100644 test/CodeGen/AArch64/128bit_load_store.ll create mode 100644 test/CodeGen/AArch64/assertion-rc-mismatch.ll create mode 100644 test/CodeGen/AArch64/concatvector-bugs.ll create mode 100644 test/CodeGen/AArch64/cpus.ll create mode 100644 test/CodeGen/AArch64/i128-shift.ll create mode 100644 test/CodeGen/AArch64/mature-mc-support.ll create mode 100644 test/CodeGen/AArch64/misched-basic-A53.ll create mode 100644 test/CodeGen/AArch64/mul-lohi.ll create mode 100644 test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll create mode 100644 test/CodeGen/AArch64/neon-fpround_f128.ll create mode 100644 test/CodeGen/AArch64/neon-load-store-v1i32.ll create mode 100644 test/CodeGen/AArch64/neon-or-combine.ll create mode 100644 test/CodeGen/AArch64/neon-scalar-ext.ll create mode 100644 test/CodeGen/AArch64/neon-select_cc.ll create mode 100644 test/CodeGen/AArch64/neon-shl-ashr-lshr.ll create mode 100644 test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll create mode 100644 test/CodeGen/AArch64/neon-truncStore-extLoad.ll create mode 100644 test/CodeGen/AArch64/neon-v1i1-setcc.ll create mode 100644 test/CodeGen/AArch64/neon-vector-list-spill.ll create mode 100644 test/CodeGen/AArch64/ragreedy-csr.ll create mode 100644 test/CodeGen/AArch64/sext_inreg.ll create mode 100644 test/CodeGen/AArch64/sincospow-vector-expansion.ll (limited to 'test/CodeGen/AArch64') diff --git a/test/CodeGen/AArch64/128bit_load_store.ll b/test/CodeGen/AArch64/128bit_load_store.ll new file mode 100644 index 0000000..502fd70 --- /dev/null +++ b/test/CodeGen/AArch64/128bit_load_store.ll @@ -0,0 +1,53 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s + +define void @test_store_f128(fp128* %ptr, fp128 %val) #0 { +; CHECK: test_store_f128 +; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}] +entry: + store fp128 %val, fp128* %ptr, align 16 + ret void +} + +define fp128 @test_load_f128(fp128* readonly %ptr) #2 { +; CHECK: test_load_f128 +; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}] +entry: + %0 = load fp128* %ptr, align 16 + ret fp128 %0 +} + +define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 { +; CHECK: test_vstrq_p128 +; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #8] +; CHECK-NEXT: str {{x[0-9]+}}, [{{x[0-9]+}}] +entry: + %0 = bitcast i128* %ptr to fp128* + %1 = bitcast i128 %val to fp128 + store fp128 %1, fp128* %0, align 16 + ret void +} + +define i128 @test_vldrq_p128(i128* readonly %ptr) #2 { +; CHECK: test_vldrq_p128 +; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}] +; CHECK-NEXT: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #8] +entry: + %0 = bitcast i128* %ptr to fp128* + %1 = load fp128* %0, align 16 + %2 = bitcast fp128 %1 to i128 + ret i128 %2 +} + +define void @test_ld_st_p128(i128* nocapture %ptr) #0 { +; CHECK: test_ld_st_p128 +; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}] +; CHECK-NEXT: str {{q[0-9]+}}, [{{x[0-9]+}}, #16] +entry: + %0 = bitcast i128* %ptr to fp128* + %1 = load fp128* %0, align 16 + %add.ptr = getelementptr inbounds i128* %ptr, i64 1 + %2 = bitcast i128* %add.ptr to fp128* + store fp128 %1, fp128* %2, align 16 + ret void +} + diff --git a/test/CodeGen/AArch64/adc.ll b/test/CodeGen/AArch64/adc.ll index 26fd3e6..29637d3 100644 --- a/test/CodeGen/AArch64/adc.ll +++ b/test/CodeGen/AArch64/adc.ll @@ -1,15 +1,20 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s define i128 @test_simple(i128 %a, i128 %b, i128 %c) { ; CHECK-LABEL: test_simple: %valadd = add i128 %a, %b -; CHECK: adds [[ADDLO:x[0-9]+]], x0, x2 -; CHECK-NEXT: adcs [[ADDHI:x[0-9]+]], x1, x3 +; CHECK-LE: adds [[ADDLO:x[0-9]+]], x0, x2 +; CHECK-LE-NEXT: adcs [[ADDHI:x[0-9]+]], x1, x3 +; CHECK-BE: adds [[ADDLO:x[0-9]+]], x1, x3 +; CHECK-BE-NEXT: adcs [[ADDHI:x[0-9]+]], x0, x2 %valsub = sub i128 %valadd, %c -; CHECK: subs x0, [[ADDLO]], x4 -; CHECK: sbcs x1, [[ADDHI]], x5 +; CHECK-LE: subs x0, [[ADDLO]], x4 +; CHECK-LE: sbcs x1, [[ADDHI]], x5 +; CHECK-BE: subs x1, [[ADDLO]], x5 +; CHECK-BE: sbcs x0, [[ADDHI]], x4 ret i128 %valsub ; CHECK: ret @@ -19,8 +24,10 @@ define i128 @test_imm(i128 %a) { ; CHECK-LABEL: test_imm: %val = add i128 %a, 12 -; CHECK: adds x0, x0, #12 -; CHECK: adcs x1, x1, {{x[0-9]|xzr}} +; CHECK-LE: adds x0, x0, #12 +; CHECK-LE: adcs x1, x1, {{x[0-9]|xzr}} +; CHECK-BE: adds x1, x1, #12 +; CHECK-BE: adcs x0, x0, {{x[0-9]|xzr}} ret i128 %val ; CHECK: ret @@ -32,8 +39,10 @@ define i128 @test_shifted(i128 %a, i128 %b) { %rhs = shl i128 %b, 45 %val = add i128 %a, %rhs -; CHECK: adds x0, x0, x2, lsl #45 -; CHECK: adcs x1, x1, {{x[0-9]}} +; CHECK-LE: adds x0, x0, x2, lsl #45 +; CHECK-LE: adcs x1, x1, {{x[0-9]}} +; CHECK-BE: adds x1, x1, x3, lsl #45 +; CHECK-BE: adcs x0, x0, {{x[0-9]}} ret i128 %val ; CHECK: ret @@ -46,8 +55,10 @@ define i128 @test_extended(i128 %a, i16 %b) { %rhs = shl i128 %ext, 3 %val = add i128 %a, %rhs -; CHECK: adds x0, x0, w2, sxth #3 -; CHECK: adcs x1, x1, {{x[0-9]}} +; CHECK-LE: adds x0, x0, w2, sxth #3 +; CHECK-LE: adcs x1, x1, {{x[0-9]}} +; CHECK-BE: adds x1, x1, w2, sxth #3 +; CHECK-BE: adcs x0, x0, {{x[0-9]}} ret i128 %val ; CHECK: ret diff --git a/test/CodeGen/AArch64/assertion-rc-mismatch.ll b/test/CodeGen/AArch64/assertion-rc-mismatch.ll new file mode 100644 index 0000000..02b0c0e --- /dev/null +++ b/test/CodeGen/AArch64/assertion-rc-mismatch.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; Test case related to . + +; CHECK-LABEL: small +define i64 @small(i64 %encodedBase) { +cmp: + %lnot.i.i = icmp eq i64 %encodedBase, 0 + br i1 %lnot.i.i, label %if, label %else +if: + %tmp1 = call i8* @llvm.returnaddress(i32 0) + br label %end +else: + %tmp3 = call i8* @llvm.returnaddress(i32 0) + %ptr = getelementptr inbounds i8* %tmp3, i64 -16 + %ld = load i8* %ptr, align 4 + %tmp2 = inttoptr i8 %ld to i8* + br label %end +end: + %tmp = phi i8* [ %tmp1, %if ], [ %tmp2, %else ] + %coerce.val.pi56 = ptrtoint i8* %tmp to i64 + ret i64 %coerce.val.pi56 +} + +declare i8* @llvm.returnaddress(i32) diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll index de84ff4..5fe2936 100644 --- a/test/CodeGen/AArch64/atomic-ops.ll +++ b/test/CodeGen/AArch64/atomic-ops.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-REG %s @var8 = global i8 0 @var16 = global i16 0 @@ -17,6 +18,8 @@ define i8 @test_atomic_load_add_i8(i8 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -37,6 +40,8 @@ define i16 @test_atomic_load_add_i16(i16 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -57,6 +62,8 @@ define i32 @test_atomic_load_add_i32(i32 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -77,6 +84,8 @@ define i64 @test_atomic_load_add_i64(i64 %offset) nounwind { ; x0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: add [[NEW:x[0-9]+]], x[[OLD]], x0 +; CHECK-REG: add x[[NEW:[0-9]+]], x{{[0-9]+}}, x0 +; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -97,6 +106,8 @@ define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -117,6 +128,8 @@ define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stlxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -137,6 +150,8 @@ define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -157,6 +172,8 @@ define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind { ; x0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: sub [[NEW:x[0-9]+]], x[[OLD]], x0 +; CHECK-REG: sub x[[NEW:[0-9]+]], x{{[0-9]+}}, x0 +; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -177,6 +194,8 @@ define i8 @test_atomic_load_and_i8(i8 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -197,6 +216,8 @@ define i16 @test_atomic_load_and_i16(i16 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -217,6 +238,8 @@ define i32 @test_atomic_load_and_i32(i32 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -237,6 +260,8 @@ define i64 @test_atomic_load_and_i64(i64 %offset) nounwind { ; x0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: and [[NEW:x[0-9]+]], x[[OLD]], x0 +; CHECK-REG: and x[[NEW:[0-9]+]], x{{[0-9]+}}, x0 +; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -257,6 +282,8 @@ define i8 @test_atomic_load_or_i8(i8 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -277,6 +304,8 @@ define i16 @test_atomic_load_or_i16(i16 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -297,6 +326,8 @@ define i32 @test_atomic_load_or_i32(i32 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -317,6 +348,8 @@ define i64 @test_atomic_load_or_i64(i64 %offset) nounwind { ; x0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: orr [[NEW:x[0-9]+]], x[[OLD]], x0 +; CHECK-REG: orr x[[NEW:[0-9]+]], x{{[0-9]+}}, x0 +; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -337,6 +370,8 @@ define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -357,6 +392,8 @@ define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -377,6 +414,8 @@ define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind { ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0 +; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0 +; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -397,6 +436,8 @@ define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind { ; x0 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-NEXT: eor [[NEW:x[0-9]+]], x[[OLD]], x0 +; CHECK-REG: eor x[[NEW:[0-9]+]], x{{[0-9]+}}, x0 +; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -416,6 +457,7 @@ define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind { ; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]] ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. +; CHECK-REG-NOT: stxrb w0, w0, [x{{[0-9]+}}] ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], w0, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -435,6 +477,7 @@ define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind { ; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]] ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. +; CHECK-REG-NOT: stlxrh w0, w0, [x{{[0-9]+}}] ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], w0, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -454,6 +497,7 @@ define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind { ; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]] ; w0 below is a reasonable guess but could change: it certainly comes into the ; function there. +; CHECK-REG-NOT: stlxr w0, w0, [x{{[0-9]+}}] ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], w0, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -473,6 +517,7 @@ define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind { ; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]] ; x0 below is a reasonable guess but could change: it certainly comes into the ; function there. +; CHECK-REG-NOT: stxr w0, x0, [x{{[0-9]+}}] ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], x0, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -495,6 +540,8 @@ define i8 @test_atomic_load_min_i8(i8 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp w0, w[[OLD]], sxtb ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt +; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt +; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -516,6 +563,8 @@ define i16 @test_atomic_load_min_i16(i16 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp w0, w[[OLD]], sxth ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt +; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt +; CHECK-REG-NOT: stlxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -537,6 +586,8 @@ define i32 @test_atomic_load_min_i32(i32 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp w0, w[[OLD]] ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt +; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt +; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -558,6 +609,8 @@ define i64 @test_atomic_load_min_i64(i64 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp x0, x[[OLD]] ; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt +; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, gt +; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -579,6 +632,8 @@ define i8 @test_atomic_load_max_i8(i8 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp w0, w[[OLD]], sxtb ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt +; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt +; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -600,6 +655,8 @@ define i16 @test_atomic_load_max_i16(i16 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp w0, w[[OLD]], sxth ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt +; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt +; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -621,6 +678,8 @@ define i32 @test_atomic_load_max_i32(i32 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp w0, w[[OLD]] ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt +; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt +; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -642,6 +701,8 @@ define i64 @test_atomic_load_max_i64(i64 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp x0, x[[OLD]] ; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lt +; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, lt +; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -663,6 +724,8 @@ define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp w0, w[[OLD]], uxtb ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi +; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi +; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -684,6 +747,8 @@ define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp w0, w[[OLD]], uxth ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi +; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi +; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -705,6 +770,8 @@ define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp w0, w[[OLD]] ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi +; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi +; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -726,6 +793,8 @@ define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp x0, x[[OLD]] ; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi +; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, hi +; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -747,6 +816,8 @@ define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp w0, w[[OLD]], uxtb ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo +; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo +; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -768,6 +839,8 @@ define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp w0, w[[OLD]], uxth ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo +; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo +; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -789,6 +862,8 @@ define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp w0, w[[OLD]] ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo +; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo +; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -810,6 +885,8 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind { ; function there. ; CHECK-NEXT: cmp x0, x[[OLD]] ; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lo +; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, lo +; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}] ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1 ; CHECK-NOT: dmb @@ -820,7 +897,7 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind { define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind { ; CHECK-LABEL: test_atomic_cmpxchg_i8: - %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire + %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8 @@ -832,6 +909,7 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind { ; CHECK-NEXT: cmp w[[OLD]], w0 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]] ; As above, w1 is a reasonable guess. +; CHECK-REG-NOT: stxrb w1, w1, [x{{[0-9]+}}] ; CHECK: stxrb [[STATUS:w[0-9]+]], w1, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]] ; CHECK-NOT: dmb @@ -842,7 +920,7 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind { define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind { ; CHECK-LABEL: test_atomic_cmpxchg_i16: - %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst + %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16 @@ -854,6 +932,7 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind { ; CHECK-NEXT: cmp w[[OLD]], w0 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]] ; As above, w1 is a reasonable guess. +; CHECK-REG-NOT: stlxrh w1, w1, [x{{[0-9]+}}] ; CHECK: stlxrh [[STATUS:w[0-9]+]], w1, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]] ; CHECK-NOT: dmb @@ -864,7 +943,7 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind { define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind { ; CHECK-LABEL: test_atomic_cmpxchg_i32: - %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release + %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32 @@ -876,6 +955,7 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind { ; CHECK-NEXT: cmp w[[OLD]], w0 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]] ; As above, w1 is a reasonable guess. +; CHECK-REG-NOT: stlxr w1, w1, [x{{[0-9]+}}] ; CHECK: stlxr [[STATUS:w[0-9]+]], w1, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]] ; CHECK-NOT: dmb @@ -886,7 +966,7 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind { define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind { ; CHECK-LABEL: test_atomic_cmpxchg_i64: - %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic + %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64 @@ -898,6 +978,7 @@ define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind { ; CHECK-NEXT: cmp x[[OLD]], x0 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]] ; As above, w1 is a reasonable guess. +; CHECK-REG-NOT: stxr w1, x1, [x{{[0-9]+}}] ; CHECK: stxr [[STATUS:w[0-9]+]], x1, [x[[ADDR]]] ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]] ; CHECK-NOT: dmb diff --git a/test/CodeGen/AArch64/concatvector-bugs.ll b/test/CodeGen/AArch64/concatvector-bugs.ll new file mode 100644 index 0000000..5889e22 --- /dev/null +++ b/test/CodeGen/AArch64/concatvector-bugs.ll @@ -0,0 +1,68 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon +; Bug: i8 type in FRP8 register but not registering with register class causes segmentation fault. +; Fix: Removed i8 type from FPR8 register class. + +define void @test_concatvector_v8i8() { +entry.split: + br i1 undef, label %if.then, label %if.end + +if.then: ; preds = %entry.split + unreachable + +if.end: ; preds = %entry.split + br i1 undef, label %if.then9, label %if.end18 + +if.then9: ; preds = %if.end + unreachable + +if.end18: ; preds = %if.end + br label %for.body + +for.body: ; preds = %for.inc, %if.end18 + br i1 false, label %if.then30, label %for.inc + +if.then30: ; preds = %for.body + unreachable + +for.inc: ; preds = %for.body + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.inc + br label %for.body77 + +for.body77: ; preds = %for.body77, %for.end + br i1 undef, label %for.end106, label %for.body77 + +for.end106: ; preds = %for.body77 + br i1 undef, label %for.body130.us.us, label %stmt.for.body130.us.us + +stmt.for.body130.us.us: ; preds = %stmt.for.body130.us.us, %for.end106 + %_p_splat.us = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer + store <8 x i8> %_p_splat.us, <8 x i8>* undef, align 1 + br label %stmt.for.body130.us.us + +for.body130.us.us: ; preds = %for.body130.us.us, %for.end106 + br label %for.body130.us.us +} + +declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32) + +define <8 x i16> @test_splat(i32 %l) nounwind { +; CHECK-LABEL: test_splat: +; CHECK: ret + %lhs = insertelement <1 x i32> undef, i32 %l, i32 0 + %shift = tail call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %lhs, i32 11) + %vec = shufflevector <1 x i16> %shift, <1 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %vec +} + + +define <8 x i16> @test_notsplat(<8 x i16> %a, <8 x i16> %b, i32 %l) nounwind { +; CHECK-LABEL: test_notsplat: +; CHECK: ret +entry: + %lhs = insertelement <1 x i32> undef, i32 %l, i32 0 + %shift = tail call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %lhs, i32 11) + %vec = shufflevector <1 x i16> %shift, <1 x i16> undef, <8 x i32> + ret <8 x i16> %vec +} diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll new file mode 100644 index 0000000..f0b60f0 --- /dev/null +++ b/test/CodeGen/AArch64/cpus.ll @@ -0,0 +1,13 @@ +; This tests that llc accepts all valid AArch64 CPUs + +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID + +; CHECK-NOT: {{.*}} is not a recognized processor for this target +; INVALID: {{.*}} is not a recognized processor for this target + +define i32 @f(i64 %z) { + ret i32 0 +} diff --git a/test/CodeGen/AArch64/fcvt-int.ll b/test/CodeGen/AArch64/fcvt-int.ll index b28eb3e..97427a7 100644 --- a/test/CodeGen/AArch64/fcvt-int.ll +++ b/test/CodeGen/AArch64/fcvt-int.ll @@ -69,7 +69,7 @@ define float @test_i32tofloat(i32 %in) { ; CHECK-DAG: scvtf [[SIG:s[0-9]+]], {{w[0-9]+}} %res = fsub float %signed, %unsigned -; CHECL: fsub {{s[0-9]+}}, [[SIG]], [[UNSIG]] +; CHECK: fsub {{s[0-9]+}}, [[SIG]], [[UNSIG]] ret float %res ; CHECK: ret } diff --git a/test/CodeGen/AArch64/fp-dp3.ll b/test/CodeGen/AArch64/fp-dp3.ll index 590557f..2a6790e 100644 --- a/test/CodeGen/AArch64/fp-dp3.ll +++ b/test/CodeGen/AArch64/fp-dp3.ll @@ -26,8 +26,9 @@ define float @test_fmsub(float %a, float %b, float %c) { define float @test_fnmadd(float %a, float %b, float %c) { ; CHECK-LABEL: test_fnmadd: ; CHECK-NOFAST-LABEL: test_fnmadd: + %nega = fsub float -0.0, %a %negc = fsub float -0.0, %c - %val = call float @llvm.fma.f32(float %a, float %b, float %negc) + %val = call float @llvm.fma.f32(float %nega, float %b, float %negc) ; CHECK: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; CHECK-NOFAST: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %val @@ -36,9 +37,8 @@ define float @test_fnmadd(float %a, float %b, float %c) { define float @test_fnmsub(float %a, float %b, float %c) { ; CHECK-LABEL: test_fnmsub: ; CHECK-NOFAST-LABEL: test_fnmsub: - %nega = fsub float -0.0, %a %negc = fsub float -0.0, %c - %val = call float @llvm.fma.f32(float %nega, float %b, float %negc) + %val = call float @llvm.fma.f32(float %a, float %b, float %negc) ; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; CHECK-NOFAST: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %val @@ -66,8 +66,9 @@ define double @testd_fmsub(double %a, double %b, double %c) { define double @testd_fnmadd(double %a, double %b, double %c) { ; CHECK-LABEL: testd_fnmadd: ; CHECK-NOFAST-LABEL: testd_fnmadd: + %nega = fsub double -0.0, %a %negc = fsub double -0.0, %c - %val = call double @llvm.fma.f64(double %a, double %b, double %negc) + %val = call double @llvm.fma.f64(double %nega, double %b, double %negc) ; CHECK: fnmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ; CHECK-NOFAST: fnmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret double %val @@ -76,9 +77,8 @@ define double @testd_fnmadd(double %a, double %b, double %c) { define double @testd_fnmsub(double %a, double %b, double %c) { ; CHECK-LABEL: testd_fnmsub: ; CHECK-NOFAST-LABEL: testd_fnmsub: - %nega = fsub double -0.0, %a %negc = fsub double -0.0, %c - %val = call double @llvm.fma.f64(double %nega, double %b, double %negc) + %val = call double @llvm.fma.f64(double %a, double %b, double %negc) ; CHECK: fnmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ; CHECK-NOFAST: fnmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret double %val @@ -113,12 +113,13 @@ define float @test_fnmadd_unfused(float %a, float %b, float %c) { ; CHECK-NOFAST-LABEL: test_fnmadd_unfused: %nega = fsub float -0.0, %a %prod = fmul float %b, %c - %sum = fadd float %nega, %prod + %diff = fsub float %nega, %prod ; CHECK: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; CHECK-NOFAST-NOT: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} - ret float %sum +; CHECK-NOFAST: ret + ret float %diff } define float @test_fnmsub_unfused(float %a, float %b, float %c) { @@ -126,12 +127,37 @@ define float @test_fnmsub_unfused(float %a, float %b, float %c) { ; CHECK-NOFAST-LABEL: test_fnmsub_unfused: %nega = fsub float -0.0, %a %prod = fmul float %b, %c - %diff = fsub float %nega, %prod + %sum = fadd float %nega, %prod ; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; CHECK-NOFAST-NOT: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -; CHECK-NOFAST-DAG: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -; CHECK-NOFAST-DAG: fneg {{s[0-9]+}}, {{s[0-9]+}} -; CHECK-NOFAST-DAG: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -; CHECK-NOFAST: ret - ret float %diff +; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} + ret float %sum } + +; Another set of tests that check for multiply single use + +define float @test_fmadd_unfused_su(float %a, float %b, float %c) { +; CHECK-LABEL: test_fmadd_unfused_su: + %prod = fmul float %b, %c + %sum = fadd float %a, %prod + %res = fadd float %sum, %prod +; CHECK-NOT: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} + ret float %res +} + +define float @test_fmsub_unfused_su(float %a, float %b, float %c) { +; CHECK-LABEL: test_fmsub_unfused_su: + %prod = fmul float %b, %c + %diff = fsub float %a, %prod + %res = fsub float %diff, %prod +; CHECK-NOT: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} + ret float %res +} + diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll index 430d77f..f307686 100644 --- a/test/CodeGen/AArch64/func-argpassing.ll +++ b/test/CodeGen/AArch64/func-argpassing.ll @@ -1,5 +1,7 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s %myStruct = type { i64 , i8, i32 } @@ -146,7 +148,8 @@ define i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var %retval = load volatile i32* %stacked ret i32 %retval -; CHECK: ldr w0, [sp, #16] +; CHECK-LE: ldr w0, [sp, #16] +; CHECK-BE: ldr w0, [sp, #20] } define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3, @@ -180,8 +183,10 @@ define void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3, ; CHECK: check_i128_stackalign store i128 %stack2, i128* @var128 ; Nothing local on stack in current codegen, so first stack is 16 away -; CHECK: add x[[REG:[0-9]+]], sp, #16 -; CHECK: ldr {{x[0-9]+}}, [x[[REG]], #8] +; CHECK-LE: add x[[REG:[0-9]+]], sp, #16 +; CHECK-LE: ldr {{x[0-9]+}}, [x[[REG]], #8] +; CHECK-BE: ldr {{x[0-9]+}}, [sp, #24] + ; Important point is that we address sp+24 for second dword ; CHECK: ldr {{x[0-9]+}}, [sp, #16] ret void diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll index ac188bb..f029bf2 100644 --- a/test/CodeGen/AArch64/func-calls.ll +++ b/test/CodeGen/AArch64/func-calls.ll @@ -1,5 +1,7 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-BE --check-prefix=CHECK-NOFP %s %myStruct = type { i64 , i8, i32 } @@ -126,8 +128,10 @@ define void @check_i128_align() { call void @check_i128_regalign(i32 0, i128 42) ; CHECK-NOT: mov x1 -; CHECK: movz x2, #42 -; CHECK: mov x3, xzr +; CHECK-LE: movz x2, #42 +; CHECK-LE: mov x3, xzr +; CHECK-BE: movz x3, #42 +; CHECK-BE: mov x2, xzr ; CHECK: bl check_i128_regalign ret void diff --git a/test/CodeGen/AArch64/i128-shift.ll b/test/CodeGen/AArch64/i128-shift.ll new file mode 100644 index 0000000..d786d44 --- /dev/null +++ b/test/CodeGen/AArch64/i128-shift.ll @@ -0,0 +1,43 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s + +define i128 @test_i128_lsl(i128 %a, i32 %shift) { +; CHECK-LABEL: test_i128_lsl: + + %sh_prom = zext i32 %shift to i128 + %shl = shl i128 %a, %sh_prom + +; CHECK: movz [[SIXTYFOUR:x[0-9]+]], #64 +; CHECK-NEXT: sub [[REVSHAMT:x[0-9]+]], [[SIXTYFOUR]], [[SHAMT_32:w[0-9]+]], uxtw +; CHECK-NEXT: lsr [[TMP1:x[0-9]+]], [[LO:x[0-9]+]], [[REVSHAMT]] +; CHECK: lsl [[TMP2:x[0-9]+]], [[HI:x[0-9]+]], [[SHAMT:x[0-9]+]] +; CHECK-NEXT: orr [[FALSEVAL:x[0-9]+]], [[TMP1]], [[TMP2]] +; CHECK-NEXT: sub [[EXTRASHAMT:x[0-9]+]], [[SHAMT]], #64 +; CHECK-NEXT: lsl [[TMP3:x[0-9]+]], [[LO]], [[EXTRASHAMT]] +; CHECK-NEXT: cmp [[EXTRASHAMT]], #0 +; CHECK-NEXT: csel [[RESULTHI:x[0-9]+]], [[TMP3]], [[FALSEVAL]], ge +; CHECK-NEXT: lsl [[TMP4:x[0-9]+]], [[LO]], [[SHAMT]] +; CHECK-NEXT: csel [[RESULTLO:x[0-9]+]], xzr, [[TMP4]], ge + + ret i128 %shl +} + +define i128 @test_i128_shr(i128 %a, i32 %shift) { +; CHECK-LABEL: test_i128_shr: + + %sh_prom = zext i32 %shift to i128 + %shr = lshr i128 %a, %sh_prom + +; CHECK: movz [[SIXTYFOUR]], #64 +; CHECK-NEXT: sub [[REVSHAMT:x[0-9]+]], [[SIXTYFOUR]], [[SHAMT_32:w[0-9]+]], uxtw +; CHECK-NEXT: lsl [[TMP2:x[0-9]+]], [[HI:x[0-9]+]], [[REVSHAMT]] +; CHECK: lsr [[TMP1:x[0-9]+]], [[LO:x[0-9]+]], [[SHAMT:x[0-9]+]] +; CHECK-NEXT: orr [[FALSEVAL:x[0-9]+]], [[TMP1]], [[TMP2]] +; CHECK-NEXT: sub [[EXTRASHAMT:x[0-9]+]], [[SHAMT]], #64 +; CHECK-NEXT: lsr [[TRUEVAL:x[0-9]+]], [[HI]], [[EXTRASHAMT]] +; CHECK-NEXT: cmp [[EXTRASHAMT]], #0 +; CHECK-NEXT: csel [[RESULTLO:x[0-9]+]], [[TRUEVAL]], [[FALSEVAL]], ge +; CHECK-NEXT: lsr [[TMP3:x[0-9]+]], [[HI]], [[SHAMT]] +; CHECK-NEXT: csel [[RESULTHI:x[0-9]+]], xzr, [[TMP3]], ge + + ret i128 %shr +} diff --git a/test/CodeGen/AArch64/init-array.ll b/test/CodeGen/AArch64/init-array.ll index 3ff1c1a..076ae27 100644 --- a/test/CodeGen/AArch64/init-array.ll +++ b/test/CodeGen/AArch64/init-array.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array < %s | FileCheck %s define internal void @_GLOBAL__I_a() section ".text.startup" { ret void diff --git a/test/CodeGen/AArch64/inline-asm-constraints.ll b/test/CodeGen/AArch64/inline-asm-constraints.ll index 18a3b37..365453c 100644 --- a/test/CodeGen/AArch64/inline-asm-constraints.ll +++ b/test/CodeGen/AArch64/inline-asm-constraints.ll @@ -1,4 +1,4 @@ -;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -no-integrated-as < %s | FileCheck %s define i64 @test_inline_constraint_r(i64 %base, i32 %offset) { ; CHECK-LABEL: test_inline_constraint_r: diff --git a/test/CodeGen/AArch64/inline-asm-modifiers.ll b/test/CodeGen/AArch64/inline-asm-modifiers.ll index b7f4d3c..cb66335 100644 --- a/test/CodeGen/AArch64/inline-asm-modifiers.ll +++ b/test/CodeGen/AArch64/inline-asm-modifiers.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -no-integrated-as < %s | FileCheck %s @var_simple = hidden global i32 0 @var_got = global i32 0 diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll index 4bb0942..94717f5 100644 --- a/test/CodeGen/AArch64/jump-table.ll +++ b/test/CodeGen/AArch64/jump-table.ll @@ -1,5 +1,6 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s ; RUN: llc -code-model=large -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic <%s | FileCheck --check-prefix=CHECK-PIC %s define i32 @test_jumptable(i32 %in) { ; CHECK: test_jumptable @@ -22,6 +23,12 @@ define i32 @test_jumptable(i32 %in) { ; CHECK-LARGE: ldr [[DEST:x[0-9]+]], [x[[JTADDR]], {{x[0-9]+}}, lsl #3] ; CHECK-LARGE: br [[DEST]] +; CHECK-PIC: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0 +; CHECK-PIC: add x[[JT:[0-9]+]], [[JTPAGE]], #:lo12:.LJTI0_0 +; CHECK-PIC: ldrsw [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #2] +; CHECK-PIC: add [[TABLE:x[0-9]+]], [[DEST]], x[[JT]] +; CHECK-PIC: br [[TABLE]] + def: ret i32 0 @@ -47,3 +54,12 @@ lbl4: ; CHECK-NEXT: .xword ; CHECK-NEXT: .xword ; CHECK-NEXT: .xword + +; CHECK-PIC-NOT: .data_region +; CHECK-PIC: .LJTI0_0: +; CHECK-PIC-NEXT: .word +; CHECK-PIC-NEXT: .word +; CHECK-PIC-NEXT: .word +; CHECK-PIC-NEXT: .word +; CHECK-PIC-NEXT: .word +; CHECK-PIC-NOT: .end_data_region diff --git a/test/CodeGen/AArch64/mature-mc-support.ll b/test/CodeGen/AArch64/mature-mc-support.ll new file mode 100644 index 0000000..06e3cc7 --- /dev/null +++ b/test/CodeGen/AArch64/mature-mc-support.ll @@ -0,0 +1,12 @@ +; Test that inline assembly is parsed by the MC layer when MC support is mature +; (even when the output is assembly). + +; RUN: not llc -mtriple=aarch64-pc-linux < %s > /dev/null 2> %t1 +; RUN: FileCheck %s < %t1 + +; RUN: not llc -mtriple=aarch64-pc-linux -filetype=obj < %s > /dev/null 2> %t2 +; RUN: FileCheck %s < %t2 + +module asm " .this_directive_is_very_unlikely_to_exist" + +; CHECK: LLVM ERROR: Error parsing inline asm diff --git a/test/CodeGen/AArch64/misched-basic-A53.ll b/test/CodeGen/AArch64/misched-basic-A53.ll new file mode 100644 index 0000000..1555c48 --- /dev/null +++ b/test/CodeGen/AArch64/misched-basic-A53.ll @@ -0,0 +1,112 @@ +; REQUIRES: asserts +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s +; +; The Cortex-A53 machine model will cause the MADD instruction to be scheduled +; much higher than the ADD instructions in order to hide latency. When not +; specifying a subtarget, the MADD will remain near the end of the block. +; +; CHECK: ********** MI Scheduling ********** +; CHECK: main +; CHECK: *** Final schedule for BB#2 *** +; CHECK: SU(13) +; CHECK: MADDwwww +; CHECK: SU(4) +; CHECK: ADDwwi_lsl0_s +; CHECK: ********** INTERVALS ********** +@main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4 +@main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4 + +; Function Attrs: nounwind +define i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %x = alloca [8 x i32], align 4 + %y = alloca [8 x i32], align 4 + %i = alloca i32, align 4 + %xx = alloca i32, align 4 + %yy = alloca i32, align 4 + store i32 0, i32* %retval + %0 = bitcast [8 x i32]* %x to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([8 x i32]* @main.x to i8*), i64 32, i32 4, i1 false) + %1 = bitcast [8 x i32]* %y to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([8 x i32]* @main.y to i8*), i64 32, i32 4, i1 false) + store i32 0, i32* %xx, align 4 + store i32 0, i32* %yy, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %2 = load i32* %i, align 4 + %cmp = icmp slt i32 %2, 8 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %3 = load i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom + %4 = load i32* %arrayidx, align 4 + %add = add nsw i32 %4, 1 + store i32 %add, i32* %xx, align 4 + %5 = load i32* %xx, align 4 + %add1 = add nsw i32 %5, 12 + store i32 %add1, i32* %xx, align 4 + %6 = load i32* %xx, align 4 + %add2 = add nsw i32 %6, 23 + store i32 %add2, i32* %xx, align 4 + %7 = load i32* %xx, align 4 + %add3 = add nsw i32 %7, 34 + store i32 %add3, i32* %xx, align 4 + %8 = load i32* %i, align 4 + %idxprom4 = sext i32 %8 to i64 + %arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4 + %9 = load i32* %arrayidx5, align 4 + %10 = load i32* %yy, align 4 + %mul = mul nsw i32 %10, %9 + store i32 %mul, i32* %yy, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %11 = load i32* %i, align 4 + %inc = add nsw i32 %11, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %12 = load i32* %xx, align 4 + %13 = load i32* %yy, align 4 + %add6 = add nsw i32 %12, %13 + ret i32 %add6 +} + + +; The Cortex-A53 machine model will cause the FDIVvvv_42 to be raised to +; hide latency. Whereas normally there would only be a single FADDvvv_4s +; after it, this test checks to make sure there are more than one. +; +; CHECK: ********** MI Scheduling ********** +; CHECK: neon4xfloat:BB#0 +; CHECK: *** Final schedule for BB#0 *** +; CHECK: FDIVvvv_4S +; CHECK: FADDvvv_4S +; CHECK: FADDvvv_4S +; CHECK: ********** INTERVALS ********** +define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) { + %tmp1 = fadd <4 x float> %A, %B; + %tmp2 = fadd <4 x float> %A, %tmp1; + %tmp3 = fadd <4 x float> %A, %tmp2; + %tmp4 = fadd <4 x float> %A, %tmp3; + %tmp5 = fadd <4 x float> %A, %tmp4; + %tmp6 = fadd <4 x float> %A, %tmp5; + %tmp7 = fadd <4 x float> %A, %tmp6; + %tmp8 = fadd <4 x float> %A, %tmp7; + %tmp9 = fdiv <4 x float> %A, %B; + %tmp10 = fadd <4 x float> %tmp8, %tmp9; + + ret <4 x float> %tmp10 +} + +; Function Attrs: nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll new file mode 100644 index 0000000..f58c598 --- /dev/null +++ b/test/CodeGen/AArch64/mul-lohi.ll @@ -0,0 +1,19 @@ +; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s +; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s + +define i128 @test_128bitmul(i128 %lhs, i128 %rhs) { +; CHECK-LABEL: test_128bitmul: +; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2 +; CHECK-DAG: madd [[PART1:x[0-9]+]], x0, x3, [[CARRY]] +; CHECK: madd x1, x1, x2, [[PART1]] +; CHECK: mul x0, x0, x2 + +; CHECK-BE-LABEL: test_128bitmul: +; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3 +; CHECK-BE-DAG: madd [[PART1:x[0-9]+]], x1, x2, [[CARRY]] +; CHECK-BE: madd x0, x0, x3, [[PART1]] +; CHECK-BE: mul x1, x1, x3 + + %prod = mul i128 %lhs, %rhs + ret i128 %prod +} diff --git a/test/CodeGen/AArch64/neon-2velem.ll b/test/CodeGen/AArch64/neon-2velem.ll index 9d61842..acffb14 100644 --- a/test/CodeGen/AArch64/neon-2velem.ll +++ b/test/CodeGen/AArch64/neon-2velem.ll @@ -45,6 +45,7 @@ declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmla_lane_s16: ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %b @@ -55,6 +56,7 @@ entry: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlaq_lane_s16: ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %b @@ -65,6 +67,7 @@ entry: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmla_lane_s32: ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %b @@ -75,6 +78,7 @@ entry: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlaq_lane_s32: ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %b @@ -85,6 +89,7 @@ entry: define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmla_laneq_s16: ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %b @@ -95,6 +100,7 @@ entry: define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlaq_laneq_s16: ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %b @@ -105,6 +111,7 @@ entry: define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmla_laneq_s32: ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %b @@ -115,6 +122,7 @@ entry: define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlaq_laneq_s32: ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %b @@ -125,6 +133,7 @@ entry: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmls_lane_s16: ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %b @@ -135,6 +144,7 @@ entry: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlsq_lane_s16: ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %b @@ -145,6 +155,7 @@ entry: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmls_lane_s32: ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %b @@ -155,6 +166,7 @@ entry: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlsq_lane_s32: ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %b @@ -165,6 +177,7 @@ entry: define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmls_laneq_s16: ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %b @@ -175,6 +188,7 @@ entry: define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlsq_laneq_s16: ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %b @@ -185,6 +199,7 @@ entry: define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmls_laneq_s32: ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %b @@ -195,6 +210,7 @@ entry: define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlsq_laneq_s32: ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %b @@ -205,6 +221,7 @@ entry: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmul_lane_s16: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %a @@ -214,6 +231,7 @@ entry: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmulq_lane_s16: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %a @@ -223,6 +241,7 @@ entry: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmul_lane_s32: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %a @@ -232,6 +251,7 @@ entry: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmulq_lane_s32: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %a @@ -241,6 +261,7 @@ entry: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmul_lane_u16: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %a @@ -250,6 +271,7 @@ entry: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmulq_lane_u16: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %a @@ -259,6 +281,7 @@ entry: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmul_lane_u32: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %a @@ -268,6 +291,7 @@ entry: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmulq_lane_u32: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %a @@ -277,6 +301,7 @@ entry: define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmul_laneq_s16: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %a @@ -286,6 +311,7 @@ entry: define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmulq_laneq_s16: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %a @@ -295,6 +321,7 @@ entry: define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmul_laneq_s32: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %a @@ -304,6 +331,7 @@ entry: define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmulq_laneq_s32: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %a @@ -313,6 +341,7 @@ entry: define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmul_laneq_u16: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %a @@ -322,6 +351,7 @@ entry: define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmulq_laneq_u16: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %a @@ -331,6 +361,7 @@ entry: define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmul_laneq_u32: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %a @@ -340,6 +371,7 @@ entry: define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmulq_laneq_u32: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %a @@ -349,6 +381,7 @@ entry: define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK: test_vfma_lane_f32: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -360,6 +393,7 @@ declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) { ; CHECK: test_vfmaq_lane_f32: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -371,6 +405,7 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) { ; CHECK: test_vfma_laneq_f32: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -380,6 +415,7 @@ entry: define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) { ; CHECK: test_vfmaq_laneq_f32: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -389,6 +425,7 @@ entry: define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK: test_vfms_lane_f32: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> @@ -399,6 +436,7 @@ entry: define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) { ; CHECK: test_vfmsq_lane_f32: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> @@ -409,6 +447,7 @@ entry: define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) { ; CHECK: test_vfms_laneq_f32: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> @@ -419,6 +458,7 @@ entry: define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) { ; CHECK: test_vfmsq_laneq_f32: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> @@ -429,6 +469,7 @@ entry: define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) { ; CHECK: test_vfmaq_lane_f64: ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; CHECK-NEXT: ret entry: %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -440,6 +481,7 @@ declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK: test_vfmaq_laneq_f64: ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; CHECK-NEXT: ret entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -449,6 +491,7 @@ entry: define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) { ; CHECK: test_vfmsq_lane_f64: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; CHECK-NEXT: ret entry: %sub = fsub <1 x double> , %v %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer @@ -459,6 +502,7 @@ entry: define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK: test_vfmsq_laneq_f64: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; CHECK-NEXT: ret entry: %sub = fsub <2 x double> , %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> @@ -466,9 +510,57 @@ entry: ret <2 x double> %0 } +define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) { +; CHECK-LABEL: test_vfmas_laneq_f32 +; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret +entry: + %extract = extractelement <4 x float> %v, i32 3 + %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) + ret float %0 +} + +declare float @llvm.fma.f32(float, float, float) + +define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) { +; CHECK-LABEL: test_vfmsd_lane_f64 +; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +; CHECK-NEXT: ret +entry: + %extract.rhs = extractelement <1 x double> %v, i32 0 + %extract = fsub double -0.000000e+00, %extract.rhs + %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a) + ret double %0 +} + +declare double @llvm.fma.f64(double, double, double) + +define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) { +; CHECK: test_vfmss_laneq_f32 +; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret +entry: + %extract.rhs = extractelement <4 x float> %v, i32 3 + %extract = fsub float -0.000000e+00, %extract.rhs + %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) + ret float %0 +} + +define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) { +; CHECK-LABEL: test_vfmsd_laneq_f64 +; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; CHECK-NEXT: ret +entry: + %extract.rhs = extractelement <2 x double> %v, i32 1 + %extract = fsub double -0.000000e+00, %extract.rhs + %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a) + ret double %0 +} + define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlal_lane_s16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -479,6 +571,7 @@ entry: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlal_lane_s32: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -489,6 +582,7 @@ entry: define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlal_laneq_s16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -499,6 +593,7 @@ entry: define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlal_laneq_s32: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -509,6 +604,7 @@ entry: define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlal_high_lane_s16: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -520,6 +616,7 @@ entry: define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlal_high_lane_s32: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -531,6 +628,7 @@ entry: define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlal_high_laneq_s16: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -542,6 +640,7 @@ entry: define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlal_high_laneq_s32: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -553,6 +652,7 @@ entry: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlsl_lane_s16: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -563,6 +663,7 @@ entry: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlsl_lane_s32: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -573,6 +674,7 @@ entry: define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlsl_laneq_s16: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -583,6 +685,7 @@ entry: define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlsl_laneq_s32: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -593,6 +696,7 @@ entry: define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlsl_high_lane_s16: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -604,6 +708,7 @@ entry: define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlsl_high_lane_s32: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -615,6 +720,7 @@ entry: define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlsl_high_laneq_s16: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -626,6 +732,7 @@ entry: define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlsl_high_laneq_s32: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -637,6 +744,7 @@ entry: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlal_lane_u16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -647,6 +755,7 @@ entry: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlal_lane_u32: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -657,6 +766,7 @@ entry: define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlal_laneq_u16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -667,6 +777,7 @@ entry: define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlal_laneq_u32: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -677,6 +788,7 @@ entry: define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlal_high_lane_u16: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -688,6 +800,7 @@ entry: define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlal_high_lane_u32: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -699,6 +812,7 @@ entry: define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlal_high_laneq_u16: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -710,6 +824,7 @@ entry: define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlal_high_laneq_u32: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -721,6 +836,7 @@ entry: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlsl_lane_u16: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -731,6 +847,7 @@ entry: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlsl_lane_u32: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -741,6 +858,7 @@ entry: define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlsl_laneq_u16: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -751,6 +869,7 @@ entry: define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlsl_laneq_u32: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -761,6 +880,7 @@ entry: define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlsl_high_lane_u16: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -772,6 +892,7 @@ entry: define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlsl_high_lane_u32: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -783,6 +904,7 @@ entry: define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlsl_high_laneq_u16: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -794,6 +916,7 @@ entry: define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlsl_high_laneq_u32: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -805,6 +928,7 @@ entry: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmull_lane_s16: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -814,6 +938,7 @@ entry: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmull_lane_s32: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -823,6 +948,7 @@ entry: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmull_lane_u16: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -832,6 +958,7 @@ entry: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmull_lane_u32: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -841,6 +968,7 @@ entry: define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmull_high_lane_s16: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -851,6 +979,7 @@ entry: define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmull_high_lane_s32: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -861,6 +990,7 @@ entry: define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmull_high_lane_u16: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -871,6 +1001,7 @@ entry: define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmull_high_lane_u32: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -881,6 +1012,7 @@ entry: define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmull_laneq_s16: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -890,6 +1022,7 @@ entry: define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmull_laneq_s32: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -899,6 +1032,7 @@ entry: define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmull_laneq_u16: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -908,6 +1042,7 @@ entry: define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmull_laneq_u32: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -917,6 +1052,7 @@ entry: define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmull_high_laneq_s16: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -927,6 +1063,7 @@ entry: define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmull_high_laneq_s32: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -937,6 +1074,7 @@ entry: define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmull_high_laneq_u16: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -947,6 +1085,7 @@ entry: define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmull_high_laneq_u32: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -957,6 +1096,7 @@ entry: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vqdmlal_lane_s16: ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -967,6 +1107,7 @@ entry: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vqdmlal_lane_s32: ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -977,6 +1118,7 @@ entry: define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vqdmlal_high_lane_s16: ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -988,6 +1130,7 @@ entry: define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vqdmlal_high_lane_s32: ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -999,6 +1142,7 @@ entry: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vqdmlsl_lane_s16: ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1009,6 +1153,7 @@ entry: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vqdmlsl_lane_s32: ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1019,6 +1164,7 @@ entry: define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vqdmlsl_high_lane_s16: ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -1030,6 +1176,7 @@ entry: define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vqdmlsl_high_lane_s32: ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -1041,6 +1188,7 @@ entry: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vqdmull_lane_s16: ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1050,6 +1198,7 @@ entry: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vqdmull_lane_s32: ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1059,6 +1208,7 @@ entry: define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) { ; CHECK: test_vqdmull_laneq_s16: ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1068,6 +1218,7 @@ entry: define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) { ; CHECK: test_vqdmull_laneq_s32: ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1077,6 +1228,7 @@ entry: define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vqdmull_high_lane_s16: ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -1087,6 +1239,7 @@ entry: define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vqdmull_high_lane_s32: ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -1097,6 +1250,7 @@ entry: define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; CHECK: test_vqdmull_high_laneq_s16: ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -1107,6 +1261,7 @@ entry: define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; CHECK: test_vqdmull_high_laneq_s32: ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -1117,6 +1272,7 @@ entry: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vqdmulh_lane_s16: ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -1126,6 +1282,7 @@ entry: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vqdmulhq_lane_s16: ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -1135,6 +1292,7 @@ entry: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vqdmulh_lane_s32: ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -1144,6 +1302,7 @@ entry: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vqdmulhq_lane_s32: ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -1153,6 +1312,7 @@ entry: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vqrdmulh_lane_s16: ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -1162,6 +1322,7 @@ entry: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vqrdmulhq_lane_s16: ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -1171,6 +1332,7 @@ entry: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vqrdmulh_lane_s32: ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -1180,6 +1342,7 @@ entry: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vqrdmulhq_lane_s32: ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -1189,6 +1352,7 @@ entry: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) { ; CHECK: test_vmul_lane_f32: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -1198,6 +1362,7 @@ entry: define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) { ; CHECK: test_vmul_lane_f64: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +; CHECK-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -1210,6 +1375,7 @@ entry: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) { ; CHECK: test_vmulq_lane_f32: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %mul = fmul <4 x float> %shuffle, %a @@ -1219,6 +1385,7 @@ entry: define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) { ; CHECK: test_vmulq_lane_f64: ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -1228,6 +1395,7 @@ entry: define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) { ; CHECK: test_vmul_laneq_f32: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -1237,6 +1405,7 @@ entry: define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) { ; CHECK: test_vmul_laneq_f64: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; CHECK-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -1249,6 +1418,7 @@ entry: define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) { ; CHECK: test_vmulq_laneq_f32: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %mul = fmul <4 x float> %shuffle, %a @@ -1258,6 +1428,7 @@ entry: define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) { ; CHECK: test_vmulq_laneq_f64: ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %mul = fmul <2 x double> %shuffle, %a @@ -1267,6 +1438,7 @@ entry: define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) { ; CHECK: test_vmulx_lane_f32: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1276,6 +1448,7 @@ entry: define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) { ; CHECK: test_vmulxq_lane_f32: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1285,6 +1458,7 @@ entry: define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) { ; CHECK: test_vmulxq_lane_f64: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1294,6 +1468,7 @@ entry: define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) { ; CHECK: test_vmulx_laneq_f32: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1303,6 +1478,7 @@ entry: define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) { ; CHECK: test_vmulxq_laneq_f32: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1312,6 +1488,7 @@ entry: define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) { ; CHECK: test_vmulxq_laneq_f64: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1321,6 +1498,7 @@ entry: define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmla_lane_s16_0: ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -1331,6 +1509,7 @@ entry: define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlaq_lane_s16_0: ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %b @@ -1341,6 +1520,7 @@ entry: define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmla_lane_s32_0: ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -1351,6 +1531,7 @@ entry: define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlaq_lane_s32_0: ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %b @@ -1361,6 +1542,7 @@ entry: define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmla_laneq_s16_0: ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -1371,6 +1553,7 @@ entry: define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlaq_laneq_s16_0: ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %b @@ -1381,6 +1564,7 @@ entry: define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmla_laneq_s32_0: ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -1391,6 +1575,7 @@ entry: define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlaq_laneq_s32_0: ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %b @@ -1401,6 +1586,7 @@ entry: define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmls_lane_s16_0: ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -1411,6 +1597,7 @@ entry: define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlsq_lane_s16_0: ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %b @@ -1421,6 +1608,7 @@ entry: define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmls_lane_s32_0: ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -1431,6 +1619,7 @@ entry: define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlsq_lane_s32_0: ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %b @@ -1441,6 +1630,7 @@ entry: define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmls_laneq_s16_0: ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -1451,6 +1641,7 @@ entry: define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlsq_laneq_s16_0: ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %b @@ -1461,6 +1652,7 @@ entry: define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmls_laneq_s32_0: ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -1471,6 +1663,7 @@ entry: define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlsq_laneq_s32_0: ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %b @@ -1481,6 +1674,7 @@ entry: define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmul_lane_s16_0: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -1490,6 +1684,7 @@ entry: define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmulq_lane_s16_0: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %a @@ -1499,6 +1694,7 @@ entry: define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmul_lane_s32_0: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -1508,6 +1704,7 @@ entry: define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmulq_lane_s32_0: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %a @@ -1517,6 +1714,7 @@ entry: define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmul_lane_u16_0: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -1526,6 +1724,7 @@ entry: define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmulq_lane_u16_0: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %a @@ -1535,6 +1734,7 @@ entry: define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmul_lane_u32_0: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -1544,6 +1744,7 @@ entry: define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmulq_lane_u32_0: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %a @@ -1553,6 +1754,7 @@ entry: define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmul_laneq_s16_0: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -1562,6 +1764,7 @@ entry: define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmulq_laneq_s16_0: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %a @@ -1571,6 +1774,7 @@ entry: define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmul_laneq_s32_0: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -1580,6 +1784,7 @@ entry: define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmulq_laneq_s32_0: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %a @@ -1589,6 +1794,7 @@ entry: define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmul_laneq_u16_0: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -1598,6 +1804,7 @@ entry: define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmulq_laneq_u16_0: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %a @@ -1607,6 +1814,7 @@ entry: define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmul_laneq_u32_0: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -1616,6 +1824,7 @@ entry: define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmulq_laneq_u32_0: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %a @@ -1625,6 +1834,7 @@ entry: define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK: test_vfma_lane_f32_0: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -1634,6 +1844,7 @@ entry: define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) { ; CHECK: test_vfmaq_lane_f32_0: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -1643,6 +1854,7 @@ entry: define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) { ; CHECK: test_vfma_laneq_f32_0: ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -1652,6 +1864,7 @@ entry: define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) { ; CHECK: test_vfmaq_laneq_f32_0: ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -1661,6 +1874,7 @@ entry: define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK: test_vfms_lane_f32_0: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer @@ -1671,6 +1885,7 @@ entry: define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) { ; CHECK: test_vfmsq_lane_f32_0: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer @@ -1681,6 +1896,7 @@ entry: define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) { ; CHECK: test_vfms_laneq_f32_0: ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer @@ -1691,6 +1907,7 @@ entry: define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) { ; CHECK: test_vfmsq_laneq_f32_0: ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer @@ -1701,6 +1918,7 @@ entry: define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK: test_vfmaq_laneq_f64_0: ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; CHECK-NEXT: ret entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -1710,6 +1928,7 @@ entry: define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK: test_vfmsq_laneq_f64_0: ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; CHECK-NEXT: ret entry: %sub = fsub <2 x double> , %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer @@ -1720,6 +1939,7 @@ entry: define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlal_lane_s16_0: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1730,6 +1950,7 @@ entry: define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlal_lane_s32_0: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1740,6 +1961,7 @@ entry: define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlal_laneq_s16_0: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1750,6 +1972,7 @@ entry: define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlal_laneq_s32_0: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1760,6 +1983,7 @@ entry: define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlal_high_lane_s16_0: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -1771,6 +1995,7 @@ entry: define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlal_high_lane_s32_0: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -1782,6 +2007,7 @@ entry: define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlal_high_laneq_s16_0: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -1793,6 +2019,7 @@ entry: define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlal_high_laneq_s32_0: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -1804,6 +2031,7 @@ entry: define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlsl_lane_s16_0: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1814,6 +2042,7 @@ entry: define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlsl_lane_s32_0: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1824,6 +2053,7 @@ entry: define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlsl_laneq_s16_0: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1834,6 +2064,7 @@ entry: define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlsl_laneq_s32_0: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1844,6 +2075,7 @@ entry: define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlsl_high_lane_s16_0: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -1855,6 +2087,7 @@ entry: define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlsl_high_lane_s32_0: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -1866,6 +2099,7 @@ entry: define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlsl_high_laneq_s16_0: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -1877,6 +2111,7 @@ entry: define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlsl_high_laneq_s32_0: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -1888,6 +2123,7 @@ entry: define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlal_lane_u16_0: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1898,6 +2134,7 @@ entry: define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlal_lane_u32_0: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1908,6 +2145,7 @@ entry: define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlal_laneq_u16_0: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1918,6 +2156,7 @@ entry: define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlal_laneq_u32_0: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1928,6 +2167,7 @@ entry: define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlal_high_lane_u16_0: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -1939,6 +2179,7 @@ entry: define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlal_high_lane_u32_0: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -1950,6 +2191,7 @@ entry: define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlal_high_laneq_u16_0: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -1961,6 +2203,7 @@ entry: define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlal_high_laneq_u32_0: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -1972,6 +2215,7 @@ entry: define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlsl_lane_u16_0: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1982,6 +2226,7 @@ entry: define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlsl_lane_u32_0: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1992,6 +2237,7 @@ entry: define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlsl_laneq_u16_0: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2002,6 +2248,7 @@ entry: define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlsl_laneq_u32_0: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2012,6 +2259,7 @@ entry: define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vmlsl_high_lane_u16_0: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2023,6 +2271,7 @@ entry: define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vmlsl_high_lane_u32_0: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2034,6 +2283,7 @@ entry: define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK: test_vmlsl_high_laneq_u16_0: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2045,6 +2295,7 @@ entry: define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK: test_vmlsl_high_laneq_u32_0: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2056,6 +2307,7 @@ entry: define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmull_lane_s16_0: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2065,6 +2317,7 @@ entry: define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmull_lane_s32_0: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2074,6 +2327,7 @@ entry: define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmull_lane_u16_0: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2083,6 +2337,7 @@ entry: define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmull_lane_u32_0: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2092,6 +2347,7 @@ entry: define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmull_high_lane_s16_0: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2102,6 +2358,7 @@ entry: define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmull_high_lane_s32_0: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2112,6 +2369,7 @@ entry: define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vmull_high_lane_u16_0: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2122,6 +2380,7 @@ entry: define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vmull_high_lane_u32_0: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2132,6 +2391,7 @@ entry: define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmull_laneq_s16_0: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2141,6 +2401,7 @@ entry: define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmull_laneq_s32_0: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2150,6 +2411,7 @@ entry: define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmull_laneq_u16_0: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2159,6 +2421,7 @@ entry: define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmull_laneq_u32_0: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2168,6 +2431,7 @@ entry: define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmull_high_laneq_s16_0: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2178,6 +2442,7 @@ entry: define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmull_high_laneq_s32_0: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2188,6 +2453,7 @@ entry: define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK: test_vmull_high_laneq_u16_0: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2198,6 +2464,7 @@ entry: define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK: test_vmull_high_laneq_u32_0: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2208,6 +2475,7 @@ entry: define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vqdmlal_lane_s16_0: ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2218,6 +2486,7 @@ entry: define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vqdmlal_lane_s32_0: ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2228,6 +2497,7 @@ entry: define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vqdmlal_high_lane_s16_0: ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2239,6 +2509,7 @@ entry: define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vqdmlal_high_lane_s32_0: ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2250,6 +2521,7 @@ entry: define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK: test_vqdmlsl_lane_s16_0: ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2260,6 +2532,7 @@ entry: define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK: test_vqdmlsl_lane_s32_0: ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2270,6 +2543,7 @@ entry: define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK: test_vqdmlsl_high_lane_s16_0: ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2281,6 +2555,7 @@ entry: define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK: test_vqdmlsl_high_lane_s32_0: ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2292,6 +2567,7 @@ entry: define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vqdmull_lane_s16_0: ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2301,6 +2577,7 @@ entry: define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vqdmull_lane_s32_0: ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2310,6 +2587,7 @@ entry: define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK: test_vqdmull_laneq_s16_0: ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2319,6 +2597,7 @@ entry: define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK: test_vqdmull_laneq_s32_0: ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2328,6 +2607,7 @@ entry: define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vqdmull_high_lane_s16_0: ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2338,6 +2618,7 @@ entry: define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vqdmull_high_lane_s32_0: ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2348,6 +2629,7 @@ entry: define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK: test_vqdmull_high_laneq_s16_0: ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2358,6 +2640,7 @@ entry: define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK: test_vqdmull_high_laneq_s32_0: ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2368,6 +2651,7 @@ entry: define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vqdmulh_lane_s16_0: ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -2377,6 +2661,7 @@ entry: define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vqdmulhq_lane_s16_0: ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -2386,6 +2671,7 @@ entry: define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vqdmulh_lane_s32_0: ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -2395,6 +2681,7 @@ entry: define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vqdmulhq_lane_s32_0: ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -2404,6 +2691,7 @@ entry: define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK: test_vqrdmulh_lane_s16_0: ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -2413,6 +2701,7 @@ entry: define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK: test_vqrdmulhq_lane_s16_0: ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -2422,6 +2711,7 @@ entry: define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK: test_vqrdmulh_lane_s32_0: ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -2431,6 +2721,7 @@ entry: define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK: test_vqrdmulhq_lane_s32_0: ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -2440,6 +2731,7 @@ entry: define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) { ; CHECK: test_vmul_lane_f32_0: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2449,6 +2741,7 @@ entry: define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) { ; CHECK: test_vmulq_lane_f32_0: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -2458,6 +2751,7 @@ entry: define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) { ; CHECK: test_vmul_laneq_f32_0: ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2467,6 +2761,7 @@ entry: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) { ; CHECK: test_vmul_laneq_f64_0: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +; CHECK-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -2479,6 +2774,7 @@ entry: define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) { ; CHECK: test_vmulq_laneq_f32_0: ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -2488,6 +2784,7 @@ entry: define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { ; CHECK: test_vmulq_laneq_f64_0: ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -2497,6 +2794,7 @@ entry: define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) { ; CHECK: test_vmulx_lane_f32_0: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -2506,6 +2804,7 @@ entry: define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) { ; CHECK: test_vmulxq_lane_f32_0: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -2515,6 +2814,7 @@ entry: define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) { ; CHECK: test_vmulxq_lane_f64_0: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -2524,6 +2824,7 @@ entry: define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) { ; CHECK: test_vmulx_laneq_f32_0: ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -2533,6 +2834,7 @@ entry: define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) { ; CHECK: test_vmulxq_laneq_f32_0: ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -2542,6 +2844,7 @@ entry: define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { ; CHECK: test_vmulxq_laneq_f64_0: ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) diff --git a/test/CodeGen/AArch64/neon-3vdiff.ll b/test/CodeGen/AArch64/neon-3vdiff.ll index 171e2b2..96400eb 100644 --- a/test/CodeGen/AArch64/neon-3vdiff.ll +++ b/test/CodeGen/AArch64/neon-3vdiff.ll @@ -1804,3 +1804,30 @@ entry: ret <8 x i16> %vmull.i.i } +define i128 @test_vmull_p64(i64 %a, i64 %b) #4 { +; CHECK: test_vmull_p64 +; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d +entry: + %vmull.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vmull1.i = insertelement <1 x i64> undef, i64 %b, i32 0 + %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i, <1 x i64> %vmull1.i) #1 + %vmull3.i = bitcast <16 x i8> %vmull2.i to i128 + ret i128 %vmull3.i +} + +define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 { +; CHECK: test_vmull_high_p64 +; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %0 = extractelement <2 x i64> %a, i32 1 + %1 = extractelement <2 x i64> %b, i32 1 + %vmull.i.i = insertelement <1 x i64> undef, i64 %0, i32 0 + %vmull1.i.i = insertelement <1 x i64> undef, i64 %1, i32 0 + %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i.i, <1 x i64> %vmull1.i.i) #1 + %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128 + ret i128 %vmull3.i.i +} + +declare <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64>, <1 x i64>) #5 + + diff --git a/test/CodeGen/AArch64/neon-across.ll b/test/CodeGen/AArch64/neon-across.ll index 733db97..6d30c95 100644 --- a/test/CodeGen/AArch64/neon-across.ll +++ b/test/CodeGen/AArch64/neon-across.ll @@ -1,12 +1,12 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s -declare <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v4f32(<4 x float>) +declare float @llvm.aarch64.neon.vminnmv(<4 x float>) -declare <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v4f32(<4 x float>) +declare float @llvm.aarch64.neon.vmaxnmv(<4 x float>) -declare <1 x float> @llvm.aarch64.neon.vminv.v1f32.v4f32(<4 x float>) +declare float @llvm.aarch64.neon.vminv(<4 x float>) -declare <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v4f32(<4 x float>) +declare float @llvm.aarch64.neon.vmaxv(<4 x float>) declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32>) @@ -442,8 +442,7 @@ define float @test_vmaxvq_f32(<4 x float> %a) { ; CHECK: test_vmaxvq_f32: ; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s entry: - %vmaxv.i = tail call <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v4f32(<4 x float> %a) - %0 = extractelement <1 x float> %vmaxv.i, i32 0 + %0 = call float @llvm.aarch64.neon.vmaxv(<4 x float> %a) ret float %0 } @@ -451,8 +450,7 @@ define float @test_vminvq_f32(<4 x float> %a) { ; CHECK: test_vminvq_f32: ; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s entry: - %vminv.i = tail call <1 x float> @llvm.aarch64.neon.vminv.v1f32.v4f32(<4 x float> %a) - %0 = extractelement <1 x float> %vminv.i, i32 0 + %0 = call float @llvm.aarch64.neon.vminv(<4 x float> %a) ret float %0 } @@ -460,8 +458,7 @@ define float @test_vmaxnmvq_f32(<4 x float> %a) { ; CHECK: test_vmaxnmvq_f32: ; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s entry: - %vmaxnmv.i = tail call <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v4f32(<4 x float> %a) - %0 = extractelement <1 x float> %vmaxnmv.i, i32 0 + %0 = call float @llvm.aarch64.neon.vmaxnmv(<4 x float> %a) ret float %0 } @@ -469,8 +466,7 @@ define float @test_vminnmvq_f32(<4 x float> %a) { ; CHECK: test_vminnmvq_f32: ; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s entry: - %vminnmv.i = tail call <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v4f32(<4 x float> %a) - %0 = extractelement <1 x float> %vminnmv.i, i32 0 + %0 = call float @llvm.aarch64.neon.vminnmv(<4 x float> %a) ret float %0 } diff --git a/test/CodeGen/AArch64/neon-add-pairwise.ll b/test/CodeGen/AArch64/neon-add-pairwise.ll index 1abfed3..32d8222 100644 --- a/test/CodeGen/AArch64/neon-add-pairwise.ll +++ b/test/CodeGen/AArch64/neon-add-pairwise.ll @@ -90,3 +90,12 @@ define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { ret <2 x double> %val } +define i32 @test_vaddv.v2i32(<2 x i32> %a) { +; CHECK-LABEL: test_vaddv.v2i32 +; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %1 = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v2i32(<2 x i32> %a) + %2 = extractelement <1 x i32> %1, i32 0 + ret i32 %2 +} + +declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v2i32(<2 x i32>) \ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-add-sub.ll b/test/CodeGen/AArch64/neon-add-sub.ll index 078ba14..9015237 100644 --- a/test/CodeGen/AArch64/neon-add-sub.ll +++ b/test/CodeGen/AArch64/neon-add-sub.ll @@ -1,119 +1,119 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) { -;CHECK: add {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp3 = add <8 x i8> %A, %B; ret <8 x i8> %tmp3 } define <16 x i8> @add16xi8(<16 x i8> %A, <16 x i8> %B) { -;CHECK: add {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp3 = add <16 x i8> %A, %B; ret <16 x i8> %tmp3 } define <4 x i16> @add4xi16(<4 x i16> %A, <4 x i16> %B) { -;CHECK: add {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h +;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h %tmp3 = add <4 x i16> %A, %B; ret <4 x i16> %tmp3 } define <8 x i16> @add8xi16(<8 x i16> %A, <8 x i16> %B) { -;CHECK: add {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h +;CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h %tmp3 = add <8 x i16> %A, %B; ret <8 x i16> %tmp3 } define <2 x i32> @add2xi32(<2 x i32> %A, <2 x i32> %B) { -;CHECK: add {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp3 = add <2 x i32> %A, %B; ret <2 x i32> %tmp3 } define <4 x i32> @add4x32(<4 x i32> %A, <4 x i32> %B) { -;CHECK: add {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %tmp3 = add <4 x i32> %A, %B; ret <4 x i32> %tmp3 } define <2 x i64> @add2xi64(<2 x i64> %A, <2 x i64> %B) { -;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d +;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %tmp3 = add <2 x i64> %A, %B; ret <2 x i64> %tmp3 } define <2 x float> @add2xfloat(<2 x float> %A, <2 x float> %B) { -;CHECK: fadd {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp3 = fadd <2 x float> %A, %B; ret <2 x float> %tmp3 } define <4 x float> @add4xfloat(<4 x float> %A, <4 x float> %B) { -;CHECK: fadd {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %tmp3 = fadd <4 x float> %A, %B; ret <4 x float> %tmp3 } define <2 x double> @add2xdouble(<2 x double> %A, <2 x double> %B) { -;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d +;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %tmp3 = fadd <2 x double> %A, %B; ret <2 x double> %tmp3 } define <8 x i8> @sub8xi8(<8 x i8> %A, <8 x i8> %B) { -;CHECK: sub {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp3 = sub <8 x i8> %A, %B; ret <8 x i8> %tmp3 } define <16 x i8> @sub16xi8(<16 x i8> %A, <16 x i8> %B) { -;CHECK: sub {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp3 = sub <16 x i8> %A, %B; ret <16 x i8> %tmp3 } define <4 x i16> @sub4xi16(<4 x i16> %A, <4 x i16> %B) { -;CHECK: sub {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h +;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h %tmp3 = sub <4 x i16> %A, %B; ret <4 x i16> %tmp3 } define <8 x i16> @sub8xi16(<8 x i16> %A, <8 x i16> %B) { -;CHECK: sub {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h +;CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h %tmp3 = sub <8 x i16> %A, %B; ret <8 x i16> %tmp3 } define <2 x i32> @sub2xi32(<2 x i32> %A, <2 x i32> %B) { -;CHECK: sub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp3 = sub <2 x i32> %A, %B; ret <2 x i32> %tmp3 } define <4 x i32> @sub4x32(<4 x i32> %A, <4 x i32> %B) { -;CHECK: sub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %tmp3 = sub <4 x i32> %A, %B; ret <4 x i32> %tmp3 } define <2 x i64> @sub2xi64(<2 x i64> %A, <2 x i64> %B) { -;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d +;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %tmp3 = sub <2 x i64> %A, %B; ret <2 x i64> %tmp3 } define <2 x float> @sub2xfloat(<2 x float> %A, <2 x float> %B) { -;CHECK: fsub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp3 = fsub <2 x float> %A, %B; ret <2 x float> %tmp3 } define <4 x float> @sub4xfloat(<4 x float> %A, <4 x float> %B) { -;CHECK: fsub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %tmp3 = fsub <4 x float> %A, %B; ret <4 x float> %tmp3 } define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) { -;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d +;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %tmp3 = fsub <2 x double> %A, %B; ret <2 x double> %tmp3 } @@ -234,4 +234,46 @@ declare <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double>, <1 x double>) declare <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double>, <1 x double>) declare <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double>, <1 x double>) declare <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double>, <1 x double>) -declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>) \ No newline at end of file +declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>) + +define <1 x i8> @test_add_v1i8(<1 x i8> %a, <1 x i8> %b) { +;CHECK-LABEL: test_add_v1i8: +;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %c = add <1 x i8> %a, %b + ret <1 x i8> %c +} + +define <1 x i16> @test_add_v1i16(<1 x i16> %a, <1 x i16> %b) { +;CHECK-LABEL: test_add_v1i16: +;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %c = add <1 x i16> %a, %b + ret <1 x i16> %c +} + +define <1 x i32> @test_add_v1i32(<1 x i32> %a, <1 x i32> %b) { +;CHECK-LABEL: test_add_v1i32: +;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %c = add <1 x i32> %a, %b + ret <1 x i32> %c +} + +define <1 x i8> @test_sub_v1i8(<1 x i8> %a, <1 x i8> %b) { +;CHECK-LABEL: test_sub_v1i8: +;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %c = sub <1 x i8> %a, %b + ret <1 x i8> %c +} + +define <1 x i16> @test_sub_v1i16(<1 x i16> %a, <1 x i16> %b) { +;CHECK-LABEL: test_sub_v1i16: +;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %c = sub <1 x i16> %a, %b + ret <1 x i16> %c +} + +define <1 x i32> @test_sub_v1i32(<1 x i32> %a, <1 x i32> %b) { +;CHECK-LABEL: test_sub_v1i32: +;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %c = sub <1 x i32> %a, %b + ret <1 x i32> %c +} \ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-bitcast.ll b/test/CodeGen/AArch64/neon-bitcast.ll index f9ec704..61099d4 100644 --- a/test/CodeGen/AArch64/neon-bitcast.ll +++ b/test/CodeGen/AArch64/neon-bitcast.ll @@ -20,8 +20,8 @@ define <2 x i32> @test_v8i8_to_v2i32(<8 x i8> %in) nounwind { ret <2 x i32> %val } -define <2 x float> @test_v8i8_to_v1f32(<8 x i8> %in) nounwind{ -; CHECK: test_v8i8_to_v1f32: +define <2 x float> @test_v8i8_to_v2f32(<8 x i8> %in) nounwind{ +; CHECK: test_v8i8_to_v2f32: ; CHECK-NEXT: // BB#0: ; CHECK-NEXT: ret @@ -67,8 +67,8 @@ define <2 x i32> @test_v4i16_to_v2i32(<4 x i16> %in) nounwind { ret <2 x i32> %val } -define <2 x float> @test_v4i16_to_v1f32(<4 x i16> %in) nounwind{ -; CHECK: test_v4i16_to_v1f32: +define <2 x float> @test_v4i16_to_v2f32(<4 x i16> %in) nounwind{ +; CHECK: test_v4i16_to_v2f32: ; CHECK-NEXT: // BB#0: ; CHECK-NEXT: ret @@ -114,8 +114,8 @@ define <2 x i32> @test_v2i32_to_v2i32(<2 x i32> %in) nounwind { ret <2 x i32> %val } -define <2 x float> @test_v2i32_to_v1f32(<2 x i32> %in) nounwind{ -; CHECK: test_v2i32_to_v1f32: +define <2 x float> @test_v2i32_to_v2f32(<2 x i32> %in) nounwind{ +; CHECK: test_v2i32_to_v2f32: ; CHECK-NEXT: // BB#0: ; CHECK-NEXT: ret diff --git a/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/test/CodeGen/AArch64/neon-bitwise-instructions.ll index 1c43b97..7e5b693 100644 --- a/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -1,502 +1,502 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s - define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) { -;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = and <8 x i8> %a, %b; ret <8 x i8> %tmp1 } define <16 x i8> @and16xi8(<16 x i8> %a, <16 x i8> %b) { -;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = and <16 x i8> %a, %b; ret <16 x i8> %tmp1 } define <8 x i8> @orr8xi8(<8 x i8> %a, <8 x i8> %b) { -;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = or <8 x i8> %a, %b; ret <8 x i8> %tmp1 } define <16 x i8> @orr16xi8(<16 x i8> %a, <16 x i8> %b) { -;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = or <16 x i8> %a, %b; ret <16 x i8> %tmp1 } define <8 x i8> @xor8xi8(<8 x i8> %a, <8 x i8> %b) { -;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = xor <8 x i8> %a, %b; ret <8 x i8> %tmp1 } define <16 x i8> @xor16xi8(<16 x i8> %a, <16 x i8> %b) { -;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = xor <16 x i8> %a, %b; ret <16 x i8> %tmp1 } define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b) { -;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b - %tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > - %tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 > +;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0 > + %tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1 > %tmp3 = or <8 x i8> %tmp1, %tmp2 ret <8 x i8> %tmp3 } define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) { -;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b - %tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > - %tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 > +;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0 > + %tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1 > %tmp3 = or <16 x i8> %tmp1, %tmp2 ret <16 x i8> %tmp3 } define <8 x i8> @orn8xi8(<8 x i8> %a, <8 x i8> %b) { -;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > %tmp2 = or <8 x i8> %a, %tmp1 ret <8 x i8> %tmp2 } define <16 x i8> @orn16xi8(<16 x i8> %a, <16 x i8> %b) { -;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > %tmp2 = or <16 x i8> %a, %tmp1 ret <16 x i8> %tmp2 } define <8 x i8> @bic8xi8(<8 x i8> %a, <8 x i8> %b) { -;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > %tmp2 = and <8 x i8> %a, %tmp1 ret <8 x i8> %tmp2 } define <16 x i8> @bic16xi8(<16 x i8> %a, <16 x i8> %b) { -;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > %tmp2 = and <16 x i8> %a, %tmp1 ret <16 x i8> %tmp2 } define <2 x i32> @orrimm2s_lsl0(<2 x i32> %a) { -;CHECK: orr {{v[0-31]+}}.2s, #0xff +;CHECK: orr {{v[0-9]+}}.2s, #0xff %tmp1 = or <2 x i32> %a, < i32 255, i32 255> ret <2 x i32> %tmp1 } define <2 x i32> @orrimm2s_lsl8(<2 x i32> %a) { -;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #8 +;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #8 %tmp1 = or <2 x i32> %a, < i32 65280, i32 65280> ret <2 x i32> %tmp1 } define <2 x i32> @orrimm2s_lsl16(<2 x i32> %a) { -;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #16 +;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #16 %tmp1 = or <2 x i32> %a, < i32 16711680, i32 16711680> ret <2 x i32> %tmp1 } define <2 x i32> @orrimm2s_lsl24(<2 x i32> %a) { -;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #24 +;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #24 %tmp1 = or <2 x i32> %a, < i32 4278190080, i32 4278190080> ret <2 x i32> %tmp1 } define <4 x i32> @orrimm4s_lsl0(<4 x i32> %a) { -;CHECK: orr {{v[0-31]+}}.4s, #0xff +;CHECK: orr {{v[0-9]+}}.4s, #0xff %tmp1 = or <4 x i32> %a, < i32 255, i32 255, i32 255, i32 255> ret <4 x i32> %tmp1 } define <4 x i32> @orrimm4s_lsl8(<4 x i32> %a) { -;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #8 +;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #8 %tmp1 = or <4 x i32> %a, < i32 65280, i32 65280, i32 65280, i32 65280> ret <4 x i32> %tmp1 } define <4 x i32> @orrimm4s_lsl16(<4 x i32> %a) { -;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #16 +;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #16 %tmp1 = or <4 x i32> %a, < i32 16711680, i32 16711680, i32 16711680, i32 16711680> ret <4 x i32> %tmp1 } define <4 x i32> @orrimm4s_lsl24(<4 x i32> %a) { -;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #24 +;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #24 %tmp1 = or <4 x i32> %a, < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080> ret <4 x i32> %tmp1 } define <4 x i16> @orrimm4h_lsl0(<4 x i16> %a) { -;CHECK: orr {{v[0-31]+}}.4h, #0xff +;CHECK: orr {{v[0-9]+}}.4h, #0xff %tmp1 = or <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255 > ret <4 x i16> %tmp1 } define <4 x i16> @orrimm4h_lsl8(<4 x i16> %a) { -;CHECK: orr {{v[0-31]+}}.4h, #0xff, lsl #8 +;CHECK: orr {{v[0-9]+}}.4h, #0xff, lsl #8 %tmp1 = or <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 > ret <4 x i16> %tmp1 } define <8 x i16> @orrimm8h_lsl0(<8 x i16> %a) { -;CHECK: orr {{v[0-31]+}}.8h, #0xff +;CHECK: orr {{v[0-9]+}}.8h, #0xff %tmp1 = or <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 > ret <8 x i16> %tmp1 } define <8 x i16> @orrimm8h_lsl8(<8 x i16> %a) { -;CHECK: orr {{v[0-31]+}}.8h, #0xff, lsl #8 +;CHECK: orr {{v[0-9]+}}.8h, #0xff, lsl #8 %tmp1 = or <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 > ret <8 x i16> %tmp1 } define <2 x i32> @bicimm2s_lsl0(<2 x i32> %a) { -;CHECK: bic {{v[0-31]+}}.2s, #0x10 +;CHECK: bic {{v[0-9]+}}.2s, #0x10 %tmp1 = and <2 x i32> %a, < i32 4294967279, i32 4294967279 > ret <2 x i32> %tmp1 } define <2 x i32> @bicimm2s_lsl8(<2 x i32> %a) { -;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #8 - %tmp1 = and <2 x i32> %a, < i32 18446744073709547519, i32 18446744073709547519 > +;CHECK: bic {{v[0-9]+}}.2s, #0x10, lsl #8 + %tmp1 = and <2 x i32> %a, < i32 4294963199, i32 4294963199 > ret <2 x i32> %tmp1 } define <2 x i32> @bicimm2s_lsl16(<2 x i32> %a) { -;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #16 - %tmp1 = and <2 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039 > +;CHECK: bic {{v[0-9]+}}.2s, #0x10, lsl #16 + %tmp1 = and <2 x i32> %a, < i32 4293918719, i32 4293918719 > ret <2 x i32> %tmp1 } define <2 x i32> @bicimm2s_lsl124(<2 x i32> %a) { -;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #24 - %tmp1 = and <2 x i32> %a, < i32 18446744073441116159, i32 18446744073441116159> +;CHECK: bic {{v[0-9]+}}.2s, #0x10, lsl #24 + %tmp1 = and <2 x i32> %a, < i32 4026531839, i32 4026531839> ret <2 x i32> %tmp1 } define <4 x i32> @bicimm4s_lsl0(<4 x i32> %a) { -;CHECK: bic {{v[0-31]+}}.4s, #0x10 +;CHECK: bic {{v[0-9]+}}.4s, #0x10 %tmp1 = and <4 x i32> %a, < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 > ret <4 x i32> %tmp1 } define <4 x i32> @bicimm4s_lsl8(<4 x i32> %a) { -;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #8 - %tmp1 = and <4 x i32> %a, < i32 18446744073709547519, i32 18446744073709547519, i32 18446744073709547519, i32 18446744073709547519 > +;CHECK: bic {{v[0-9]+}}.4s, #0x10, lsl #8 + %tmp1 = and <4 x i32> %a, < i32 4294963199, i32 4294963199, i32 4294963199, i32 4294963199 > ret <4 x i32> %tmp1 } define <4 x i32> @bicimm4s_lsl16(<4 x i32> %a) { -;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #16 - %tmp1 = and <4 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039 > +;CHECK: bic {{v[0-9]+}}.4s, #0x10, lsl #16 + %tmp1 = and <4 x i32> %a, < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 > ret <4 x i32> %tmp1 } define <4 x i32> @bicimm4s_lsl124(<4 x i32> %a) { -;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #24 - %tmp1 = and <4 x i32> %a, < i32 18446744073441116159, i32 18446744073441116159, i32 18446744073441116159, i32 18446744073441116159> +;CHECK: bic {{v[0-9]+}}.4s, #0x10, lsl #24 + %tmp1 = and <4 x i32> %a, < i32 4026531839, i32 4026531839, i32 4026531839, i32 4026531839> ret <4 x i32> %tmp1 } define <4 x i16> @bicimm4h_lsl0_a(<4 x i16> %a) { -;CHECK: bic {{v[0-31]+}}.4h, #0x10 - %tmp1 = and <4 x i16> %a, < i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599 > +;CHECK: bic {{v[0-9]+}}.4h, #0x10 + %tmp1 = and <4 x i16> %a, < i16 4294967279, i16 4294967279, i16 4294967279, i16 4294967279 > ret <4 x i16> %tmp1 } define <4 x i16> @bicimm4h_lsl0_b(<4 x i16> %a) { -;CHECK: bic {{v[0-31]+}}.4h, #0x0 +;CHECK: bic {{v[0-9]+}}.4h, #0xff %tmp1 = and <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 > ret <4 x i16> %tmp1 } define <4 x i16> @bicimm4h_lsl8_a(<4 x i16> %a) { -;CHECK: bic {{v[0-31]+}}.4h, #0x10, lsl #8 - %tmp1 = and <4 x i16> %a, < i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519> +;CHECK: bic {{v[0-9]+}}.4h, #0x10, lsl #8 + %tmp1 = and <4 x i16> %a, < i16 4294963199, i16 4294963199, i16 4294963199, i16 4294963199> ret <4 x i16> %tmp1 } define <4 x i16> @bicimm4h_lsl8_b(<4 x i16> %a) { -;CHECK: bic {{v[0-31]+}}.4h, #0x0, lsl #8 +;CHECK: bic {{v[0-9]+}}.4h, #0xff, lsl #8 %tmp1 = and <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255> ret <4 x i16> %tmp1 } define <8 x i16> @bicimm8h_lsl0_a(<8 x i16> %a) { -;CHECK: bic {{v[0-31]+}}.8h, #0x10 - %tmp1 = and <8 x i16> %a, < i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, - i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599 > +;CHECK: bic {{v[0-9]+}}.8h, #0x10 + %tmp1 = and <8 x i16> %a, < i16 4294967279, i16 4294967279, i16 4294967279, i16 4294967279, + i16 4294967279, i16 4294967279, i16 4294967279, i16 4294967279 > ret <8 x i16> %tmp1 } define <8 x i16> @bicimm8h_lsl0_b(<8 x i16> %a) { -;CHECK: bic {{v[0-31]+}}.8h, #0x0 +;CHECK: bic {{v[0-9]+}}.8h, #0xff %tmp1 = and <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 > ret <8 x i16> %tmp1 } define <8 x i16> @bicimm8h_lsl8_a(<8 x i16> %a) { -;CHECK: bic {{v[0-31]+}}.8h, #0x10, lsl #8 - %tmp1 = and <8 x i16> %a, < i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, - i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519> +;CHECK: bic {{v[0-9]+}}.8h, #0x10, lsl #8 + %tmp1 = and <8 x i16> %a, < i16 4294963199, i16 4294963199, i16 4294963199, i16 4294963199, + i16 4294963199, i16 4294963199, i16 4294963199, i16 4294963199> ret <8 x i16> %tmp1 } define <8 x i16> @bicimm8h_lsl8_b(<8 x i16> %a) { -;CHECK: bic {{v[0-31]+}}.8h, #0x0, lsl #8 +;CHECK: bic {{v[0-9]+}}.8h, #0xff, lsl #8 %tmp1 = and <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> ret <8 x i16> %tmp1 } define <2 x i32> @and2xi32(<2 x i32> %a, <2 x i32> %b) { -;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = and <2 x i32> %a, %b; ret <2 x i32> %tmp1 } define <4 x i16> @and4xi16(<4 x i16> %a, <4 x i16> %b) { -;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = and <4 x i16> %a, %b; ret <4 x i16> %tmp1 } define <1 x i64> @and1xi64(<1 x i64> %a, <1 x i64> %b) { -;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = and <1 x i64> %a, %b; ret <1 x i64> %tmp1 } define <4 x i32> @and4xi32(<4 x i32> %a, <4 x i32> %b) { -;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = and <4 x i32> %a, %b; ret <4 x i32> %tmp1 } define <8 x i16> @and8xi16(<8 x i16> %a, <8 x i16> %b) { -;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = and <8 x i16> %a, %b; ret <8 x i16> %tmp1 } define <2 x i64> @and2xi64(<2 x i64> %a, <2 x i64> %b) { -;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = and <2 x i64> %a, %b; ret <2 x i64> %tmp1 } define <2 x i32> @orr2xi32(<2 x i32> %a, <2 x i32> %b) { -;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = or <2 x i32> %a, %b; ret <2 x i32> %tmp1 } define <4 x i16> @orr4xi16(<4 x i16> %a, <4 x i16> %b) { -;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = or <4 x i16> %a, %b; ret <4 x i16> %tmp1 } define <1 x i64> @orr1xi64(<1 x i64> %a, <1 x i64> %b) { -;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = or <1 x i64> %a, %b; ret <1 x i64> %tmp1 } define <4 x i32> @orr4xi32(<4 x i32> %a, <4 x i32> %b) { -;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = or <4 x i32> %a, %b; ret <4 x i32> %tmp1 } define <8 x i16> @orr8xi16(<8 x i16> %a, <8 x i16> %b) { -;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = or <8 x i16> %a, %b; ret <8 x i16> %tmp1 } define <2 x i64> @orr2xi64(<2 x i64> %a, <2 x i64> %b) { -;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = or <2 x i64> %a, %b; ret <2 x i64> %tmp1 } define <2 x i32> @eor2xi32(<2 x i32> %a, <2 x i32> %b) { -;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = xor <2 x i32> %a, %b; ret <2 x i32> %tmp1 } define <4 x i16> @eor4xi16(<4 x i16> %a, <4 x i16> %b) { -;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = xor <4 x i16> %a, %b; ret <4 x i16> %tmp1 } define <1 x i64> @eor1xi64(<1 x i64> %a, <1 x i64> %b) { -;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = xor <1 x i64> %a, %b; ret <1 x i64> %tmp1 } define <4 x i32> @eor4xi32(<4 x i32> %a, <4 x i32> %b) { -;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = xor <4 x i32> %a, %b; ret <4 x i32> %tmp1 } define <8 x i16> @eor8xi16(<8 x i16> %a, <8 x i16> %b) { -;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = xor <8 x i16> %a, %b; ret <8 x i16> %tmp1 } define <2 x i64> @eor2xi64(<2 x i64> %a, <2 x i64> %b) { -;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = xor <2 x i64> %a, %b; ret <2 x i64> %tmp1 } define <2 x i32> @bic2xi32(<2 x i32> %a, <2 x i32> %b) { -;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 > %tmp2 = and <2 x i32> %a, %tmp1 ret <2 x i32> %tmp2 } define <4 x i16> @bic4xi16(<4 x i16> %a, <4 x i16> %b) { -;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 > %tmp2 = and <4 x i16> %a, %tmp1 ret <4 x i16> %tmp2 } define <1 x i64> @bic1xi64(<1 x i64> %a, <1 x i64> %b) { -;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = xor <1 x i64> %b, < i64 -1> %tmp2 = and <1 x i64> %a, %tmp1 ret <1 x i64> %tmp2 } define <4 x i32> @bic4xi32(<4 x i32> %a, <4 x i32> %b) { -;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1> %tmp2 = and <4 x i32> %a, %tmp1 ret <4 x i32> %tmp2 } define <8 x i16> @bic8xi16(<8 x i16> %a, <8 x i16> %b) { -;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 > %tmp2 = and <8 x i16> %a, %tmp1 ret <8 x i16> %tmp2 } define <2 x i64> @bic2xi64(<2 x i64> %a, <2 x i64> %b) { -;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1> %tmp2 = and <2 x i64> %a, %tmp1 ret <2 x i64> %tmp2 } define <2 x i32> @orn2xi32(<2 x i32> %a, <2 x i32> %b) { -;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 > %tmp2 = or <2 x i32> %a, %tmp1 ret <2 x i32> %tmp2 } define <4 x i16> @orn4xi16(<4 x i16> %a, <4 x i16> %b) { -;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 > %tmp2 = or <4 x i16> %a, %tmp1 ret <4 x i16> %tmp2 } define <1 x i64> @orn1xi64(<1 x i64> %a, <1 x i64> %b) { -;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = xor <1 x i64> %b, < i64 -1> %tmp2 = or <1 x i64> %a, %tmp1 ret <1 x i64> %tmp2 } define <4 x i32> @orn4xi32(<4 x i32> %a, <4 x i32> %b) { -;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1> %tmp2 = or <4 x i32> %a, %tmp1 ret <4 x i32> %tmp2 } define <8 x i16> @orn8xi16(<8 x i16> %a, <8 x i16> %b) { -;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 > %tmp2 = or <8 x i16> %a, %tmp1 ret <8 x i16> %tmp2 } define <2 x i64> @orn2xi64(<2 x i64> %a, <2 x i64> %b) { -;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1> %tmp2 = or <2 x i64> %a, %tmp1 ret <2 x i64> %tmp2 } + define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b) { -;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b - %tmp1 = and <2 x i32> %a, < i32 -1, i32 -1 > - %tmp2 = and <2 x i32> %b, < i32 0, i32 0 > +;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp1 = and <2 x i32> %a, < i32 -1, i32 0 > + %tmp2 = and <2 x i32> %b, < i32 0, i32 -1 > %tmp3 = or <2 x i32> %tmp1, %tmp2 ret <2 x i32> %tmp3 } define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b) { -;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b - %tmp1 = and <4 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1 > - %tmp2 = and <4 x i16> %b, < i16 0, i16 0,i16 0, i16 0 > +;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp1 = and <4 x i16> %a, < i16 -1, i16 0, i16 -1,i16 0 > + %tmp2 = and <4 x i16> %b, < i16 0, i16 -1,i16 0, i16 -1 > %tmp3 = or <4 x i16> %tmp1, %tmp2 ret <4 x i16> %tmp3 } define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b) { -;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b - %tmp1 = and <1 x i64> %a, < i64 -1 > - %tmp2 = and <1 x i64> %b, < i64 0 > +;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp1 = and <1 x i64> %a, < i64 -16 > + %tmp2 = and <1 x i64> %b, < i64 15 > %tmp3 = or <1 x i64> %tmp1, %tmp2 ret <1 x i64> %tmp3 } define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b) { -;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b - %tmp1 = and <4 x i32> %a, < i32 -1, i32 -1, i32 -1, i32 -1 > - %tmp2 = and <4 x i32> %b, < i32 0, i32 0, i32 0, i32 0 > +;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp1 = and <4 x i32> %a, < i32 -1, i32 0, i32 -1, i32 0 > + %tmp2 = and <4 x i32> %b, < i32 0, i32 -1, i32 0, i32 -1 > %tmp3 = or <4 x i32> %tmp1, %tmp2 ret <4 x i32> %tmp3 } define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b) { -;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b - %tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1, i16 -1, i16 -1, i16 -1,i16 -1 > - %tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0 > +;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 0,i16 0, i16 -1, i16 -1, i16 0,i16 0 > + %tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 -1, i16 -1, i16 0, i16 0, i16 -1, i16 -1 > %tmp3 = or <8 x i16> %tmp1, %tmp2 ret <8 x i16> %tmp3 } define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b) { -;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b - %tmp1 = and <2 x i64> %a, < i64 -1, i64 -1 > - %tmp2 = and <2 x i64> %b, < i64 0, i64 0 > +;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp1 = and <2 x i64> %a, < i64 -1, i64 0 > + %tmp2 = and <2 x i64> %b, < i64 0, i64 -1 > %tmp3 = or <2 x i64> %tmp1, %tmp2 ret <2 x i64> %tmp3 } define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { -;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %1 = and <8 x i8> %v1, %v2 %2 = xor <8 x i8> %v1, %3 = and <8 x i8> %2, %v3 @@ -505,7 +505,7 @@ define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { } define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) { -;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %1 = and <4 x i16> %v1, %v2 %2 = xor <4 x i16> %v1, %3 = and <4 x i16> %2, %v3 @@ -514,7 +514,7 @@ define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) { } define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { -;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %1 = and <2 x i32> %v1, %v2 %2 = xor <2 x i32> %v1, %3 = and <2 x i32> %2, %v3 @@ -523,7 +523,7 @@ define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { } define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) { -;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %1 = and <1 x i64> %v1, %v2 %2 = xor <1 x i64> %v1, %3 = and <1 x i64> %2, %v3 @@ -532,7 +532,7 @@ define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) { } define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { -;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %1 = and <16 x i8> %v1, %v2 %2 = xor <16 x i8> %v1, %3 = and <16 x i8> %2, %v3 @@ -541,7 +541,7 @@ define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { } define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) { -;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %1 = and <8 x i16> %v1, %v2 %2 = xor <8 x i16> %v1, %3 = and <8 x i16> %2, %v3 @@ -550,7 +550,7 @@ define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) { } define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { -;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %1 = and <4 x i32> %v1, %v2 %2 = xor <4 x i32> %v1, %3 = and <4 x i32> %2, %v3 @@ -558,8 +558,65 @@ define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { ret <4 x i32> %4 } +define <8 x i8> @vselect_v8i8(<8 x i8> %a) { +;CHECK: movi {{d[0-9]+}}, #0xffff +;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %b = select <8 x i1> , <8 x i8> %a, <8 x i8> + ret <8 x i8> %b +} + +define <4 x i16> @vselect_v4i16(<4 x i16> %a) { +;CHECK: movi {{d[0-9]+}}, #0xffff +;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %b = select <4 x i1> , <4 x i16> %a, <4 x i16> + ret <4 x i16> %b +} + +define <8 x i8> @vselect_cmp_ne(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %cmp = icmp ne <8 x i8> %a, %b + %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c + ret <8 x i8> %d +} + +define <8 x i8> @vselect_cmp_eq(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %cmp = icmp eq <8 x i8> %a, %b + %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c + ret <8 x i8> %d +} + +define <8 x i8> @vselect_cmpz_ne(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0 +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %cmp = icmp ne <8 x i8> %a, zeroinitializer + %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c + ret <8 x i8> %d +} + +define <8 x i8> @vselect_cmpz_eq(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0 +;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %cmp = icmp eq <8 x i8> %a, zeroinitializer + %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c + ret <8 x i8> %d +} + +define <8 x i8> @vselect_tst(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +;CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = and <8 x i8> %a, %b + %tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer + %d = select <8 x i1> %tmp4, <8 x i8> %b, <8 x i8> %c + ret <8 x i8> %d +} + define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) { -;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %1 = and <2 x i64> %v1, %v2 %2 = xor <2 x i64> %v1, %3 = and <2 x i64> %2, %v3 @@ -568,27 +625,459 @@ define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) { } define <8 x i8> @orrimm8b_as_orrimm4h_lsl0(<8 x i8> %a) { -;CHECK: orr {{v[0-31]+}}.4h, #0xff +;CHECK: orr {{v[0-9]+}}.4h, #0xff %val = or <8 x i8> %a, ret <8 x i8> %val } define <8 x i8> @orrimm8b_as_orimm4h_lsl8(<8 x i8> %a) { -;CHECK: orr {{v[0-31]+}}.4h, #0xff, lsl #8 +;CHECK: orr {{v[0-9]+}}.4h, #0xff, lsl #8 %val = or <8 x i8> %a, ret <8 x i8> %val } define <16 x i8> @orimm16b_as_orrimm8h_lsl0(<16 x i8> %a) { -;CHECK: orr {{v[0-31]+}}.8h, #0xff +;CHECK: orr {{v[0-9]+}}.8h, #0xff %val = or <16 x i8> %a, ret <16 x i8> %val } define <16 x i8> @orimm16b_as_orrimm8h_lsl8(<16 x i8> %a) { -;CHECK: orr {{v[0-31]+}}.8h, #0xff, lsl #8 +;CHECK: orr {{v[0-9]+}}.8h, #0xff, lsl #8 %val = or <16 x i8> %a, ret <16 x i8> %val } +define <8 x i8> @and8imm2s_lsl0(<8 x i8> %a) { +;CHECK: bic {{v[0-9]+}}.2s, #0xff + %tmp1 = and <8 x i8> %a, < i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255> + ret <8 x i8> %tmp1 +} + +define <8 x i8> @and8imm2s_lsl8(<8 x i8> %a) { +;CHECK: bic {{v[0-9]+}}.2s, #0xff, lsl #8 + %tmp1 = and <8 x i8> %a, < i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255> + ret <8 x i8> %tmp1 +} + +define <8 x i8> @and8imm2s_lsl16(<8 x i8> %a) { +;CHECK: bic {{v[0-9]+}}.2s, #0xff, lsl #16 + %tmp1 = and <8 x i8> %a, < i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255> + ret <8 x i8> %tmp1 +} + +define <8 x i8> @and8imm2s_lsl24(<8 x i8> %a) { +;CHECK: bic {{v[0-9]+}}.2s, #0xfe, lsl #24 + %tmp1 = and <8 x i8> %a, < i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1> + ret <8 x i8> %tmp1 +} + +define <4 x i16> @and16imm2s_lsl0(<4 x i16> %a) { +;CHECK: bic {{v[0-9]+}}.2s, #0xff + %tmp1 = and <4 x i16> %a, < i16 65280, i16 65535, i16 65280, i16 65535> + ret <4 x i16> %tmp1 +} + +define <4 x i16> @and16imm2s_lsl8(<4 x i16> %a) { +;CHECK: bic {{v[0-9]+}}.2s, #0xff, lsl #8 + %tmp1 = and <4 x i16> %a, < i16 255, i16 65535, i16 255, i16 65535> + ret <4 x i16> %tmp1 +} + +define <4 x i16> @and16imm2s_lsl16(<4 x i16> %a) { +;CHECK: bic {{v[0-9]+}}.2s, #0xff, lsl #16 + %tmp1 = and <4 x i16> %a, < i16 65535, i16 65280, i16 65535, i16 65280> + ret <4 x i16> %tmp1 +} + +define <4 x i16> @and16imm2s_lsl24(<4 x i16> %a) { +;CHECK: bic {{v[0-9]+}}.2s, #0xfe, lsl #24 + %tmp1 = and <4 x i16> %a, < i16 65535, i16 511, i16 65535, i16 511> + ret <4 x i16> %tmp1 +} + + +define <1 x i64> @and64imm2s_lsl0(<1 x i64> %a) { +;CHECK: bic {{v[0-9]+}}.2s, #0xff + %tmp1 = and <1 x i64> %a, < i64 -1095216660736> + ret <1 x i64> %tmp1 +} + +define <1 x i64> @and64imm2s_lsl8(<1 x i64> %a) { +;CHECK: bic {{v[0-9]+}}.2s, #0xff, lsl #8 + %tmp1 = and <1 x i64> %a, < i64 -280375465148161> + ret <1 x i64> %tmp1 +} + +define <1 x i64> @and64imm2s_lsl16(<1 x i64> %a) { +;CHECK: bic {{v[0-9]+}}.2s, #0xff, lsl #16 + %tmp1 = and <1 x i64> %a, < i64 -71776119077928961> + ret <1 x i64> %tmp1 +} + +define <1 x i64> @and64imm2s_lsl24(<1 x i64> %a) { +;CHECK: bic {{v[0-9]+}}.2s, #0xfe, lsl #24 + %tmp1 = and <1 x i64> %a, < i64 144115183814443007> + ret <1 x i64> %tmp1 +} + +define <16 x i8> @and8imm4s_lsl0(<16 x i8> %a) { +;CHECK: bic {{v[0-9]+}}.4s, #0xff + %tmp1 = and <16 x i8> %a, < i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255> + ret <16 x i8> %tmp1 +} + +define <16 x i8> @and8imm4s_lsl8(<16 x i8> %a) { +;CHECK: bic {{v[0-9]+}}.4s, #0xff, lsl #8 + %tmp1 = and <16 x i8> %a, < i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255> + ret <16 x i8> %tmp1 +} + +define <16 x i8> @and8imm4s_lsl16(<16 x i8> %a) { +;CHECK: bic {{v[0-9]+}}.4s, #0xff, lsl #16 + %tmp1 = and <16 x i8> %a, < i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255> + ret <16 x i8> %tmp1 +} + +define <16 x i8> @and8imm4s_lsl24(<16 x i8> %a) { +;CHECK: bic {{v[0-9]+}}.4s, #0xfe, lsl #24 + %tmp1 = and <16 x i8> %a, < i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1> + ret <16 x i8> %tmp1 +} + +define <8 x i16> @and16imm4s_lsl0(<8 x i16> %a) { +;CHECK: bic {{v[0-9]+}}.4s, #0xff + %tmp1 = and <8 x i16> %a, < i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535> + ret <8 x i16> %tmp1 +} + +define <8 x i16> @and16imm4s_lsl8(<8 x i16> %a) { +;CHECK: bic {{v[0-9]+}}.4s, #0xff, lsl #8 + %tmp1 = and <8 x i16> %a, < i16 255, i16 65535, i16 255, i16 65535, i16 255, i16 65535, i16 255, i16 65535> + ret <8 x i16> %tmp1 +} + +define <8 x i16> @and16imm4s_lsl16(<8 x i16> %a) { +;CHECK: bic {{v[0-9]+}}.4s, #0xff, lsl #16 + %tmp1 = and <8 x i16> %a, < i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280> + ret <8 x i16> %tmp1 +} + +define <8 x i16> @and16imm4s_lsl24(<8 x i16> %a) { +;CHECK: bic {{v[0-9]+}}.4s, #0xfe, lsl #24 + %tmp1 = and <8 x i16> %a, < i16 65535, i16 511, i16 65535, i16 511, i16 65535, i16 511, i16 65535, i16 511> + ret <8 x i16> %tmp1 +} + +define <2 x i64> @and64imm4s_lsl0(<2 x i64> %a) { +;CHECK: bic {{v[0-9]+}}.4s, #0xff + %tmp1 = and <2 x i64> %a, < i64 -1095216660736, i64 -1095216660736> + ret <2 x i64> %tmp1 +} + +define <2 x i64> @and64imm4s_lsl8(<2 x i64> %a) { +;CHECK: bic {{v[0-9]+}}.4s, #0xff, lsl #8 + %tmp1 = and <2 x i64> %a, < i64 -280375465148161, i64 -280375465148161> + ret <2 x i64> %tmp1 +} + +define <2 x i64> @and64imm4s_lsl16(<2 x i64> %a) { +;CHECK: bic {{v[0-9]+}}.4s, #0xff, lsl #16 + %tmp1 = and <2 x i64> %a, < i64 -71776119077928961, i64 -71776119077928961> + ret <2 x i64> %tmp1 +} + +define <2 x i64> @and64imm4s_lsl24(<2 x i64> %a) { +;CHECK: bic {{v[0-9]+}}.4s, #0xfe, lsl #24 + %tmp1 = and <2 x i64> %a, < i64 144115183814443007, i64 144115183814443007> + ret <2 x i64> %tmp1 +} + +define <8 x i8> @and8imm4h_lsl0(<8 x i8> %a) { +;CHECK: bic {{v[0-9]+}}.4h, #0xff + %tmp1 = and <8 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255> + ret <8 x i8> %tmp1 +} + +define <8 x i8> @and8imm4h_lsl8(<8 x i8> %a) { +;CHECK: bic {{v[0-9]+}}.4h, #0xff, lsl #8 + %tmp1 = and <8 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0> + ret <8 x i8> %tmp1 +} + +define <2 x i32> @and16imm4h_lsl0(<2 x i32> %a) { +;CHECK: bic {{v[0-9]+}}.4h, #0xff + %tmp1 = and <2 x i32> %a, < i32 4278255360, i32 4278255360> + ret <2 x i32> %tmp1 +} + +define <2 x i32> @and16imm4h_lsl8(<2 x i32> %a) { +;CHECK: bic {{v[0-9]+}}.4h, #0xff, lsl #8 + %tmp1 = and <2 x i32> %a, < i32 16711935, i32 16711935> + ret <2 x i32> %tmp1 +} + +define <1 x i64> @and64imm4h_lsl0(<1 x i64> %a) { +;CHECK: bic {{v[0-9]+}}.4h, #0xff + %tmp1 = and <1 x i64> %a, < i64 -71777214294589696> + ret <1 x i64> %tmp1 +} + +define <1 x i64> @and64imm4h_lsl8(<1 x i64> %a) { +;CHECK: bic {{v[0-9]+}}.4h, #0xff, lsl #8 + %tmp1 = and <1 x i64> %a, < i64 71777214294589695> + ret <1 x i64> %tmp1 +} + +define <16 x i8> @and8imm8h_lsl0(<16 x i8> %a) { +;CHECK: bic {{v[0-9]+}}.8h, #0xff + %tmp1 = and <16 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255 > + ret <16 x i8> %tmp1 +} + +define <16 x i8> @and8imm8h_lsl8(<16 x i8> %a) { +;CHECK: bic {{v[0-9]+}}.8h, #0xff, lsl #8 + %tmp1 = and <16 x i8> %a, + ret <16 x i8> %tmp1 +} + +define <4 x i32> @and16imm8h_lsl0(<4 x i32> %a) { +;CHECK: bic {{v[0-9]+}}.8h, #0xff + %tmp1 = and <4 x i32> %a, < i32 4278255360, i32 4278255360, i32 4278255360, i32 4278255360> + ret <4 x i32> %tmp1 +} + +define <4 x i32> @and16imm8h_lsl8(<4 x i32> %a) { +;CHECK: bic {{v[0-9]+}}.8h, #0xff, lsl #8 + %tmp1 = and <4 x i32> %a, < i32 16711935, i32 16711935, i32 16711935, i32 16711935> + ret <4 x i32> %tmp1 +} + +define <2 x i64> @and64imm8h_lsl0(<2 x i64> %a) { +;CHECK: bic {{v[0-9]+}}.8h, #0xff + %tmp1 = and <2 x i64> %a, < i64 -71777214294589696, i64 -71777214294589696> + ret <2 x i64> %tmp1 +} + +define <2 x i64> @and64imm8h_lsl8(<2 x i64> %a) { +;CHECK: bic {{v[0-9]+}}.8h, #0xff, lsl #8 + %tmp1 = and <2 x i64> %a, < i64 71777214294589695, i64 71777214294589695> + ret <2 x i64> %tmp1 +} + +define <8 x i8> @orr8imm2s_lsl0(<8 x i8> %a) { +;CHECK: orr {{v[0-9]+}}.2s, #0xff + %tmp1 = or <8 x i8> %a, < i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0> + ret <8 x i8> %tmp1 +} + +define <8 x i8> @orr8imm2s_lsl8(<8 x i8> %a) { +;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #8 + %tmp1 = or <8 x i8> %a, < i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0> + ret <8 x i8> %tmp1 +} + +define <8 x i8> @orr8imm2s_lsl16(<8 x i8> %a) { +;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #16 + %tmp1 = or <8 x i8> %a, < i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0> + ret <8 x i8> %tmp1 +} + +define <8 x i8> @orr8imm2s_lsl24(<8 x i8> %a) { +;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #24 + %tmp1 = or <8 x i8> %a, < i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255> + ret <8 x i8> %tmp1 +} + +define <4 x i16> @orr16imm2s_lsl0(<4 x i16> %a) { +;CHECK: orr {{v[0-9]+}}.2s, #0xff + %tmp1 = or <4 x i16> %a, < i16 255, i16 0, i16 255, i16 0> + ret <4 x i16> %tmp1 +} + +define <4 x i16> @orr16imm2s_lsl8(<4 x i16> %a) { +;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #8 + %tmp1 = or <4 x i16> %a, < i16 65280, i16 0, i16 65280, i16 0> + ret <4 x i16> %tmp1 +} + +define <4 x i16> @orr16imm2s_lsl16(<4 x i16> %a) { +;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #16 + %tmp1 = or <4 x i16> %a, < i16 0, i16 255, i16 0, i16 255> + ret <4 x i16> %tmp1 +} + +define <4 x i16> @orr16imm2s_lsl24(<4 x i16> %a) { +;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #24 + %tmp1 = or <4 x i16> %a, < i16 0, i16 65280, i16 0, i16 65280> + ret <4 x i16> %tmp1 +} + +define <1 x i64> @orr64imm2s_lsl0(<1 x i64> %a) { +;CHECK: orr {{v[0-9]+}}.2s, #0xff + %tmp1 = or <1 x i64> %a, < i64 1095216660735> + ret <1 x i64> %tmp1 +} + +define <1 x i64> @orr64imm2s_lsl8(<1 x i64> %a) { +;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #8 + %tmp1 = or <1 x i64> %a, < i64 280375465148160> + ret <1 x i64> %tmp1 +} + +define <1 x i64> @orr64imm2s_lsl16(<1 x i64> %a) { +;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #16 + %tmp1 = or <1 x i64> %a, < i64 71776119077928960> + ret <1 x i64> %tmp1 +} + +define <1 x i64> @orr64imm2s_lsl24(<1 x i64> %a) { +;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #24 + %tmp1 = or <1 x i64> %a, < i64 -72057589759737856> + ret <1 x i64> %tmp1 +} + +define <16 x i8> @orr8imm4s_lsl0(<16 x i8> %a) { +;CHECK: orr {{v[0-9]+}}.4s, #0xff + %tmp1 = or <16 x i8> %a, < i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0> + ret <16 x i8> %tmp1 +} + +define <16 x i8> @orr8imm4s_lsl8(<16 x i8> %a) { +;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #8 + %tmp1 = or <16 x i8> %a, < i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0> + ret <16 x i8> %tmp1 +} + +define <16 x i8> @orr8imm4s_lsl16(<16 x i8> %a) { +;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #16 + %tmp1 = or <16 x i8> %a, < i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0> + ret <16 x i8> %tmp1 +} + +define <16 x i8> @orr8imm4s_lsl24(<16 x i8> %a) { +;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #24 + %tmp1 = or <16 x i8> %a, < i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255> + ret <16 x i8> %tmp1 +} + +define <8 x i16> @orr16imm4s_lsl0(<8 x i16> %a) { +;CHECK: orr {{v[0-9]+}}.4s, #0xff + %tmp1 = or <8 x i16> %a, < i16 255, i16 0, i16 255, i16 0, i16 255, i16 0, i16 255, i16 0> + ret <8 x i16> %tmp1 +} + +define <8 x i16> @orr16imm4s_lsl8(<8 x i16> %a) { +;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #8 + %tmp1 = or <8 x i16> %a, < i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0> + ret <8 x i16> %tmp1 +} + +define <8 x i16> @orr16imm4s_lsl16(<8 x i16> %a) { +;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #16 + %tmp1 = or <8 x i16> %a, < i16 0, i16 255, i16 0, i16 255, i16 0, i16 255, i16 0, i16 255> + ret <8 x i16> %tmp1 +} + +define <8 x i16> @orr16imm4s_lsl24(<8 x i16> %a) { +;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #24 + %tmp1 = or <8 x i16> %a, < i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280> + ret <8 x i16> %tmp1 +} + +define <2 x i64> @orr64imm4s_lsl0(<2 x i64> %a) { +;CHECK: orr {{v[0-9]+}}.4s, #0xff + %tmp1 = or <2 x i64> %a, < i64 1095216660735, i64 1095216660735> + ret <2 x i64> %tmp1 +} + +define <2 x i64> @orr64imm4s_lsl8(<2 x i64> %a) { +;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #8 + %tmp1 = or <2 x i64> %a, < i64 280375465148160, i64 280375465148160> + ret <2 x i64> %tmp1 +} + +define <2 x i64> @orr64imm4s_lsl16(<2 x i64> %a) { +;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #16 + %tmp1 = or <2 x i64> %a, < i64 71776119077928960, i64 71776119077928960> + ret <2 x i64> %tmp1 +} + +define <2 x i64> @orr64imm4s_lsl24(<2 x i64> %a) { +;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #24 + %tmp1 = or <2 x i64> %a, < i64 -72057589759737856, i64 -72057589759737856> + ret <2 x i64> %tmp1 +} + +define <8 x i8> @orr8imm4h_lsl0(<8 x i8> %a) { +;CHECK: orr {{v[0-9]+}}.4h, #0xff + %tmp1 = or <8 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0> + ret <8 x i8> %tmp1 +} + +define <8 x i8> @orr8imm4h_lsl8(<8 x i8> %a) { +;CHECK: orr {{v[0-9]+}}.4h, #0xff, lsl #8 + %tmp1 = or <8 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255> + ret <8 x i8> %tmp1 +} + +define <2 x i32> @orr16imm4h_lsl0(<2 x i32> %a) { +;CHECK: orr {{v[0-9]+}}.4h, #0xff + %tmp1 = or <2 x i32> %a, < i32 16711935, i32 16711935> + ret <2 x i32> %tmp1 +} + +define <2 x i32> @orr16imm4h_lsl8(<2 x i32> %a) { +;CHECK: orr {{v[0-9]+}}.4h, #0xff, lsl #8 + %tmp1 = or <2 x i32> %a, < i32 4278255360, i32 4278255360> + ret <2 x i32> %tmp1 +} + +define <1 x i64> @orr64imm4h_lsl0(<1 x i64> %a) { +;CHECK: orr {{v[0-9]+}}.4h, #0xff + %tmp1 = or <1 x i64> %a, < i64 71777214294589695> + ret <1 x i64> %tmp1 +} + +define <1 x i64> @orr64imm4h_lsl8(<1 x i64> %a) { +;CHECK: orr {{v[0-9]+}}.4h, #0xff, lsl #8 + %tmp1 = or <1 x i64> %a, < i64 -71777214294589696> + ret <1 x i64> %tmp1 +} + +define <16 x i8> @orr8imm8h_lsl0(<16 x i8> %a) { +;CHECK: orr {{v[0-9]+}}.8h, #0xff + %tmp1 = or <16 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0> + ret <16 x i8> %tmp1 +} + +define <16 x i8> @orr8imm8h_lsl8(<16 x i8> %a) { +;CHECK: orr {{v[0-9]+}}.8h, #0xff, lsl #8 + %tmp1 = or <16 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255> + ret <16 x i8> %tmp1 +} + +define <4 x i32> @orr16imm8h_lsl0(<4 x i32> %a) { +;CHECK: orr {{v[0-9]+}}.8h, #0xff + %tmp1 = or <4 x i32> %a, < i32 16711935, i32 16711935, i32 16711935, i32 16711935> + ret <4 x i32> %tmp1 +} + +define <4 x i32> @orr16imm8h_lsl8(<4 x i32> %a) { +;CHECK: orr {{v[0-9]+}}.8h, #0xff, lsl #8 + %tmp1 = or <4 x i32> %a, < i32 4278255360, i32 4278255360, i32 4278255360, i32 4278255360> + ret <4 x i32> %tmp1 +} + +define <2 x i64> @orr64imm8h_lsl0(<2 x i64> %a) { +;CHECK: orr {{v[0-9]+}}.8h, #0xff + %tmp1 = or <2 x i64> %a, < i64 71777214294589695, i64 71777214294589695> + ret <2 x i64> %tmp1 +} + +define <2 x i64> @orr64imm8h_lsl8(<2 x i64> %a) { +;CHECK: orr {{v[0-9]+}}.8h, #0xff, lsl #8 + %tmp1 = or <2 x i64> %a, < i64 -71777214294589696, i64 -71777214294589696> + ret <2 x i64> %tmp1 +} diff --git a/test/CodeGen/AArch64/neon-bsl.ll b/test/CodeGen/AArch64/neon-bsl.ll index 6bd923d..c55fd01 100644 --- a/test/CodeGen/AArch64/neon-bsl.ll +++ b/test/CodeGen/AArch64/neon-bsl.ll @@ -220,3 +220,16 @@ entry: ret <2 x double> %vbsl3.i } +define <2 x double> @test_bsl_v2f64(<2 x i1> %v1, <2 x double> %v2, <2 x double> %v3) { +; CHECK-LABEL: test_bsl_v2f64: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %1 = select <2 x i1> %v1, <2 x double> %v2, <2 x double> %v3 + ret <2 x double> %1 +} + +define <4 x float> @test_bsl_v4f32(<4 x i1> %v1, <4 x float> %v2, <4 x float> %v3) { +; CHECK-LABEL: test_bsl_v4f32: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %1 = select <4 x i1> %v1, <4 x float> %v2, <4 x float> %v3 + ret <4 x float> %1 +} diff --git a/test/CodeGen/AArch64/neon-copy.ll b/test/CodeGen/AArch64/neon-copy.ll index e18530e..b4d55df 100644 --- a/test/CodeGen/AArch64/neon-copy.ll +++ b/test/CodeGen/AArch64/neon-copy.ll @@ -2,269 +2,269 @@ define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) { -;CHECK: ins {{v[0-31]+}}.b[15], {{w[0-31]+}} +;CHECK: ins {{v[0-9]+}}.b[15], {{w[0-9]+}} %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15 ret <16 x i8> %tmp3 } define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) { -;CHECK: ins {{v[0-31]+}}.h[6], {{w[0-31]+}} +;CHECK: ins {{v[0-9]+}}.h[6], {{w[0-9]+}} %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6 ret <8 x i16> %tmp3 } define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) { -;CHECK: ins {{v[0-31]+}}.s[2], {{w[0-31]+}} +;CHECK: ins {{v[0-9]+}}.s[2], {{w[0-9]+}} %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2 ret <4 x i32> %tmp3 } define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) { -;CHECK: ins {{v[0-31]+}}.d[1], {{x[0-31]+}} +;CHECK: ins {{v[0-9]+}}.d[1], {{x[0-9]+}} %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1 ret <2 x i64> %tmp3 } define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) { -;CHECK: ins {{v[0-31]+}}.b[5], {{w[0-31]+}} +;CHECK: ins {{v[0-9]+}}.b[5], {{w[0-9]+}} %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5 ret <8 x i8> %tmp3 } define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) { -;CHECK: ins {{v[0-31]+}}.h[3], {{w[0-31]+}} +;CHECK: ins {{v[0-9]+}}.h[3], {{w[0-9]+}} %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3 ret <4 x i16> %tmp3 } define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) { -;CHECK: ins {{v[0-31]+}}.s[1], {{w[0-31]+}} +;CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}} %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 ret <2 x i32> %tmp3 } define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) { -;CHECK: ins {{v[0-31]+}}.b[15], {{v[0-31]+}}.b[2] +;CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2] %tmp3 = extractelement <16 x i8> %tmp1, i32 2 %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15 ret <16 x i8> %tmp4 } define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) { -;CHECK: ins {{v[0-31]+}}.h[7], {{v[0-31]+}}.h[2] +;CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2] %tmp3 = extractelement <8 x i16> %tmp1, i32 2 %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7 ret <8 x i16> %tmp4 } define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) { -;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] +;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2] %tmp3 = extractelement <4 x i32> %tmp1, i32 2 %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1 ret <4 x i32> %tmp4 } define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) { -;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] +;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] %tmp3 = extractelement <2 x i64> %tmp1, i32 0 %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1 ret <2 x i64> %tmp4 } define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) { -;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] +;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2] %tmp3 = extractelement <4 x float> %tmp1, i32 2 %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1 ret <4 x float> %tmp4 } define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) { -;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] +;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] %tmp3 = extractelement <2 x double> %tmp1, i32 0 %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1 ret <2 x double> %tmp4 } define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) { -;CHECK: ins {{v[0-31]+}}.b[15], {{v[0-31]+}}.b[2] +;CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2] %tmp3 = extractelement <8 x i8> %tmp1, i32 2 %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15 ret <16 x i8> %tmp4 } define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) { -;CHECK: ins {{v[0-31]+}}.h[7], {{v[0-31]+}}.h[2] +;CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2] %tmp3 = extractelement <4 x i16> %tmp1, i32 2 %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7 ret <8 x i16> %tmp4 } define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) { -;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1] +;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1] %tmp3 = extractelement <2 x i32> %tmp1, i32 1 %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1 ret <4 x i32> %tmp4 } define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) { -;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] +;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] %tmp3 = extractelement <1 x i64> %tmp1, i32 0 %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1 ret <2 x i64> %tmp4 } define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) { -;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1] +;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1] %tmp3 = extractelement <2 x float> %tmp1, i32 1 %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1 ret <4 x float> %tmp4 } define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) { -;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] +;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] %tmp3 = extractelement <1 x double> %tmp1, i32 0 %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1 ret <2 x double> %tmp4 } define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) { -;CHECK: ins {{v[0-31]+}}.b[7], {{v[0-31]+}}.b[2] +;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2] %tmp3 = extractelement <16 x i8> %tmp1, i32 2 %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7 ret <8 x i8> %tmp4 } define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) { -;CHECK: ins {{v[0-31]+}}.h[3], {{v[0-31]+}}.h[2] +;CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2] %tmp3 = extractelement <8 x i16> %tmp1, i32 2 %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3 ret <4 x i16> %tmp4 } define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) { -;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] +;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2] %tmp3 = extractelement <4 x i32> %tmp1, i32 2 %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1 ret <2 x i32> %tmp4 } define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) { -;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0] +;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0] %tmp3 = extractelement <2 x i64> %tmp1, i32 0 %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0 ret <1 x i64> %tmp4 } define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) { -;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] +;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2] %tmp3 = extractelement <4 x float> %tmp1, i32 2 %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1 ret <2 x float> %tmp4 } define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) { -;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0] +;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0] %tmp3 = extractelement <2 x double> %tmp1, i32 0 %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0 ret <1 x double> %tmp4 } define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) { -;CHECK: ins {{v[0-31]+}}.b[4], {{v[0-31]+}}.b[2] +;CHECK: ins {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2] %tmp3 = extractelement <8 x i8> %tmp1, i32 2 %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4 ret <8 x i8> %tmp4 } define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) { -;CHECK: ins {{v[0-31]+}}.h[3], {{v[0-31]+}}.h[2] +;CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2] %tmp3 = extractelement <4 x i16> %tmp1, i32 2 %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3 ret <4 x i16> %tmp4 } define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) { -;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[0] +;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] %tmp3 = extractelement <2 x i32> %tmp1, i32 0 %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1 ret <2 x i32> %tmp4 } define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) { -;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0] +;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0] %tmp3 = extractelement <1 x i64> %tmp1, i32 0 %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0 ret <1 x i64> %tmp4 } define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) { -;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[0] +;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] %tmp3 = extractelement <2 x float> %tmp1, i32 0 %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1 ret <2 x float> %tmp4 } define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) { -;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0] +;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0] %tmp3 = extractelement <1 x double> %tmp1, i32 0 %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0 ret <1 x double> %tmp4 } define i32 @umovw16b(<16 x i8> %tmp1) { -;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.b[8] +;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8] %tmp3 = extractelement <16 x i8> %tmp1, i32 8 %tmp4 = zext i8 %tmp3 to i32 ret i32 %tmp4 } define i32 @umovw8h(<8 x i16> %tmp1) { -;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.h[2] +;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2] %tmp3 = extractelement <8 x i16> %tmp1, i32 2 %tmp4 = zext i16 %tmp3 to i32 ret i32 %tmp4 } define i32 @umovw4s(<4 x i32> %tmp1) { -;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.s[2] +;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[2] %tmp3 = extractelement <4 x i32> %tmp1, i32 2 ret i32 %tmp3 } define i64 @umovx2d(<2 x i64> %tmp1) { -;CHECK: umov {{x[0-31]+}}, {{v[0-31]+}}.d[0] +;CHECK: umov {{x[0-9]+}}, {{v[0-9]+}}.d[0] %tmp3 = extractelement <2 x i64> %tmp1, i32 0 ret i64 %tmp3 } define i32 @umovw8b(<8 x i8> %tmp1) { -;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.b[7] +;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[7] %tmp3 = extractelement <8 x i8> %tmp1, i32 7 %tmp4 = zext i8 %tmp3 to i32 ret i32 %tmp4 } define i32 @umovw4h(<4 x i16> %tmp1) { -;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.h[2] +;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2] %tmp3 = extractelement <4 x i16> %tmp1, i32 2 %tmp4 = zext i16 %tmp3 to i32 ret i32 %tmp4 } define i32 @umovw2s(<2 x i32> %tmp1) { -;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.s[1] +;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[1] %tmp3 = extractelement <2 x i32> %tmp1, i32 1 ret i32 %tmp3 } define i64 @umovx1d(<1 x i64> %tmp1) { -;CHECK: fmov {{x[0-31]+}}, {{d[0-31]+}} +;CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} %tmp3 = extractelement <1 x i64> %tmp1, i32 0 ret i64 %tmp3 } define i32 @smovw16b(<16 x i8> %tmp1) { -;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.b[8] +;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8] %tmp3 = extractelement <16 x i8> %tmp1, i32 8 %tmp4 = sext i8 %tmp3 to i32 %tmp5 = add i32 5, %tmp4 @@ -272,7 +272,7 @@ define i32 @smovw16b(<16 x i8> %tmp1) { } define i32 @smovw8h(<8 x i16> %tmp1) { -;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.h[2] +;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2] %tmp3 = extractelement <8 x i16> %tmp1, i32 2 %tmp4 = sext i16 %tmp3 to i32 %tmp5 = add i32 5, %tmp4 @@ -280,28 +280,28 @@ define i32 @smovw8h(<8 x i16> %tmp1) { } define i32 @smovx16b(<16 x i8> %tmp1) { -;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.b[8] +;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[8] %tmp3 = extractelement <16 x i8> %tmp1, i32 8 %tmp4 = sext i8 %tmp3 to i32 ret i32 %tmp4 } define i32 @smovx8h(<8 x i16> %tmp1) { -;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.h[2] +;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2] %tmp3 = extractelement <8 x i16> %tmp1, i32 2 %tmp4 = sext i16 %tmp3 to i32 ret i32 %tmp4 } define i64 @smovx4s(<4 x i32> %tmp1) { -;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.s[2] +;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2] %tmp3 = extractelement <4 x i32> %tmp1, i32 2 %tmp4 = sext i32 %tmp3 to i64 ret i64 %tmp4 } define i32 @smovw8b(<8 x i8> %tmp1) { -;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.b[4] +;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4] %tmp3 = extractelement <8 x i8> %tmp1, i32 4 %tmp4 = sext i8 %tmp3 to i32 %tmp5 = add i32 5, %tmp4 @@ -309,7 +309,7 @@ define i32 @smovw8b(<8 x i8> %tmp1) { } define i32 @smovw4h(<4 x i16> %tmp1) { -;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.h[2] +;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2] %tmp3 = extractelement <4 x i16> %tmp1, i32 2 %tmp4 = sext i16 %tmp3 to i32 %tmp5 = add i32 5, %tmp4 @@ -317,21 +317,21 @@ define i32 @smovw4h(<4 x i16> %tmp1) { } define i32 @smovx8b(<8 x i8> %tmp1) { -;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.b[6] +;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[6] %tmp3 = extractelement <8 x i8> %tmp1, i32 6 %tmp4 = sext i8 %tmp3 to i32 ret i32 %tmp4 } define i32 @smovx4h(<4 x i16> %tmp1) { -;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.h[2] +;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2] %tmp3 = extractelement <4 x i16> %tmp1, i32 2 %tmp4 = sext i16 %tmp3 to i32 ret i32 %tmp4 } define i64 @smovx2s(<2 x i32> %tmp1) { -;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.s[1] +;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1] %tmp3 = extractelement <2 x i32> %tmp1, i32 1 %tmp4 = sext i32 %tmp3 to i64 ret i64 %tmp4 @@ -612,4 +612,791 @@ define <1 x double> @test_bitcasti64tov1f64(i64 %in) { %res = bitcast i64 %in to <1 x double> ; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} ret <1 x double> %res -} \ No newline at end of file +} + +define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 { +; CHECK-LABEL: test_bitcastv8i8tov1f64: +; CHECK: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}} + %sub.i = sub <8 x i8> zeroinitializer, %a + %1 = bitcast <8 x i8> %sub.i to <1 x double> + %vcvt.i = fptosi <1 x double> %1 to <1 x i64> + ret <1 x i64> %vcvt.i +} + +define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 { +; CHECK-LABEL: test_bitcastv4i16tov1f64: +; CHECK: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}} + %sub.i = sub <4 x i16> zeroinitializer, %a + %1 = bitcast <4 x i16> %sub.i to <1 x double> + %vcvt.i = fptosi <1 x double> %1 to <1 x i64> + ret <1 x i64> %vcvt.i +} + +define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 { +; CHECK-LABEL: test_bitcastv2i32tov1f64: +; CHECK: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}} + %sub.i = sub <2 x i32> zeroinitializer, %a + %1 = bitcast <2 x i32> %sub.i to <1 x double> + %vcvt.i = fptosi <1 x double> %1 to <1 x i64> + ret <1 x i64> %vcvt.i +} + +define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 { +; CHECK-LABEL: test_bitcastv1i64tov1f64: +; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}} + %sub.i = sub <1 x i64> zeroinitializer, %a + %1 = bitcast <1 x i64> %sub.i to <1 x double> + %vcvt.i = fptosi <1 x double> %1 to <1 x i64> + ret <1 x i64> %vcvt.i +} + +define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 { +; CHECK-LABEL: test_bitcastv2f32tov1f64: +; CHECK: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}} + %sub.i = fsub <2 x float> , %a + %1 = bitcast <2 x float> %sub.i to <1 x double> + %vcvt.i = fptosi <1 x double> %1 to <1 x i64> + ret <1 x i64> %vcvt.i +} + +define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 { +; CHECK-LABEL: test_bitcastv1f64tov8i8: +; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NEXT: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %vcvt.i = sitofp <1 x i64> %a to <1 x double> + %1 = bitcast <1 x double> %vcvt.i to <8 x i8> + %sub.i = sub <8 x i8> zeroinitializer, %1 + ret <8 x i8> %sub.i +} + +define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 { +; CHECK-LABEL: test_bitcastv1f64tov4i16: +; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NEXT: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %vcvt.i = sitofp <1 x i64> %a to <1 x double> + %1 = bitcast <1 x double> %vcvt.i to <4 x i16> + %sub.i = sub <4 x i16> zeroinitializer, %1 + ret <4 x i16> %sub.i +} + +define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 { +; CHECK-LABEL: test_bitcastv1f64tov2i32: +; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NEXT: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %vcvt.i = sitofp <1 x i64> %a to <1 x double> + %1 = bitcast <1 x double> %vcvt.i to <2 x i32> + %sub.i = sub <2 x i32> zeroinitializer, %1 + ret <2 x i32> %sub.i +} + +define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 { +; CHECK-LABEL: test_bitcastv1f64tov1i64: +; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NEXT: neg {{d[0-9]+}}, {{d[0-9]+}} + %vcvt.i = sitofp <1 x i64> %a to <1 x double> + %1 = bitcast <1 x double> %vcvt.i to <1 x i64> + %sub.i = sub <1 x i64> zeroinitializer, %1 + ret <1 x i64> %sub.i +} + +define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 { +; CHECK-LABEL: test_bitcastv1f64tov2f32: +; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NEXT: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %vcvt.i = sitofp <1 x i64> %a to <1 x double> + %1 = bitcast <1 x double> %vcvt.i to <2 x float> + %sub.i = fsub <2 x float> , %1 + ret <2 x float> %sub.i +} + +; Test insert element into an undef vector +define <8 x i8> @scalar_to_vector.v8i8(i8 %a) { +; CHECK-LABEL: scalar_to_vector.v8i8: +; CHECK: ins {{v[0-9]+}}.b[0], {{w[0-9]+}} + %b = insertelement <8 x i8> undef, i8 %a, i32 0 + ret <8 x i8> %b +} + +define <16 x i8> @scalar_to_vector.v16i8(i8 %a) { +; CHECK-LABEL: scalar_to_vector.v16i8: +; CHECK: ins {{v[0-9]+}}.b[0], {{w[0-9]+}} + %b = insertelement <16 x i8> undef, i8 %a, i32 0 + ret <16 x i8> %b +} + +define <4 x i16> @scalar_to_vector.v4i16(i16 %a) { +; CHECK-LABEL: scalar_to_vector.v4i16: +; CHECK: ins {{v[0-9]+}}.h[0], {{w[0-9]+}} + %b = insertelement <4 x i16> undef, i16 %a, i32 0 + ret <4 x i16> %b +} + +define <8 x i16> @scalar_to_vector.v8i16(i16 %a) { +; CHECK-LABEL: scalar_to_vector.v8i16: +; CHECK: ins {{v[0-9]+}}.h[0], {{w[0-9]+}} + %b = insertelement <8 x i16> undef, i16 %a, i32 0 + ret <8 x i16> %b +} + +define <2 x i32> @scalar_to_vector.v2i32(i32 %a) { +; CHECK-LABEL: scalar_to_vector.v2i32: +; CHECK: ins {{v[0-9]+}}.s[0], {{w[0-9]+}} + %b = insertelement <2 x i32> undef, i32 %a, i32 0 + ret <2 x i32> %b +} + +define <4 x i32> @scalar_to_vector.v4i32(i32 %a) { +; CHECK-LABEL: scalar_to_vector.v4i32: +; CHECK: ins {{v[0-9]+}}.s[0], {{w[0-9]+}} + %b = insertelement <4 x i32> undef, i32 %a, i32 0 + ret <4 x i32> %b +} + +define <2 x i64> @scalar_to_vector.v2i64(i64 %a) { +; CHECK-LABEL: scalar_to_vector.v2i64: +; CHECK: ins {{v[0-9]+}}.d[0], {{x[0-9]+}} + %b = insertelement <2 x i64> undef, i64 %a, i32 0 + ret <2 x i64> %b +} + +define <8 x i8> @testDUP.v1i8(<1 x i8> %a) { +; CHECK-LABEL: testDUP.v1i8: +; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}} + %b = extractelement <1 x i8> %a, i32 0 + %c = insertelement <8 x i8> undef, i8 %b, i32 0 + %d = insertelement <8 x i8> %c, i8 %b, i32 1 + %e = insertelement <8 x i8> %d, i8 %b, i32 2 + %f = insertelement <8 x i8> %e, i8 %b, i32 3 + %g = insertelement <8 x i8> %f, i8 %b, i32 4 + %h = insertelement <8 x i8> %g, i8 %b, i32 5 + %i = insertelement <8 x i8> %h, i8 %b, i32 6 + %j = insertelement <8 x i8> %i, i8 %b, i32 7 + ret <8 x i8> %j +} + +define <8 x i16> @testDUP.v1i16(<1 x i16> %a) { +; CHECK-LABEL: testDUP.v1i16: +; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}} + %b = extractelement <1 x i16> %a, i32 0 + %c = insertelement <8 x i16> undef, i16 %b, i32 0 + %d = insertelement <8 x i16> %c, i16 %b, i32 1 + %e = insertelement <8 x i16> %d, i16 %b, i32 2 + %f = insertelement <8 x i16> %e, i16 %b, i32 3 + %g = insertelement <8 x i16> %f, i16 %b, i32 4 + %h = insertelement <8 x i16> %g, i16 %b, i32 5 + %i = insertelement <8 x i16> %h, i16 %b, i32 6 + %j = insertelement <8 x i16> %i, i16 %b, i32 7 + ret <8 x i16> %j +} + +define <4 x i32> @testDUP.v1i32(<1 x i32> %a) { +; CHECK-LABEL: testDUP.v1i32: +; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}} + %b = extractelement <1 x i32> %a, i32 0 + %c = insertelement <4 x i32> undef, i32 %b, i32 0 + %d = insertelement <4 x i32> %c, i32 %b, i32 1 + %e = insertelement <4 x i32> %d, i32 %b, i32 2 + %f = insertelement <4 x i32> %e, i32 %b, i32 3 + ret <4 x i32> %f +} + +define <8 x i8> @getl(<16 x i8> %x) #0 { +; CHECK-LABEL: getl: +; CHECK: ret + %vecext = extractelement <16 x i8> %x, i32 0 + %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0 + %vecext1 = extractelement <16 x i8> %x, i32 1 + %vecinit2 = insertelement <8 x i8> %vecinit, i8 %vecext1, i32 1 + %vecext3 = extractelement <16 x i8> %x, i32 2 + %vecinit4 = insertelement <8 x i8> %vecinit2, i8 %vecext3, i32 2 + %vecext5 = extractelement <16 x i8> %x, i32 3 + %vecinit6 = insertelement <8 x i8> %vecinit4, i8 %vecext5, i32 3 + %vecext7 = extractelement <16 x i8> %x, i32 4 + %vecinit8 = insertelement <8 x i8> %vecinit6, i8 %vecext7, i32 4 + %vecext9 = extractelement <16 x i8> %x, i32 5 + %vecinit10 = insertelement <8 x i8> %vecinit8, i8 %vecext9, i32 5 + %vecext11 = extractelement <16 x i8> %x, i32 6 + %vecinit12 = insertelement <8 x i8> %vecinit10, i8 %vecext11, i32 6 + %vecext13 = extractelement <16 x i8> %x, i32 7 + %vecinit14 = insertelement <8 x i8> %vecinit12, i8 %vecext13, i32 7 + ret <8 x i8> %vecinit14 +} + +define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) { +; CHECK-LABEL: test_dup_v2i32_v4i16: +; CHECK: dup v0.4h, v0.h[2] +entry: + %x = extractelement <2 x i32> %a, i32 1 + %vget_lane = trunc i32 %x to i16 + %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0 + %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1 + %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2 + %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3 + ret <4 x i16> %vecinit3.i +} + +define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) { +; CHECK-LABEL: test_dup_v4i32_v8i16: +; CHECK: dup v0.8h, v0.h[6] +entry: + %x = extractelement <4 x i32> %a, i32 3 + %vget_lane = trunc i32 %x to i16 + %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0 + %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1 + %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2 + %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3 + %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4 + %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5 + %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6 + %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7 + ret <8 x i16> %vecinit7.i +} + +define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) { +; CHECK-LABEL: test_dup_v1i64_v4i16: +; CHECK: dup v0.4h, v0.h[0] +entry: + %x = extractelement <1 x i64> %a, i32 0 + %vget_lane = trunc i64 %x to i16 + %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0 + %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1 + %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2 + %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3 + ret <4 x i16> %vecinit3.i +} + +define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) { +; CHECK-LABEL: test_dup_v1i64_v2i32: +; CHECK: dup v0.2s, v0.s[0] +entry: + %x = extractelement <1 x i64> %a, i32 0 + %vget_lane = trunc i64 %x to i32 + %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0 + %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1 + ret <2 x i32> %vecinit1.i +} + +define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) { +; CHECK-LABEL: test_dup_v2i64_v8i16: +; CHECK: dup v0.8h, v0.h[4] +entry: + %x = extractelement <2 x i64> %a, i32 1 + %vget_lane = trunc i64 %x to i16 + %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0 + %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1 + %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2 + %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3 + %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4 + %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5 + %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6 + %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7 + ret <8 x i16> %vecinit7.i +} + +define <4 x i32> @test_dup_v2i64_v4i32(<2 x i64> %a) { +; CHECK-LABEL: test_dup_v2i64_v4i32: +; CHECK: dup v0.4s, v0.s[2] +entry: + %x = extractelement <2 x i64> %a, i32 1 + %vget_lane = trunc i64 %x to i32 + %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0 + %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1 + %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2 + %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3 + ret <4 x i32> %vecinit3.i +} + +define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) { +; CHECK-LABEL: test_dup_v4i32_v4i16: +; CHECK: dup v0.4h, v0.h[2] +entry: + %x = extractelement <4 x i32> %a, i32 1 + %vget_lane = trunc i32 %x to i16 + %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0 + %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1 + %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2 + %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3 + ret <4 x i16> %vecinit3.i +} + +define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) { +; CHECK-LABEL: test_dup_v2i64_v4i16: +; CHECK: dup v0.4h, v0.h[0] +entry: + %x = extractelement <2 x i64> %a, i32 0 + %vget_lane = trunc i64 %x to i16 + %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0 + %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1 + %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2 + %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3 + ret <4 x i16> %vecinit3.i +} + +define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) { +; CHECK-LABEL: test_dup_v2i64_v2i32: +; CHECK: dup v0.2s, v0.s[0] +entry: + %x = extractelement <2 x i64> %a, i32 0 + %vget_lane = trunc i64 %x to i32 + %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0 + %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1 + ret <2 x i32> %vecinit1.i +} + + +define <2 x float> @test_scalar_to_vector_f32_to_v2f32(<2 x float> %a) { +; CHECK-LABEL: test_scalar_to_vector_f32_to_v2f32: +; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s +; CHECK-NEXT: ret +entry: + %0 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a) + %1 = insertelement <1 x float> undef, float %0, i32 0 + %2 = extractelement <1 x float> %1, i32 0 + %vecinit1.i = insertelement <2 x float> undef, float %2, i32 0 + ret <2 x float> %vecinit1.i +} + +define <4 x float> @test_scalar_to_vector_f32_to_v4f32(<2 x float> %a) { +; CHECK-LABEL: test_scalar_to_vector_f32_to_v4f32: +; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s +; CHECK-NEXT: ret +entry: + %0 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a) + %1 = insertelement <1 x float> undef, float %0, i32 0 + %2 = extractelement <1 x float> %1, i32 0 + %vecinit1.i = insertelement <4 x float> undef, float %2, i32 0 + ret <4 x float> %vecinit1.i +} + +declare float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float>) + +define <2 x i32> @test_concat_undef_v1i32(<1 x i32> %a) { +; CHECK-LABEL: test_concat_undef_v1i32: +; CHECK: ins v{{[0-9]+}}.s[1], v{{[0-9]+}}.s[0] +entry: + %0 = extractelement <1 x i32> %a, i32 0 + %vecinit1.i = insertelement <2 x i32> undef, i32 %0, i32 1 + ret <2 x i32> %vecinit1.i +} + +declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>) #4 + +define <2 x i32> @test_concat_v1i32_undef(<1 x i32> %a) { +; CHECK-LABEL: test_concat_v1i32_undef: +; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} +; CHECK-NEXT: ret +entry: + %b = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %a) + %0 = extractelement <1 x i32> %b, i32 0 + %vecinit.i432 = insertelement <2 x i32> undef, i32 %0, i32 0 + ret <2 x i32> %vecinit.i432 +} + +define <2 x i32> @test_concat_same_v1i32_v1i32(<1 x i32> %a) { +; CHECK-LABEL: test_concat_same_v1i32_v1i32: +; CHECK: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0] +entry: + %0 = extractelement <1 x i32> %a, i32 0 + %vecinit.i = insertelement <2 x i32> undef, i32 %0, i32 0 + %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %0, i32 1 + ret <2 x i32> %vecinit1.i +} + +define <2 x i32> @test_concat_diff_v1i32_v1i32(<1 x i32> %a, <1 x i32> %b) { +; CHECK-LABEL: test_concat_diff_v1i32_v1i32: +; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} +; CHECK-NEXT: sqabs s{{[0-9]+}}, s{{[0-9]+}} +; CHECK-NEXT: ins v0.s[1], v1.s[0] +entry: + %c = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %a) + %d = extractelement <1 x i32> %c, i32 0 + %e = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %b) + %f = extractelement <1 x i32> %e, i32 0 + %h = shufflevector <1 x i32> %c, <1 x i32> %e, <2 x i32> + ret <2 x i32> %h +} + +define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 { +; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> + ret <16 x i8> %vecinit30 +} + +define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { +; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecext = extractelement <8 x i8> %x, i32 0 + %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0 + %vecext1 = extractelement <8 x i8> %x, i32 1 + %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1 + %vecext3 = extractelement <8 x i8> %x, i32 2 + %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2 + %vecext5 = extractelement <8 x i8> %x, i32 3 + %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3 + %vecext7 = extractelement <8 x i8> %x, i32 4 + %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4 + %vecext9 = extractelement <8 x i8> %x, i32 5 + %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5 + %vecext11 = extractelement <8 x i8> %x, i32 6 + %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6 + %vecext13 = extractelement <8 x i8> %x, i32 7 + %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7 + %vecinit30 = shufflevector <16 x i8> %vecinit14, <16 x i8> %y, <16 x i32> + ret <16 x i8> %vecinit30 +} + +define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 { +; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecext = extractelement <16 x i8> %x, i32 0 + %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0 + %vecext1 = extractelement <16 x i8> %x, i32 1 + %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1 + %vecext3 = extractelement <16 x i8> %x, i32 2 + %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2 + %vecext5 = extractelement <16 x i8> %x, i32 3 + %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3 + %vecext7 = extractelement <16 x i8> %x, i32 4 + %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4 + %vecext9 = extractelement <16 x i8> %x, i32 5 + %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5 + %vecext11 = extractelement <16 x i8> %x, i32 6 + %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6 + %vecext13 = extractelement <16 x i8> %x, i32 7 + %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7 + %vecext15 = extractelement <8 x i8> %y, i32 0 + %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8 + %vecext17 = extractelement <8 x i8> %y, i32 1 + %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9 + %vecext19 = extractelement <8 x i8> %y, i32 2 + %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10 + %vecext21 = extractelement <8 x i8> %y, i32 3 + %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11 + %vecext23 = extractelement <8 x i8> %y, i32 4 + %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12 + %vecext25 = extractelement <8 x i8> %y, i32 5 + %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13 + %vecext27 = extractelement <8 x i8> %y, i32 6 + %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14 + %vecext29 = extractelement <8 x i8> %y, i32 7 + %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15 + ret <16 x i8> %vecinit30 +} + +define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 { +; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecext = extractelement <8 x i8> %x, i32 0 + %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0 + %vecext1 = extractelement <8 x i8> %x, i32 1 + %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1 + %vecext3 = extractelement <8 x i8> %x, i32 2 + %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2 + %vecext5 = extractelement <8 x i8> %x, i32 3 + %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3 + %vecext7 = extractelement <8 x i8> %x, i32 4 + %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4 + %vecext9 = extractelement <8 x i8> %x, i32 5 + %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5 + %vecext11 = extractelement <8 x i8> %x, i32 6 + %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6 + %vecext13 = extractelement <8 x i8> %x, i32 7 + %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7 + %vecext15 = extractelement <8 x i8> %y, i32 0 + %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8 + %vecext17 = extractelement <8 x i8> %y, i32 1 + %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9 + %vecext19 = extractelement <8 x i8> %y, i32 2 + %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10 + %vecext21 = extractelement <8 x i8> %y, i32 3 + %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11 + %vecext23 = extractelement <8 x i8> %y, i32 4 + %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12 + %vecext25 = extractelement <8 x i8> %y, i32 5 + %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13 + %vecext27 = extractelement <8 x i8> %y, i32 6 + %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14 + %vecext29 = extractelement <8 x i8> %y, i32 7 + %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15 + ret <16 x i8> %vecinit30 +} + +define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 { +; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> + ret <8 x i16> %vecinit14 +} + +define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 { +; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecext = extractelement <4 x i16> %x, i32 0 + %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0 + %vecext1 = extractelement <4 x i16> %x, i32 1 + %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1 + %vecext3 = extractelement <4 x i16> %x, i32 2 + %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2 + %vecext5 = extractelement <4 x i16> %x, i32 3 + %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3 + %vecinit14 = shufflevector <8 x i16> %vecinit6, <8 x i16> %y, <8 x i32> + ret <8 x i16> %vecinit14 +} + +define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 { +; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecext = extractelement <8 x i16> %x, i32 0 + %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0 + %vecext1 = extractelement <8 x i16> %x, i32 1 + %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1 + %vecext3 = extractelement <8 x i16> %x, i32 2 + %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2 + %vecext5 = extractelement <8 x i16> %x, i32 3 + %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3 + %vecext7 = extractelement <4 x i16> %y, i32 0 + %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4 + %vecext9 = extractelement <4 x i16> %y, i32 1 + %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5 + %vecext11 = extractelement <4 x i16> %y, i32 2 + %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6 + %vecext13 = extractelement <4 x i16> %y, i32 3 + %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7 + ret <8 x i16> %vecinit14 +} + +define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 { +; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecext = extractelement <4 x i16> %x, i32 0 + %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0 + %vecext1 = extractelement <4 x i16> %x, i32 1 + %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1 + %vecext3 = extractelement <4 x i16> %x, i32 2 + %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2 + %vecext5 = extractelement <4 x i16> %x, i32 3 + %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3 + %vecext7 = extractelement <4 x i16> %y, i32 0 + %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4 + %vecext9 = extractelement <4 x i16> %y, i32 1 + %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5 + %vecext11 = extractelement <4 x i16> %y, i32 2 + %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6 + %vecext13 = extractelement <4 x i16> %y, i32 3 + %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7 + ret <8 x i16> %vecinit14 +} + +define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 { +; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> + ret <4 x i32> %vecinit6 +} + +define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 { +; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecext = extractelement <2 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <2 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecinit6 = shufflevector <4 x i32> %vecinit2, <4 x i32> %y, <4 x i32> + ret <4 x i32> %vecinit6 +} + +define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 { +; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <4 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecext3 = extractelement <2 x i32> %y, i32 0 + %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2 + %vecext5 = extractelement <2 x i32> %y, i32 1 + %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3 + ret <4 x i32> %vecinit6 +} + +define <4 x i32> @test_concat_v4i32_v2i32_v2i32(<2 x i32> %x, <2 x i32> %y) #0 { +; CHECK-LABEL: test_concat_v4i32_v2i32_v2i32: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecext = extractelement <2 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <2 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecext3 = extractelement <2 x i32> %y, i32 0 + %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2 + %vecext5 = extractelement <2 x i32> %y, i32 1 + %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3 + ret <4 x i32> %vecinit6 +} + +define <2 x i64> @test_concat_v2i64_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) #0 { +; CHECK-LABEL: test_concat_v2i64_v2i64_v2i64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecinit2 = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> + ret <2 x i64> %vecinit2 +} + +define <2 x i64> @test_concat_v2i64_v1i64_v2i64(<1 x i64> %x, <2 x i64> %y) #0 { +; CHECK-LABEL: test_concat_v2i64_v1i64_v2i64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecext = extractelement <1 x i64> %x, i32 0 + %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0 + %vecinit2 = shufflevector <2 x i64> %vecinit, <2 x i64> %y, <2 x i32> + ret <2 x i64> %vecinit2 +} + +define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 { +; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecext = extractelement <2 x i64> %x, i32 0 + %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0 + %vecext1 = extractelement <1 x i64> %y, i32 0 + %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1 + ret <2 x i64> %vecinit2 +} + +define <2 x i64> @test_concat_v2i64_v1i64_v1i64(<1 x i64> %x, <1 x i64> %y) #0 { +; CHECK-LABEL: test_concat_v2i64_v1i64_v1i64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %vecext = extractelement <1 x i64> %x, i32 0 + %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0 + %vecext1 = extractelement <1 x i64> %y, i32 0 + %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1 + ret <2 x i64> %vecinit2 +} + +declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>) + +; This case tests the copy of two FPR8 registers, which is implemented by fmov +; of two FPR32 registers. +define <1 x i8> @test_copy_FPR8_FPR8(<1 x i8> %a, <1 x i8> %b) { +; CHECK-LABEL: test_copy_FPR8_FPR8: +; CHECK: usqadd b1, b0 +; CHECK-NEXT: fmov s0, s1 +entry: + %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %b, <1 x i8> %a) + ret <1 x i8> %vsqadd2.i +} + +declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>) + +define <1 x i16> @test_copy_FPR16_FPR16(<1 x i16> %a, <1 x i16> %b) { +; CHECK-LABEL: test_copy_FPR16_FPR16: +; CHECK: usqadd h1, h0 +; CHECK-NEXT: fmov s0, s1 +entry: + %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %b, <1 x i16> %a) + ret <1 x i16> %vsqadd2.i +} + +define <4 x i16> @concat_vector_v4i16_const() { +; CHECK-LABEL: concat_vector_v4i16_const: +; CHECK: dup {{v[0-9]+}}.4h, wzr + %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer + ret <4 x i16> %r +} + +define <4 x i16> @concat_vector_v4i16_const_one() { +; CHECK-LABEL: concat_vector_v4i16_const_one: +; CHECK: movz {{w[0-9]+}}, #1 +; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}} + %r = shufflevector <1 x i16> , <1 x i16> undef, <4 x i32> zeroinitializer + ret <4 x i16> %r +} + +define <4 x i32> @concat_vector_v4i32_const() { +; CHECK-LABEL: concat_vector_v4i32_const: +; CHECK: dup {{v[0-9]+}}.4s, wzr + %r = shufflevector <1 x i32> zeroinitializer, <1 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %r +} + +define <8 x i8> @concat_vector_v8i8_const() { +; CHECK-LABEL: concat_vector_v8i8_const: +; CHECK: dup {{v[0-9]+}}.8b, wzr + %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer + ret <8 x i8> %r +} + +define <8 x i16> @concat_vector_v8i16_const() { +; CHECK-LABEL: concat_vector_v8i16_const: +; CHECK: dup {{v[0-9]+}}.8h, wzr + %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %r +} + +define <8 x i16> @concat_vector_v8i16_const_one() { +; CHECK-LABEL: concat_vector_v8i16_const_one: +; CHECK: movz {{w[0-9]+}}, #1 +; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}} + %r = shufflevector <1 x i16> , <1 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %r +} + +define <16 x i8> @concat_vector_v16i8_const() { +; CHECK-LABEL: concat_vector_v16i8_const: +; CHECK: dup {{v[0-9]+}}.16b, wzr + %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <16 x i32> zeroinitializer + ret <16 x i8> %r +} + +define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) { +; CHECK-LABEL: concat_vector_v4i16: +; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] + %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer + ret <4 x i16> %r +} + +define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) { +; CHECK-LABEL: concat_vector_v4i32: +; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] + %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %r +} + +define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) { +; CHECK-LABEL: concat_vector_v8i8: +; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[0] + %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer + ret <8 x i8> %r +} + +define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) { +; CHECK-LABEL: concat_vector_v8i16: +; CHECK: dup {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] + %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %r +} + +define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) { +; CHECK-LABEL: concat_vector_v16i8: +; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[0] + %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer + ret <16 x i8> %r +} diff --git a/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll b/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll new file mode 100644 index 0000000..4dffcd1 --- /dev/null +++ b/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll @@ -0,0 +1,47 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <4 x i32> @copyTuple.QPair(i8* %a, i8* %b) { +; CHECK-LABEL: copyTuple.QPair: +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %a, <4 x i32> , <4 x i32> , i32 0, i32 4) + %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0 + %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> , i32 1, i32 4) + %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0 + ret <4 x i32> %vld1.fca.0.extract +} + +define <4 x i32> @copyTuple.QTriple(i8* %a, i8* %b, <4 x i32> %c) { +; CHECK-LABEL: copyTuple.QTriple: +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %a, <4 x i32> , <4 x i32> %c, <4 x i32> %c, i32 0, i32 4) + %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0 + %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> , <4 x i32> %c, i32 1, i32 4) + %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0 + ret <4 x i32> %vld1.fca.0.extract +} + +define <4 x i32> @copyTuple.QQuad(i8* %a, i8* %b, <4 x i32> %c) { +; CHECK-LABEL: copyTuple.QQuad: +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %a, <4 x i32> , <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4) + %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0 + %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> , <4 x i32> %c, <4 x i32> %c, i32 1, i32 4) + %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0 + ret <4 x i32> %vld1.fca.0.extract +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) \ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-crypto.ll b/test/CodeGen/AArch64/neon-crypto.ll index 0283e0e..c0014fa 100644 --- a/test/CodeGen/AArch64/neon-crypto.ll +++ b/test/CodeGen/AArch64/neon-crypto.ll @@ -1,40 +1,40 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -mattr=+crypto | FileCheck %s ; RUN: not llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s -declare <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32>, <4 x i32>, <4 x i32>) #1 -declare <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32>, <4 x i32>, <4 x i32>) #1 -declare <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.neon.sha256h(<4 x i32>, <4 x i32>, <4 x i32>) #1 -declare <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32>, <4 x i32>, <4 x i32>) #1 -declare <4 x i32> @llvm.aarch64.neon.sha1m(<4 x i32>, <1 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.neon.sha1m(<4 x i32>, i32, <4 x i32>) #1 -declare <4 x i32> @llvm.aarch64.neon.sha1p(<4 x i32>, <1 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.neon.sha1p(<4 x i32>, i32, <4 x i32>) #1 -declare <4 x i32> @llvm.aarch64.neon.sha1c(<4 x i32>, <1 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.neon.sha1c(<4 x i32>, i32, <4 x i32>) #1 -declare <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32>, <4 x i32>) #1 -declare <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32>, <4 x i32>) #1 +declare <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32>, <4 x i32>) #1 -declare <1 x i32> @llvm.arm.neon.sha1h.v1i32(<1 x i32>) #1 +declare i32 @llvm.arm.neon.sha1h(i32) #1 -declare <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8>) #1 +declare <16 x i8> @llvm.arm.neon.aesimc(<16 x i8>) #1 -declare <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8>) #1 +declare <16 x i8> @llvm.arm.neon.aesmc(<16 x i8>) #1 -declare <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.neon.aesd(<16 x i8>, <16 x i8>) #1 -declare <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8>, <16 x i8>) #1 +declare <16 x i8> @llvm.arm.neon.aese(<16 x i8>, <16 x i8>) #1 define <16 x i8> @test_vaeseq_u8(<16 x i8> %data, <16 x i8> %key) { ; CHECK: test_vaeseq_u8: ; CHECK: aese {{v[0-9]+}}.16b, {{v[0-9]+}}.16b ; CHECK-NO-CRYPTO: Cannot select: intrinsic %llvm.arm.neon.aese entry: - %aese.i = tail call <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8> %data, <16 x i8> %key) + %aese.i = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data, <16 x i8> %key) ret <16 x i8> %aese.i } @@ -42,7 +42,7 @@ define <16 x i8> @test_vaesdq_u8(<16 x i8> %data, <16 x i8> %key) { ; CHECK: test_vaesdq_u8: ; CHECK: aesd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b entry: - %aesd.i = tail call <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8> %data, <16 x i8> %key) + %aesd.i = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data, <16 x i8> %key) ret <16 x i8> %aesd.i } @@ -50,7 +50,7 @@ define <16 x i8> @test_vaesmcq_u8(<16 x i8> %data) { ; CHECK: test_vaesmcq_u8: ; CHECK: aesmc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b entry: - %aesmc.i = tail call <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8> %data) + %aesmc.i = tail call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %data) ret <16 x i8> %aesmc.i } @@ -58,7 +58,7 @@ define <16 x i8> @test_vaesimcq_u8(<16 x i8> %data) { ; CHECK: test_vaesimcq_u8: ; CHECK: aesimc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b entry: - %aesimc.i = tail call <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8> %data) + %aesimc.i = tail call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %data) ret <16 x i8> %aesimc.i } @@ -66,17 +66,15 @@ define i32 @test_vsha1h_u32(i32 %hash_e) { ; CHECK: test_vsha1h_u32: ; CHECK: sha1h {{s[0-9]+}}, {{s[0-9]+}} entry: - %sha1h.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0 - %sha1h1.i = tail call <1 x i32> @llvm.arm.neon.sha1h.v1i32(<1 x i32> %sha1h.i) - %0 = extractelement <1 x i32> %sha1h1.i, i32 0 - ret i32 %0 + %sha1h1.i = tail call i32 @llvm.arm.neon.sha1h(i32 %hash_e) + ret i32 %sha1h1.i } define <4 x i32> @test_vsha1su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w12_15) { ; CHECK: test_vsha1su1q_u32: ; CHECK: sha1su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s entry: - %sha1su12.i = tail call <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32> %tw0_3, <4 x i32> %w12_15) + %sha1su12.i = tail call <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32> %tw0_3, <4 x i32> %w12_15) ret <4 x i32> %sha1su12.i } @@ -84,7 +82,7 @@ define <4 x i32> @test_vsha256su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7) { ; CHECK: test_vsha256su0q_u32: ; CHECK: sha256su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s entry: - %sha256su02.i = tail call <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32> %w0_3, <4 x i32> %w4_7) + %sha256su02.i = tail call <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) ret <4 x i32> %sha256su02.i } @@ -92,8 +90,7 @@ define <4 x i32> @test_vsha1cq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> ; CHECK: test_vsha1cq_u32: ; CHECK: sha1c {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s entry: - %sha1c.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0 - %sha1c1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1c(<4 x i32> %hash_abcd, <1 x i32> %sha1c.i, <4 x i32> %wk) + %sha1c1.i = tail call <4 x i32> @llvm.arm.neon.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) ret <4 x i32> %sha1c1.i } @@ -101,8 +98,7 @@ define <4 x i32> @test_vsha1pq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> ; CHECK: test_vsha1pq_u32: ; CHECK: sha1p {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s entry: - %sha1p.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0 - %sha1p1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1p(<4 x i32> %hash_abcd, <1 x i32> %sha1p.i, <4 x i32> %wk) + %sha1p1.i = tail call <4 x i32> @llvm.arm.neon.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) ret <4 x i32> %sha1p1.i } @@ -110,8 +106,7 @@ define <4 x i32> @test_vsha1mq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> ; CHECK: test_vsha1mq_u32: ; CHECK: sha1m {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s entry: - %sha1m.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0 - %sha1m1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1m(<4 x i32> %hash_abcd, <1 x i32> %sha1m.i, <4 x i32> %wk) + %sha1m1.i = tail call <4 x i32> @llvm.arm.neon.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) ret <4 x i32> %sha1m1.i } @@ -119,7 +114,7 @@ define <4 x i32> @test_vsha1su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> ; CHECK: test_vsha1su0q_u32: ; CHECK: sha1su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s entry: - %sha1su03.i = tail call <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11) + %sha1su03.i = tail call <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11) ret <4 x i32> %sha1su03.i } @@ -127,7 +122,7 @@ define <4 x i32> @test_vsha256hq_u32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, ; CHECK: test_vsha256hq_u32: ; CHECK: sha256h {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s entry: - %sha256h3.i = tail call <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) + %sha256h3.i = tail call <4 x i32> @llvm.arm.neon.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) ret <4 x i32> %sha256h3.i } @@ -135,7 +130,7 @@ define <4 x i32> @test_vsha256h2q_u32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd ; CHECK: test_vsha256h2q_u32: ; CHECK: sha256h2 {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s entry: - %sha256h23.i = tail call <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) + %sha256h23.i = tail call <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) ret <4 x i32> %sha256h23.i } @@ -143,7 +138,7 @@ define <4 x i32> @test_vsha256su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x ; CHECK: test_vsha256su1q_u32: ; CHECK: sha256su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s entry: - %sha256su13.i = tail call <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) + %sha256su13.i = tail call <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) ret <4 x i32> %sha256su13.i } diff --git a/test/CodeGen/AArch64/neon-extract.ll b/test/CodeGen/AArch64/neon-extract.ll index 5c52cd3..cddc226 100644 --- a/test/CodeGen/AArch64/neon-extract.ll +++ b/test/CodeGen/AArch64/neon-extract.ll @@ -188,3 +188,35 @@ entry: %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %vext } + +define <8 x i8> @test_undef_vext_s8(<8 x i8> %a) { +; CHECK: test_undef_vext_s8: +; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2 +entry: + %vext = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %vext +} + +define <16 x i8> @test_undef_vextq_s8(<16 x i8> %a) { +; CHECK: test_undef_vextq_s8: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6 +entry: + %vext = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %vext +} + +define <4 x i16> @test_undef_vext_s16(<4 x i16> %a) { +; CHECK: test_undef_vext_s16: +; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4 +entry: + %vext = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %vext +} + +define <8 x i16> @test_undef_vextq_s16(<8 x i16> %a) { +; CHECK: test_undef_vextq_s16: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6 +entry: + %vext = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %vext +} diff --git a/test/CodeGen/AArch64/neon-facge-facgt.ll b/test/CodeGen/AArch64/neon-facge-facgt.ll index 146256e..28e8212 100644 --- a/test/CodeGen/AArch64/neon-facge-facgt.ll +++ b/test/CodeGen/AArch64/neon-facge-facgt.ll @@ -1,20 +1,20 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s -declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) -declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) -declare <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double>, <2 x double>) +declare <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float>, <2 x float>) +declare <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float>, <4 x float>) +declare <2 x i64> @llvm.arm.neon.vacge.v2i64.v2f64(<2 x double>, <2 x double>) define <2 x i32> @facge_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) { ; Using registers other than v0, v1 and v2 are possible, but would be odd. ; CHECK: facge_from_intr_v2i32: - %val = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %A, <2 x float> %B) + %val = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %A, <2 x float> %B) ; CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s ret <2 x i32> %val } define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) { ; Using registers other than v0, v1 and v2 are possible, but would be odd. ; CHECK: facge_from_intr_v4i32: - %val = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %A, <4 x float> %B) + %val = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %A, <4 x float> %B) ; CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ret <4 x i32> %val } @@ -22,26 +22,26 @@ define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) { define <2 x i64> @facge_from_intr_v2i64(<2 x double> %A, <2 x double> %B) { ; Using registers other than v0, v1 and v2 are possible, but would be odd. ; CHECK: facge_from_intr_v2i64: - %val = call <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double> %A, <2 x double> %B) + %val = call <2 x i64> @llvm.arm.neon.vacge.v2i64.v2f64(<2 x double> %A, <2 x double> %B) ; CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d ret <2 x i64> %val } -declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) -declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) -declare <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double>, <2 x double>) +declare <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float>, <2 x float>) +declare <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float>, <4 x float>) +declare <2 x i64> @llvm.arm.neon.vacgt.v2i64.v2f64(<2 x double>, <2 x double>) define <2 x i32> @facgt_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) { ; Using registers other than v0, v1 and v2 are possible, but would be odd. ; CHECK: facgt_from_intr_v2i32: - %val = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %A, <2 x float> %B) + %val = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %A, <2 x float> %B) ; CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s ret <2 x i32> %val } define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) { ; Using registers other than v0, v1 and v2 are possible, but would be odd. ; CHECK: facgt_from_intr_v4i32: - %val = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %A, <4 x float> %B) + %val = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %A, <4 x float> %B) ; CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ret <4 x i32> %val } @@ -49,7 +49,7 @@ define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) { define <2 x i64> @facgt_from_intr_v2i64(<2 x double> %A, <2 x double> %B) { ; Using registers other than v0, v1 and v2 are possible, but would be odd. ; CHECK: facgt_from_intr_v2i64: - %val = call <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double> %A, <2 x double> %B) + %val = call <2 x i64> @llvm.arm.neon.vacgt.v2i64.v2f64(<2 x double> %A, <2 x double> %B) ; CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d ret <2 x i64> %val } diff --git a/test/CodeGen/AArch64/neon-fma.ll b/test/CodeGen/AArch64/neon-fma.ll index dcf4e28..af70302 100644 --- a/test/CodeGen/AArch64/neon-fma.ll +++ b/test/CodeGen/AArch64/neon-fma.ll @@ -1,21 +1,21 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s define <2 x float> @fmla2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp1 = fmul <2 x float> %A, %B; %tmp2 = fadd <2 x float> %C, %tmp1; ret <2 x float> %tmp2 } define <4 x float> @fmla4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) { -;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %tmp1 = fmul <4 x float> %A, %B; %tmp2 = fadd <4 x float> %C, %tmp1; ret <4 x float> %tmp2 } define <2 x double> @fmla2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) { -;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d +;CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %tmp1 = fmul <2 x double> %A, %B; %tmp2 = fadd <2 x double> %C, %tmp1; ret <2 x double> %tmp2 @@ -23,21 +23,21 @@ define <2 x double> @fmla2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> define <2 x float> @fmls2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp1 = fmul <2 x float> %A, %B; %tmp2 = fsub <2 x float> %C, %tmp1; ret <2 x float> %tmp2 } define <4 x float> @fmls4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) { -;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %tmp1 = fmul <4 x float> %A, %B; %tmp2 = fsub <4 x float> %C, %tmp1; ret <4 x float> %tmp2 } define <2 x double> @fmls2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) { -;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d +;CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %tmp1 = fmul <2 x double> %A, %B; %tmp2 = fsub <2 x double> %C, %tmp1; ret <2 x double> %tmp2 @@ -51,39 +51,39 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) define <2 x float> @fmla2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C) ret <2 x float> %val } define <4 x float> @fmla4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) { -;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C) ret <4 x float> %val } define <2 x double> @fmla2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) { -;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d +;CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C) ret <2 x double> %val } define <2 x float> @fmls2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %negA = fsub <2 x float> , %A %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %negA, <2 x float> %B, <2 x float> %C) ret <2 x float> %val } define <4 x float> @fmls4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) { -;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %negA = fsub <4 x float> , %A %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %negA, <4 x float> %B, <4 x float> %C) ret <4 x float> %val } define <2 x double> @fmls2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) { -;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d +;CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %negA = fsub <2 x double> , %A %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %negA, <2 x double> %B, <2 x double> %C) ret <2 x double> %val @@ -94,19 +94,39 @@ declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) define <2 x float> @fmuladd2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) { -;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %val = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C) ret <2 x float> %val } define <4 x float> @fmuladd4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) { -;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %val = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C) ret <4 x float> %val } define <2 x double> @fmuladd2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) { -;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d +;CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %val = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C) ret <2 x double> %val } + + +; Another set of tests that check for multiply single use + +define <2 x float> @fmla2xfloati_su(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK-NOT: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp1 = fmul <2 x float> %A, %B; + %tmp2 = fadd <2 x float> %C, %tmp1; + %tmp3 = fadd <2 x float> %tmp2, %tmp1; + ret <2 x float> %tmp3 +} + +define <2 x double> @fmls2xdouble_su(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK-NOT: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp1 = fmul <2 x double> %A, %B; + %tmp2 = fsub <2 x double> %C, %tmp1; + %tmp3 = fsub <2 x double> %tmp2, %tmp1; + ret <2 x double> %tmp3 +} + diff --git a/test/CodeGen/AArch64/neon-fpround_f128.ll b/test/CodeGen/AArch64/neon-fpround_f128.ll new file mode 100644 index 0000000..a93f3f2 --- /dev/null +++ b/test/CodeGen/AArch64/neon-fpround_f128.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +define <1 x double> @test_fpround_v1f128(<1 x fp128>* %a) { +; CHECK-LABEL: test_fpround_v1f128: +; CHECK: bl __trunctfdf2 + %b = load <1 x fp128>* %a + %c = fptrunc <1 x fp128> %b to <1 x double> + ret <1 x double> %c +} + +define <2 x double> @test_fpround_v2f128(<2 x fp128>* %a) { +; CHECK-LABEL: test_fpround_v2f128: +; CHECK: bl __trunctfdf2 +; CHECK: bl __trunctfdf2 + %b = load <2 x fp128>* %a + %c = fptrunc <2 x fp128> %b to <2 x double> + ret <2 x double> %c +} diff --git a/test/CodeGen/AArch64/neon-load-store-v1i32.ll b/test/CodeGen/AArch64/neon-load-store-v1i32.ll new file mode 100644 index 0000000..92f704d --- /dev/null +++ b/test/CodeGen/AArch64/neon-load-store-v1i32.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +; Test load/store of v1i8, v1i16, v1i32 types can be selected correctly +define void @load.store.v1i8(<1 x i8>* %ptr, <1 x i8>* %ptr2) { +; CHECK-LABEL: load.store.v1i8: +; CHECK: ldr b{{[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: str b{{[0-9]+}}, [x{{[0-9]+|sp}}] + %a = load <1 x i8>* %ptr + store <1 x i8> %a, <1 x i8>* %ptr2 + ret void +} + +define void @load.store.v1i16(<1 x i16>* %ptr, <1 x i16>* %ptr2) { +; CHECK-LABEL: load.store.v1i16: +; CHECK: ldr h{{[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: str h{{[0-9]+}}, [x{{[0-9]+|sp}}] + %a = load <1 x i16>* %ptr + store <1 x i16> %a, <1 x i16>* %ptr2 + ret void +} + +define void @load.store.v1i32(<1 x i32>* %ptr, <1 x i32>* %ptr2) { +; CHECK-LABEL: load.store.v1i32: +; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: str s{{[0-9]+}}, [x{{[0-9]+|sp}}] + %a = load <1 x i32>* %ptr + store <1 x i32> %a, <1 x i32>* %ptr2 + ret void +} diff --git a/test/CodeGen/AArch64/neon-max-min-pairwise.ll b/test/CodeGen/AArch64/neon-max-min-pairwise.ll index d757aca..3e18077 100644 --- a/test/CodeGen/AArch64/neon-max-min-pairwise.ll +++ b/test/CodeGen/AArch64/neon-max-min-pairwise.ll @@ -308,3 +308,39 @@ define <2 x double> @test_fminnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { ret <2 x double> %val } +define i32 @test_vminv_s32(<2 x i32> %a) { +; CHECK-LABEL: test_vminv_s32 +; CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %1 = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v2i32(<2 x i32> %a) + %2 = extractelement <1 x i32> %1, i32 0 + ret i32 %2 +} + +define i32 @test_vminv_u32(<2 x i32> %a) { +; CHECK-LABEL: test_vminv_u32 +; CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %1 = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v2i32(<2 x i32> %a) + %2 = extractelement <1 x i32> %1, i32 0 + ret i32 %2 +} + +define i32 @test_vmaxv_s32(<2 x i32> %a) { +; CHECK-LABEL: test_vmaxv_s32 +; CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %1 = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v2i32(<2 x i32> %a) + %2 = extractelement <1 x i32> %1, i32 0 + ret i32 %2 +} + +define i32 @test_vmaxv_u32(<2 x i32> %a) { +; CHECK-LABEL: test_vmaxv_u32 +; CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %1 = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v2i32(<2 x i32> %a) + %2 = extractelement <1 x i32> %1, i32 0 + ret i32 %2 +} + +declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v2i32(<2 x i32>) +declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v2i32(<2 x i32>) +declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v2i32(<2 x i32>) +declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v2i32(<2 x i32>) \ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-misc.ll b/test/CodeGen/AArch64/neon-misc.ll index 9660bf2..7ec36c2 100644 --- a/test/CodeGen/AArch64/neon-misc.ll +++ b/test/CodeGen/AArch64/neon-misc.ll @@ -894,13 +894,13 @@ define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 { define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 { ; CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d - %vcvtx_f32_f641.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %a) #4 + %vcvtx_f32_f641.i = call <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double> %a) #4 ret <2 x float> %vcvtx_f32_f641.i } define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 { ; CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d - %vcvtx_f32_f641.i.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %b) #4 + %vcvtx_f32_f641.i.i = tail call <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double> %b) #4 %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvtx_f32_f641.i.i, <4 x i32> ret <4 x float> %shuffle.i } @@ -1080,147 +1080,255 @@ define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 { ret <2 x i64> %vcvt.i } -define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 { +define <2 x i64> @test_vcvt_s64_f32(<2 x float> %a) #0 { +; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s +; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvt.i = fptosi <2 x float> %a to <2 x i64> + ret <2 x i64> %vcvt.i +} + +define <2 x i64> @test_vcvt_u64_f32(<2 x float> %a) #0 { +; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s +; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvt.i = fptoui <2 x float> %a to <2 x i64> + ret <2 x i64> %vcvt.i +} + +define <4 x i16> @test_vcvt_s16_f32(<4 x float> %a) #0 { +; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s +; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s + %vcvt.i = fptosi <4 x float> %a to <4 x i16> + ret <4 x i16> %vcvt.i +} + +define <4 x i16> @test_vcvt_u16_f32(<4 x float> %a) #0 { +; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s +; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s + %vcvt.i = fptoui <4 x float> %a to <4 x i16> + ret <4 x i16> %vcvt.i +} + +define <2 x i32> @test_vcvt_s32_f64(<2 x double> %a) #0 { +; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d +; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d + %vcvt.i = fptosi <2 x double> %a to <2 x i32> + ret <2 x i32> %vcvt.i +} + +define <2 x i32> @test_vcvt_u32_f64(<2 x double> %a) #0 { +; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d +; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d + %vcvt.i = fptoui <2 x double> %a to <2 x i32> + ret <2 x i32> %vcvt.i +} + +define <1 x i8> @test_vcvt_s8_f64(<1 x double> %a) #0 { +; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}} +; CHECK: ins v{{[0-9]+}}.b[0], w{{[0-9]+}} + %vcvt.i = fptosi <1 x double> %a to <1 x i8> + ret <1 x i8> %vcvt.i +} + +define <1 x i8> @test_vcvt_u8_f64(<1 x double> %a) #0 { +; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}} +; CHECK: ins v{{[0-9]+}}.b[0], w{{[0-9]+}} + %vcvt.i = fptoui <1 x double> %a to <1 x i8> + ret <1 x i8> %vcvt.i +} + +define <1 x i16> @test_vcvt_s16_f64(<1 x double> %a) #0 { +; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}} +; CHECK: ins v{{[0-9]+}}.h[0], w{{[0-9]+}} + %vcvt.i = fptosi <1 x double> %a to <1 x i16> + ret <1 x i16> %vcvt.i +} + +define <1 x i16> @test_vcvt_u16_f64(<1 x double> %a) #0 { +; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}} +; CHECK: ins v{{[0-9]+}}.h[0], w{{[0-9]+}} + %vcvt.i = fptoui <1 x double> %a to <1 x i16> + ret <1 x i16> %vcvt.i +} + +define <1 x i32> @test_vcvt_s32_f64_v1(<1 x double> %a) #0 { +; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}} +; CHECK: fmov s{{[0-9]+}}, w{{[0-9]+}} + %vcvt.i = fptosi <1 x double> %a to <1 x i32> + ret <1 x i32> %vcvt.i +} + +define <1 x i32> @test_vcvt_u32_f64_v1(<1 x double> %a) #0 { +; CHECK: fcvtzu w{{[0-9]+}}, d{{[0-9]+}} +; CHECK: fmov s{{[0-9]+}}, w{{[0-9]+}} + %vcvt.i = fptoui <1 x double> %a to <1 x i32> + ret <1 x i32> %vcvt.i +} + +define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) { +; CHECK-LABEL: test_vcvtn_s32_f32 ; CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s - %vcvtns_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %a) #4 + %vcvtns_f321.i = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a) ret <2 x i32> %vcvtns_f321.i } -define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 { +define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) { +; CHECK-LABEL: test_vcvtnq_s32_f32 ; CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s - %vcvtns_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %a) #4 + %vcvtns_f321.i = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a) ret <4 x i32> %vcvtns_f321.i } -define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) #0 { +define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) { +; CHECK-LABEL: test_vcvtnq_s64_f64 ; CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d - %vcvtns_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %a) #4 + %vcvtns_f641.i = call <2 x i64> @llvm.arm.neon.vcvtns.v2i64.v2f64(<2 x double> %a) ret <2 x i64> %vcvtns_f641.i } -define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 { +define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) { +; CHECK-LABEL: test_vcvtn_u32_f32 ; CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s - %vcvtnu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %a) #4 + %vcvtnu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a) ret <2 x i32> %vcvtnu_f321.i } -define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 { +define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) { +; CHECK-LABEL: test_vcvtnq_u32_f32 ; CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s - %vcvtnu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %a) #4 + %vcvtnu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a) ret <4 x i32> %vcvtnu_f321.i } -define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) #0 { +define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) { +; CHECK-LABEL: test_vcvtnq_u64_f64 ; CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d - %vcvtnu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %a) #4 + %vcvtnu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtnu.v2i64.v2f64(<2 x double> %a) ret <2 x i64> %vcvtnu_f641.i } -define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 { +define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) { +; CHECK-LABEL: test_vcvtp_s32_f32 ; CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s - %vcvtps_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %a) #4 + %vcvtps_f321.i = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a) ret <2 x i32> %vcvtps_f321.i } -define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 { +define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) { +; CHECK-LABEL: test_vcvtpq_s32_f32 ; CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s - %vcvtps_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %a) #4 + %vcvtps_f321.i = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a) ret <4 x i32> %vcvtps_f321.i } -define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) #0 { +define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) { +; CHECK-LABEL: test_vcvtpq_s64_f64 ; CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d - %vcvtps_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %a) #4 + %vcvtps_f641.i = call <2 x i64> @llvm.arm.neon.vcvtps.v2i64.v2f64(<2 x double> %a) ret <2 x i64> %vcvtps_f641.i } -define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 { +define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) { +; CHECK-LABEL: test_vcvtp_u32_f32 ; CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s - %vcvtpu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %a) #4 + %vcvtpu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a) ret <2 x i32> %vcvtpu_f321.i } -define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 { +define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) { +; CHECK-LABEL: test_vcvtpq_u32_f32 ; CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s - %vcvtpu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %a) #4 + %vcvtpu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a) ret <4 x i32> %vcvtpu_f321.i } -define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) #0 { +define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) { +; CHECK-LABEL: test_vcvtpq_u64_f64 ; CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d - %vcvtpu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %a) #4 + %vcvtpu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtpu.v2i64.v2f64(<2 x double> %a) ret <2 x i64> %vcvtpu_f641.i } -define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 { +define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) { +; CHECK-LABEL: test_vcvtm_s32_f32 ; CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s - %vcvtms_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %a) #4 + %vcvtms_f321.i = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a) ret <2 x i32> %vcvtms_f321.i } -define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 { +define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) { +; CHECK-LABEL: test_vcvtmq_s32_f32 ; CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s - %vcvtms_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %a) #4 + %vcvtms_f321.i = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a) ret <4 x i32> %vcvtms_f321.i } -define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) #0 { +define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) { +; CHECK-LABEL: test_vcvtmq_s64_f64 ; CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d - %vcvtms_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %a) #4 + %vcvtms_f641.i = call <2 x i64> @llvm.arm.neon.vcvtms.v2i64.v2f64(<2 x double> %a) ret <2 x i64> %vcvtms_f641.i } -define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 { +define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) { +; CHECK-LABEL: test_vcvtm_u32_f32 ; CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s - %vcvtmu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %a) #4 + %vcvtmu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a) ret <2 x i32> %vcvtmu_f321.i } -define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 { +define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) { +; CHECK-LABEL: test_vcvtmq_u32_f32 ; CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s - %vcvtmu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %a) #4 + %vcvtmu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a) ret <4 x i32> %vcvtmu_f321.i } -define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) #0 { +define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) { +; CHECK-LABEL: test_vcvtmq_u64_f64 ; CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d - %vcvtmu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %a) #4 + %vcvtmu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtmu.v2i64.v2f64(<2 x double> %a) ret <2 x i64> %vcvtmu_f641.i } -define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 { +define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) { +; CHECK-LABEL: test_vcvta_s32_f32 ; CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s - %vcvtas_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %a) #4 + %vcvtas_f321.i = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a) ret <2 x i32> %vcvtas_f321.i } -define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 { +define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) { +; CHECK-LABEL: test_vcvtaq_s32_f32 ; CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s - %vcvtas_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %a) #4 + %vcvtas_f321.i = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a) ret <4 x i32> %vcvtas_f321.i } -define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) #0 { +define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) { +; CHECK-LABEL: test_vcvtaq_s64_f64 ; CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d - %vcvtas_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %a) #4 + %vcvtas_f641.i = call <2 x i64> @llvm.arm.neon.vcvtas.v2i64.v2f64(<2 x double> %a) ret <2 x i64> %vcvtas_f641.i } -define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 { +define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) { +; CHECK-LABEL: test_vcvta_u32_f32 ; CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s - %vcvtau_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %a) #4 + %vcvtau_f321.i = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a) ret <2 x i32> %vcvtau_f321.i } -define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 { +define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) { +; CHECK-LABEL: test_vcvtaq_u32_f32 ; CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s - %vcvtau_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %a) #4 + %vcvtau_f321.i = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a) ret <4 x i32> %vcvtau_f321.i } -define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) #0 { +define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) { +; CHECK-LABEL: test_vcvtaq_u64_f64 ; CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d - %vcvtau_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %a) #4 + %vcvtau_f641.i = call <2 x i64> @llvm.arm.neon.vcvtau.v2i64.v2f64(<2 x double> %a) ret <2 x i64> %vcvtau_f641.i } @@ -1326,6 +1434,94 @@ define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 { ret <2 x double> %vcvt.i } +define <2 x float> @test_vcvt_f32_s64(<2 x i64> %a) #0 { +; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d +; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d + %vcvt.i = sitofp <2 x i64> %a to <2 x float> + ret <2 x float> %vcvt.i +} + +define <2 x float> @test_vcvt_f32_u64(<2 x i64> %a) #0 { +; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d +; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d + %vcvt.i = uitofp <2 x i64> %a to <2 x float> + ret <2 x float> %vcvt.i +} + +define <4 x float> @test_vcvt_f32_s16(<4 x i16> %a) #0 { +; CHECK: sshll v{{[0-9]+}}.4s, v{{[0-9]+}}.4h, #0 +; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvt.i = sitofp <4 x i16> %a to <4 x float> + ret <4 x float> %vcvt.i +} + +define <4 x float> @test_vcvt_f32_u16(<4 x i16> %a) #0 { +; CHECK: ushll v{{[0-9]+}}.4s, v{{[0-9]+}}.4h, #0 +; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvt.i = uitofp <4 x i16> %a to <4 x float> + ret <4 x float> %vcvt.i +} + +define <2 x double> @test_vcvt_f64_s32(<2 x i32> %a) #0 { +; CHECK: sshll v{{[0-9]+}}.2d, v{{[0-9]+}}.2s, #0 +; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvt.i = sitofp <2 x i32> %a to <2 x double> + ret <2 x double> %vcvt.i +} + +define <2 x double> @test_vcvt_f64_u32(<2 x i32> %a) #0 { +; CHECK: ushll v{{[0-9]+}}.2d, v{{[0-9]+}}.2s, #0 +; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvt.i = uitofp <2 x i32> %a to <2 x double> + ret <2 x double> %vcvt.i +} + +define <1 x double> @test_vcvt_f64_s8(<1 x i8> %a) #0 { +; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.b[0] +; CHECK: sxtb w{{[0-9]+}}, w{{[0-9]+}} +; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}} + %vcvt.i = sitofp <1 x i8> %a to <1 x double> + ret <1 x double> %vcvt.i +} + +define <1 x double> @test_vcvt_f64_u8(<1 x i8> %a) #0 { +; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.b[0] +; CHECK: and w{{[0-9]+}}, w{{[0-9]+}}, #0xff +; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}} + %vcvt.i = uitofp <1 x i8> %a to <1 x double> + ret <1 x double> %vcvt.i +} + +define <1 x double> @test_vcvt_f64_s16(<1 x i16> %a) #0 { +; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.h[0] +; CHECK: sxth w{{[0-9]+}}, w{{[0-9]+}} +; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}} + %vcvt.i = sitofp <1 x i16> %a to <1 x double> + ret <1 x double> %vcvt.i +} + +define <1 x double> @test_vcvt_f64_u16(<1 x i16> %a) #0 { +; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.h[0] +; CHECK: and w{{[0-9]+}}, w{{[0-9]+}}, #0xffff +; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}} + %vcvt.i = uitofp <1 x i16> %a to <1 x double> + ret <1 x double> %vcvt.i +} + +define <1 x double> @test_vcvt_f64_s32_v1(<1 x i32> %a) #0 { +; CHECK: fmov w{{[0-9]+}}, s{{[0-9]+}} +; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}} + %vcvt.i = sitofp <1 x i32> %a to <1 x double> + ret <1 x double> %vcvt.i +} + +define <1 x double> @test_vcvt_f64_u32_v1(<1 x i32> %a) #0 { +; CHECK: fmov w{{[0-9]+}}, s{{[0-9]+}} +; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}} + %vcvt.i = uitofp <1 x i32> %a to <1 x double> + ret <1 x double> %vcvt.i +} + declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #2 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #2 @@ -1348,53 +1544,53 @@ declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) #2 declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) #2 -declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) #2 +declare <2 x i64> @llvm.arm.neon.vcvtau.v2i64.v2f64(<2 x double>) -declare <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float>) #2 +declare <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float>) -declare <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float>) #2 +declare <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float>) -declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) #2 +declare <2 x i64> @llvm.arm.neon.vcvtas.v2i64.v2f64(<2 x double>) -declare <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float>) #2 +declare <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float>) -declare <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float>) #2 +declare <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float>) -declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) #2 +declare <2 x i64> @llvm.arm.neon.vcvtmu.v2i64.v2f64(<2 x double>) -declare <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float>) #2 +declare <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float>) -declare <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float>) #2 +declare <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float>) -declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) #2 +declare <2 x i64> @llvm.arm.neon.vcvtms.v2i64.v2f64(<2 x double>) -declare <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float>) #2 +declare <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float>) -declare <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float>) #2 +declare <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float>) -declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) #2 +declare <2 x i64> @llvm.arm.neon.vcvtpu.v2i64.v2f64(<2 x double>) -declare <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) #2 +declare <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float>) -declare <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float>) #2 +declare <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float>) -declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) #2 +declare <2 x i64> @llvm.arm.neon.vcvtps.v2i64.v2f64(<2 x double>) -declare <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float>) #2 +declare <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float>) -declare <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float>) #2 +declare <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float>) -declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) #2 +declare <2 x i64> @llvm.arm.neon.vcvtnu.v2i64.v2f64(<2 x double>) -declare <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float>) #2 +declare <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float>) -declare <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float>) #2 +declare <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float>) -declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) #2 +declare <2 x i64> @llvm.arm.neon.vcvtns.v2i64.v2f64(<2 x double>) -declare <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float>) #2 +declare <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float>) -declare <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float>) #2 +declare <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float>) declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #3 @@ -1438,7 +1634,7 @@ declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) #2 declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) #2 -declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) #2 +declare <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double>) #2 declare <2 x float> @llvm.aarch64.neon.fcvtn.v2f32.v2f64(<2 x double>) #2 @@ -1624,56 +1820,56 @@ define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) { define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) { ; CHECK-LABEL: test_vcvtn_s64_f64 ; CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}} - %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %a) + %1 = call <1 x i64> @llvm.arm.neon.vcvtns.v1i64.v1f64(<1 x double> %a) ret <1 x i64> %1 } define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) { ; CHECK-LABEL: test_vcvtn_u64_f64 ; CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}} - %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %a) + %1 = call <1 x i64> @llvm.arm.neon.vcvtnu.v1i64.v1f64(<1 x double> %a) ret <1 x i64> %1 } define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) { ; CHECK-LABEL: test_vcvtp_s64_f64 ; CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}} - %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %a) + %1 = call <1 x i64> @llvm.arm.neon.vcvtps.v1i64.v1f64(<1 x double> %a) ret <1 x i64> %1 } define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) { ; CHECK-LABEL: test_vcvtp_u64_f64 ; CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}} - %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %a) + %1 = call <1 x i64> @llvm.arm.neon.vcvtpu.v1i64.v1f64(<1 x double> %a) ret <1 x i64> %1 } define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) { ; CHECK-LABEL: test_vcvtm_s64_f64 ; CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}} - %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %a) + %1 = call <1 x i64> @llvm.arm.neon.vcvtms.v1i64.v1f64(<1 x double> %a) ret <1 x i64> %1 } define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) { ; CHECK-LABEL: test_vcvtm_u64_f64 ; CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}} - %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %a) + %1 = call <1 x i64> @llvm.arm.neon.vcvtmu.v1i64.v1f64(<1 x double> %a) ret <1 x i64> %1 } define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) { ; CHECK-LABEL: test_vcvta_s64_f64 ; CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}} - %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %a) + %1 = call <1 x i64> @llvm.arm.neon.vcvtas.v1i64.v1f64(<1 x double> %a) ret <1 x i64> %1 } define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) { ; CHECK-LABEL: test_vcvta_u64_f64 ; CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}} - %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %a) + %1 = call <1 x i64> @llvm.arm.neon.vcvtau.v1i64.v1f64(<1 x double> %a) ret <1 x i64> %1 } @@ -1691,14 +1887,14 @@ define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) { ret <1 x double> %1 } -declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double>) -declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double>) -declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double>) -declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double>) -declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double>) -declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double>) -declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double>) -declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.arm.neon.vcvtau.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.arm.neon.vcvtas.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.arm.neon.vcvtmu.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.arm.neon.vcvtms.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.arm.neon.vcvtpu.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.arm.neon.vcvtps.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.arm.neon.vcvtnu.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.arm.neon.vcvtns.v1i64.v1f64(<1 x double>) define <1 x double> @test_vrndn_f64(<1 x double> %a) { ; CHECK-LABEL: test_vrndn_f64 @@ -1796,4 +1992,23 @@ declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>) declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>) declare <1 x double> @llvm.sqrt.v1f64(<1 x double>) declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>) -declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>) \ No newline at end of file +declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>) + +define i64 @test_vaddlv_s32(<2 x i32> %a) { +; CHECK-LABEL: test_vaddlv_s32 +; CHECK: saddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s + %1 = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v2i32(<2 x i32> %a) + %2 = extractelement <1 x i64> %1, i32 0 + ret i64 %2 +} + +define i64 @test_vaddlv_u32(<2 x i32> %a) { +; CHECK-LABEL: test_vaddlv_u32 +; CHECK: uaddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s + %1 = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v2i32(<2 x i32> %a) + %2 = extractelement <1 x i64> %1, i32 0 + ret i64 %2 +} + +declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v2i32(<2 x i32>) +declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v2i32(<2 x i32>) \ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-mla-mls.ll b/test/CodeGen/AArch64/neon-mla-mls.ll index 23e9223..71bb0e7 100644 --- a/test/CodeGen/AArch64/neon-mla-mls.ll +++ b/test/CodeGen/AArch64/neon-mla-mls.ll @@ -2,84 +2,84 @@ define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { -;CHECK: mla {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = mul <8 x i8> %A, %B; %tmp2 = add <8 x i8> %C, %tmp1; ret <8 x i8> %tmp2 } define <16 x i8> @mla16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { -;CHECK: mla {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = mul <16 x i8> %A, %B; %tmp2 = add <16 x i8> %C, %tmp1; ret <16 x i8> %tmp2 } define <4 x i16> @mla4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { -;CHECK: mla {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h +;CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h %tmp1 = mul <4 x i16> %A, %B; %tmp2 = add <4 x i16> %C, %tmp1; ret <4 x i16> %tmp2 } define <8 x i16> @mla8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { -;CHECK: mla {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h +;CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h %tmp1 = mul <8 x i16> %A, %B; %tmp2 = add <8 x i16> %C, %tmp1; ret <8 x i16> %tmp2 } define <2 x i32> @mla2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { -;CHECK: mla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp1 = mul <2 x i32> %A, %B; %tmp2 = add <2 x i32> %C, %tmp1; ret <2 x i32> %tmp2 } define <4 x i32> @mla4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { -;CHECK: mla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %tmp1 = mul <4 x i32> %A, %B; %tmp2 = add <4 x i32> %C, %tmp1; ret <4 x i32> %tmp2 } define <8 x i8> @mls8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { -;CHECK: mls {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp1 = mul <8 x i8> %A, %B; %tmp2 = sub <8 x i8> %C, %tmp1; ret <8 x i8> %tmp2 } define <16 x i8> @mls16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { -;CHECK: mls {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp1 = mul <16 x i8> %A, %B; %tmp2 = sub <16 x i8> %C, %tmp1; ret <16 x i8> %tmp2 } define <4 x i16> @mls4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { -;CHECK: mls {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h +;CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h %tmp1 = mul <4 x i16> %A, %B; %tmp2 = sub <4 x i16> %C, %tmp1; ret <4 x i16> %tmp2 } define <8 x i16> @mls8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { -;CHECK: mls {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h +;CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h %tmp1 = mul <8 x i16> %A, %B; %tmp2 = sub <8 x i16> %C, %tmp1; ret <8 x i16> %tmp2 } define <2 x i32> @mls2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { -;CHECK: mls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp1 = mul <2 x i32> %A, %B; %tmp2 = sub <2 x i32> %C, %tmp1; ret <2 x i32> %tmp2 } define <4 x i32> @mls4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { -;CHECK: mls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %tmp1 = mul <4 x i32> %A, %B; %tmp2 = sub <4 x i32> %C, %tmp1; ret <4 x i32> %tmp2 diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll index 60b13b8..4035b91 100644 --- a/test/CodeGen/AArch64/neon-mov.ll +++ b/test/CodeGen/AArch64/neon-mov.ll @@ -1,204 +1,204 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s define <8 x i8> @movi8b() { -;CHECK: movi {{v[0-31]+}}.8b, #0x8 +;CHECK: movi {{v[0-9]+}}.8b, #0x8 ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > } define <16 x i8> @movi16b() { -;CHECK: movi {{v[0-31]+}}.16b, #0x8 +;CHECK: movi {{v[0-9]+}}.16b, #0x8 ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > } define <2 x i32> @movi2s_lsl0() { -;CHECK: movi {{v[0-31]+}}.2s, #0xff +;CHECK: movi {{v[0-9]+}}.2s, #0xff ret <2 x i32> < i32 255, i32 255 > } define <2 x i32> @movi2s_lsl8() { -;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #8 +;CHECK: movi {{v[0-9]+}}.2s, #0xff, lsl #8 ret <2 x i32> < i32 65280, i32 65280 > } define <2 x i32> @movi2s_lsl16() { -;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #16 +;CHECK: movi {{v[0-9]+}}.2s, #0xff, lsl #16 ret <2 x i32> < i32 16711680, i32 16711680 > } define <2 x i32> @movi2s_lsl24() { -;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #24 +;CHECK: movi {{v[0-9]+}}.2s, #0xff, lsl #24 ret <2 x i32> < i32 4278190080, i32 4278190080 > } define <4 x i32> @movi4s_lsl0() { -;CHECK: movi {{v[0-31]+}}.4s, #0xff +;CHECK: movi {{v[0-9]+}}.4s, #0xff ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 > } define <4 x i32> @movi4s_lsl8() { -;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #8 +;CHECK: movi {{v[0-9]+}}.4s, #0xff, lsl #8 ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 > } define <4 x i32> @movi4s_lsl16() { -;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #16 +;CHECK: movi {{v[0-9]+}}.4s, #0xff, lsl #16 ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 > } define <4 x i32> @movi4s_lsl24() { -;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #24 +;CHECK: movi {{v[0-9]+}}.4s, #0xff, lsl #24 ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 > } define <4 x i16> @movi4h_lsl0() { -;CHECK: movi {{v[0-31]+}}.4h, #0xff +;CHECK: movi {{v[0-9]+}}.4h, #0xff ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 > } define <4 x i16> @movi4h_lsl8() { -;CHECK: movi {{v[0-31]+}}.4h, #0xff, lsl #8 +;CHECK: movi {{v[0-9]+}}.4h, #0xff, lsl #8 ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 > } define <8 x i16> @movi8h_lsl0() { -;CHECK: movi {{v[0-31]+}}.8h, #0xff +;CHECK: movi {{v[0-9]+}}.8h, #0xff ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 > } define <8 x i16> @movi8h_lsl8() { -;CHECK: movi {{v[0-31]+}}.8h, #0xff, lsl #8 +;CHECK: movi {{v[0-9]+}}.8h, #0xff, lsl #8 ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 > } define <2 x i32> @mvni2s_lsl0() { -;CHECK: mvni {{v[0-31]+}}.2s, #0x10 +;CHECK: mvni {{v[0-9]+}}.2s, #0x10 ret <2 x i32> < i32 4294967279, i32 4294967279 > } define <2 x i32> @mvni2s_lsl8() { -;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #8 +;CHECK: mvni {{v[0-9]+}}.2s, #0x10, lsl #8 ret <2 x i32> < i32 4294963199, i32 4294963199 > } define <2 x i32> @mvni2s_lsl16() { -;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #16 +;CHECK: mvni {{v[0-9]+}}.2s, #0x10, lsl #16 ret <2 x i32> < i32 4293918719, i32 4293918719 > } define <2 x i32> @mvni2s_lsl24() { -;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #24 +;CHECK: mvni {{v[0-9]+}}.2s, #0x10, lsl #24 ret <2 x i32> < i32 4026531839, i32 4026531839 > } define <4 x i32> @mvni4s_lsl0() { -;CHECK: mvni {{v[0-31]+}}.4s, #0x10 +;CHECK: mvni {{v[0-9]+}}.4s, #0x10 ret <4 x i32> < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 > } define <4 x i32> @mvni4s_lsl8() { -;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #8 +;CHECK: mvni {{v[0-9]+}}.4s, #0x10, lsl #8 ret <4 x i32> < i32 4294963199, i32 4294963199, i32 4294963199, i32 4294963199 > } define <4 x i32> @mvni4s_lsl16() { -;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #16 +;CHECK: mvni {{v[0-9]+}}.4s, #0x10, lsl #16 ret <4 x i32> < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 > } define <4 x i32> @mvni4s_lsl24() { -;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #24 +;CHECK: mvni {{v[0-9]+}}.4s, #0x10, lsl #24 ret <4 x i32> < i32 4026531839, i32 4026531839, i32 4026531839, i32 4026531839 > } define <4 x i16> @mvni4h_lsl0() { -;CHECK: mvni {{v[0-31]+}}.4h, #0x10 +;CHECK: mvni {{v[0-9]+}}.4h, #0x10 ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 > } define <4 x i16> @mvni4h_lsl8() { -;CHECK: mvni {{v[0-31]+}}.4h, #0x10, lsl #8 +;CHECK: mvni {{v[0-9]+}}.4h, #0x10, lsl #8 ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 > } define <8 x i16> @mvni8h_lsl0() { -;CHECK: mvni {{v[0-31]+}}.8h, #0x10 +;CHECK: mvni {{v[0-9]+}}.8h, #0x10 ret <8 x i16> < i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519 > } define <8 x i16> @mvni8h_lsl8() { -;CHECK: mvni {{v[0-31]+}}.8h, #0x10, lsl #8 +;CHECK: mvni {{v[0-9]+}}.8h, #0x10, lsl #8 ret <8 x i16> < i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439 > } define <2 x i32> @movi2s_msl8(<2 x i32> %a) { -;CHECK: movi {{v[0-31]+}}.2s, #0xff, msl #8 +;CHECK: movi {{v[0-9]+}}.2s, #0xff, msl #8 ret <2 x i32> < i32 65535, i32 65535 > } define <2 x i32> @movi2s_msl16() { -;CHECK: movi {{v[0-31]+}}.2s, #0xff, msl #16 +;CHECK: movi {{v[0-9]+}}.2s, #0xff, msl #16 ret <2 x i32> < i32 16777215, i32 16777215 > } define <4 x i32> @movi4s_msl8() { -;CHECK: movi {{v[0-31]+}}.4s, #0xff, msl #8 +;CHECK: movi {{v[0-9]+}}.4s, #0xff, msl #8 ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 > } define <4 x i32> @movi4s_msl16() { -;CHECK: movi {{v[0-31]+}}.4s, #0xff, msl #16 +;CHECK: movi {{v[0-9]+}}.4s, #0xff, msl #16 ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 > } define <2 x i32> @mvni2s_msl8() { -;CHECK: mvni {{v[0-31]+}}.2s, #0x10, msl #8 +;CHECK: mvni {{v[0-9]+}}.2s, #0x10, msl #8 ret <2 x i32> < i32 18446744073709547264, i32 18446744073709547264> } define <2 x i32> @mvni2s_msl16() { -;CHECK: mvni {{v[0-31]+}}.2s, #0x10, msl #16 +;CHECK: mvni {{v[0-9]+}}.2s, #0x10, msl #16 ret <2 x i32> < i32 18446744073708437504, i32 18446744073708437504> } define <4 x i32> @mvni4s_msl8() { -;CHECK: mvni {{v[0-31]+}}.4s, #0x10, msl #8 +;CHECK: mvni {{v[0-9]+}}.4s, #0x10, msl #8 ret <4 x i32> < i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264> } define <4 x i32> @mvni4s_msl16() { -;CHECK: mvni {{v[0-31]+}}.4s, #0x10, msl #16 +;CHECK: mvni {{v[0-9]+}}.4s, #0x10, msl #16 ret <4 x i32> < i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504> } define <2 x i64> @movi2d() { -;CHECK: movi {{v[0-31]+}}.2d, #0xff0000ff0000ffff +;CHECK: movi {{v[0-9]+}}.2d, #0xff0000ff0000ffff ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 > } define <1 x i64> @movid() { -;CHECK: movi {{d[0-31]+}}, #0xff0000ff0000ffff +;CHECK: movi {{d[0-9]+}}, #0xff0000ff0000ffff ret <1 x i64> < i64 18374687574888349695 > } define <2 x float> @fmov2s() { -;CHECK: fmov {{v[0-31]+}}.2s, #-12.00000000 +;CHECK: fmov {{v[0-9]+}}.2s, #-12.00000000 ret <2 x float> < float -1.2e1, float -1.2e1> } define <4 x float> @fmov4s() { -;CHECK: fmov {{v[0-31]+}}.4s, #-12.00000000 +;CHECK: fmov {{v[0-9]+}}.4s, #-12.00000000 ret <4 x float> < float -1.2e1, float -1.2e1, float -1.2e1, float -1.2e1> } define <2 x double> @fmov2d() { -;CHECK: fmov {{v[0-31]+}}.2d, #-12.00000000 +;CHECK: fmov {{v[0-9]+}}.2d, #-12.00000000 ret <2 x double> < double -1.2e1, double -1.2e1> } @@ -210,7 +210,9 @@ define <2 x i32> @movi1d_1() { declare <2 x i32> @test_movi1d(<2 x i32>, <2 x i32>) define <2 x i32> @movi1d() { -; CHECK: movi d1, #0xffffffff0000 +; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}} +; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}] +; CHECK-NEXT: movi d1, #0xffffffff0000 %1 = tail call <2 x i32> @test_movi1d(<2 x i32> , <2 x i32> ) ret <2 x i32> %1 } diff --git a/test/CodeGen/AArch64/neon-mul-div.ll b/test/CodeGen/AArch64/neon-mul-div.ll index e1be313..da22ce8 100644 --- a/test/CodeGen/AArch64/neon-mul-div.ll +++ b/test/CodeGen/AArch64/neon-mul-div.ll @@ -2,76 +2,628 @@ define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) { -;CHECK: mul {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b +;CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b %tmp3 = mul <8 x i8> %A, %B; ret <8 x i8> %tmp3 } define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) { -;CHECK: mul {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b +;CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b %tmp3 = mul <16 x i8> %A, %B; ret <16 x i8> %tmp3 } define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) { -;CHECK: mul {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h +;CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h %tmp3 = mul <4 x i16> %A, %B; ret <4 x i16> %tmp3 } define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) { -;CHECK: mul {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h +;CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h %tmp3 = mul <8 x i16> %A, %B; ret <8 x i16> %tmp3 } define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) { -;CHECK: mul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp3 = mul <2 x i32> %A, %B; ret <2 x i32> %tmp3 } define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) { -;CHECK: mul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %tmp3 = mul <4 x i32> %A, %B; ret <4 x i32> %tmp3 } +define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) { +;CHECK-LABEL: mul1xi64: +;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}} + %tmp3 = mul <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK-LABEL: mul2xi64: +;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}} +;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}} + %tmp3 = mul <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) { -;CHECK: fmul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp3 = fmul <2 x float> %A, %B; ret <2 x float> %tmp3 } define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) { -;CHECK: fmul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %tmp3 = fmul <4 x float> %A, %B; ret <4 x float> %tmp3 } define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) { -;CHECK: fmul {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d +;CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %tmp3 = fmul <2 x double> %A, %B; ret <2 x double> %tmp3 } define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) { -;CHECK: fdiv {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s +;CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s %tmp3 = fdiv <2 x float> %A, %B; ret <2 x float> %tmp3 } define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) { -;CHECK: fdiv {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s +;CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s %tmp3 = fdiv <4 x float> %A, %B; ret <4 x float> %tmp3 } define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) { -;CHECK: fdiv {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d +;CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d %tmp3 = fdiv <2 x double> %A, %B; ret <2 x double> %tmp3 } +define <1 x i8> @sdiv1x8(<1 x i8> %A, <1 x i8> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <1 x i8> %A, %B; + ret <1 x i8> %tmp3 +} + +define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <1 x i16> @sdiv1x16(<1 x i16> %A, <1 x i16> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <1 x i16> %A, %B; + ret <1 x i16> %tmp3 +} + +define <4 x i16> @sdiv4x16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <1 x i32> @sdiv1x32(<1 x i32> %A, <1 x i32> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <1 x i32> %A, %B; + ret <1 x i32> %tmp3 +} + +define <2 x i32> @sdiv2x32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <1 x i64> @sdiv1x64(<1 x i64> %A, <1 x i64> %B) { +;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = sdiv <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = sdiv <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <1 x i8> @udiv1x8(<1 x i8> %A, <1 x i8> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <1 x i8> %A, %B; + ret <1 x i8> %tmp3 +} + +define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <1 x i16> @udiv1x16(<1 x i16> %A, <1 x i16> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <1 x i16> %A, %B; + ret <1 x i16> %tmp3 +} + +define <4 x i16> @udiv4x16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <1 x i32> @udiv1x32(<1 x i32> %A, <1 x i32> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <1 x i32> %A, %B; + ret <1 x i32> %tmp3 +} + +define <2 x i32> @udiv2x32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <1 x i64> @udiv1x64(<1 x i64> %A, <1 x i64> %B) { +;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = udiv <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = udiv <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <1 x i8> @srem1x8(<1 x i8> %A, <1 x i8> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <1 x i8> %A, %B; + ret <1 x i8> %tmp3 +} + +define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <1 x i16> @srem1x16(<1 x i16> %A, <1 x i16> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <1 x i16> %A, %B; + ret <1 x i16> %tmp3 +} + +define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <1 x i32> @srem1x32(<1 x i32> %A, <1 x i32> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <1 x i32> %A, %B; + ret <1 x i32> %tmp3 +} + +define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <1 x i64> @srem1x64(<1 x i64> %A, <1 x i64> %B) { +;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = srem <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = srem <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <1 x i8> @urem1x8(<1 x i8> %A, <1 x i8> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <1 x i8> %A, %B; + ret <1 x i8> %tmp3 +} + +define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <1 x i16> @urem1x16(<1 x i16> %A, <1 x i16> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <1 x i16> %A, %B; + ret <1 x i16> %tmp3 +} + +define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <1 x i32> @urem1x32(<1 x i32> %A, <1 x i32> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <1 x i32> %A, %B; + ret <1 x i32> %tmp3 +} + +define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <1 x i64> @urem1x64(<1 x i64> %A, <1 x i64> %B) { +;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = urem <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = urem <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) { +; CHECK: bl fmodf +; CHECK: bl fmodf + %tmp3 = frem <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @frem4f32(<4 x float> %A, <4 x float> %B) { +; CHECK: bl fmodf +; CHECK: bl fmodf +; CHECK: bl fmodf +; CHECK: bl fmodf + %tmp3 = frem <4 x float> %A, %B; + ret <4 x float> %tmp3 +} + +define <1 x double> @frem1d64(<1 x double> %A, <1 x double> %B) { +; CHECK: bl fmod + %tmp3 = frem <1 x double> %A, %B; + ret <1 x double> %tmp3 +} + +define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) { +; CHECK: bl fmod +; CHECK: bl fmod + %tmp3 = frem <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) @@ -179,3 +731,24 @@ define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) { %val = call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs) ret <2 x double> %val } + +define <1 x i8> @test_mul_v1i8(<1 x i8> %a, <1 x i8> %b) { +;CHECK-LABEL: test_mul_v1i8: +;CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %c = mul <1 x i8> %a, %b + ret <1 x i8> %c +} + +define <1 x i16> @test_mul_v1i16(<1 x i16> %a, <1 x i16> %b) { +;CHECK-LABEL: test_mul_v1i16: +;CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %c = mul <1 x i16> %a, %b + ret <1 x i16> %c +} + +define <1 x i32> @test_mul_v1i32(<1 x i32> %a, <1 x i32> %b) { +;CHECK-LABEL: test_mul_v1i32: +;CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %c = mul <1 x i32> %a, %b + ret <1 x i32> %c +} diff --git a/test/CodeGen/AArch64/neon-or-combine.ll b/test/CodeGen/AArch64/neon-or-combine.ll new file mode 100644 index 0000000..260f693 --- /dev/null +++ b/test/CodeGen/AArch64/neon-or-combine.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +; Check that the DAGCombiner does not crash with an assertion failure +; when performing a target specific combine to simplify a 'or' dag node +; according to the following rule: +; (or (and B, A), (and C, ~A)) => (VBSL A, B, C) +; The assertion failure was caused by an invalid comparison between APInt +; values with different 'BitWidth'. + +define <8 x i8> @test1(<8 x i8> %a, <8 x i8> %b) { + %tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0 > + %tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 > + %tmp3 = or <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +; CHECK-LABEL: test1 +; CHECK: ret + +define <16 x i8> @test2(<16 x i8> %a, <16 x i8> %b) { + %tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <16 x i8> %b, < i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 > + %tmp3 = or <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +; CHECK-LABEL: test2 +; CHECK: ret + diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll index fa4d54d..a0b17e1 100644 --- a/test/CodeGen/AArch64/neon-perm.ll +++ b/test/CodeGen/AArch64/neon-perm.ll @@ -1030,6 +1030,1447 @@ entry: ret <8 x i16> %shuffle.i } +define <8 x i8> @test_same_vuzp1_s8(<8 x i8> %a) { +; CHECK: test_same_vuzp1_s8: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vuzp1q_s8(<16 x i8> %a) { +; CHECK: test_same_vuzp1q_s8: +; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vuzp1_s16(<4 x i16> %a) { +; CHECK: test_same_vuzp1_s16: +; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vuzp1q_s16(<8 x i16> %a) { +; CHECK: test_same_vuzp1q_s16: +; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_same_vuzp1q_s32(<4 x i32> %a) { +; CHECK: test_same_vuzp1q_s32: +; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_same_vuzp1_u8(<8 x i8> %a) { +; CHECK: test_same_vuzp1_u8: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vuzp1q_u8(<16 x i8> %a) { +; CHECK: test_same_vuzp1q_u8: +; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vuzp1_u16(<4 x i16> %a) { +; CHECK: test_same_vuzp1_u16: +; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vuzp1q_u16(<8 x i16> %a) { +; CHECK: test_same_vuzp1q_u16: +; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_same_vuzp1q_u32(<4 x i32> %a) { +; CHECK: test_same_vuzp1q_u32: +; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_same_vuzp1q_f32(<4 x float> %a) { +; CHECK: test_same_vuzp1q_f32: +; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <8 x i8> @test_same_vuzp1_p8(<8 x i8> %a) { +; CHECK: test_same_vuzp1_p8: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vuzp1q_p8(<16 x i8> %a) { +; CHECK: test_same_vuzp1q_p8: +; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vuzp1_p16(<4 x i16> %a) { +; CHECK: test_same_vuzp1_p16: +; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vuzp1q_p16(<8 x i16> %a) { +; CHECK: test_same_vuzp1q_p16: +; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_same_vuzp2_s8(<8 x i8> %a) { +; CHECK: test_same_vuzp2_s8: +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vuzp2q_s8(<16 x i8> %a) { +; CHECK: test_same_vuzp2q_s8: +; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vuzp2_s16(<4 x i16> %a) { +; CHECK: test_same_vuzp2_s16: +; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vuzp2q_s16(<8 x i16> %a) { +; CHECK: test_same_vuzp2q_s16: +; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_same_vuzp2q_s32(<4 x i32> %a) { +; CHECK: test_same_vuzp2q_s32: +; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_same_vuzp2_u8(<8 x i8> %a) { +; CHECK: test_same_vuzp2_u8: +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vuzp2q_u8(<16 x i8> %a) { +; CHECK: test_same_vuzp2q_u8: +; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vuzp2_u16(<4 x i16> %a) { +; CHECK: test_same_vuzp2_u16: +; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vuzp2q_u16(<8 x i16> %a) { +; CHECK: test_same_vuzp2q_u16: +; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_same_vuzp2q_u32(<4 x i32> %a) { +; CHECK: test_same_vuzp2q_u32: +; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_same_vuzp2q_f32(<4 x float> %a) { +; CHECK: test_same_vuzp2q_f32: +; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <8 x i8> @test_same_vuzp2_p8(<8 x i8> %a) { +; CHECK: test_same_vuzp2_p8: +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vuzp2q_p8(<16 x i8> %a) { +; CHECK: test_same_vuzp2q_p8: +; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vuzp2_p16(<4 x i16> %a) { +; CHECK: test_same_vuzp2_p16: +; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vuzp2q_p16(<8 x i16> %a) { +; CHECK: test_same_vuzp2q_p16: +; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_same_vzip1_s8(<8 x i8> %a) { +; CHECK: test_same_vzip1_s8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vzip1q_s8(<16 x i8> %a) { +; CHECK: test_same_vzip1q_s8: +; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vzip1_s16(<4 x i16> %a) { +; CHECK: test_same_vzip1_s16: +; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vzip1q_s16(<8 x i16> %a) { +; CHECK: test_same_vzip1q_s16: +; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_same_vzip1q_s32(<4 x i32> %a) { +; CHECK: test_same_vzip1q_s32: +; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_same_vzip1_u8(<8 x i8> %a) { +; CHECK: test_same_vzip1_u8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vzip1q_u8(<16 x i8> %a) { +; CHECK: test_same_vzip1q_u8: +; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vzip1_u16(<4 x i16> %a) { +; CHECK: test_same_vzip1_u16: +; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vzip1q_u16(<8 x i16> %a) { +; CHECK: test_same_vzip1q_u16: +; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_same_vzip1q_u32(<4 x i32> %a) { +; CHECK: test_same_vzip1q_u32: +; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_same_vzip1q_f32(<4 x float> %a) { +; CHECK: test_same_vzip1q_f32: +; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <8 x i8> @test_same_vzip1_p8(<8 x i8> %a) { +; CHECK: test_same_vzip1_p8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vzip1q_p8(<16 x i8> %a) { +; CHECK: test_same_vzip1q_p8: +; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vzip1_p16(<4 x i16> %a) { +; CHECK: test_same_vzip1_p16: +; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vzip1q_p16(<8 x i16> %a) { +; CHECK: test_same_vzip1q_p16: +; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_same_vzip2_s8(<8 x i8> %a) { +; CHECK: test_same_vzip2_s8: +; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vzip2q_s8(<16 x i8> %a) { +; CHECK: test_same_vzip2q_s8: +; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vzip2_s16(<4 x i16> %a) { +; CHECK: test_same_vzip2_s16: +; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vzip2q_s16(<8 x i16> %a) { +; CHECK: test_same_vzip2q_s16: +; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_same_vzip2q_s32(<4 x i32> %a) { +; CHECK: test_same_vzip2q_s32: +; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_same_vzip2_u8(<8 x i8> %a) { +; CHECK: test_same_vzip2_u8: +; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vzip2q_u8(<16 x i8> %a) { +; CHECK: test_same_vzip2q_u8: +; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vzip2_u16(<4 x i16> %a) { +; CHECK: test_same_vzip2_u16: +; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vzip2q_u16(<8 x i16> %a) { +; CHECK: test_same_vzip2q_u16: +; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_same_vzip2q_u32(<4 x i32> %a) { +; CHECK: test_same_vzip2q_u32: +; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_same_vzip2q_f32(<4 x float> %a) { +; CHECK: test_same_vzip2q_f32: +; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <8 x i8> @test_same_vzip2_p8(<8 x i8> %a) { +; CHECK: test_same_vzip2_p8: +; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vzip2q_p8(<16 x i8> %a) { +; CHECK: test_same_vzip2q_p8: +; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vzip2_p16(<4 x i16> %a) { +; CHECK: test_same_vzip2_p16: +; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vzip2q_p16(<8 x i16> %a) { +; CHECK: test_same_vzip2q_p16: +; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_same_vtrn1_s8(<8 x i8> %a) { +; CHECK: test_same_vtrn1_s8: +; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vtrn1q_s8(<16 x i8> %a) { +; CHECK: test_same_vtrn1q_s8: +; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vtrn1_s16(<4 x i16> %a) { +; CHECK: test_same_vtrn1_s16: +; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vtrn1q_s16(<8 x i16> %a) { +; CHECK: test_same_vtrn1q_s16: +; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_same_vtrn1q_s32(<4 x i32> %a) { +; CHECK: test_same_vtrn1q_s32: +; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_same_vtrn1_u8(<8 x i8> %a) { +; CHECK: test_same_vtrn1_u8: +; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vtrn1q_u8(<16 x i8> %a) { +; CHECK: test_same_vtrn1q_u8: +; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vtrn1_u16(<4 x i16> %a) { +; CHECK: test_same_vtrn1_u16: +; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vtrn1q_u16(<8 x i16> %a) { +; CHECK: test_same_vtrn1q_u16: +; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_same_vtrn1q_u32(<4 x i32> %a) { +; CHECK: test_same_vtrn1q_u32: +; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_same_vtrn1q_f32(<4 x float> %a) { +; CHECK: test_same_vtrn1q_f32: +; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <8 x i8> @test_same_vtrn1_p8(<8 x i8> %a) { +; CHECK: test_same_vtrn1_p8: +; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vtrn1q_p8(<16 x i8> %a) { +; CHECK: test_same_vtrn1q_p8: +; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vtrn1_p16(<4 x i16> %a) { +; CHECK: test_same_vtrn1_p16: +; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vtrn1q_p16(<8 x i16> %a) { +; CHECK: test_same_vtrn1q_p16: +; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_same_vtrn2_s8(<8 x i8> %a) { +; CHECK: test_same_vtrn2_s8: +; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vtrn2q_s8(<16 x i8> %a) { +; CHECK: test_same_vtrn2q_s8: +; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vtrn2_s16(<4 x i16> %a) { +; CHECK: test_same_vtrn2_s16: +; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vtrn2q_s16(<8 x i16> %a) { +; CHECK: test_same_vtrn2q_s16: +; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_same_vtrn2q_s32(<4 x i32> %a) { +; CHECK: test_same_vtrn2q_s32: +; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_same_vtrn2_u8(<8 x i8> %a) { +; CHECK: test_same_vtrn2_u8: +; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vtrn2q_u8(<16 x i8> %a) { +; CHECK: test_same_vtrn2q_u8: +; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vtrn2_u16(<4 x i16> %a) { +; CHECK: test_same_vtrn2_u16: +; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vtrn2q_u16(<8 x i16> %a) { +; CHECK: test_same_vtrn2q_u16: +; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_same_vtrn2q_u32(<4 x i32> %a) { +; CHECK: test_same_vtrn2q_u32: +; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_same_vtrn2q_f32(<4 x float> %a) { +; CHECK: test_same_vtrn2q_f32: +; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <8 x i8> @test_same_vtrn2_p8(<8 x i8> %a) { +; CHECK: test_same_vtrn2_p8: +; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_same_vtrn2q_p8(<16 x i8> %a) { +; CHECK: test_same_vtrn2q_p8: +; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_same_vtrn2_p16(<4 x i16> %a) { +; CHECK: test_same_vtrn2_p16: +; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_same_vtrn2q_p16(<8 x i16> %a) { +; CHECK: test_same_vtrn2q_p16: +; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle.i +} + + +define <8 x i8> @test_undef_vuzp1_s8(<8 x i8> %a) { +; CHECK: test_undef_vuzp1_s8: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vuzp1q_s8(<16 x i8> %a) { +; CHECK: test_undef_vuzp1q_s8: +; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vuzp1_s16(<4 x i16> %a) { +; CHECK: test_undef_vuzp1_s16: +; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vuzp1q_s16(<8 x i16> %a) { +; CHECK: test_undef_vuzp1q_s16: +; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_undef_vuzp1q_s32(<4 x i32> %a) { +; CHECK: test_undef_vuzp1q_s32: +; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_undef_vuzp1_u8(<8 x i8> %a) { +; CHECK: test_undef_vuzp1_u8: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vuzp1q_u8(<16 x i8> %a) { +; CHECK: test_undef_vuzp1q_u8: +; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vuzp1_u16(<4 x i16> %a) { +; CHECK: test_undef_vuzp1_u16: +; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vuzp1q_u16(<8 x i16> %a) { +; CHECK: test_undef_vuzp1q_u16: +; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_undef_vuzp1q_u32(<4 x i32> %a) { +; CHECK: test_undef_vuzp1q_u32: +; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_undef_vuzp1q_f32(<4 x float> %a) { +; CHECK: test_undef_vuzp1q_f32: +; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <8 x i8> @test_undef_vuzp1_p8(<8 x i8> %a) { +; CHECK: test_undef_vuzp1_p8: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vuzp1q_p8(<16 x i8> %a) { +; CHECK: test_undef_vuzp1q_p8: +; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vuzp1_p16(<4 x i16> %a) { +; CHECK: test_undef_vuzp1_p16: +; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vuzp1q_p16(<8 x i16> %a) { +; CHECK: test_undef_vuzp1q_p16: +; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_undef_vuzp2_s8(<8 x i8> %a) { +; CHECK: test_undef_vuzp2_s8: +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vuzp2q_s8(<16 x i8> %a) { +; CHECK: test_undef_vuzp2q_s8: +; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vuzp2_s16(<4 x i16> %a) { +; CHECK: test_undef_vuzp2_s16: +; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vuzp2q_s16(<8 x i16> %a) { +; CHECK: test_undef_vuzp2q_s16: +; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_undef_vuzp2q_s32(<4 x i32> %a) { +; CHECK: test_undef_vuzp2q_s32: +; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_undef_vuzp2_u8(<8 x i8> %a) { +; CHECK: test_undef_vuzp2_u8: +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vuzp2q_u8(<16 x i8> %a) { +; CHECK: test_undef_vuzp2q_u8: +; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vuzp2_u16(<4 x i16> %a) { +; CHECK: test_undef_vuzp2_u16: +; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vuzp2q_u16(<8 x i16> %a) { +; CHECK: test_undef_vuzp2q_u16: +; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_undef_vuzp2q_u32(<4 x i32> %a) { +; CHECK: test_undef_vuzp2q_u32: +; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_undef_vuzp2q_f32(<4 x float> %a) { +; CHECK: test_undef_vuzp2q_f32: +; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <8 x i8> @test_undef_vuzp2_p8(<8 x i8> %a) { +; CHECK: test_undef_vuzp2_p8: +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vuzp2q_p8(<16 x i8> %a) { +; CHECK: test_undef_vuzp2q_p8: +; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vuzp2_p16(<4 x i16> %a) { +; CHECK: test_undef_vuzp2_p16: +; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vuzp2q_p16(<8 x i16> %a) { +; CHECK: test_undef_vuzp2q_p16: +; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_undef_vzip1_s8(<8 x i8> %a) { +; CHECK: test_undef_vzip1_s8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vzip1q_s8(<16 x i8> %a) { +; CHECK: test_undef_vzip1q_s8: +; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vzip1_s16(<4 x i16> %a) { +; CHECK: test_undef_vzip1_s16: +; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vzip1q_s16(<8 x i16> %a) { +; CHECK: test_undef_vzip1q_s16: +; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_undef_vzip1q_s32(<4 x i32> %a) { +; CHECK: test_undef_vzip1q_s32: +; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_undef_vzip1_u8(<8 x i8> %a) { +; CHECK: test_undef_vzip1_u8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vzip1q_u8(<16 x i8> %a) { +; CHECK: test_undef_vzip1q_u8: +; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vzip1_u16(<4 x i16> %a) { +; CHECK: test_undef_vzip1_u16: +; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vzip1q_u16(<8 x i16> %a) { +; CHECK: test_undef_vzip1q_u16: +; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_undef_vzip1q_u32(<4 x i32> %a) { +; CHECK: test_undef_vzip1q_u32: +; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_undef_vzip1q_f32(<4 x float> %a) { +; CHECK: test_undef_vzip1q_f32: +; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <8 x i8> @test_undef_vzip1_p8(<8 x i8> %a) { +; CHECK: test_undef_vzip1_p8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vzip1q_p8(<16 x i8> %a) { +; CHECK: test_undef_vzip1q_p8: +; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vzip1_p16(<4 x i16> %a) { +; CHECK: test_undef_vzip1_p16: +; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vzip1q_p16(<8 x i16> %a) { +; CHECK: test_undef_vzip1q_p16: +; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_undef_vzip2_s8(<8 x i8> %a) { +; CHECK: test_undef_vzip2_s8: +; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vzip2q_s8(<16 x i8> %a) { +; CHECK: test_undef_vzip2q_s8: +; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vzip2_s16(<4 x i16> %a) { +; CHECK: test_undef_vzip2_s16: +; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vzip2q_s16(<8 x i16> %a) { +; CHECK: test_undef_vzip2q_s16: +; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_undef_vzip2q_s32(<4 x i32> %a) { +; CHECK: test_undef_vzip2q_s32: +; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_undef_vzip2_u8(<8 x i8> %a) { +; CHECK: test_undef_vzip2_u8: +; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vzip2q_u8(<16 x i8> %a) { +; CHECK: test_undef_vzip2q_u8: +; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vzip2_u16(<4 x i16> %a) { +; CHECK: test_undef_vzip2_u16: +; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vzip2q_u16(<8 x i16> %a) { +; CHECK: test_undef_vzip2q_u16: +; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_undef_vzip2q_u32(<4 x i32> %a) { +; CHECK: test_undef_vzip2q_u32: +; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_undef_vzip2q_f32(<4 x float> %a) { +; CHECK: test_undef_vzip2q_f32: +; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <8 x i8> @test_undef_vzip2_p8(<8 x i8> %a) { +; CHECK: test_undef_vzip2_p8: +; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vzip2q_p8(<16 x i8> %a) { +; CHECK: test_undef_vzip2q_p8: +; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vzip2_p16(<4 x i16> %a) { +; CHECK: test_undef_vzip2_p16: +; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vzip2q_p16(<8 x i16> %a) { +; CHECK: test_undef_vzip2q_p16: +; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_undef_vtrn1_s8(<8 x i8> %a) { +; CHECK: test_undef_vtrn1_s8: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vtrn1q_s8(<16 x i8> %a) { +; CHECK: test_undef_vtrn1q_s8: +; CHECK: ret +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vtrn1_s16(<4 x i16> %a) { +; CHECK: test_undef_vtrn1_s16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vtrn1q_s16(<8 x i16> %a) { +; CHECK: test_undef_vtrn1q_s16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_undef_vtrn1q_s32(<4 x i32> %a) { +; CHECK: test_undef_vtrn1q_s32: +; CHECK: ret +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_undef_vtrn1_u8(<8 x i8> %a) { +; CHECK: test_undef_vtrn1_u8: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vtrn1q_u8(<16 x i8> %a) { +; CHECK: test_undef_vtrn1q_u8: +; CHECK: ret +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vtrn1_u16(<4 x i16> %a) { +; CHECK: test_undef_vtrn1_u16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vtrn1q_u16(<8 x i16> %a) { +; CHECK: test_undef_vtrn1q_u16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_undef_vtrn1q_u32(<4 x i32> %a) { +; CHECK: test_undef_vtrn1q_u32: +; CHECK: ret +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_undef_vtrn1q_f32(<4 x float> %a) { +; CHECK: test_undef_vtrn1q_f32: +; CHECK: ret +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <8 x i8> @test_undef_vtrn1_p8(<8 x i8> %a) { +; CHECK: test_undef_vtrn1_p8: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vtrn1q_p8(<16 x i8> %a) { +; CHECK: test_undef_vtrn1q_p8: +; CHECK: ret +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vtrn1_p16(<4 x i16> %a) { +; CHECK: test_undef_vtrn1_p16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vtrn1q_p16(<8 x i16> %a) { +; CHECK: test_undef_vtrn1q_p16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_undef_vtrn2_s8(<8 x i8> %a) { +; CHECK: test_undef_vtrn2_s8: +; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vtrn2q_s8(<16 x i8> %a) { +; CHECK: test_undef_vtrn2q_s8: +; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vtrn2_s16(<4 x i16> %a) { +; CHECK: test_undef_vtrn2_s16: +; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vtrn2q_s16(<8 x i16> %a) { +; CHECK: test_undef_vtrn2q_s16: +; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_undef_vtrn2q_s32(<4 x i32> %a) { +; CHECK: test_undef_vtrn2q_s32: +; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_undef_vtrn2_u8(<8 x i8> %a) { +; CHECK: test_undef_vtrn2_u8: +; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vtrn2q_u8(<16 x i8> %a) { +; CHECK: test_undef_vtrn2q_u8: +; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vtrn2_u16(<4 x i16> %a) { +; CHECK: test_undef_vtrn2_u16: +; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vtrn2q_u16(<8 x i16> %a) { +; CHECK: test_undef_vtrn2q_u16: +; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_undef_vtrn2q_u32(<4 x i32> %a) { +; CHECK: test_undef_vtrn2q_u32: +; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_undef_vtrn2q_f32(<4 x float> %a) { +; CHECK: test_undef_vtrn2q_f32: +; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <8 x i8> @test_undef_vtrn2_p8(<8 x i8> %a) { +; CHECK: test_undef_vtrn2_p8: +; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_undef_vtrn2q_p8(<16 x i8> %a) { +; CHECK: test_undef_vtrn2q_p8: +; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_undef_vtrn2_p16(<4 x i16> %a) { +; CHECK: test_undef_vtrn2_p16: +; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_undef_vtrn2q_p16(<8 x i16> %a) { +; CHECK: test_undef_vtrn2q_p16: +; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle.i +} + define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) { ; CHECK: test_vuzp_s8: ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b diff --git a/test/CodeGen/AArch64/neon-scalar-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-add-sub.ll index 09ca880..4f322e0 100644 --- a/test/CodeGen/AArch64/neon-scalar-add-sub.ll +++ b/test/CodeGen/AArch64/neon-scalar-add-sub.ll @@ -1,13 +1,13 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) { -;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} %tmp3 = add <1 x i64> %A, %B; ret <1 x i64> %tmp3 } define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) { -;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} %tmp3 = sub <1 x i64> %A, %B; ret <1 x i64> %tmp3 } @@ -18,14 +18,14 @@ declare <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64>, <1 x i64>) define <1 x i64> @test_add_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_add_v1i64: %tmp1 = call <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64> %lhs, <1 x i64> %rhs) -; CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +; CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } define <1 x i64> @test_uadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_uadd_v1i64: %tmp1 = call <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } @@ -35,14 +35,14 @@ declare <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64>, <1 x i64>) define <1 x i64> @test_sub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_sub_v1i64: %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64> %lhs, <1 x i64> %rhs) -; CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +; CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } define <1 x i64> @test_usub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_usub_v1i64: %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll index 8ce42de..247514c 100644 --- a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll +++ b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll @@ -5,7 +5,7 @@ declare double @llvm.fma.f64(double, double, double) define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) { ; CHECK: test_fmla_ss4S - ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3] + ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a) ret float %tmp2 @@ -13,7 +13,7 @@ define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) { define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) { ; CHECK: test_fmla_ss4S_swap - ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3] + ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = call float @llvm.fma.f32(float %tmp1, float %a, float %a) ret float %tmp2 @@ -21,7 +21,7 @@ define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) { define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) { ; CHECK: test_fmla_ss2S - ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[1] + ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] %tmp1 = extractelement <2 x float> %v, i32 1 %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a) ret float %tmp2 @@ -29,7 +29,7 @@ define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) { define double @test_fmla_ddD(double %a, double %b, <1 x double> %v) { ; CHECK: test_fmla_ddD - ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[0] + ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] %tmp1 = extractelement <1 x double> %v, i32 0 %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a) ret double %tmp2 @@ -37,7 +37,7 @@ define double @test_fmla_ddD(double %a, double %b, <1 x double> %v) { define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) { ; CHECK: test_fmla_dd2D - ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1] + ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a) ret double %tmp2 @@ -45,7 +45,7 @@ define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) { define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) { ; CHECK: test_fmla_dd2D_swap - ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1] + ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a) ret double %tmp2 @@ -53,7 +53,7 @@ define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) { define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) { ; CHECK: test_fmls_ss4S - ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3] + ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = fsub float -0.0, %tmp1 %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a) @@ -62,7 +62,7 @@ define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) { define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) { ; CHECK: test_fmls_ss4S_swap - ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3] + ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = fsub float -0.0, %tmp1 %tmp3 = call float @llvm.fma.f32(float %tmp1, float %tmp2, float %a) @@ -72,7 +72,7 @@ define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) { define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) { ; CHECK: test_fmls_ss2S - ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[1] + ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] %tmp1 = extractelement <2 x float> %v, i32 1 %tmp2 = fsub float -0.0, %tmp1 %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a) @@ -81,7 +81,7 @@ define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) { define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) { ; CHECK: test_fmls_ddD - ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[0] + ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] %tmp1 = extractelement <1 x double> %v, i32 0 %tmp2 = fsub double -0.0, %tmp1 %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a) @@ -90,7 +90,7 @@ define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) { define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) { ; CHECK: test_fmls_dd2D - ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1] + ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = fsub double -0.0, %tmp1 %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a) @@ -99,7 +99,7 @@ define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) { define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) { ; CHECK: test_fmls_dd2D_swap - ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1] + ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = fsub double -0.0, %tmp1 %tmp3 = call double @llvm.fma.f64(double %tmp1, double %tmp2, double %a) diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll index 968ad3e..c9128e7 100644 --- a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll +++ b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll @@ -2,7 +2,7 @@ define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) { ; CHECK: test_fmul_lane_ss2S - ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1] + ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] %tmp1 = extractelement <2 x float> %v, i32 1 %tmp2 = fmul float %a, %tmp1; ret float %tmp2; @@ -10,7 +10,7 @@ define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) { define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) { ; CHECK: test_fmul_lane_ss2S_swap - ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1] + ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] %tmp1 = extractelement <2 x float> %v, i32 1 %tmp2 = fmul float %tmp1, %a; ret float %tmp2; @@ -19,7 +19,7 @@ define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) { define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) { ; CHECK: test_fmul_lane_ss4S - ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3] + ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = fmul float %a, %tmp1; ret float %tmp2; @@ -27,7 +27,7 @@ define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) { define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) { ; CHECK: test_fmul_lane_ss4S_swap - ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3] + ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = fmul float %tmp1, %a; ret float %tmp2; @@ -36,7 +36,7 @@ define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) { define double @test_fmul_lane_ddD(double %a, <1 x double> %v) { ; CHECK: test_fmul_lane_ddD - ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0] + ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] %tmp1 = extractelement <1 x double> %v, i32 0 %tmp2 = fmul double %a, %tmp1; ret double %tmp2; @@ -46,7 +46,7 @@ define double @test_fmul_lane_ddD(double %a, <1 x double> %v) { define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) { ; CHECK: test_fmul_lane_dd2D - ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = fmul double %a, %tmp1; ret double %tmp2; @@ -55,7 +55,7 @@ define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) { define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) { ; CHECK: test_fmul_lane_dd2D_swap - ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = fmul double %tmp1, %a; ret double %tmp2; @@ -65,7 +65,7 @@ declare float @llvm.aarch64.neon.vmulx.f32(float, float) define float @test_fmulx_lane_f32(float %a, <2 x float> %v) { ; CHECK: test_fmulx_lane_f32 - ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1] + ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] %tmp1 = extractelement <2 x float> %v, i32 1 %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1) ret float %tmp2; @@ -73,7 +73,7 @@ define float @test_fmulx_lane_f32(float %a, <2 x float> %v) { define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) { ; CHECK: test_fmulx_laneq_f32 - ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3] + ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1) ret float %tmp2; @@ -81,7 +81,7 @@ define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) { define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) { ; CHECK: test_fmulx_laneq_f32_swap - ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3] + ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] %tmp1 = extractelement <4 x float> %v, i32 3 %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %tmp1, float %a) ret float %tmp2; @@ -91,7 +91,7 @@ declare double @llvm.aarch64.neon.vmulx.f64(double, double) define double @test_fmulx_lane_f64(double %a, <1 x double> %v) { ; CHECK: test_fmulx_lane_f64 - ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0] + ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] %tmp1 = extractelement <1 x double> %v, i32 0 %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1) ret double %tmp2; @@ -99,7 +99,7 @@ define double @test_fmulx_lane_f64(double %a, <1 x double> %v) { define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) { ; CHECK: test_fmulx_laneq_f64_0 - ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0] + ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] %tmp1 = extractelement <2 x double> %v, i32 0 %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1) ret double %tmp2; @@ -108,7 +108,7 @@ define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) { define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) { ; CHECK: test_fmulx_laneq_f64_1 - ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1) ret double %tmp2; @@ -116,7 +116,7 @@ define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) { define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) { ; CHECK: test_fmulx_laneq_f64_1_swap - ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] %tmp1 = extractelement <2 x double> %v, i32 1 %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %tmp1, double %a) ret double %tmp2; diff --git a/test/CodeGen/AArch64/neon-scalar-compare.ll b/test/CodeGen/AArch64/neon-scalar-compare.ll index 5f10cbb..e1f3964 100644 --- a/test/CodeGen/AArch64/neon-scalar-compare.ll +++ b/test/CodeGen/AArch64/neon-scalar-compare.ll @@ -122,28 +122,28 @@ entry: define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 { ; CHECK: test_vcage_f64 ; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} - %vcage2.i = tail call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %a, <1 x double> %b) #2 + %vcage2.i = tail call <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #2 ret <1 x i64> %vcage2.i } define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 { ; CHECK: test_vcagt_f64 ; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} - %vcagt2.i = tail call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %a, <1 x double> %b) #2 + %vcagt2.i = tail call <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #2 ret <1 x i64> %vcagt2.i } define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 { ; CHECK: test_vcale_f64 ; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} - %vcage2.i = tail call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %b, <1 x double> %a) #2 + %vcage2.i = tail call <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #2 ret <1 x i64> %vcage2.i } define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 { ; CHECK: test_vcalt_f64 ; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} - %vcagt2.i = tail call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %b, <1 x double> %a) #2 + %vcagt2.i = tail call <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #2 ret <1 x i64> %vcagt2.i } @@ -271,7 +271,7 @@ define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 { ; CHECK: test_vceqz_s64 ; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0 %1 = icmp eq <1 x i64> %a, zeroinitializer - %vceqz.i = zext <1 x i1> %1 to <1 x i64> + %vceqz.i = sext <1 x i1> %1 to <1 x i64> ret <1 x i64> %vceqz.i } @@ -279,7 +279,7 @@ define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 { ; CHECK: test_vceqz_u64 ; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0 %1 = icmp eq <1 x i64> %a, zeroinitializer - %vceqz.i = zext <1 x i1> %1 to <1 x i64> + %vceqz.i = sext <1 x i1> %1 to <1 x i64> ret <1 x i64> %vceqz.i } @@ -287,7 +287,7 @@ define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 { ; CHECK: test_vceqz_p64 ; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0 %1 = icmp eq <1 x i64> %a, zeroinitializer - %vceqz.i = zext <1 x i1> %1 to <1 x i64> + %vceqz.i = sext <1 x i1> %1 to <1 x i64> ret <1 x i64> %vceqz.i } @@ -295,7 +295,7 @@ define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 { ; CHECK: test_vceqzq_p64 ; CHECK: cmeq {{v[0-9]}}.2d, {{v[0-9]}}.2d, #0 %1 = icmp eq <2 x i64> %a, zeroinitializer - %vceqz.i = zext <2 x i1> %1 to <2 x i64> + %vceqz.i = sext <2 x i1> %1 to <2 x i64> ret <2 x i64> %vceqz.i } @@ -303,7 +303,7 @@ define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 { ; CHECK: test_vcgez_s64 ; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0 %1 = icmp sge <1 x i64> %a, zeroinitializer - %vcgez.i = zext <1 x i1> %1 to <1 x i64> + %vcgez.i = sext <1 x i1> %1 to <1 x i64> ret <1 x i64> %vcgez.i } @@ -311,7 +311,7 @@ define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 { ; CHECK: test_vclez_s64 ; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0 %1 = icmp sle <1 x i64> %a, zeroinitializer - %vclez.i = zext <1 x i1> %1 to <1 x i64> + %vclez.i = sext <1 x i1> %1 to <1 x i64> ret <1 x i64> %vclez.i } @@ -319,7 +319,7 @@ define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 { ; CHECK: test_vcgtz_s64 ; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0 %1 = icmp sgt <1 x i64> %a, zeroinitializer - %vcgtz.i = zext <1 x i1> %1 to <1 x i64> + %vcgtz.i = sext <1 x i1> %1 to <1 x i64> ret <1 x i64> %vcgtz.i } @@ -327,12 +327,12 @@ define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 { ; CHECK: test_vcltz_s64 ; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0 %1 = icmp slt <1 x i64> %a, zeroinitializer - %vcltz.i = zext <1 x i1> %1 to <1 x i64> + %vcltz.i = sext <1 x i1> %1 to <1 x i64> ret <1 x i64> %vcltz.i } -declare <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) -declare <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) +declare <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double>, <1 x double>) +declare <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double>, <1 x double>) declare <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>) declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>) declare <1 x i64> @llvm.aarch64.neon.vchs.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>) diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll index d433ff5..fadd734 100644 --- a/test/CodeGen/AArch64/neon-scalar-copy.ll +++ b/test/CodeGen/AArch64/neon-scalar-copy.ll @@ -2,21 +2,30 @@ define float @test_dup_sv2S(<2 x float> %v) { ;CHECK: test_dup_sv2S - ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[1] + ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1] %tmp1 = extractelement <2 x float> %v, i32 1 ret float %tmp1 } +define float @test_dup_sv2S_0(<2 x float> %v) { + ;CHECK-LABEL: test_dup_sv2S_0 + ;CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[0] + ;CHECK: ret + %tmp1 = extractelement <2 x float> %v, i32 0 + ret float %tmp1 +} + define float @test_dup_sv4S(<4 x float> %v) { - ;CHECK: test_dup_sv4S - ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[0] + ;CHECK-LABEL: test_dup_sv4S + ;CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[0] + ;CHECK: ret %tmp1 = extractelement <4 x float> %v, i32 0 ret float %tmp1 } define double @test_dup_dvD(<1 x double> %v) { ;CHECK: test_dup_dvD - ;CHECK-NOT: dup {{d[0-31]+}}, {{v[0-31]+}}.d[0] + ;CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0] ;CHECK: ret %tmp1 = extractelement <1 x double> %v, i32 0 ret double %tmp1 @@ -24,63 +33,71 @@ define double @test_dup_dvD(<1 x double> %v) { define double @test_dup_dv2D(<2 x double> %v) { ;CHECK: test_dup_dv2D - ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1] + ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + ret double %tmp1 +} + +define double @test_dup_dv2D_0(<2 x double> %v) { + ;CHECK: test_dup_dv2D_0 + ;CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0] + ;CHECK: ret %tmp1 = extractelement <2 x double> %v, i32 1 ret double %tmp1 } define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) { ;CHECK: test_vector_dup_bv16B - ;CHECK: dup {{b[0-31]+}}, {{v[0-31]+}}.b[14] + ;CHECK: dup {{b[0-9]+}}, {{v[0-9]+}}.b[14] %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> ret <1 x i8> %shuffle.i } define <1 x i8> @test_vector_dup_bv8B(<8 x i8> %v1) { ;CHECK: test_vector_dup_bv8B - ;CHECK: dup {{b[0-31]+}}, {{v[0-31]+}}.b[7] + ;CHECK: dup {{b[0-9]+}}, {{v[0-9]+}}.b[7] %shuffle.i = shufflevector <8 x i8> %v1, <8 x i8> undef, <1 x i32> ret <1 x i8> %shuffle.i } define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) { ;CHECK: test_vector_dup_hv8H - ;CHECK: dup {{h[0-31]+}}, {{v[0-31]+}}.h[7] + ;CHECK: dup {{h[0-9]+}}, {{v[0-9]+}}.h[7] %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> ret <1 x i16> %shuffle.i } define <1 x i16> @test_vector_dup_hv4H(<4 x i16> %v1) { ;CHECK: test_vector_dup_hv4H - ;CHECK: dup {{h[0-31]+}}, {{v[0-31]+}}.h[3] + ;CHECK: dup {{h[0-9]+}}, {{v[0-9]+}}.h[3] %shuffle.i = shufflevector <4 x i16> %v1, <4 x i16> undef, <1 x i32> ret <1 x i16> %shuffle.i } define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) { ;CHECK: test_vector_dup_sv4S - ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[3] + ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[3] %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> ret <1 x i32> %shuffle } define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) { ;CHECK: test_vector_dup_sv2S - ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[1] + ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1] %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <1 x i32> ret <1 x i32> %shuffle } define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) { ;CHECK: test_vector_dup_dv2D - ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1] + ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1] %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> ret <1 x i64> %shuffle.i } define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) { ;CHECK: test_vector_copy_dup_dv2D - ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1] + ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1] %vget_lane = extractelement <2 x i64> %c, i32 1 %vset_lane = insertelement <1 x i64> undef, i64 %vget_lane, i32 0 ret <1 x i64> %vset_lane diff --git a/test/CodeGen/AArch64/neon-scalar-cvt.ll b/test/CodeGen/AArch64/neon-scalar-cvt.ll index a06d5d6..3a19bed 100644 --- a/test/CodeGen/AArch64/neon-scalar-cvt.ll +++ b/test/CodeGen/AArch64/neon-scalar-cvt.ll @@ -5,133 +5,129 @@ define float @test_vcvts_f32_s32(i32 %a) { ; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}} entry: %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0 - %0 = call float @llvm.aarch64.neon.vcvtf32.s32(<1 x i32> %vcvtf.i) + %0 = call float @llvm.aarch64.neon.vcvtint2fps.f32.v1i32(<1 x i32> %vcvtf.i) ret float %0 } -declare float @llvm.aarch64.neon.vcvtf32.s32(<1 x i32>) +declare float @llvm.aarch64.neon.vcvtint2fps.f32.v1i32(<1 x i32>) define double @test_vcvtd_f64_s64(i64 %a) { ; CHECK: test_vcvtd_f64_s64 ; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}} entry: %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0 - %0 = call double @llvm.aarch64.neon.vcvtf64.s64(<1 x i64> %vcvtf.i) + %0 = call double @llvm.aarch64.neon.vcvtint2fps.f64.v1i64(<1 x i64> %vcvtf.i) ret double %0 } -declare double @llvm.aarch64.neon.vcvtf64.s64(<1 x i64>) +declare double @llvm.aarch64.neon.vcvtint2fps.f64.v1i64(<1 x i64>) define float @test_vcvts_f32_u32(i32 %a) { ; CHECK: test_vcvts_f32_u32 ; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}} entry: %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0 - %0 = call float @llvm.aarch64.neon.vcvtf32.u32(<1 x i32> %vcvtf.i) + %0 = call float @llvm.aarch64.neon.vcvtint2fpu.f32.v1i32(<1 x i32> %vcvtf.i) ret float %0 } -declare float @llvm.aarch64.neon.vcvtf32.u32(<1 x i32>) +declare float @llvm.aarch64.neon.vcvtint2fpu.f32.v1i32(<1 x i32>) define double @test_vcvtd_f64_u64(i64 %a) { ; CHECK: test_vcvtd_f64_u64 ; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}} entry: %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0 - %0 = call double @llvm.aarch64.neon.vcvtf64.u64(<1 x i64> %vcvtf.i) + %0 = call double @llvm.aarch64.neon.vcvtint2fpu.f64.v1i64(<1 x i64> %vcvtf.i) ret double %0 } -declare double @llvm.aarch64.neon.vcvtf64.u64(<1 x i64>) +declare double @llvm.aarch64.neon.vcvtint2fpu.f64.v1i64(<1 x i64>) define float @test_vcvts_n_f32_s32(i32 %a) { ; CHECK: test_vcvts_n_f32_s32 ; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1 entry: %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0 - %0 = call float @llvm.aarch64.neon.vcvtf32.n.s32(<1 x i32> %vcvtf, i32 1) + %0 = call float @llvm.aarch64.neon.vcvtfxs2fp.n.f32.v1i32(<1 x i32> %vcvtf, i32 1) ret float %0 } -declare float @llvm.aarch64.neon.vcvtf32.n.s32(<1 x i32>, i32) +declare float @llvm.aarch64.neon.vcvtfxs2fp.n.f32.v1i32(<1 x i32>, i32) define double @test_vcvtd_n_f64_s64(i64 %a) { ; CHECK: test_vcvtd_n_f64_s64 ; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1 entry: %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0 - %0 = call double @llvm.aarch64.neon.vcvtf64.n.s64(<1 x i64> %vcvtf, i32 1) + %0 = call double @llvm.aarch64.neon.vcvtfxs2fp.n.f64.v1i64(<1 x i64> %vcvtf, i32 1) ret double %0 } -declare double @llvm.aarch64.neon.vcvtf64.n.s64(<1 x i64>, i32) +declare double @llvm.aarch64.neon.vcvtfxs2fp.n.f64.v1i64(<1 x i64>, i32) define float @test_vcvts_n_f32_u32(i32 %a) { ; CHECK: test_vcvts_n_f32_u32 ; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #1 entry: %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0 - %0 = call float @llvm.aarch64.neon.vcvtf32.n.u32(<1 x i32> %vcvtf, i32 1) + %0 = call float @llvm.aarch64.neon.vcvtfxu2fp.n.f32.v1i32(<1 x i32> %vcvtf, i32 1) ret float %0 } -declare float @llvm.aarch64.neon.vcvtf32.n.u32(<1 x i32>, i32) +declare float @llvm.aarch64.neon.vcvtfxu2fp.n.f32.v1i32(<1 x i32>, i32) define double @test_vcvtd_n_f64_u64(i64 %a) { ; CHECK: test_vcvtd_n_f64_u64 ; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #1 entry: %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0 - %0 = call double @llvm.aarch64.neon.vcvtf64.n.u64(<1 x i64> %vcvtf, i32 1) + %0 = call double @llvm.aarch64.neon.vcvtfxu2fp.n.f64.v1i64(<1 x i64> %vcvtf, i32 1) ret double %0 } -declare double @llvm.aarch64.neon.vcvtf64.n.u64(<1 x i64>, i32) +declare double @llvm.aarch64.neon.vcvtfxu2fp.n.f64.v1i64(<1 x i64>, i32) define i32 @test_vcvts_n_s32_f32(float %a) { ; CHECK: test_vcvts_n_s32_f32 ; CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1 entry: - %fcvtzs = insertelement <1 x float> undef, float %a, i32 0 - %fcvtzs1 = call <1 x i32> @llvm.aarch64.neon.vcvts.n.s32.f32(<1 x float> %fcvtzs, i32 1) + %fcvtzs1 = call <1 x i32> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i32.f32(float %a, i32 1) %0 = extractelement <1 x i32> %fcvtzs1, i32 0 ret i32 %0 } -declare <1 x i32> @llvm.aarch64.neon.vcvts.n.s32.f32(<1 x float>, i32) +declare <1 x i32> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i32.f32(float, i32) define i64 @test_vcvtd_n_s64_f64(double %a) { ; CHECK: test_vcvtd_n_s64_f64 ; CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1 entry: - %fcvtzs = insertelement <1 x double> undef, double %a, i32 0 - %fcvtzs1 = call <1 x i64> @llvm.aarch64.neon.vcvtd.n.s64.f64(<1 x double> %fcvtzs, i32 1) + %fcvtzs1 = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i64.f64(double %a, i32 1) %0 = extractelement <1 x i64> %fcvtzs1, i32 0 ret i64 %0 } -declare <1 x i64> @llvm.aarch64.neon.vcvtd.n.s64.f64(<1 x double>, i32) +declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i64.f64(double, i32) define i32 @test_vcvts_n_u32_f32(float %a) { ; CHECK: test_vcvts_n_u32_f32 ; CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32 entry: - %fcvtzu = insertelement <1 x float> undef, float %a, i32 0 - %fcvtzu1 = call <1 x i32> @llvm.aarch64.neon.vcvts.n.u32.f32(<1 x float> %fcvtzu, i32 32) + %fcvtzu1 = call <1 x i32> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i32.f32(float %a, i32 32) %0 = extractelement <1 x i32> %fcvtzu1, i32 0 ret i32 %0 } -declare <1 x i32> @llvm.aarch64.neon.vcvts.n.u32.f32(<1 x float>, i32) +declare <1 x i32> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i32.f32(float, i32) define i64 @test_vcvtd_n_u64_f64(double %a) { ; CHECK: test_vcvtd_n_u64_f64 ; CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64 entry: - %fcvtzu = insertelement <1 x double> undef, double %a, i32 0 - %fcvtzu1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtd.n.u64.f64(<1 x double> %fcvtzu, i32 64) + %fcvtzu1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i64.f64(double %a, i32 64) %0 = extractelement <1 x i64> %fcvtzu1, i32 0 ret i64 %0 } -declare <1 x i64> @llvm.aarch64.neon.vcvtd.n.u64.f64(<1 x double>, i32) +declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i64.f64(double, i32) diff --git a/test/CodeGen/AArch64/neon-scalar-ext.ll b/test/CodeGen/AArch64/neon-scalar-ext.ll new file mode 100644 index 0000000..51dea06 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-ext.ll @@ -0,0 +1,113 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +define <1 x i64> @test_zext_v1i32_v1i64(<2 x i32> %v) nounwind readnone { +; CHECK-LABEL: test_zext_v1i32_v1i64: +; CHECK: ushll v0.2d, v0.2s, #0 + %1 = extractelement <2 x i32> %v, i32 0 + %2 = insertelement <1 x i32> undef, i32 %1, i32 0 + %3 = zext <1 x i32> %2 to <1 x i64> + ret <1 x i64> %3 +} + +define <1 x i32> @test_zext_v1i16_v1i32(<4 x i16> %v) nounwind readnone { +; CHECK-LABEL: test_zext_v1i16_v1i32: +; CHECK: ushll v0.4s, v0.4h, #0 + %1 = extractelement <4 x i16> %v, i32 0 + %2 = insertelement <1 x i16> undef, i16 %1, i32 0 + %3 = zext <1 x i16> %2 to <1 x i32> + ret <1 x i32> %3 +} + +define <1 x i16> @test_zext_v1i8_v1i16(<8 x i8> %v) nounwind readnone { +; CHECK-LABEL: test_zext_v1i8_v1i16: +; CHECK: ushll v0.8h, v0.8b, #0 + %1 = extractelement <8 x i8> %v, i32 0 + %2 = insertelement <1 x i8> undef, i8 %1, i32 0 + %3 = zext <1 x i8> %2 to <1 x i16> + ret <1 x i16> %3 +} + +define <1 x i32> @test_zext_v1i8_v1i32(<8 x i8> %v) nounwind readnone { +; CHECK-LABEL: test_zext_v1i8_v1i32: +; CHECK: dup b0, v0.b[0] + %1 = extractelement <8 x i8> %v, i32 0 + %2 = insertelement <1 x i8> undef, i8 %1, i32 0 + %3 = zext <1 x i8> %2 to <1 x i32> + ret <1 x i32> %3 +} + +define <1 x i64> @test_zext_v1i16_v1i64(<4 x i16> %v) nounwind readnone { +; CHECK-LABEL: test_zext_v1i16_v1i64: +; CHECK: dup h0, v0.h[0] + %1 = extractelement <4 x i16> %v, i32 0 + %2 = insertelement <1 x i16> undef, i16 %1, i32 0 + %3 = zext <1 x i16> %2 to <1 x i64> + ret <1 x i64> %3 +} + +define <1 x i64> @test_zext_v1i8_v1i64(<8 x i8> %v) nounwind readnone { +; CHECK-LABEL: test_zext_v1i8_v1i64: +; CHECK: dup b0, v0.b[0] + %1 = extractelement <8 x i8> %v, i32 0 + %2 = insertelement <1 x i8> undef, i8 %1, i32 0 + %3 = zext <1 x i8> %2 to <1 x i64> + ret <1 x i64> %3 +} + +define <1 x i64> @test_sext_v1i32_v1i64(<2 x i32> %v) nounwind readnone { +; CHECK-LABEL: test_sext_v1i32_v1i64: +; CHECK: sshll v0.2d, v0.2s, #0 + %1 = extractelement <2 x i32> %v, i32 0 + %2 = insertelement <1 x i32> undef, i32 %1, i32 0 + %3 = sext <1 x i32> %2 to <1 x i64> + ret <1 x i64> %3 +} + +define <1 x i32> @test_sext_v1i16_v1i32(<4 x i16> %v) nounwind readnone { +; CHECK-LABEL: test_sext_v1i16_v1i32: +; CHECK: sshll v0.4s, v0.4h, #0 + %1 = extractelement <4 x i16> %v, i32 0 + %2 = insertelement <1 x i16> undef, i16 %1, i32 0 + %3 = sext <1 x i16> %2 to <1 x i32> + ret <1 x i32> %3 +} + +define <1 x i16> @test_sext_v1i8_v1i16(<8 x i8> %v) nounwind readnone { +; CHECK-LABEL: test_sext_v1i8_v1i16: +; CHECK: sshll v0.8h, v0.8b, #0 + %1 = extractelement <8 x i8> %v, i32 0 + %2 = insertelement <1 x i8> undef, i8 %1, i32 0 + %3 = sext <1 x i8> %2 to <1 x i16> + ret <1 x i16> %3 +} + +define <1 x i32> @test_sext_v1i8_v1i32(<8 x i8> %v) nounwind readnone { +; CHECK-LABEL: test_sext_v1i8_v1i32: +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK: sshll v0.4s, v0.4h, #0 + %1 = extractelement <8 x i8> %v, i32 0 + %2 = insertelement <1 x i8> undef, i8 %1, i32 0 + %3 = sext <1 x i8> %2 to <1 x i32> + ret <1 x i32> %3 +} + +define <1 x i64> @test_sext_v1i16_v1i64(<4 x i16> %v) nounwind readnone { +; CHECK-LABEL: test_sext_v1i16_v1i64: +; CHECK: sshll v0.4s, v0.4h, #0 +; CHECK: sshll v0.2d, v0.2s, #0 + %1 = extractelement <4 x i16> %v, i32 0 + %2 = insertelement <1 x i16> undef, i16 %1, i32 0 + %3 = sext <1 x i16> %2 to <1 x i64> + ret <1 x i64> %3 +} + +define <1 x i64> @test_sext_v1i8_v1i64(<8 x i8> %v) nounwind readnone { +; CHECK-LABEL: test_sext_v1i8_v1i64: +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK: sshll v0.4s, v0.4h, #0 +; CHECK: sshll v0.2d, v0.2s, #0 + %1 = extractelement <8 x i8> %v, i32 0 + %2 = insertelement <1 x i8> undef, i8 %1, i32 0 + %3 = sext <1 x i8> %2 to <1 x i64> + ret <1 x i64> %3 +} diff --git a/test/CodeGen/AArch64/neon-scalar-fabd.ll b/test/CodeGen/AArch64/neon-scalar-fabd.ll index 75686d3..6343310 100644 --- a/test/CodeGen/AArch64/neon-scalar-fabd.ll +++ b/test/CodeGen/AArch64/neon-scalar-fabd.ll @@ -4,10 +4,7 @@ define float @test_vabds_f32(float %a, float %b) { ; CHECK-LABEL: test_vabds_f32 ; CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} entry: - %vabd.i = insertelement <1 x float> undef, float %a, i32 0 - %vabd1.i = insertelement <1 x float> undef, float %b, i32 0 - %vabd2.i = call <1 x float> @llvm.aarch64.neon.vabd.v1f32(<1 x float> %vabd.i, <1 x float> %vabd1.i) - %0 = extractelement <1 x float> %vabd2.i, i32 0 + %0 = call float @llvm.aarch64.neon.vabd.f32(float %a, float %a) ret float %0 } @@ -15,12 +12,9 @@ define double @test_vabdd_f64(double %a, double %b) { ; CHECK-LABEL: test_vabdd_f64 ; CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} entry: - %vabd.i = insertelement <1 x double> undef, double %a, i32 0 - %vabd1.i = insertelement <1 x double> undef, double %b, i32 0 - %vabd2.i = call <1 x double> @llvm.aarch64.neon.vabd.v1f64(<1 x double> %vabd.i, <1 x double> %vabd1.i) - %0 = extractelement <1 x double> %vabd2.i, i32 0 + %0 = call double @llvm.aarch64.neon.vabd.f64(double %a, double %b) ret double %0 } -declare <1 x double> @llvm.aarch64.neon.vabd.v1f64(<1 x double>, <1 x double>) -declare <1 x float> @llvm.aarch64.neon.vabd.v1f32(<1 x float>, <1 x float>) +declare double @llvm.aarch64.neon.vabd.f64(double, double) +declare float @llvm.aarch64.neon.vabd.f32(float, float) diff --git a/test/CodeGen/AArch64/neon-scalar-fcvt.ll b/test/CodeGen/AArch64/neon-scalar-fcvt.ll index d7b84fa..6cf30a7 100644 --- a/test/CodeGen/AArch64/neon-scalar-fcvt.ll +++ b/test/CodeGen/AArch64/neon-scalar-fcvt.ll @@ -6,250 +6,228 @@ define float @test_vcvtxn(double %a) { ; CHECK: test_vcvtxn ; CHECK: fcvtxn {{s[0-9]}}, {{d[0-9]}} entry: - %vcvtf.i = insertelement <1 x double> undef, double %a, i32 0 - %vcvtf1.i = tail call <1 x float> @llvm.aarch64.neon.fcvtxn.v1f32.v1f64(<1 x double> %vcvtf.i) - %0 = extractelement <1 x float> %vcvtf1.i, i32 0 - ret float %0 + %vcvtf = call float @llvm.aarch64.neon.fcvtxn(double %a) + ret float %vcvtf } -declare <1 x float> @llvm.aarch64.neon.fcvtxn.v1f32.v1f64(<1 x double>) +declare float @llvm.aarch64.neon.fcvtxn(double) define i32 @test_vcvtass(float %a) { ; CHECK: test_vcvtass ; CHECK: fcvtas {{s[0-9]}}, {{s[0-9]}} entry: - %vcvtas.i = insertelement <1 x float> undef, float %a, i32 0 - %vcvtas1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.v1f32(<1 x float> %vcvtas.i) + %vcvtas1.i = call <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.f32(float %a) %0 = extractelement <1 x i32> %vcvtas1.i, i32 0 ret i32 %0 } -declare <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.v1f32(<1 x float>) +declare <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.f32(float) define i64 @test_test_vcvtasd(double %a) { ; CHECK: test_test_vcvtasd ; CHECK: fcvtas {{d[0-9]}}, {{d[0-9]}} entry: - %vcvtas.i = insertelement <1 x double> undef, double %a, i32 0 - %vcvtas1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %vcvtas.i) + %vcvtas1.i = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.f64(double %a) %0 = extractelement <1 x i64> %vcvtas1.i, i32 0 ret i64 %0 } -declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.f64(double) define i32 @test_vcvtaus(float %a) { ; CHECK: test_vcvtaus ; CHECK: fcvtau {{s[0-9]}}, {{s[0-9]}} entry: - %vcvtau.i = insertelement <1 x float> undef, float %a, i32 0 - %vcvtau1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.v1f32(<1 x float> %vcvtau.i) + %vcvtau1.i = call <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.f32(float %a) %0 = extractelement <1 x i32> %vcvtau1.i, i32 0 ret i32 %0 } -declare <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.v1f32(<1 x float>) +declare <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.f32(float) define i64 @test_vcvtaud(double %a) { ; CHECK: test_vcvtaud ; CHECK: fcvtau {{d[0-9]}}, {{d[0-9]}} entry: - %vcvtau.i = insertelement <1 x double> undef, double %a, i32 0 - %vcvtau1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %vcvtau.i) + %vcvtau1.i = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.f64(double %a) %0 = extractelement <1 x i64> %vcvtau1.i, i32 0 ret i64 %0 } -declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.f64(double) define i32 @test_vcvtmss(float %a) { ; CHECK: test_vcvtmss ; CHECK: fcvtms {{s[0-9]}}, {{s[0-9]}} entry: - %vcvtms.i = insertelement <1 x float> undef, float %a, i32 0 - %vcvtms1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.v1f32(<1 x float> %vcvtms.i) + %vcvtms1.i = call <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.f32(float %a) %0 = extractelement <1 x i32> %vcvtms1.i, i32 0 ret i32 %0 } -declare <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.v1f32(<1 x float>) +declare <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.f32(float) define i64 @test_vcvtmd_s64_f64(double %a) { ; CHECK: test_vcvtmd_s64_f64 ; CHECK: fcvtms {{d[0-9]}}, {{d[0-9]}} entry: - %vcvtms.i = insertelement <1 x double> undef, double %a, i32 0 - %vcvtms1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %vcvtms.i) + %vcvtms1.i = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.f64(double %a) %0 = extractelement <1 x i64> %vcvtms1.i, i32 0 ret i64 %0 } -declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.f64(double) define i32 @test_vcvtmus(float %a) { ; CHECK: test_vcvtmus ; CHECK: fcvtmu {{s[0-9]}}, {{s[0-9]}} entry: - %vcvtmu.i = insertelement <1 x float> undef, float %a, i32 0 - %vcvtmu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.v1f32(<1 x float> %vcvtmu.i) + %vcvtmu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.f32(float %a) %0 = extractelement <1 x i32> %vcvtmu1.i, i32 0 ret i32 %0 } -declare <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.v1f32(<1 x float>) +declare <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.f32(float) define i64 @test_vcvtmud(double %a) { ; CHECK: test_vcvtmud ; CHECK: fcvtmu {{d[0-9]}}, {{d[0-9]}} entry: - %vcvtmu.i = insertelement <1 x double> undef, double %a, i32 0 - %vcvtmu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %vcvtmu.i) + %vcvtmu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.f64(double %a) %0 = extractelement <1 x i64> %vcvtmu1.i, i32 0 ret i64 %0 } -declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.f64(double) define i32 @test_vcvtnss(float %a) { ; CHECK: test_vcvtnss ; CHECK: fcvtns {{s[0-9]}}, {{s[0-9]}} entry: - %vcvtns.i = insertelement <1 x float> undef, float %a, i32 0 - %vcvtns1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.v1f32(<1 x float> %vcvtns.i) + %vcvtns1.i = call <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.f32(float %a) %0 = extractelement <1 x i32> %vcvtns1.i, i32 0 ret i32 %0 } -declare <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.v1f32(<1 x float>) +declare <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.f32(float) define i64 @test_vcvtnd_s64_f64(double %a) { ; CHECK: test_vcvtnd_s64_f64 ; CHECK: fcvtns {{d[0-9]}}, {{d[0-9]}} entry: - %vcvtns.i = insertelement <1 x double> undef, double %a, i32 0 - %vcvtns1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %vcvtns.i) + %vcvtns1.i = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.f64(double %a) %0 = extractelement <1 x i64> %vcvtns1.i, i32 0 ret i64 %0 } -declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.f64(double) define i32 @test_vcvtnus(float %a) { ; CHECK: test_vcvtnus ; CHECK: fcvtnu {{s[0-9]}}, {{s[0-9]}} entry: - %vcvtnu.i = insertelement <1 x float> undef, float %a, i32 0 - %vcvtnu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.v1f32(<1 x float> %vcvtnu.i) + %vcvtnu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.f32(float %a) %0 = extractelement <1 x i32> %vcvtnu1.i, i32 0 ret i32 %0 } -declare <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.v1f32(<1 x float>) +declare <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.f32(float) define i64 @test_vcvtnud(double %a) { ; CHECK: test_vcvtnud ; CHECK: fcvtnu {{d[0-9]}}, {{d[0-9]}} entry: - %vcvtnu.i = insertelement <1 x double> undef, double %a, i32 0 - %vcvtnu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %vcvtnu.i) + %vcvtnu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.f64(double %a) %0 = extractelement <1 x i64> %vcvtnu1.i, i32 0 ret i64 %0 } -declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.f64(double) define i32 @test_vcvtpss(float %a) { ; CHECK: test_vcvtpss ; CHECK: fcvtps {{s[0-9]}}, {{s[0-9]}} entry: - %vcvtps.i = insertelement <1 x float> undef, float %a, i32 0 - %vcvtps1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.v1f32(<1 x float> %vcvtps.i) + %vcvtps1.i = call <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.f32(float %a) %0 = extractelement <1 x i32> %vcvtps1.i, i32 0 ret i32 %0 } -declare <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.v1f32(<1 x float>) +declare <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.f32(float) define i64 @test_vcvtpd_s64_f64(double %a) { ; CHECK: test_vcvtpd_s64_f64 ; CHECK: fcvtps {{d[0-9]}}, {{d[0-9]}} entry: - %vcvtps.i = insertelement <1 x double> undef, double %a, i32 0 - %vcvtps1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %vcvtps.i) + %vcvtps1.i = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.f64(double %a) %0 = extractelement <1 x i64> %vcvtps1.i, i32 0 ret i64 %0 } -declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.f64(double) define i32 @test_vcvtpus(float %a) { ; CHECK: test_vcvtpus ; CHECK: fcvtpu {{s[0-9]}}, {{s[0-9]}} entry: - %vcvtpu.i = insertelement <1 x float> undef, float %a, i32 0 - %vcvtpu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.v1f32(<1 x float> %vcvtpu.i) + %vcvtpu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.f32(float %a) %0 = extractelement <1 x i32> %vcvtpu1.i, i32 0 ret i32 %0 } -declare <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.v1f32(<1 x float>) +declare <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.f32(float) define i64 @test_vcvtpud(double %a) { ; CHECK: test_vcvtpud ; CHECK: fcvtpu {{d[0-9]}}, {{d[0-9]}} entry: - %vcvtpu.i = insertelement <1 x double> undef, double %a, i32 0 - %vcvtpu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %vcvtpu.i) + %vcvtpu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.f64(double %a) %0 = extractelement <1 x i64> %vcvtpu1.i, i32 0 ret i64 %0 } -declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.f64(double) define i32 @test_vcvtss(float %a) { ; CHECK: test_vcvtss ; CHECK: fcvtzs {{s[0-9]}}, {{s[0-9]}} entry: - %vcvtzs.i = insertelement <1 x float> undef, float %a, i32 0 - %vcvtzs1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.v1f32(<1 x float> %vcvtzs.i) + %vcvtzs1.i = call <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.f32(float %a) %0 = extractelement <1 x i32> %vcvtzs1.i, i32 0 ret i32 %0 } -declare <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.v1f32(<1 x float>) +declare <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.f32(float) define i64 @test_vcvtd_s64_f64(double %a) { ; CHECK: test_vcvtd_s64_f64 ; CHECK: fcvtzs {{d[0-9]}}, {{d[0-9]}} entry: - %vcvzs.i = insertelement <1 x double> undef, double %a, i32 0 - %vcvzs1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %vcvzs.i) + %vcvzs1.i = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.f64(double %a) %0 = extractelement <1 x i64> %vcvzs1.i, i32 0 ret i64 %0 } -declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.f64(double) define i32 @test_vcvtus(float %a) { ; CHECK: test_vcvtus ; CHECK: fcvtzu {{s[0-9]}}, {{s[0-9]}} entry: - %vcvtzu.i = insertelement <1 x float> undef, float %a, i32 0 - %vcvtzu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.v1f32(<1 x float> %vcvtzu.i) + %vcvtzu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.f32(float %a) %0 = extractelement <1 x i32> %vcvtzu1.i, i32 0 ret i32 %0 } -declare <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.v1f32(<1 x float>) +declare <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.f32(float) define i64 @test_vcvtud(double %a) { ; CHECK: test_vcvtud ; CHECK: fcvtzu {{d[0-9]}}, {{d[0-9]}} entry: - %vcvtzu.i = insertelement <1 x double> undef, double %a, i32 0 - %vcvtzu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %vcvtzu.i) + %vcvtzu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.f64(double %a) %0 = extractelement <1 x i64> %vcvtzu1.i, i32 0 ret i64 %0 } -declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.f64(double) diff --git a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll index a6e5859..e0dce13 100644 --- a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll +++ b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll @@ -3,326 +3,280 @@ ;; Scalar Floating-point Compare define i32 @test_vceqs_f32(float %a, float %b) { -; CHECK: test_vceqs_f32 +; CHECK-LABEL: test_vceqs_f32 ; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} entry: - %vceq.i = insertelement <1 x float> undef, float %a, i32 0 - %vceq1.i = insertelement <1 x float> undef, float %b, i32 0 - %vceq2.i = call <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float> %vceq.i, <1 x float> %vceq1.i) - %0 = extractelement <1 x i32> %vceq2.i, i32 0 + %fceq2.i = call <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float %a, float %b) + %0 = extractelement <1 x i32> %fceq2.i, i32 0 ret i32 %0 } define i64 @test_vceqd_f64(double %a, double %b) { -; CHECK: test_vceqd_f64 +; CHECK-LABEL: test_vceqd_f64 ; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} entry: - %vceq.i = insertelement <1 x double> undef, double %a, i32 0 - %vceq1.i = insertelement <1 x double> undef, double %b, i32 0 - %vceq2.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f64(<1 x double> %vceq.i, <1 x double> %vceq1.i) - %0 = extractelement <1 x i64> %vceq2.i, i32 0 + %fceq2.i = call <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f64(double %a, double %b) + %0 = extractelement <1 x i64> %fceq2.i, i32 0 ret i64 %0 } -define <1 x i64> @test_vceqz_f64(<1 x double> %a) #0 { -; CHECK: test_vceqz_f64 +define <1 x i64> @test_vceqz_f64(<1 x double> %a) { +; CHECK-LABEL: test_vceqz_f64 ; CHECK: fcmeq {{d[0-9]+}}, {{d[0-9]+}}, #0.0 entry: %0 = fcmp oeq <1 x double> %a, zeroinitializer - %vceqz.i = zext <1 x i1> %0 to <1 x i64> + %vceqz.i = sext <1 x i1> %0 to <1 x i64> ret <1 x i64> %vceqz.i } define i32 @test_vceqzs_f32(float %a) { -; CHECK: test_vceqzs_f32 +; CHECK-LABEL: test_vceqzs_f32 ; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, #0.0 entry: - %vceq.i = insertelement <1 x float> undef, float %a, i32 0 - %vceq1.i = call <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float> %vceq.i, <1 x float> zeroinitializer) - %0 = extractelement <1 x i32> %vceq1.i, i32 0 + %fceq1.i = call <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float %a, float 0.0) + %0 = extractelement <1 x i32> %fceq1.i, i32 0 ret i32 %0 } define i64 @test_vceqzd_f64(double %a) { -; CHECK: test_vceqzd_f64 +; CHECK-LABEL: test_vceqzd_f64 ; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, #0.0 entry: - %vceq.i = insertelement <1 x double> undef, double %a, i32 0 - %vceq1.i = tail call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f32(<1 x double> %vceq.i, <1 x float> zeroinitializer) #5 - %0 = extractelement <1 x i64> %vceq1.i, i32 0 + %fceq1.i = call <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f32(double %a, float 0.0) + %0 = extractelement <1 x i64> %fceq1.i, i32 0 ret i64 %0 } define i32 @test_vcges_f32(float %a, float %b) { -; CHECK: test_vcges_f32 +; CHECK-LABEL: test_vcges_f32 ; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} entry: - %vcge.i = insertelement <1 x float> undef, float %a, i32 0 - %vcge1.i = insertelement <1 x float> undef, float %b, i32 0 - %vcge2.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> %vcge1.i) - %0 = extractelement <1 x i32> %vcge2.i, i32 0 + %fcge2.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float %b) + %0 = extractelement <1 x i32> %fcge2.i, i32 0 ret i32 %0 } define i64 @test_vcged_f64(double %a, double %b) { -; CHECK: test_vcged_f64 +; CHECK-LABEL: test_vcged_f64 ; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} entry: - %vcge.i = insertelement <1 x double> undef, double %a, i32 0 - %vcge1.i = insertelement <1 x double> undef, double %b, i32 0 - %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double> %vcge.i, <1 x double> %vcge1.i) - %0 = extractelement <1 x i64> %vcge2.i, i32 0 + %fcge2.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double %a, double %b) + %0 = extractelement <1 x i64> %fcge2.i, i32 0 ret i64 %0 } define i32 @test_vcgezs_f32(float %a) { -; CHECK: test_vcgezs_f32 +; CHECK-LABEL: test_vcgezs_f32 ; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, #0.0 entry: - %vcge.i = insertelement <1 x float> undef, float %a, i32 0 - %vcge1.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> zeroinitializer) - %0 = extractelement <1 x i32> %vcge1.i, i32 0 + %fcge1.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float 0.0) + %0 = extractelement <1 x i32> %fcge1.i, i32 0 ret i32 %0 } define i64 @test_vcgezd_f64(double %a) { -; CHECK: test_vcgezd_f64 +; CHECK-LABEL: test_vcgezd_f64 ; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, #0.0 entry: - %vcge.i = insertelement <1 x double> undef, double %a, i32 0 - %vcge1.i = tail call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f32(<1 x double> %vcge.i, <1 x float> zeroinitializer) #5 - %0 = extractelement <1 x i64> %vcge1.i, i32 0 + %fcge1.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f32(double %a, float 0.0) + %0 = extractelement <1 x i64> %fcge1.i, i32 0 ret i64 %0 } define i32 @test_vcgts_f32(float %a, float %b) { -; CHECK: test_vcgts_f32 +; CHECK-LABEL: test_vcgts_f32 ; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} entry: - %vcgt.i = insertelement <1 x float> undef, float %a, i32 0 - %vcgt1.i = insertelement <1 x float> undef, float %b, i32 0 - %vcgt2.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> %vcgt1.i) - %0 = extractelement <1 x i32> %vcgt2.i, i32 0 + %fcgt2.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float %b) + %0 = extractelement <1 x i32> %fcgt2.i, i32 0 ret i32 %0 } define i64 @test_vcgtd_f64(double %a, double %b) { -; CHECK: test_vcgtd_f64 +; CHECK-LABEL: test_vcgtd_f64 ; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} entry: - %vcgt.i = insertelement <1 x double> undef, double %a, i32 0 - %vcgt1.i = insertelement <1 x double> undef, double %b, i32 0 - %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double> %vcgt.i, <1 x double> %vcgt1.i) - %0 = extractelement <1 x i64> %vcgt2.i, i32 0 + %fcgt2.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double %a, double %b) + %0 = extractelement <1 x i64> %fcgt2.i, i32 0 ret i64 %0 } define i32 @test_vcgtzs_f32(float %a) { -; CHECK: test_vcgtzs_f32 +; CHECK-LABEL: test_vcgtzs_f32 ; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, #0.0 entry: - %vcgt.i = insertelement <1 x float> undef, float %a, i32 0 - %vcgt1.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> zeroinitializer) - %0 = extractelement <1 x i32> %vcgt1.i, i32 0 + %fcgt1.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float 0.0) + %0 = extractelement <1 x i32> %fcgt1.i, i32 0 ret i32 %0 } define i64 @test_vcgtzd_f64(double %a) { -; CHECK: test_vcgtzd_f64 +; CHECK-LABEL: test_vcgtzd_f64 ; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, #0.0 entry: - %vcgt.i = insertelement <1 x double> undef, double %a, i32 0 - %vcgt1.i = tail call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f32(<1 x double> %vcgt.i, <1 x float> zeroinitializer) #5 - %0 = extractelement <1 x i64> %vcgt1.i, i32 0 + %fcgt1.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f32(double %a, float 0.0) + %0 = extractelement <1 x i64> %fcgt1.i, i32 0 ret i64 %0 } define i32 @test_vcles_f32(float %a, float %b) { -; CHECK: test_vcles_f32 +; CHECK-LABEL: test_vcles_f32 ; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} entry: - %vcge.i = insertelement <1 x float> undef, float %a, i32 0 - %vcge1.i = insertelement <1 x float> undef, float %b, i32 0 - %vcge2.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> %vcge1.i) - %0 = extractelement <1 x i32> %vcge2.i, i32 0 + %fcge2.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float %b) + %0 = extractelement <1 x i32> %fcge2.i, i32 0 ret i32 %0 } define i64 @test_vcled_f64(double %a, double %b) { -; CHECK: test_vcled_f64 +; CHECK-LABEL: test_vcled_f64 ; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} entry: - %vcge.i = insertelement <1 x double> undef, double %a, i32 0 - %vcge1.i = insertelement <1 x double> undef, double %b, i32 0 - %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double> %vcge.i, <1 x double> %vcge1.i) - %0 = extractelement <1 x i64> %vcge2.i, i32 0 + %fcge2.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double %a, double %b) + %0 = extractelement <1 x i64> %fcge2.i, i32 0 ret i64 %0 } define i32 @test_vclezs_f32(float %a) { -; CHECK: test_vclezs_f32 +; CHECK-LABEL: test_vclezs_f32 ; CHECK: fcmle {{s[0-9]}}, {{s[0-9]}}, #0.0 entry: - %vcle.i = insertelement <1 x float> undef, float %a, i32 0 - %vcle1.i = call <1 x i32> @llvm.aarch64.neon.vclez.v1i32.v1f32.v1f32(<1 x float> %vcle.i, <1 x float> zeroinitializer) - %0 = extractelement <1 x i32> %vcle1.i, i32 0 + %fcle1.i = call <1 x i32> @llvm.aarch64.neon.fclez.v1i32.f32.f32(float %a, float 0.0) + %0 = extractelement <1 x i32> %fcle1.i, i32 0 ret i32 %0 } define i64 @test_vclezd_f64(double %a) { -; CHECK: test_vclezd_f64 +; CHECK-LABEL: test_vclezd_f64 ; CHECK: fcmle {{d[0-9]}}, {{d[0-9]}}, #0.0 entry: - %vcle.i = insertelement <1 x double> undef, double %a, i32 0 - %vcle1.i = tail call <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1f64.v1f32(<1 x double> %vcle.i, <1 x float> zeroinitializer) #5 - %0 = extractelement <1 x i64> %vcle1.i, i32 0 + %fcle1.i = call <1 x i64> @llvm.aarch64.neon.fclez.v1i64.f64.f32(double %a, float 0.0) + %0 = extractelement <1 x i64> %fcle1.i, i32 0 ret i64 %0 } define i32 @test_vclts_f32(float %a, float %b) { -; CHECK: test_vclts_f32 +; CHECK-LABEL: test_vclts_f32 ; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} entry: - %vcgt.i = insertelement <1 x float> undef, float %b, i32 0 - %vcgt1.i = insertelement <1 x float> undef, float %a, i32 0 - %vcgt2.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> %vcgt1.i) - %0 = extractelement <1 x i32> %vcgt2.i, i32 0 + %fcgt2.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float %b) + %0 = extractelement <1 x i32> %fcgt2.i, i32 0 ret i32 %0 } define i64 @test_vcltd_f64(double %a, double %b) { -; CHECK: test_vcltd_f64 +; CHECK-LABEL: test_vcltd_f64 ; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} entry: - %vcgt.i = insertelement <1 x double> undef, double %b, i32 0 - %vcgt1.i = insertelement <1 x double> undef, double %a, i32 0 - %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double> %vcgt.i, <1 x double> %vcgt1.i) - %0 = extractelement <1 x i64> %vcgt2.i, i32 0 + %fcgt2.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double %a, double %b) + %0 = extractelement <1 x i64> %fcgt2.i, i32 0 ret i64 %0 } define i32 @test_vcltzs_f32(float %a) { -; CHECK: test_vcltzs_f32 +; CHECK-LABEL: test_vcltzs_f32 ; CHECK: fcmlt {{s[0-9]}}, {{s[0-9]}}, #0.0 entry: - %vclt.i = insertelement <1 x float> undef, float %a, i32 0 - %vclt1.i = call <1 x i32> @llvm.aarch64.neon.vcltz.v1i32.v1f32.v1f32(<1 x float> %vclt.i, <1 x float> zeroinitializer) - %0 = extractelement <1 x i32> %vclt1.i, i32 0 + %fclt1.i = call <1 x i32> @llvm.aarch64.neon.fcltz.v1i32.f32.f32(float %a, float 0.0) + %0 = extractelement <1 x i32> %fclt1.i, i32 0 ret i32 %0 } define i64 @test_vcltzd_f64(double %a) { -; CHECK: test_vcltzd_f64 +; CHECK-LABEL: test_vcltzd_f64 ; CHECK: fcmlt {{d[0-9]}}, {{d[0-9]}}, #0.0 entry: - %vclt.i = insertelement <1 x double> undef, double %a, i32 0 - %vclt1.i = tail call <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1f64.v1f32(<1 x double> %vclt.i, <1 x float> zeroinitializer) #5 - %0 = extractelement <1 x i64> %vclt1.i, i32 0 + %fclt1.i = call <1 x i64> @llvm.aarch64.neon.fcltz.v1i64.f64.f32(double %a, float 0.0) + %0 = extractelement <1 x i64> %fclt1.i, i32 0 ret i64 %0 } define i32 @test_vcages_f32(float %a, float %b) { -; CHECK: test_vcages_f32 +; CHECK-LABEL: test_vcages_f32 ; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} entry: - %vcage.i = insertelement <1 x float> undef, float %a, i32 0 - %vcage1.i = insertelement <1 x float> undef, float %b, i32 0 - %vcage2.i = call <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float> %vcage.i, <1 x float> %vcage1.i) - %0 = extractelement <1 x i32> %vcage2.i, i32 0 + %fcage2.i = call <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float %a, float %b) + %0 = extractelement <1 x i32> %fcage2.i, i32 0 ret i32 %0 } define i64 @test_vcaged_f64(double %a, double %b) { -; CHECK: test_vcaged_f64 +; CHECK-LABEL: test_vcaged_f64 ; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} entry: - %vcage.i = insertelement <1 x double> undef, double %a, i32 0 - %vcage1.i = insertelement <1 x double> undef, double %b, i32 0 - %vcage2.i = call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %vcage.i, <1 x double> %vcage1.i) - %0 = extractelement <1 x i64> %vcage2.i, i32 0 + %fcage2.i = call <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double %a, double %b) + %0 = extractelement <1 x i64> %fcage2.i, i32 0 ret i64 %0 } define i32 @test_vcagts_f32(float %a, float %b) { -; CHECK: test_vcagts_f32 +; CHECK-LABEL: test_vcagts_f32 ; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} entry: - %vcagt.i = insertelement <1 x float> undef, float %a, i32 0 - %vcagt1.i = insertelement <1 x float> undef, float %b, i32 0 - %vcagt2.i = call <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float> %vcagt.i, <1 x float> %vcagt1.i) - %0 = extractelement <1 x i32> %vcagt2.i, i32 0 + %fcagt2.i = call <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float %a, float %b) + %0 = extractelement <1 x i32> %fcagt2.i, i32 0 ret i32 %0 } define i64 @test_vcagtd_f64(double %a, double %b) { -; CHECK: test_vcagtd_f64 +; CHECK-LABEL: test_vcagtd_f64 ; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} entry: - %vcagt.i = insertelement <1 x double> undef, double %a, i32 0 - %vcagt1.i = insertelement <1 x double> undef, double %b, i32 0 - %vcagt2.i = call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %vcagt.i, <1 x double> %vcagt1.i) - %0 = extractelement <1 x i64> %vcagt2.i, i32 0 + %fcagt2.i = call <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double %a, double %b) + %0 = extractelement <1 x i64> %fcagt2.i, i32 0 ret i64 %0 } define i32 @test_vcales_f32(float %a, float %b) { -; CHECK: test_vcales_f32 +; CHECK-LABEL: test_vcales_f32 ; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} entry: - %vcage.i = insertelement <1 x float> undef, float %b, i32 0 - %vcage1.i = insertelement <1 x float> undef, float %a, i32 0 - %vcage2.i = call <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float> %vcage.i, <1 x float> %vcage1.i) - %0 = extractelement <1 x i32> %vcage2.i, i32 0 + %fcage2.i = call <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float %a, float %b) + %0 = extractelement <1 x i32> %fcage2.i, i32 0 ret i32 %0 } define i64 @test_vcaled_f64(double %a, double %b) { -; CHECK: test_vcaled_f64 +; CHECK-LABEL: test_vcaled_f64 ; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} entry: - %vcage.i = insertelement <1 x double> undef, double %b, i32 0 - %vcage1.i = insertelement <1 x double> undef, double %a, i32 0 - %vcage2.i = call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %vcage.i, <1 x double> %vcage1.i) - %0 = extractelement <1 x i64> %vcage2.i, i32 0 + %fcage2.i = call <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double %a, double %b) + %0 = extractelement <1 x i64> %fcage2.i, i32 0 ret i64 %0 } define i32 @test_vcalts_f32(float %a, float %b) { -; CHECK: test_vcalts_f32 +; CHECK-LABEL: test_vcalts_f32 ; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} entry: - %vcalt.i = insertelement <1 x float> undef, float %b, i32 0 - %vcalt1.i = insertelement <1 x float> undef, float %a, i32 0 - %vcalt2.i = call <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float> %vcalt.i, <1 x float> %vcalt1.i) - %0 = extractelement <1 x i32> %vcalt2.i, i32 0 + %fcalt2.i = call <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float %a, float %b) + %0 = extractelement <1 x i32> %fcalt2.i, i32 0 ret i32 %0 } define i64 @test_vcaltd_f64(double %a, double %b) { -; CHECK: test_vcaltd_f64 +; CHECK-LABEL: test_vcaltd_f64 ; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} entry: - %vcalt.i = insertelement <1 x double> undef, double %b, i32 0 - %vcalt1.i = insertelement <1 x double> undef, double %a, i32 0 - %vcalt2.i = call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %vcalt.i, <1 x double> %vcalt1.i) - %0 = extractelement <1 x i64> %vcalt2.i, i32 0 + %fcalt2.i = call <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double %a, double %b) + %0 = extractelement <1 x i64> %fcalt2.i, i32 0 ret i64 %0 } -declare <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) -declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f32(<1 x double>, <1 x float>) -declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) -declare <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) -declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f32(<1 x double>, <1 x float>) -declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) -declare <1 x i32> @llvm.aarch64.neon.vclez.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) -declare <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1f64.v1f32(<1 x double>, <1 x float>) -declare <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) -declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f32(<1 x double>, <1 x float>) -declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) -declare <1 x i32> @llvm.aarch64.neon.vcltz.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) -declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1f64.v1f32(<1 x double>, <1 x float>) -declare <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) -declare <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) -declare <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) -declare <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) +declare <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float, float) +declare <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f32(double, float) +declare <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f64(double, double) +declare <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float, float) +declare <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f32(double, float) +declare <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double, double) +declare <1 x i32> @llvm.aarch64.neon.fclez.v1i32.f32.f32(float, float) +declare <1 x i64> @llvm.aarch64.neon.fclez.v1i64.f64.f32(double, float) +declare <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float, float) +declare <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f32(double, float) +declare <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double, double) +declare <1 x i32> @llvm.aarch64.neon.fcltz.v1i32.f32.f32(float, float) +declare <1 x i64> @llvm.aarch64.neon.fcltz.v1i64.f64.f32(double, float) +declare <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float, float) +declare <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double, double) +declare <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float, float) +declare <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double, double) diff --git a/test/CodeGen/AArch64/neon-scalar-recip.ll b/test/CodeGen/AArch64/neon-scalar-recip.ll index f21c27b..100839b 100644 --- a/test/CodeGen/AArch64/neon-scalar-recip.ll +++ b/test/CodeGen/AArch64/neon-scalar-recip.ll @@ -3,56 +3,42 @@ define float @test_vrecpss_f32(float %a, float %b) { ; CHECK: test_vrecpss_f32 ; CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} - %1 = insertelement <1 x float> undef, float %a, i32 0 - %2 = insertelement <1 x float> undef, float %b, i32 0 - %3 = call <1 x float> @llvm.arm.neon.vrecps.v1f32(<1 x float> %1, <1 x float> %2) - %4 = extractelement <1 x float> %3, i32 0 - ret float %4 + %1 = call float @llvm.aarch64.neon.vrecps.f32(float %a, float %b) + ret float %1 } define double @test_vrecpsd_f64(double %a, double %b) { ; CHECK: test_vrecpsd_f64 ; CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} - %1 = insertelement <1 x double> undef, double %a, i32 0 - %2 = insertelement <1 x double> undef, double %b, i32 0 - %3 = call <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double> %1, <1 x double> %2) - %4 = extractelement <1 x double> %3, i32 0 - ret double %4 + %1 = call double @llvm.aarch64.neon.vrecps.f64(double %a, double %b) + ret double %1 } -declare <1 x float> @llvm.arm.neon.vrecps.v1f32(<1 x float>, <1 x float>) -declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>) +declare float @llvm.aarch64.neon.vrecps.f32(float, float) +declare double @llvm.aarch64.neon.vrecps.f64(double, double) define float @test_vrsqrtss_f32(float %a, float %b) { ; CHECK: test_vrsqrtss_f32 ; CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} - %1 = insertelement <1 x float> undef, float %a, i32 0 - %2 = insertelement <1 x float> undef, float %b, i32 0 - %3 = call <1 x float> @llvm.arm.neon.vrsqrts.v1f32(<1 x float> %1, <1 x float> %2) - %4 = extractelement <1 x float> %3, i32 0 - ret float %4 + %1 = call float @llvm.aarch64.neon.vrsqrts.f32(float %a, float %b) + ret float %1 } define double @test_vrsqrtsd_f64(double %a, double %b) { ; CHECK: test_vrsqrtsd_f64 ; CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} - %1 = insertelement <1 x double> undef, double %a, i32 0 - %2 = insertelement <1 x double> undef, double %b, i32 0 - %3 = call <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double> %1, <1 x double> %2) - %4 = extractelement <1 x double> %3, i32 0 - ret double %4 + %1 = call double @llvm.aarch64.neon.vrsqrts.f64(double %a, double %b) + ret double %1 } -declare <1 x float> @llvm.arm.neon.vrsqrts.v1f32(<1 x float>, <1 x float>) -declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>) +declare float @llvm.aarch64.neon.vrsqrts.f32(float, float) +declare double @llvm.aarch64.neon.vrsqrts.f64(double, double) define float @test_vrecpes_f32(float %a) { ; CHECK: test_vrecpes_f32 ; CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}} entry: - %vrecpe.i = insertelement <1 x float> undef, float %a, i32 0 - %vrecpe1.i = tail call <1 x float> @llvm.arm.neon.vrecpe.v1f32(<1 x float> %vrecpe.i) - %0 = extractelement <1 x float> %vrecpe1.i, i32 0 + %0 = call float @llvm.aarch64.neon.vrecpe.f32(float %a) ret float %0 } @@ -60,22 +46,18 @@ define double @test_vrecped_f64(double %a) { ; CHECK: test_vrecped_f64 ; CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}} entry: - %vrecpe.i = insertelement <1 x double> undef, double %a, i32 0 - %vrecpe1.i = tail call <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double> %vrecpe.i) - %0 = extractelement <1 x double> %vrecpe1.i, i32 0 + %0 = call double @llvm.aarch64.neon.vrecpe.f64(double %a) ret double %0 } -declare <1 x float> @llvm.arm.neon.vrecpe.v1f32(<1 x float>) -declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>) +declare float @llvm.aarch64.neon.vrecpe.f32(float) +declare double @llvm.aarch64.neon.vrecpe.f64(double) define float @test_vrecpxs_f32(float %a) { ; CHECK: test_vrecpxs_f32 ; CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}} entry: - %vrecpx.i = insertelement <1 x float> undef, float %a, i32 0 - %vrecpx1.i = tail call <1 x float> @llvm.aarch64.neon.vrecpx.v1f32(<1 x float> %vrecpx.i) - %0 = extractelement <1 x float> %vrecpx1.i, i32 0 + %0 = call float @llvm.aarch64.neon.vrecpx.f32(float %a) ret float %0 } @@ -83,22 +65,18 @@ define double @test_vrecpxd_f64(double %a) { ; CHECK: test_vrecpxd_f64 ; CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}} entry: - %vrecpx.i = insertelement <1 x double> undef, double %a, i32 0 - %vrecpx1.i = tail call <1 x double> @llvm.aarch64.neon.vrecpx.v1f64(<1 x double> %vrecpx.i) - %0 = extractelement <1 x double> %vrecpx1.i, i32 0 + %0 = call double @llvm.aarch64.neon.vrecpx.f64(double %a) ret double %0 } -declare <1 x float> @llvm.aarch64.neon.vrecpx.v1f32(<1 x float>) -declare <1 x double> @llvm.aarch64.neon.vrecpx.v1f64(<1 x double>) +declare float @llvm.aarch64.neon.vrecpx.f32(float) +declare double @llvm.aarch64.neon.vrecpx.f64(double) define float @test_vrsqrtes_f32(float %a) { ; CHECK: test_vrsqrtes_f32 ; CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}} entry: - %vrsqrte.i = insertelement <1 x float> undef, float %a, i32 0 - %vrsqrte1.i = tail call <1 x float> @llvm.arm.neon.vrsqrte.v1f32(<1 x float> %vrsqrte.i) - %0 = extractelement <1 x float> %vrsqrte1.i, i32 0 + %0 = call float @llvm.aarch64.neon.vrsqrte.f32(float %a) ret float %0 } @@ -106,11 +84,9 @@ define double @test_vrsqrted_f64(double %a) { ; CHECK: test_vrsqrted_f64 ; CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}} entry: - %vrsqrte.i = insertelement <1 x double> undef, double %a, i32 0 - %vrsqrte1.i = tail call <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double> %vrsqrte.i) - %0 = extractelement <1 x double> %vrsqrte1.i, i32 0 + %0 = call double @llvm.aarch64.neon.vrsqrte.f64(double %a) ret double %0 } -declare <1 x float> @llvm.arm.neon.vrsqrte.v1f32(<1 x float>) -declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>) +declare float @llvm.aarch64.neon.vrsqrte.f32(float) +declare double @llvm.aarch64.neon.vrsqrte.f64(double) diff --git a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll index 80e8dc3..33ce5cf 100644 --- a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll +++ b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll @@ -4,210 +4,198 @@ declare <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64>) define <1 x i64> @test_addp_v1i64(<2 x i64> %a) { ; CHECK: test_addp_v1i64: - %val = call <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64> %a) -; CHECK: addp d0, v0.2d - ret <1 x i64> %val +; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d + %val = call <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64> %a) + ret <1 x i64> %val } -declare <1 x float> @llvm.aarch64.neon.vpfadd(<2 x float>) +declare float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float>) -define <1 x float> @test_faddp_v1f32(<2 x float> %a) { -; CHECK: test_faddp_v1f32: - %val = call <1 x float> @llvm.aarch64.neon.vpfadd(<2 x float> %a) -; CHECK: faddp s0, v0.2s - ret <1 x float> %val +define float @test_faddp_f32(<2 x float> %a) { +; CHECK: test_faddp_f32: +; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s + %val = call float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float> %a) + ret float %val } -declare <1 x double> @llvm.aarch64.neon.vpfaddq(<2 x double>) +declare double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double>) -define <1 x double> @test_faddp_v1f64(<2 x double> %a) { -; CHECK: test_faddp_v1f64: - %val = call <1 x double> @llvm.aarch64.neon.vpfaddq(<2 x double> %a) -; CHECK: faddp d0, v0.2d - ret <1 x double> %val +define double @test_faddp_f64(<2 x double> %a) { +; CHECK: test_faddp_f64: +; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d + %val = call double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double> %a) + ret double %val } -declare <1 x float> @llvm.aarch64.neon.vpmax(<2 x float>) +declare float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float>) -define <1 x float> @test_fmaxp_v1f32(<2 x float> %a) { -; CHECK: test_fmaxp_v1f32: - %val = call <1 x float> @llvm.aarch64.neon.vpmax(<2 x float> %a) -; CHECK: fmaxp s0, v0.2s - ret <1 x float> %val +define float @test_fmaxp_f32(<2 x float> %a) { +; CHECK: test_fmaxp_f32: +; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s + %val = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a) + ret float %val } -declare <1 x double> @llvm.aarch64.neon.vpmaxq(<2 x double>) +declare double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double>) -define <1 x double> @test_fmaxp_v1f64(<2 x double> %a) { -; CHECK: test_fmaxp_v1f64: - %val = call <1 x double> @llvm.aarch64.neon.vpmaxq(<2 x double> %a) -; CHECK: fmaxp d0, v0.2d - ret <1 x double> %val +define double @test_fmaxp_f64(<2 x double> %a) { +; CHECK: test_fmaxp_f64: +; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d + %val = call double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double> %a) + ret double %val } +declare float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float>) -declare <1 x float> @llvm.aarch64.neon.vpmin(<2 x float>) - -define <1 x float> @test_fminp_v1f32(<2 x float> %a) { -; CHECK: test_fminp_v1f32: - %val = call <1 x float> @llvm.aarch64.neon.vpmin(<2 x float> %a) -; CHECK: fminp s0, v0.2s - ret <1 x float> %val +define float @test_fminp_f32(<2 x float> %a) { +; CHECK: test_fminp_f32: +; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s + %val = call float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float> %a) + ret float %val } -declare <1 x double> @llvm.aarch64.neon.vpminq(<2 x double>) +declare double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double>) -define <1 x double> @test_fminp_v1f64(<2 x double> %a) { -; CHECK: test_fminp_v1f64: - %val = call <1 x double> @llvm.aarch64.neon.vpminq(<2 x double> %a) -; CHECK: fminp d0, v0.2d - ret <1 x double> %val +define double @test_fminp_f64(<2 x double> %a) { +; CHECK: test_fminp_f64: +; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d + %val = call double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double> %a) + ret double %val } -declare <1 x float> @llvm.aarch64.neon.vpfmaxnm(<2 x float>) +declare float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float>) -define <1 x float> @test_fmaxnmp_v1f32(<2 x float> %a) { -; CHECK: test_fmaxnmp_v1f32: - %val = call <1 x float> @llvm.aarch64.neon.vpfmaxnm(<2 x float> %a) -; CHECK: fmaxnmp s0, v0.2s - ret <1 x float> %val +define float @test_fmaxnmp_f32(<2 x float> %a) { +; CHECK: test_fmaxnmp_f32: +; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s + %val = call float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float> %a) + ret float %val } -declare <1 x double> @llvm.aarch64.neon.vpfmaxnmq(<2 x double>) +declare double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double>) -define <1 x double> @test_fmaxnmp_v1f64(<2 x double> %a) { -; CHECK: test_fmaxnmp_v1f64: - %val = call <1 x double> @llvm.aarch64.neon.vpfmaxnmq(<2 x double> %a) -; CHECK: fmaxnmp d0, v0.2d - ret <1 x double> %val +define double @test_fmaxnmp_f64(<2 x double> %a) { +; CHECK: test_fmaxnmp_f64: +; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d + %val = call double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double> %a) + ret double %val } -declare <1 x float> @llvm.aarch64.neon.vpfminnm(<2 x float>) +declare float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float>) -define <1 x float> @test_fminnmp_v1f32(<2 x float> %a) { -; CHECK: test_fminnmp_v1f32: - %val = call <1 x float> @llvm.aarch64.neon.vpfminnm(<2 x float> %a) -; CHECK: fminnmp s0, v0.2s - ret <1 x float> %val +define float @test_fminnmp_f32(<2 x float> %a) { +; CHECK: test_fminnmp_f32: +; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s + %val = call float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float> %a) + ret float %val } -declare <1 x double> @llvm.aarch64.neon.vpfminnmq(<2 x double>) +declare double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double>) -define <1 x double> @test_fminnmp_v1f64(<2 x double> %a) { -; CHECK: test_fminnmp_v1f64: - %val = call <1 x double> @llvm.aarch64.neon.vpfminnmq(<2 x double> %a) -; CHECK: fminnmp d0, v0.2d - ret <1 x double> %val +define double @test_fminnmp_f64(<2 x double> %a) { +; CHECK: test_fminnmp_f64: +; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d + %val = call double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double> %a) + ret double %val } define float @test_vaddv_f32(<2 x float> %a) { ; CHECK-LABEL: test_vaddv_f32 ; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s - %1 = tail call <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v2f32(<2 x float> %a) - %2 = extractelement <1 x float> %1, i32 0 - ret float %2 + %1 = call float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float> %a) + ret float %1 } define float @test_vaddvq_f32(<4 x float> %a) { ; CHECK-LABEL: test_vaddvq_f32 ; CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s - %1 = tail call <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v4f32(<4 x float> %a) - %2 = extractelement <1 x float> %1, i32 0 - ret float %2 + %1 = call float @llvm.aarch64.neon.vpfadd.f32.v4f32(<4 x float> %a) + ret float %1 } define double @test_vaddvq_f64(<2 x double> %a) { ; CHECK-LABEL: test_vaddvq_f64 ; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d - %1 = tail call <1 x double> @llvm.aarch64.neon.vaddv.v1f64.v2f64(<2 x double> %a) - %2 = extractelement <1 x double> %1, i32 0 - ret double %2 + %1 = call double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double> %a) + ret double %1 } define float @test_vmaxv_f32(<2 x float> %a) { ; CHECK-LABEL: test_vmaxv_f32 ; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s - %1 = tail call <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v2f32(<2 x float> %a) - %2 = extractelement <1 x float> %1, i32 0 - ret float %2 + %1 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a) + ret float %1 } define double @test_vmaxvq_f64(<2 x double> %a) { ; CHECK-LABEL: test_vmaxvq_f64 ; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d - %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxv.v1f64.v2f64(<2 x double> %a) - %2 = extractelement <1 x double> %1, i32 0 - ret double %2 + %1 = call double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double> %a) + ret double %1 } define float @test_vminv_f32(<2 x float> %a) { ; CHECK-LABEL: test_vminv_f32 ; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s - %1 = tail call <1 x float> @llvm.aarch64.neon.vminv.v1f32.v2f32(<2 x float> %a) - %2 = extractelement <1 x float> %1, i32 0 - ret float %2 + %1 = call float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float> %a) + ret float %1 } define double @test_vminvq_f64(<2 x double> %a) { ; CHECK-LABEL: test_vminvq_f64 ; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d - %1 = tail call <1 x double> @llvm.aarch64.neon.vminv.v1f64.v2f64(<2 x double> %a) - %2 = extractelement <1 x double> %1, i32 0 - ret double %2 + %1 = call double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double> %a) + ret double %1 } define double @test_vmaxnmvq_f64(<2 x double> %a) { ; CHECK-LABEL: test_vmaxnmvq_f64 ; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d - %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxnmv.v1f64.v2f64(<2 x double> %a) - %2 = extractelement <1 x double> %1, i32 0 - ret double %2 + %1 = call double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double> %a) + ret double %1 } define float @test_vmaxnmv_f32(<2 x float> %a) { ; CHECK-LABEL: test_vmaxnmv_f32 ; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s - %1 = tail call <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v2f32(<2 x float> %a) - %2 = extractelement <1 x float> %1, i32 0 - ret float %2 + %1 = call float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float> %a) + ret float %1 } define double @test_vminnmvq_f64(<2 x double> %a) { ; CHECK-LABEL: test_vminnmvq_f64 ; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d - %1 = tail call <1 x double> @llvm.aarch64.neon.vminnmv.v1f64.v2f64(<2 x double> %a) - %2 = extractelement <1 x double> %1, i32 0 - ret double %2 + %1 = call double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double> %a) + ret double %1 } define float @test_vminnmv_f32(<2 x float> %a) { ; CHECK-LABEL: test_vminnmv_f32 ; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s - %1 = tail call <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v2f32(<2 x float> %a) - %2 = extractelement <1 x float> %1, i32 0 - ret float %2 + %1 = call float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float> %a) + ret float %1 } define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_vpaddq_s64 ; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d - %1 = tail call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b) + %1 = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b) ret <2 x i64> %1 } define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_vpaddq_u64 ; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d - %1 = tail call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b) + %1 = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b) ret <2 x i64> %1 } define i64 @test_vaddvq_s64(<2 x i64> %a) { ; CHECK-LABEL: test_vaddvq_s64 ; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d - %1 = tail call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a) + %1 = call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a) %2 = extractelement <1 x i64> %1, i32 0 ret i64 %2 } @@ -215,7 +203,7 @@ define i64 @test_vaddvq_s64(<2 x i64> %a) { define i64 @test_vaddvq_u64(<2 x i64> %a) { ; CHECK-LABEL: test_vaddvq_u64 ; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d - %1 = tail call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a) + %1 = call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a) %2 = extractelement <1 x i64> %1, i32 0 ret i64 %2 } @@ -224,24 +212,4 @@ declare <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64>) declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>) -declare <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v2f32(<2 x float>) - -declare <1 x double> @llvm.aarch64.neon.vminnmv.v1f64.v2f64(<2 x double>) - -declare <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v2f32(<2 x float>) - -declare <1 x double> @llvm.aarch64.neon.vmaxnmv.v1f64.v2f64(<2 x double>) - -declare <1 x double> @llvm.aarch64.neon.vminv.v1f64.v2f64(<2 x double>) - -declare <1 x float> @llvm.aarch64.neon.vminv.v1f32.v2f32(<2 x float>) - -declare <1 x double> @llvm.aarch64.neon.vmaxv.v1f64.v2f64(<2 x double>) - -declare <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v2f32(<2 x float>) - -declare <1 x double> @llvm.aarch64.neon.vaddv.v1f64.v2f64(<2 x double>) - -declare <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v4f32(<4 x float>) - -declare <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v2f32(<2 x float>) \ No newline at end of file +declare float @llvm.aarch64.neon.vpfadd.f32.v4f32(<4 x float>) diff --git a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll index 83ceb4e..7c9ffa0 100644 --- a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll +++ b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll @@ -7,14 +7,14 @@ declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_urshl_v1i64: %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: urshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_srshl_v1i64: %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: srshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } @@ -24,14 +24,14 @@ declare <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64>, <1 x i64>) define <1 x i64> @test_urshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_urshl_v1i64_aarch64: %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: urshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } define <1 x i64> @test_srshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_srshl_v1i64_aarch64: %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: srshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll index bd66f80..5c010ef 100644 --- a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll +++ b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll @@ -6,14 +6,14 @@ declare <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8>, <1 x i8>) define <1 x i8> @test_uqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { ; CHECK: test_uqadd_v1i8_aarch64: %tmp1 = call <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) -;CHECK: uqadd {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} +;CHECK: uqadd {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}} ret <1 x i8> %tmp1 } define <1 x i8> @test_sqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { ; CHECK: test_sqadd_v1i8_aarch64: %tmp1 = call <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) -;CHECK: sqadd {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} +;CHECK: sqadd {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}} ret <1 x i8> %tmp1 } @@ -23,14 +23,14 @@ declare <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8>, <1 x i8>) define <1 x i8> @test_uqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { ; CHECK: test_uqsub_v1i8_aarch64: %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) -;CHECK: uqsub {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} +;CHECK: uqsub {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}} ret <1 x i8> %tmp1 } define <1 x i8> @test_sqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { ; CHECK: test_sqsub_v1i8_aarch64: %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) -;CHECK: sqsub {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} +;CHECK: sqsub {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}} ret <1 x i8> %tmp1 } @@ -40,14 +40,14 @@ declare <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16>, <1 x i16>) define <1 x i16> @test_uqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { ; CHECK: test_uqadd_v1i16_aarch64: %tmp1 = call <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) -;CHECK: uqadd {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} +;CHECK: uqadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} ret <1 x i16> %tmp1 } define <1 x i16> @test_sqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { ; CHECK: test_sqadd_v1i16_aarch64: %tmp1 = call <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) -;CHECK: sqadd {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} +;CHECK: sqadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} ret <1 x i16> %tmp1 } @@ -57,14 +57,14 @@ declare <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16>, <1 x i16>) define <1 x i16> @test_uqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { ; CHECK: test_uqsub_v1i16_aarch64: %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) -;CHECK: uqsub {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} +;CHECK: uqsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} ret <1 x i16> %tmp1 } define <1 x i16> @test_sqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { ; CHECK: test_sqsub_v1i16_aarch64: %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) -;CHECK: sqsub {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} +;CHECK: sqsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} ret <1 x i16> %tmp1 } @@ -74,14 +74,14 @@ declare <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32>, <1 x i32>) define <1 x i32> @test_uqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { ; CHECK: test_uqadd_v1i32_aarch64: %tmp1 = call <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) -;CHECK: uqadd {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} +;CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret <1 x i32> %tmp1 } define <1 x i32> @test_sqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { ; CHECK: test_sqadd_v1i32_aarch64: %tmp1 = call <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) -;CHECK: sqadd {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} +;CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret <1 x i32> %tmp1 } @@ -91,7 +91,7 @@ declare <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32>, <1 x i32>) define <1 x i32> @test_uqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { ; CHECK: test_uqsub_v1i32_aarch64: %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) -;CHECK: uqsub {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} +;CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret <1 x i32> %tmp1 } @@ -99,7 +99,7 @@ define <1 x i32> @test_uqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { define <1 x i32> @test_sqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { ; CHECK: test_sqsub_v1i32_aarch64: %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) -;CHECK: sqsub {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} +;CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret <1 x i32> %tmp1 } @@ -109,14 +109,14 @@ declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) define <1 x i64> @test_uqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_uqadd_v1i64_aarch64: %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: uqadd {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } define <1 x i64> @test_sqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_sqadd_v1i64_aarch64: %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: sqadd {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } @@ -126,14 +126,14 @@ declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) define <1 x i64> @test_uqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_uqsub_v1i64_aarch64: %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: uqsub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } define <1 x i64> @test_sqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_sqsub_v1i64_aarch64: %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: sqsub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll index 0fd67df..dbf9669 100644 --- a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll +++ b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll @@ -6,7 +6,7 @@ declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_uqrshl_v1i64: %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: uqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } @@ -14,7 +14,7 @@ define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_sqrshl_v1i64: %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: sqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } @@ -24,7 +24,7 @@ declare <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8>, <1 x i8>) define <1 x i8> @test_uqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { ; CHECK: test_uqrshl_v1i8_aarch64: %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) -;CHECK: uqrshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} +;CHECK: uqrshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}} ret <1 x i8> %tmp1 } @@ -32,7 +32,7 @@ define <1 x i8> @test_uqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { define <1 x i8> @test_sqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { ; CHECK: test_sqrshl_v1i8_aarch64: %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) -;CHECK: sqrshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} +;CHECK: sqrshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}} ret <1 x i8> %tmp1 } @@ -42,7 +42,7 @@ declare <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16>, <1 x i16>) define <1 x i16> @test_uqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { ; CHECK: test_uqrshl_v1i16_aarch64: %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) -;CHECK: uqrshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} +;CHECK: uqrshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} ret <1 x i16> %tmp1 } @@ -50,7 +50,7 @@ define <1 x i16> @test_uqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { define <1 x i16> @test_sqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { ; CHECK: test_sqrshl_v1i16_aarch64: %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) -;CHECK: sqrshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} +;CHECK: sqrshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} ret <1 x i16> %tmp1 } @@ -60,7 +60,7 @@ declare <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32>, <1 x i32>) define <1 x i32> @test_uqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { ; CHECK: test_uqrshl_v1i32_aarch64: %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) -;CHECK: uqrshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} +;CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret <1 x i32> %tmp1 } @@ -68,7 +68,7 @@ define <1 x i32> @test_uqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { define <1 x i32> @test_sqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { ; CHECK: test_sqrshl_v1i32_aarch64: %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) -;CHECK: sqrshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} +;CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret <1 x i32> %tmp1 } @@ -78,7 +78,7 @@ declare <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64>, <1 x i64>) define <1 x i64> @test_uqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_uqrshl_v1i64_aarch64: %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: uqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } @@ -86,7 +86,7 @@ define <1 x i64> @test_uqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { define <1 x i64> @test_sqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_sqrshl_v1i64_aarch64: %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: sqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll index 8fdea24..0a1f4c9 100644 --- a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll +++ b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll @@ -6,14 +6,14 @@ declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>) define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_uqshl_v1i64: %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: uqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_sqshl_v1i64: %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: sqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } @@ -23,14 +23,14 @@ declare <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8>, <1 x i8>) define <1 x i8> @test_uqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { ; CHECK: test_uqshl_v1i8_aarch64: %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) -;CHECK: uqshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} +;CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}} ret <1 x i8> %tmp1 } define <1 x i8> @test_sqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { ; CHECK: test_sqshl_v1i8_aarch64: %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) -;CHECK: sqshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} +;CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}} ret <1 x i8> %tmp1 } @@ -40,14 +40,14 @@ declare <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16>, <1 x i16>) define <1 x i16> @test_uqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { ; CHECK: test_uqshl_v1i16_aarch64: %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) -;CHECK: uqshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} +;CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} ret <1 x i16> %tmp1 } define <1 x i16> @test_sqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { ; CHECK: test_sqshl_v1i16_aarch64: %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) -;CHECK: sqshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} +;CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} ret <1 x i16> %tmp1 } @@ -57,14 +57,14 @@ declare <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32>, <1 x i32>) define <1 x i32> @test_uqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { ; CHECK: test_uqshl_v1i32_aarch64: %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) -;CHECK: uqshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} +;CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret <1 x i32> %tmp1 } define <1 x i32> @test_sqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { ; CHECK: test_sqshl_v1i32_aarch64: %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) -;CHECK: sqshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} +;CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret <1 x i32> %tmp1 } @@ -74,14 +74,14 @@ declare <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64>, <1 x i64>) define <1 x i64> @test_uqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_uqshl_v1i64_aarch64: %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: uqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } define <1 x i64> @test_sqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_sqshl_v1i64_aarch64: %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -;CHECK: sqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +;CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } diff --git a/test/CodeGen/AArch64/neon-scalar-shift.ll b/test/CodeGen/AArch64/neon-scalar-shift.ll index 1222be5..b712ea4 100644 --- a/test/CodeGen/AArch64/neon-scalar-shift.ll +++ b/test/CodeGen/AArch64/neon-scalar-shift.ll @@ -6,7 +6,7 @@ declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>) define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_ushl_v1i64: %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -; CHECK: ushl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +; CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } @@ -14,7 +14,7 @@ define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_sshl_v1i64: %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) -; CHECK: sshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +; CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } @@ -24,15 +24,213 @@ declare <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64>, <1 x i64>) define <1 x i64> @test_ushl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_ushl_v1i64_aarch64: %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64> %lhs, <1 x i64> %rhs) -; CHECK: ushl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +; CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } define <1 x i64> @test_sshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { ; CHECK: test_sshl_v1i64_aarch64: %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64> %lhs, <1 x i64> %rhs) -; CHECK: sshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} +; CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret <1 x i64> %tmp1 } +define <1 x i64> @test_vtst_s64(<1 x i64> %a, <1 x i64> %b) { +; CHECK-LABEL: test_vtst_s64 +; CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +entry: + %0 = and <1 x i64> %a, %b + %1 = icmp ne <1 x i64> %0, zeroinitializer + %vtst.i = sext <1 x i1> %1 to <1 x i64> + ret <1 x i64> %vtst.i +} + +define <1 x i64> @test_vtst_u64(<1 x i64> %a, <1 x i64> %b) { +; CHECK-LABEL: test_vtst_u64 +; CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +entry: + %0 = and <1 x i64> %a, %b + %1 = icmp ne <1 x i64> %0, zeroinitializer + %vtst.i = sext <1 x i1> %1 to <1 x i64> + ret <1 x i64> %vtst.i +} + +define <1 x i64> @test_vsli_n_p64(<1 x i64> %a, <1 x i64> %b) { +; CHECK-LABEL: test_vsli_n_p64 +; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #0 +entry: + %vsli_n2 = tail call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %a, <1 x i64> %b, i32 0) + ret <1 x i64> %vsli_n2 +} + +declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) + +define <2 x i64> @test_vsliq_n_p64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vsliq_n_p64 +; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0 +entry: + %vsli_n2 = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 0) + ret <2 x i64> %vsli_n2 +} + +declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) + +define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) { +; CHECK-LABEL: test_vrsqrte_u32 +; CHECK: ursqrte {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vrsqrte1.i = tail call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a) + ret <2 x i32> %vrsqrte1.i +} + +define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) { +; CHECK-LABEL: test_vrsqrteq_u32 +; CHECK: ursqrte {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vrsqrte1.i = tail call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a) + ret <4 x i32> %vrsqrte1.i +} + +define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) { +; CHECK-LABEL: test_vqshl_n_s8 +; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0 +entry: + %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer) + ret <8 x i8> %vqshl_n +} + +declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>) + +define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) { +; CHECK-LABEL: test_vqshlq_n_s8 +; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0 +entry: + %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer) + ret <16 x i8> %vqshl_n +} + +declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>) + +define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) { +; CHECK-LABEL: test_vqshl_n_s16 +; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0 +entry: + %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> zeroinitializer) + ret <4 x i16> %vqshl_n1 +} + +declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>) + +define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) { +; CHECK-LABEL: test_vqshlq_n_s16 +; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0 +entry: + %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> zeroinitializer) + ret <8 x i16> %vqshl_n1 +} + +declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>) + +define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) { +; CHECK-LABEL: test_vqshl_n_s32 +; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0 +entry: + %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer) + ret <2 x i32> %vqshl_n1 +} + +declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>) + +define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) { +; CHECK-LABEL: test_vqshlq_n_s32 +; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0 +entry: + %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> zeroinitializer) + ret <4 x i32> %vqshl_n1 +} + +declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>) + +define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) { +; CHECK-LABEL: test_vqshlq_n_s64 +; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0 +entry: + %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> zeroinitializer) + ret <2 x i64> %vqshl_n1 +} + +declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>) + +define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) { +; CHECK-LABEL: test_vqshl_n_u8 +; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0 +entry: + %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer) + ret <8 x i8> %vqshl_n +} + +declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>) + +define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) { +; CHECK-LABEL: test_vqshlq_n_u8 +; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0 +entry: + %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer) + ret <16 x i8> %vqshl_n +} + +declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) + +define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) { +; CHECK-LABEL: test_vqshl_n_u16 +; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0 +entry: + %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> zeroinitializer) + ret <4 x i16> %vqshl_n1 +} + +declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>) + +define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) { +; CHECK-LABEL: test_vqshlq_n_u16 +; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0 +entry: + %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> zeroinitializer) + ret <8 x i16> %vqshl_n1 +} + +declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) + +define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) { +; CHECK-LABEL: test_vqshl_n_u32 +; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0 +entry: + %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer) + ret <2 x i32> %vqshl_n1 +} + +declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) + +define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) { +; CHECK-LABEL: test_vqshlq_n_u32 +; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0 +entry: + %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> zeroinitializer) + ret <4 x i32> %vqshl_n1 +} + +declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>) + +define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) { +; CHECK-LABEL: test_vqshlq_n_u64 +; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, +entry: + %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> zeroinitializer) + ret <2 x i64> %vqshl_n1 +} + +declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>) +declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>) diff --git a/test/CodeGen/AArch64/neon-select_cc.ll b/test/CodeGen/AArch64/neon-select_cc.ll new file mode 100644 index 0000000..f6b5d3c --- /dev/null +++ b/test/CodeGen/AArch64/neon-select_cc.ll @@ -0,0 +1,202 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) { +; CHECK-LABEL: test_select_cc_v8i8_i8: +; CHECK: and w0, w0, #0xff +; CHECK-NEXT: cmp w0, w1, uxtb +; CHECK-NEXT: csinv w0, wzr, wzr, ne +; CHECK-NEXT: dup v{{[0-9]+}}.8b, w0 +; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b + %cmp31 = icmp eq i8 %a, %b + %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d + ret <8x i8> %e +} + +define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8> %d ) { +; CHECK-LABEL: test_select_cc_v8i8_f32: +; CHECK: fcmeq v{{[0-9]+}}.4s, v0.4s, v1.4s +; CHECK-NEXT: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0] +; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v2.8b, v3.8b + %cmp31 = fcmp oeq float %a, %b + %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d + ret <8x i8> %e +} + +define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8> %d ) { +; CHECK-LABEL: test_select_cc_v8i8_f64: +; CHECK: fcmeq v{{[0-9]+}}.2d, v0.2d, v1.2d +; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v2.8b, v3.8b + %cmp31 = fcmp oeq double %a, %b + %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d + ret <8x i8> %e +} + +define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ) { +; CHECK-LABEL: test_select_cc_v16i8_i8: +; CHECK: and w0, w0, #0xff +; CHECK-NEXT: cmp w0, w1, uxtb +; CHECK-NEXT: csinv w0, wzr, wzr, ne +; CHECK-NEXT: dup v{{[0-9]+}}.16b, w0 +; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b + %cmp31 = icmp eq i8 %a, %b + %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d + ret <16x i8> %e +} + +define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x i8> %d ) { +; CHECK-LABEL: test_select_cc_v16i8_f32: +; CHECK: fcmeq v{{[0-9]+}}.4s, v0.4s, v1.4s +; CHECK-NEXT: dup v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0] +; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v2.16b, v3.16b + %cmp31 = fcmp oeq float %a, %b + %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d + ret <16x i8> %e +} + +define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16x i8> %d ) { +; CHECK-LABEL: test_select_cc_v16i8_f64: +; CHECK: fcmeq v{{[0-9]+}}.2d, v0.2d, v1.2d +; CHECK-NEXT: dup v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0] +; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v2.16b, v3.16b + %cmp31 = fcmp oeq double %a, %b + %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d + ret <16x i8> %e +} + +define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) { +; CHECK-LABEL: test_select_cc_v4i16: +; CHECK: and w0, w0, #0xffff +; CHECK-NEXT: cmp w0, w1, uxth +; CHECK-NEXT: csinv w0, wzr, wzr, ne +; CHECK-NEXT: dup v{{[0-9]+}}.4h, w0 +; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b + %cmp31 = icmp eq i16 %a, %b + %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d + ret <4x i16> %e +} + +define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) { +; CHECK-LABEL: test_select_cc_v8i16: +; CHECK: and w0, w0, #0xffff +; CHECK-NEXT: cmp w0, w1, uxth +; CHECK-NEXT: csinv w0, wzr, wzr, ne +; CHECK-NEXT: dup v{{[0-9]+}}.8h, w0 +; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b + %cmp31 = icmp eq i16 %a, %b + %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d + ret <8x i16> %e +} + +define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) { +; CHECK-LABEL: test_select_cc_v2i32: +; CHECK: cmp w0, w1, uxtw +; CHECK-NEXT: csinv w0, wzr, wzr, ne +; CHECK-NEXT: dup v{{[0-9]+}}.2s, w0 +; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b + %cmp31 = icmp eq i32 %a, %b + %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d + ret <2x i32> %e +} + +define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) { +; CHECK-LABEL: test_select_cc_v4i32: +; CHECK: cmp w0, w1, uxtw +; CHECK-NEXT: csinv w0, wzr, wzr, ne +; CHECK-NEXT: dup v{{[0-9]+}}.4s, w0 +; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b + %cmp31 = icmp eq i32 %a, %b + %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d + ret <4x i32> %e +} + +define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) { +; CHECK-LABEL: test_select_cc_v1i64: +; CHECK: cmp x0, x1 +; CHECK-NEXT: csinv x0, xzr, xzr, ne +; CHECK-NEXT: fmov d{{[0-9]+}}, x0 +; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b + %cmp31 = icmp eq i64 %a, %b + %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d + ret <1x i64> %e +} + +define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) { +; CHECK-LABEL: test_select_cc_v2i64: +; CHECK: cmp x0, x1 +; CHECK-NEXT: csinv x0, xzr, xzr, ne +; CHECK-NEXT: dup v{{[0-9]+}}.2d, x0 +; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b + %cmp31 = icmp eq i64 %a, %b + %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d + ret <2x i64> %e +} + +define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) { +; CHECK-LABEL: test_select_cc_v1f32: +; CHECK: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, eq + %cmp31 = fcmp oeq float %a, %b + %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d + ret <1 x float> %e +} + +define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d ) { +; CHECK-LABEL: test_select_cc_v2f32: +; CHECK: fcmeq v{{[0-9]+}}.4s, v0.4s, v1.4s +; CHECK-NEXT: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0] +; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v2.8b, v3.8b + %cmp31 = fcmp oeq float %a, %b + %e = select i1 %cmp31, <2 x float> %c, <2 x float> %d + ret <2 x float> %e +} + +define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x float> %d ) { +; CHECK-LABEL: test_select_cc_v4f32: +; CHECK: fcmeq v{{[0-9]+}}.4s, v0.4s, v1.4s +; CHECK-NEXT: dup v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0] +; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v2.16b, v3.16b + %cmp31 = fcmp oeq float %a, %b + %e = select i1 %cmp31, <4x float> %c, <4x float> %d + ret <4x float> %e +} + +define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x float> %d ) { +; CHECK-LABEL: test_select_cc_v4f32_icmp: +; CHECK: cmp w0, w1, uxtw +; CHECK: csinv w0, wzr, wzr, ne +; CHECK-NEXT: dup v{{[0-9]+}}.4s, w0 +; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b + %cmp31 = icmp eq i32 %a, %b + %e = select i1 %cmp31, <4x float> %c, <4x float> %d + ret <4x float> %e +} + +define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c, <1 x double> %d ) { +; CHECK-LABEL: test_select_cc_v1f64: +; CHECK: fcmeq v{{[0-9]+}}.2d, v0.2d, v1.2d +; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v2.8b, v3.8b + %cmp31 = fcmp oeq double %a, %b + %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d + ret <1 x double> %e +} + +define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c, <1 x double> %d ) { +; CHECK-LABEL: test_select_cc_v1f64_icmp: +; CHECK: cmp x0, x1 +; CHECK-NEXT: csinv x0, xzr, xzr, ne +; CHECK-NEXT: fmov d{{[0-9]+}}, x0 +; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b + %cmp31 = icmp eq i64 %a, %b + %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d + ret <1 x double> %e +} + +define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d ) { +; CHECK-LABEL: test_select_cc_v2f64: +; CHECK: fcmeq v{{[0-9]+}}.2d, v0.2d, v1.2d +; CHECK-NEXT: dup v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0] +; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v2.16b, v3.16b + %cmp31 = fcmp oeq double %a, %b + %e = select i1 %cmp31, <2 x double> %c, <2 x double> %d + ret <2 x double> %e +} diff --git a/test/CodeGen/AArch64/neon-shift-left-long.ll b/test/CodeGen/AArch64/neon-shift-left-long.ll index d45c476..d10d551 100644 --- a/test/CodeGen/AArch64/neon-shift-left-long.ll +++ b/test/CodeGen/AArch64/neon-shift-left-long.ll @@ -191,3 +191,13 @@ define <2 x i64> @test_ushll2_shl0_v4i32(<4 x i32> %a) { %tmp = zext <2 x i32> %1 to <2 x i64> ret <2 x i64> %tmp } + +define <8 x i16> @test_ushll_cmp(<8 x i8> %a, <8 x i8> %b) #0 { +; CHECK: test_ushll_cmp: +; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK-NEXT: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0 + %cmp.i = icmp eq <8 x i8> %a, %b + %vcgtz.i.i = sext <8 x i1> %cmp.i to <8 x i8> + %vmovl.i.i.i = zext <8 x i8> %vcgtz.i.i to <8 x i16> + ret <8 x i16> %vmovl.i.i.i +} diff --git a/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll b/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll new file mode 100644 index 0000000..0b520d7 --- /dev/null +++ b/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll @@ -0,0 +1,333 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i8> @shl.v8i8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: shl.v8i8: +; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %c = shl <8 x i8> %a, %b + ret <8 x i8> %c +} + +define <4 x i16> @shl.v4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: shl.v4i16: +; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %c = shl <4 x i16> %a, %b + ret <4 x i16> %c +} + +define <2 x i32> @shl.v2i32(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: shl.v2i32: +; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %c = shl <2 x i32> %a, %b + ret <2 x i32> %c +} + +define <1 x i64> @shl.v1i64(<1 x i64> %a, <1 x i64> %b) { +; CHECK-LABEL: shl.v1i64: +; CHECK: ushl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %c = shl <1 x i64> %a, %b + ret <1 x i64> %c +} + +define <16 x i8> @shl.v16i8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: shl.v16i8: +; CHECK: ushl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %c = shl <16 x i8> %a, %b + ret <16 x i8> %c +} + +define <8 x i16> @shl.v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shl.v8i16: +; CHECK: ushl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h + %c = shl <8 x i16> %a, %b + ret <8 x i16> %c +} + +define <4 x i32> @shl.v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: shl.v4i32: +; CHECK: ushl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %c = shl <4 x i32> %a, %b + ret <4 x i32> %c +} + +define <2 x i64> @shl.v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: shl.v2i64: +; CHECK: ushl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %c = shl <2 x i64> %a, %b + ret <2 x i64> %c +} + +define <8 x i8> @lshr.v8i8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: lshr.v8i8: +; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %c = lshr <8 x i8> %a, %b + ret <8 x i8> %c +} + +define <4 x i16> @lshr.v4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: lshr.v4i16: +; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %c = lshr <4 x i16> %a, %b + ret <4 x i16> %c +} + +define <2 x i32> @lshr.v2i32(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: lshr.v2i32: +; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s +; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %c = lshr <2 x i32> %a, %b + ret <2 x i32> %c +} + +define <1 x i64> @lshr.v1i64(<1 x i64> %a, <1 x i64> %b) { +; CHECK-LABEL: lshr.v1i64: +; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}} +; CHECK: ushl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %c = lshr <1 x i64> %a, %b + ret <1 x i64> %c +} + +define <16 x i8> @lshr.v16i8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: lshr.v16i8: +; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: ushl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %c = lshr <16 x i8> %a, %b + ret <16 x i8> %c +} + +define <8 x i16> @lshr.v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: lshr.v8i16: +; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +; CHECK: ushl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h + %c = lshr <8 x i16> %a, %b + ret <8 x i16> %c +} + +define <4 x i32> @lshr.v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: lshr.v4i32: +; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s +; CHECK: ushl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %c = lshr <4 x i32> %a, %b + ret <4 x i32> %c +} + +define <2 x i64> @lshr.v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: lshr.v2i64: +; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d +; CHECK: ushl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %c = lshr <2 x i64> %a, %b + ret <2 x i64> %c +} + +define <8 x i8> @ashr.v8i8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: ashr.v8i8: +; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +; CHECK: sshl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %c = ashr <8 x i8> %a, %b + ret <8 x i8> %c +} + +define <4 x i16> @ashr.v4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: ashr.v4i16: +; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +; CHECK: sshl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %c = ashr <4 x i16> %a, %b + ret <4 x i16> %c +} + +define <2 x i32> @ashr.v2i32(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: ashr.v2i32: +; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s +; CHECK: sshl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %c = ashr <2 x i32> %a, %b + ret <2 x i32> %c +} + +define <1 x i64> @ashr.v1i64(<1 x i64> %a, <1 x i64> %b) { +; CHECK-LABEL: ashr.v1i64: +; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}} +; CHECK: sshl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %c = ashr <1 x i64> %a, %b + ret <1 x i64> %c +} + +define <16 x i8> @ashr.v16i8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: ashr.v16i8: +; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b +; CHECK: sshl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %c = ashr <16 x i8> %a, %b + ret <16 x i8> %c +} + +define <8 x i16> @ashr.v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: ashr.v8i16: +; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h +; CHECK: sshl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h + %c = ashr <8 x i16> %a, %b + ret <8 x i16> %c +} + +define <4 x i32> @ashr.v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: ashr.v4i32: +; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s +; CHECK: sshl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %c = ashr <4 x i32> %a, %b + ret <4 x i32> %c +} + +define <2 x i64> @ashr.v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: ashr.v2i64: +; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d +; CHECK: sshl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %c = ashr <2 x i64> %a, %b + ret <2 x i64> %c +} + +define <1 x i64> @shl.v1i64.0(<1 x i64> %a) { +; CHECK-LABEL: shl.v1i64.0: +; CHECK-NOT: shl d{{[0-9]+}}, d{{[0-9]+}}, #0 + %c = shl <1 x i64> %a, zeroinitializer + ret <1 x i64> %c +} + +define <2 x i32> @shl.v2i32.0(<2 x i32> %a) { +; CHECK-LABEL: shl.v2i32.0: +; CHECK-NOT: shl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #0 + %c = shl <2 x i32> %a, zeroinitializer + ret <2 x i32> %c +} + +; The following test cases test shl/ashr/lshr with v1i8/v1i16/v1i32 types + +define <1 x i8> @shl.v1i8(<1 x i8> %a, <1 x i8> %b) { +; CHECK-LABEL: shl.v1i8: +; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %c = shl <1 x i8> %a, %b + ret <1 x i8> %c +} + +define <1 x i16> @shl.v1i16(<1 x i16> %a, <1 x i16> %b) { +; CHECK-LABEL: shl.v1i16: +; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %c = shl <1 x i16> %a, %b + ret <1 x i16> %c +} + +define <1 x i32> @shl.v1i32(<1 x i32> %a, <1 x i32> %b) { +; CHECK-LABEL: shl.v1i32: +; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %c = shl <1 x i32> %a, %b + ret <1 x i32> %c +} + +define <1 x i8> @ashr.v1i8(<1 x i8> %a, <1 x i8> %b) { +; CHECK-LABEL: ashr.v1i8: +; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +; CHECK: sshl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %c = ashr <1 x i8> %a, %b + ret <1 x i8> %c +} + +define <1 x i16> @ashr.v1i16(<1 x i16> %a, <1 x i16> %b) { +; CHECK-LABEL: ashr.v1i16: +; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +; CHECK: sshl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %c = ashr <1 x i16> %a, %b + ret <1 x i16> %c +} + +define <1 x i32> @ashr.v1i32(<1 x i32> %a, <1 x i32> %b) { +; CHECK-LABEL: ashr.v1i32: +; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s +; CHECK: sshl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %c = ashr <1 x i32> %a, %b + ret <1 x i32> %c +} + +define <1 x i8> @lshr.v1i8(<1 x i8> %a, <1 x i8> %b) { +; CHECK-LABEL: lshr.v1i8: +; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b +; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %c = lshr <1 x i8> %a, %b + ret <1 x i8> %c +} + +define <1 x i16> @lshr.v1i16(<1 x i16> %a, <1 x i16> %b) { +; CHECK-LABEL: lshr.v1i16: +; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h +; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %c = lshr <1 x i16> %a, %b + ret <1 x i16> %c +} + +define <1 x i32> @lshr.v1i32(<1 x i32> %a, <1 x i32> %b) { +; CHECK-LABEL: lshr.v1i32: +; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s +; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %c = lshr <1 x i32> %a, %b + ret <1 x i32> %c +} + +define <1 x i8> @shl.v1i8.imm(<1 x i8> %a) { +; CHECK-LABEL: shl.v1i8.imm: +; CHECK: shl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3 + %c = shl <1 x i8> %a, + ret <1 x i8> %c +} + +define <1 x i16> @shl.v1i16.imm(<1 x i16> %a) { +; CHECK-LABEL: shl.v1i16.imm: +; CHECK: shl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #5 + %c = shl <1 x i16> %a, + ret <1 x i16> %c +} + +define <1 x i32> @shl.v1i32.imm(<1 x i32> %a) { +; CHECK-LABEL: shl.v1i32.imm: +; CHECK-NOT: shl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #0 + %c = shl <1 x i32> %a, zeroinitializer + ret <1 x i32> %c +} + +define <1 x i8> @ashr.v1i8.imm(<1 x i8> %a) { +; CHECK-LABEL: ashr.v1i8.imm: +; CHECK: sshr v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3 + %c = ashr <1 x i8> %a, + ret <1 x i8> %c +} + +define <1 x i16> @ashr.v1i16.imm(<1 x i16> %a) { +; CHECK-LABEL: ashr.v1i16.imm: +; CHECK: sshr v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #10 + %c = ashr <1 x i16> %a, + ret <1 x i16> %c +} + +define <1 x i32> @ashr.v1i32.imm(<1 x i32> %a) { +; CHECK-LABEL: ashr.v1i32.imm: +; CHECK: sshr v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #31 + %c = ashr <1 x i32> %a, + ret <1 x i32> %c +} + +define <1 x i8> @lshr.v1i8.imm(<1 x i8> %a) { +; CHECK-LABEL: lshr.v1i8.imm: +; CHECK: ushr v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3 + %c = lshr <1 x i8> %a, + ret <1 x i8> %c +} + +define <1 x i16> @lshr.v1i16.imm(<1 x i16> %a) { +; CHECK-LABEL: lshr.v1i16.imm: +; CHECK: ushr v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #10 + %c = lshr <1 x i16> %a, + ret <1 x i16> %c +} + +define <1 x i32> @lshr.v1i32.imm(<1 x i32> %a) { +; CHECK-LABEL: lshr.v1i32.imm: +; CHECK: ushr v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #31 + %c = lshr <1 x i32> %a, + ret <1 x i32> %c +} diff --git a/test/CodeGen/AArch64/neon-simd-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-ldst-one.ll index 3f28320..927c933 100644 --- a/test/CodeGen/AArch64/neon-simd-ldst-one.ll +++ b/test/CodeGen/AArch64/neon-simd-ldst-one.ll @@ -1,5 +1,8 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s +%struct.uint8x16x2_t = type { [2 x <16 x i8>] } +%struct.poly8x16x2_t = type { [2 x <16 x i8>] } +%struct.uint8x16x3_t = type { [3 x <16 x i8>] } %struct.int8x16x2_t = type { [2 x <16 x i8>] } %struct.int16x8x2_t = type { [2 x <8 x i16>] } %struct.int32x4x2_t = type { [2 x <4 x i32>] } @@ -37,6 +40,87 @@ %struct.float32x2x4_t = type { [4 x <2 x float>] } %struct.float64x1x4_t = type { [4 x <1 x double>] } +define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) { +; CHECK-LABEL: test_ld_from_poll_v16i8 +; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}} +; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}] +entry: + %b = add <16 x i8> %a, + ret <16 x i8> %b +} + +define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) { +; CHECK-LABEL: test_ld_from_poll_v8i16 +; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}} +; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}] +entry: + %b = add <8 x i16> %a, + ret <8 x i16> %b +} + +define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) { +; CHECK-LABEL: test_ld_from_poll_v4i32 +; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}} +; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}] +entry: + %b = add <4 x i32> %a, + ret <4 x i32> %b +} + +define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) { +; CHECK-LABEL: test_ld_from_poll_v2i64 +; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}} +; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}] +entry: + %b = add <2 x i64> %a, + ret <2 x i64> %b +} + +define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) { +; CHECK-LABEL: test_ld_from_poll_v4f32 +; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}} +; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}] +entry: + %b = fadd <4 x float> %a, + ret <4 x float> %b +} + +define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) { +; CHECK-LABEL: test_ld_from_poll_v2f64 +; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}} +; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}] +entry: + %b = fadd <2 x double> %a, + ret <2 x double> %b +} + +define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) { +; CHECK-LABEL: test_ld_from_poll_v8i8 +; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}} +; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}] +entry: + %b = add <8 x i8> %a, + ret <8 x i8> %b +} + +define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) { +; CHECK-LABEL: test_ld_from_poll_v4i16 +; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}} +; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}] +entry: + %b = add <4 x i16> %a, + ret <4 x i16> %b +} + +define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) { +; CHECK-LABEL: test_ld_from_poll_v2i32 +; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}} +; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}] +entry: + %b = add <2 x i32> %a, + ret <2 x i32> %b +} + define <16 x i8> @test_vld1q_dup_s8(i8* %a) { ; CHECK-LABEL: test_vld1q_dup_s8 ; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0] @@ -155,6 +239,31 @@ entry: ret <1 x double> %1 } +define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 { +; As there is a store operation depending on %1, LD1R pattern can't be selected. +; So LDR and FMOV should be emitted. +; CHECK-LABEL: testDUP.v1i64 +; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}] +; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} +; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}] + %1 = load i64* %a, align 8 + store i64 %1, i64* %b, align 8 + %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0 + ret <1 x i64> %vecinit.i +} + +define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 { +; As there is a store operation depending on %1, LD1R pattern can't be selected. +; So LDR and FMOV should be emitted. +; CHECK-LABEL: testDUP.v1f64 +; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}] +; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}] + %1 = load double* %a, align 8 + store double %1, double* %b, align 8 + %vecinit.i = insertelement <1 x double> undef, double %1, i32 0 + ret <1 x double> %vecinit.i +} + define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) { ; CHECK-LABEL: test_vld2q_dup_s8 ; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0] @@ -2110,4 +2219,81 @@ declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) -declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32) \ No newline at end of file +declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32) + +define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) { +; CHECK-LABEL: test_vld2q_lane_s8 +; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0] +entry: + %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1 + %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1) + %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int8x16x2_t %.fca.0.1.insert +} + +define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) { +; CHECK-LABEL: test_vld2q_lane_u8 +; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0] +entry: + %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1 + %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1) + %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.uint8x16x2_t %.fca.0.1.insert +} + +define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) { +; CHECK-LABEL: test_vld2q_lane_p8 +; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0] +entry: + %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1 + %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1) + %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.poly8x16x2_t %.fca.0.1.insert +} + +define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) { +; CHECK-LABEL: test_vld3q_lane_s8 +; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0] +entry: + %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1 + %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2 + %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1) + %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int8x16x3_t %.fca.0.2.insert +} + +define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) { +; CHECK-LABEL: test_vld3q_lane_u8 +; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0] +entry: + %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1 + %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2 + %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1) + %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.uint8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.uint8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.uint8x16x3_t %.fca.0.2.insert +} + diff --git a/test/CodeGen/AArch64/neon-simd-tbl.ll b/test/CodeGen/AArch64/neon-simd-tbl.ll index 8eac1e8..7a51c0f 100644 --- a/test/CodeGen/AArch64/neon-simd-tbl.ll +++ b/test/CodeGen/AArch64/neon-simd-tbl.ll @@ -1,45 +1,45 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s -declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) -declare <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) -declare <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) -declare <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) -declare <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8>, <16 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) -declare <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8>, <16 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) -declare <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8>, <8 x i8>) -declare <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8>, <16 x i8>) -declare <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) -declare <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) +declare <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) { ; CHECK: test_vtbl1_s8: ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b entry: %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> - %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b) ret <8 x i8> %vtbl11.i } @@ -47,7 +47,7 @@ define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) { ; CHECK: test_vqtbl1_s8: ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b entry: - %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b) + %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b) ret <8 x i8> %vtbl1.i } @@ -58,7 +58,7 @@ entry: %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1 %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> - %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b) ret <8 x i8> %vtbl17.i } @@ -68,7 +68,7 @@ define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) { entry: %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 - %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b) + %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b) ret <8 x i8> %vtbl2.i } @@ -81,7 +81,7 @@ entry: %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2 %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> - %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b) + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b) ret <8 x i8> %vtbl212.i } @@ -92,7 +92,7 @@ entry: %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 - %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b) + %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b) ret <8 x i8> %vtbl3.i } @@ -106,7 +106,7 @@ entry: %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3 %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> - %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b) + %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b) ret <8 x i8> %vtbl216.i } @@ -118,7 +118,7 @@ entry: %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 - %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b) + %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b) ret <8 x i8> %vtbl4.i } @@ -126,7 +126,7 @@ define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) { ; CHECK: test_vqtbl1q_s8: ; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b entry: - %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b) + %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b) ret <16 x i8> %vtbl1.i } @@ -136,7 +136,7 @@ define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) { entry: %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 - %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b) + %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b) ret <16 x i8> %vtbl2.i } @@ -147,7 +147,7 @@ entry: %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 - %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b) + %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b) ret <16 x i8> %vtbl3.i } @@ -159,7 +159,7 @@ entry: %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 - %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b) + %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b) ret <16 x i8> %vtbl4.i } @@ -168,7 +168,7 @@ define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b entry: %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> - %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c) + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c) %0 = icmp uge <8 x i8> %c, %1 = sext <8 x i1> %0 to <8 x i8> %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i) @@ -182,7 +182,7 @@ entry: %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1 %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> - %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c) + %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c) ret <8 x i8> %vtbx17.i } @@ -195,7 +195,7 @@ entry: %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2 %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> - %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c) + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c) %0 = icmp uge <8 x i8> %c, %1 = sext <8 x i1> %0 to <8 x i8> %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i) @@ -212,7 +212,7 @@ entry: %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3 %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> - %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c) + %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c) ret <8 x i8> %vtbx216.i } @@ -220,7 +220,7 @@ define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) { ; CHECK: test_vqtbx1_s8: ; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b entry: - %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) + %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) ret <8 x i8> %vtbx1.i } @@ -230,7 +230,7 @@ define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> entry: %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 - %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c) + %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c) ret <8 x i8> %vtbx2.i } @@ -241,7 +241,7 @@ entry: %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 - %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c) + %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c) ret <8 x i8> %vtbx3.i } @@ -253,7 +253,7 @@ entry: %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 - %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c) + %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c) ret <8 x i8> %vtbx4.i } @@ -261,7 +261,7 @@ define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { ; CHECK: test_vqtbx1q_s8: ; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b entry: - %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) + %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) ret <16 x i8> %vtbx1.i } @@ -271,7 +271,7 @@ define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x entry: %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 - %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c) + %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c) ret <16 x i8> %vtbx2.i } @@ -282,7 +282,7 @@ entry: %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 - %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c) + %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c) ret <16 x i8> %vtbx3.i } @@ -294,7 +294,7 @@ entry: %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 - %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c) + %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c) ret <16 x i8> %vtbx4.i } @@ -303,7 +303,7 @@ define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) { ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b entry: %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> - %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b) ret <8 x i8> %vtbl11.i } @@ -311,7 +311,7 @@ define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) { ; CHECK: test_vqtbl1_u8: ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b entry: - %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b) + %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b) ret <8 x i8> %vtbl1.i } @@ -322,7 +322,7 @@ entry: %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1 %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> - %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b) ret <8 x i8> %vtbl17.i } @@ -332,7 +332,7 @@ define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) { entry: %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 - %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b) + %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b) ret <8 x i8> %vtbl2.i } @@ -345,7 +345,7 @@ entry: %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2 %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> - %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b) + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b) ret <8 x i8> %vtbl212.i } @@ -356,7 +356,7 @@ entry: %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 - %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b) + %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b) ret <8 x i8> %vtbl3.i } @@ -370,7 +370,7 @@ entry: %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3 %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> - %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b) + %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b) ret <8 x i8> %vtbl216.i } @@ -382,7 +382,7 @@ entry: %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 - %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b) + %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b) ret <8 x i8> %vtbl4.i } @@ -390,7 +390,7 @@ define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) { ; CHECK: test_vqtbl1q_u8: ; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b entry: - %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b) + %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b) ret <16 x i8> %vtbl1.i } @@ -400,7 +400,7 @@ define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) { entry: %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 - %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b) + %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b) ret <16 x i8> %vtbl2.i } @@ -411,7 +411,7 @@ entry: %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 - %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b) + %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b) ret <16 x i8> %vtbl3.i } @@ -423,7 +423,7 @@ entry: %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 - %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b) + %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b) ret <16 x i8> %vtbl4.i } @@ -432,7 +432,7 @@ define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b entry: %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> - %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c) + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c) %0 = icmp uge <8 x i8> %c, %1 = sext <8 x i1> %0 to <8 x i8> %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i) @@ -446,7 +446,7 @@ entry: %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1 %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> - %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c) + %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c) ret <8 x i8> %vtbx17.i } @@ -459,7 +459,7 @@ entry: %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2 %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> - %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c) + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c) %0 = icmp uge <8 x i8> %c, %1 = sext <8 x i1> %0 to <8 x i8> %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i) @@ -476,7 +476,7 @@ entry: %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3 %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> - %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c) + %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c) ret <8 x i8> %vtbx216.i } @@ -484,7 +484,7 @@ define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) { ; CHECK: test_vqtbx1_u8: ; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b entry: - %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) + %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) ret <8 x i8> %vtbx1.i } @@ -494,7 +494,7 @@ define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> entry: %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 - %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c) + %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c) ret <8 x i8> %vtbx2.i } @@ -505,7 +505,7 @@ entry: %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 - %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c) + %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c) ret <8 x i8> %vtbx3.i } @@ -517,7 +517,7 @@ entry: %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 - %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c) + %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c) ret <8 x i8> %vtbx4.i } @@ -525,7 +525,7 @@ define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { ; CHECK: test_vqtbx1q_u8: ; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b entry: - %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) + %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) ret <16 x i8> %vtbx1.i } @@ -535,7 +535,7 @@ define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x entry: %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 - %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c) + %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c) ret <16 x i8> %vtbx2.i } @@ -546,7 +546,7 @@ entry: %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 - %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c) + %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c) ret <16 x i8> %vtbx3.i } @@ -558,7 +558,7 @@ entry: %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 - %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c) + %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c) ret <16 x i8> %vtbx4.i } @@ -567,7 +567,7 @@ define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) { ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b entry: %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> - %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b) ret <8 x i8> %vtbl11.i } @@ -575,7 +575,7 @@ define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) { ; CHECK: test_vqtbl1_p8: ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b entry: - %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b) + %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b) ret <8 x i8> %vtbl1.i } @@ -586,7 +586,7 @@ entry: %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1 %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> - %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b) ret <8 x i8> %vtbl17.i } @@ -596,7 +596,7 @@ define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) { entry: %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 - %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b) + %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b) ret <8 x i8> %vtbl2.i } @@ -609,7 +609,7 @@ entry: %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2 %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> - %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b) + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b) ret <8 x i8> %vtbl212.i } @@ -620,7 +620,7 @@ entry: %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 - %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b) + %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b) ret <8 x i8> %vtbl3.i } @@ -634,7 +634,7 @@ entry: %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3 %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> - %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b) + %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b) ret <8 x i8> %vtbl216.i } @@ -646,7 +646,7 @@ entry: %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 - %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b) + %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b) ret <8 x i8> %vtbl4.i } @@ -654,7 +654,7 @@ define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) { ; CHECK: test_vqtbl1q_p8: ; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b entry: - %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b) + %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b) ret <16 x i8> %vtbl1.i } @@ -664,7 +664,7 @@ define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) { entry: %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 - %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b) + %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b) ret <16 x i8> %vtbl2.i } @@ -675,7 +675,7 @@ entry: %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 - %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b) + %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b) ret <16 x i8> %vtbl3.i } @@ -687,7 +687,7 @@ entry: %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 - %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b) + %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b) ret <16 x i8> %vtbl4.i } @@ -696,7 +696,7 @@ define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b entry: %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> - %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c) + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c) %0 = icmp uge <8 x i8> %c, %1 = sext <8 x i1> %0 to <8 x i8> %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i) @@ -710,7 +710,7 @@ entry: %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1 %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> - %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c) + %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c) ret <8 x i8> %vtbx17.i } @@ -723,7 +723,7 @@ entry: %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2 %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> - %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c) + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c) %0 = icmp uge <8 x i8> %c, %1 = sext <8 x i1> %0 to <8 x i8> %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i) @@ -740,7 +740,7 @@ entry: %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3 %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> - %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c) + %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c) ret <8 x i8> %vtbx216.i } @@ -748,7 +748,7 @@ define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) { ; CHECK: test_vqtbx1_p8: ; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b entry: - %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) + %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) ret <8 x i8> %vtbx1.i } @@ -758,7 +758,7 @@ define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> entry: %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 - %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c) + %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c) ret <8 x i8> %vtbx2.i } @@ -769,7 +769,7 @@ entry: %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 - %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c) + %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c) ret <8 x i8> %vtbx3.i } @@ -781,7 +781,7 @@ entry: %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 - %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c) + %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c) ret <8 x i8> %vtbx4.i } @@ -789,7 +789,7 @@ define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { ; CHECK: test_vqtbx1q_p8: ; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b entry: - %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) + %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) ret <16 x i8> %vtbx1.i } @@ -799,7 +799,7 @@ define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x entry: %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 - %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c) + %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c) ret <16 x i8> %vtbx2.i } @@ -810,7 +810,7 @@ entry: %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 - %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c) + %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c) ret <16 x i8> %vtbx3.i } @@ -822,7 +822,7 @@ entry: %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 - %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c) + %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c) ret <16 x i8> %vtbx4.i } diff --git a/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll b/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll new file mode 100644 index 0000000..bb3300e --- /dev/null +++ b/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll @@ -0,0 +1,30 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +; This file tests the spill of FPR8/FPR16. The volatile loads/stores force the +; allocator to keep the value live until it's needed. + +%bigtype_v1i8 = type [20 x <1 x i8>] + +define void @spill_fpr8(%bigtype_v1i8* %addr) { +; CHECK-LABEL: spill_fpr8: +; CHECK: 1-byte Folded Spill +; CHECK: 1-byte Folded Reload + %val1 = load volatile %bigtype_v1i8* %addr + %val2 = load volatile %bigtype_v1i8* %addr + store volatile %bigtype_v1i8 %val1, %bigtype_v1i8* %addr + store volatile %bigtype_v1i8 %val2, %bigtype_v1i8* %addr + ret void +} + +%bigtype_v1i16 = type [20 x <1 x i16>] + +define void @spill_fpr16(%bigtype_v1i16* %addr) { +; CHECK-LABEL: spill_fpr16: +; CHECK: 2-byte Folded Spill +; CHECK: 2-byte Folded Reload + %val1 = load volatile %bigtype_v1i16* %addr + %val2 = load volatile %bigtype_v1i16* %addr + store volatile %bigtype_v1i16 %val1, %bigtype_v1i16* %addr + store volatile %bigtype_v1i16 %val2, %bigtype_v1i16* %addr + ret void +} \ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-truncStore-extLoad.ll b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll new file mode 100644 index 0000000..e5b7694 --- /dev/null +++ b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll @@ -0,0 +1,57 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +; A vector TruncStore can not be selected. +; Test a trunc IR and a vector store IR can be selected correctly. +define void @truncStore.v2i64(<2 x i64> %a, <2 x i32>* %result) { +; CHECK-LABEL: truncStore.v2i64: +; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d +; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %b = trunc <2 x i64> %a to <2 x i32> + store <2 x i32> %b, <2 x i32>* %result + ret void +} + +define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) { +; CHECK-LABEL: truncStore.v4i32: +; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s +; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] + %b = trunc <4 x i32> %a to <4 x i16> + store <4 x i16> %b, <4 x i16>* %result + ret void +} + +define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) { +; CHECK-LABEL: truncStore.v8i16: +; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h +; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] + %b = trunc <8 x i16> %a to <8 x i8> + store <8 x i8> %b, <8 x i8>* %result + ret void +} + +; A vector LoadExt can not be selected. +; Test a vector load IR and a sext/zext IR can be selected correctly. +define <4 x i32> @loadSExt.v4i8(<4 x i8>* %ref) { +; CHECK-LABEL: loadSExt.v4i8: +; CHECK: ldrsb + %a = load <4 x i8>* %ref + %conv = sext <4 x i8> %a to <4 x i32> + ret <4 x i32> %conv +} + +define <4 x i32> @loadZExt.v4i8(<4 x i8>* %ref) { +; CHECK-LABEL: loadZExt.v4i8: +; CHECK: ldrb + %a = load <4 x i8>* %ref + %conv = zext <4 x i8> %a to <4 x i32> + ret <4 x i32> %conv +} + +define i32 @loadExt.i32(<4 x i8>* %ref) { +; CHECK-LABEL: loadExt.i32: +; CHECK: ldrb + %a = load <4 x i8>* %ref + %vecext = extractelement <4 x i8> %a, i32 0 + %conv = zext i8 %vecext to i32 + ret i32 %conv +} \ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-v1i1-setcc.ll b/test/CodeGen/AArch64/neon-v1i1-setcc.ll new file mode 100644 index 0000000..6c7d009 --- /dev/null +++ b/test/CodeGen/AArch64/neon-v1i1-setcc.ll @@ -0,0 +1,68 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +; This file test the DAG node like "v1i1 SETCC v1i64, v1i64". As the v1i1 type +; is illegal in AArch64 backend, the legalizer tries to scalarize this node. +; As the v1i64 operands of SETCC are legal types, they will not be scalarized. +; Currently the type legalizer will have an assertion failure as it assumes all +; operands of SETCC have been legalized. +; FIXME: If the algorithm of type scalarization is improved and can legaize +; "v1i1 SETCC" correctly, these test cases are not needed. + +define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) { +; CHECK-LABEL: test_sext_extr_cmp_0: +; CHECK: cmge d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = icmp sge <1 x i64> %v1, %v2 + %2 = extractelement <1 x i1> %1, i32 0 + %vget_lane = sext i1 %2 to i64 + ret i64 %vget_lane +} + +define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) { +; CHECK-LABEL: test_sext_extr_cmp_1: +; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = fcmp oeq <1 x double> %v1, %v2 + %2 = extractelement <1 x i1> %1, i32 0 + %vget_lane = sext i1 %2 to i64 + ret i64 %vget_lane +} + +define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) { +; CHECK-LABEL: test_select_v1i1_0: +; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %1 = icmp eq <1 x i64> %v1, %v2 + %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3 + ret <1 x i64> %res +} + +define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) { +; CHECK-LABEL: test_select_v1i1_1: +; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %1 = fcmp oeq <1 x double> %v1, %v2 + %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3 + ret <1 x i64> %res +} + +define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) { +; CHECK-LABEL: test_select_v1i1_2: +; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %1 = icmp eq <1 x i64> %v1, %v2 + %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3 + ret <1 x double> %res +} + +define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) { +; CHECK-LABEL: test_br_extr_cmp: +; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}} + %1 = icmp eq <1 x i64> %v1, %v2 + %2 = extractelement <1 x i1> %1, i32 0 + br i1 %2, label %if.end, label %if.then + +if.then: + ret i32 0; + +if.end: + ret i32 1; +} diff --git a/test/CodeGen/AArch64/neon-vector-list-spill.ll b/test/CodeGen/AArch64/neon-vector-list-spill.ll new file mode 100644 index 0000000..3ab69c4 --- /dev/null +++ b/test/CodeGen/AArch64/neon-vector-list-spill.ll @@ -0,0 +1,175 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast + +; FIXME: We should not generate ld/st for such register spill/fill, because the +; test case seems very simple and the register pressure is not high. If the +; spill/fill algorithm is optimized, this test case may not be triggered. And +; then we can delete it. +define i32 @spill.DPairReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.DPairReg: +; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %arg1, i32 4) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <2 x i32>, <2 x i32> } %vld, 0 + %res = extractelement <2 x i32> %vld.extract, i32 1 + ret i32 %res +} + +define i16 @spill.DTripleReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.DTripleReg: +; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %arg1, i32 4) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0 + %res = extractelement <4 x i16> %vld.extract, i32 1 + ret i16 %res +} + +define i16 @spill.DQuadReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.DQuadReg: +; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %arg1, i32 4) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0 + %res = extractelement <4 x i16> %vld.extract, i32 0 + ret i16 %res +} + +define i32 @spill.QPairReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.QPairReg: +; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %arg1, i32 4) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0 + %res = extractelement <4 x i32> %vld.extract, i32 1 + ret i32 %res +} + +define float @spill.QTripleReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.QTripleReg: +; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +entry: + %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %arg1, i32 4) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0 + %res = extractelement <4 x float> %vld3.extract, i32 1 + ret float %res +} + +define i8 @spill.QQuadReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.QQuadReg: +; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %arg1, i32 4) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld, 0 + %res = extractelement <16 x i8> %vld.extract, i32 1 + ret i8 %res +} + +declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32) +declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32) +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32) +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32) + +declare void @foo() + +; FIXME: We should not generate ld/st for such register spill/fill, because the +; test case seems very simple and the register pressure is not high. If the +; spill/fill algorithm is optimized, this test case may not be triggered. And +; then we can delete it. +; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo +define <8 x i16> @test_2xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) { + tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8) + tail call void @foo() + %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> + %1 = bitcast <2 x i64> %sv to <8 x i16> + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> + %3 = mul <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo +define <8 x i16> @test_3xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) { + tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8) + tail call void @foo() + %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> + %1 = bitcast <2 x i64> %sv to <8 x i16> + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> + %3 = mul <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo +define <8 x i16> @test_4xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) { + tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8) + tail call void @foo() + %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> + %1 = bitcast <2 x i64> %sv to <8 x i16> + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> + %3 = mul <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) \ No newline at end of file diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll index 6ec4b19..3404d3f 100644 --- a/test/CodeGen/AArch64/pic-eh-stubs.ll +++ b/test/CodeGen/AArch64/pic-eh-stubs.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s ; Make sure exception-handling PIC code can be linked correctly. An alternative ; to the sequence described below would have .gcc_except_table itself writable @@ -10,8 +11,8 @@ ; ... referring indirectly to stubs for its typeinfo ... ; CHECK: // @TType Encoding = indirect pcrel sdata8 ; ... one of which is "int"'s typeinfo -; CHECK: .Ltmp9: -; CHECK-NEXT: .xword .L_ZTIi.DW.stub-.Ltmp9 +; CHECK: .Ltmp7: +; CHECK-NEXT: .xword .L_ZTIi.DW.stub-.Ltmp7 ; .. and which is properly defined (in a writable section for the dynamic loader) later. ; CHECK: .section .data.rel,"aw" diff --git a/test/CodeGen/AArch64/ragreedy-csr.ll b/test/CodeGen/AArch64/ragreedy-csr.ll new file mode 100644 index 0000000..18a948b --- /dev/null +++ b/test/CodeGen/AArch64/ragreedy-csr.ll @@ -0,0 +1,297 @@ +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -regalloc=greedy -regalloc-csr-first-time-cost=15 | FileCheck %s + +; This testing case is reduced from 197.parser prune_match function. +; We make sure that we do not use callee-saved registers (x19 to x25). +; rdar://16162005 + +; CHECK-LABEL: prune_match: +; CHECK: entry +; CHECK: str x30, [sp +; CHECK-NOT: stp x25, +; CHECK-NOT: stp x23, x24 +; CHECK-NOT: stp x21, x22 +; CHECK-NOT: stp x19, x20 +; CHECK: if.end +; CHECK: return +; CHECK: ldr x30, [sp +; CHECK-NOT: ldp x19, x20 +; CHECK-NOT: ldp x21, x22 +; CHECK-NOT: ldp x23, x24 +; CHECK-NOT: ldp x25, + +%struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* } +%struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* } +%struct._RuneLocale = type { [8 x i8], [32 x i8], i32 (i8*, i64, i8**)*, i32 (i32, i8*, i64, i8**)*, i32, [256 x i32], [256 x i32], [256 x i32], %struct._RuneRange, %struct._RuneRange, %struct._RuneRange, i8*, i32, i32, %struct._RuneCharClass* } +%struct._RuneRange = type { i32, %struct._RuneEntry* } +%struct._RuneEntry = type { i32, i32, i32, i32* } +%struct._RuneCharClass = type { [14 x i8], i32 } +%struct.Exp_struct = type { i8, i8, i8, i8, %union.anon } +%union.anon = type { %struct.E_list_struct* } +%struct.E_list_struct = type { %struct.E_list_struct*, %struct.Exp_struct* } +%struct.domain_struct = type { i8*, i32, %struct.List_o_links_struct*, i32, i32, %struct.d_tree_leaf_struct*, %struct.domain_struct* } +%struct.d_tree_leaf_struct = type { %struct.domain_struct*, i32, %struct.d_tree_leaf_struct* } +@_DefaultRuneLocale = external global %struct._RuneLocale +declare i32 @__maskrune(i32, i64) #7 +define fastcc i32 @prune_match(%struct.Connector_struct* nocapture readonly %a, %struct.Connector_struct* nocapture readonly %b) #9 { +entry: + %label56 = bitcast %struct.Connector_struct* %a to i16* + %0 = load i16* %label56, align 2 + %label157 = bitcast %struct.Connector_struct* %b to i16* + %1 = load i16* %label157, align 2 + %cmp = icmp eq i16 %0, %1 + br i1 %cmp, label %if.end, label %return, !prof !988 +if.end: + %priority = getelementptr inbounds %struct.Connector_struct* %a, i64 0, i32 2 + %2 = load i8* %priority, align 1 + %priority5 = getelementptr inbounds %struct.Connector_struct* %b, i64 0, i32 2 + %3 = load i8* %priority5, align 1 + %string = getelementptr inbounds %struct.Connector_struct* %a, i64 0, i32 5 + %4 = load i8** %string, align 8 + %string7 = getelementptr inbounds %struct.Connector_struct* %b, i64 0, i32 5 + %5 = load i8** %string7, align 8 + br label %while.cond +while.cond: + %lsr.iv27 = phi i64 [ %lsr.iv.next28, %if.end17 ], [ 0, %if.end ] + %scevgep55 = getelementptr i8* %4, i64 %lsr.iv27 + %6 = load i8* %scevgep55, align 1 + %idxprom.i.i = sext i8 %6 to i64 + %isascii.i.i224 = icmp sgt i8 %6, -1 + br i1 %isascii.i.i224, label %cond.true.i.i, label %cond.false.i.i, !prof !181 +cond.true.i.i: + %arrayidx.i.i = getelementptr inbounds %struct._RuneLocale* @_DefaultRuneLocale, i64 0, i32 5, i64 %idxprom.i.i + %7 = load i32* %arrayidx.i.i, align 4 + %and.i.i = and i32 %7, 32768 + br label %isupper.exit +cond.false.i.i: + %8 = trunc i64 %idxprom.i.i to i8 + %conv8 = sext i8 %8 to i32 + %call3.i.i = tail call i32 @__maskrune(i32 %conv8, i64 32768) #3 + br label %isupper.exit +isupper.exit: + %tobool1.sink.i.in.i = phi i32 [ %and.i.i, %cond.true.i.i ], [ %call3.i.i, %cond.false.i.i ] + %tobool1.sink.i.i = icmp eq i32 %tobool1.sink.i.in.i, 0 + br i1 %tobool1.sink.i.i, label %lor.rhs, label %while.body, !prof !989 +lor.rhs: + %sunkaddr = ptrtoint i8* %5 to i64 + %sunkaddr58 = add i64 %sunkaddr, %lsr.iv27 + %sunkaddr59 = inttoptr i64 %sunkaddr58 to i8* + %9 = load i8* %sunkaddr59, align 1 + %idxprom.i.i214 = sext i8 %9 to i64 + %isascii.i.i213225 = icmp sgt i8 %9, -1 + br i1 %isascii.i.i213225, label %cond.true.i.i217, label %cond.false.i.i219, !prof !181 +cond.true.i.i217: + %arrayidx.i.i215 = getelementptr inbounds %struct._RuneLocale* @_DefaultRuneLocale, i64 0, i32 5, i64 %idxprom.i.i214 + %10 = load i32* %arrayidx.i.i215, align 4 + %and.i.i216 = and i32 %10, 32768 + br label %isupper.exit223 +cond.false.i.i219: + %11 = trunc i64 %idxprom.i.i214 to i8 + %conv9 = sext i8 %11 to i32 + %call3.i.i218 = tail call i32 @__maskrune(i32 %conv9, i64 32768) #3 + br label %isupper.exit223 +isupper.exit223: + %tobool1.sink.i.in.i220 = phi i32 [ %and.i.i216, %cond.true.i.i217 ], [ %call3.i.i218, %cond.false.i.i219 ] + %tobool1.sink.i.i221 = icmp eq i32 %tobool1.sink.i.in.i220, 0 + br i1 %tobool1.sink.i.i221, label %while.end, label %while.body, !prof !990 +while.body: + %sunkaddr60 = ptrtoint i8* %4 to i64 + %sunkaddr61 = add i64 %sunkaddr60, %lsr.iv27 + %sunkaddr62 = inttoptr i64 %sunkaddr61 to i8* + %12 = load i8* %sunkaddr62, align 1 + %sunkaddr63 = ptrtoint i8* %5 to i64 + %sunkaddr64 = add i64 %sunkaddr63, %lsr.iv27 + %sunkaddr65 = inttoptr i64 %sunkaddr64 to i8* + %13 = load i8* %sunkaddr65, align 1 + %cmp14 = icmp eq i8 %12, %13 + br i1 %cmp14, label %if.end17, label %return, !prof !991 +if.end17: + %lsr.iv.next28 = add i64 %lsr.iv27, 1 + br label %while.cond +while.end: + %14 = or i8 %3, %2 + %15 = icmp eq i8 %14, 0 + br i1 %15, label %if.then23, label %if.else88, !prof !992 +if.then23: + %sunkaddr66 = ptrtoint %struct.Connector_struct* %a to i64 + %sunkaddr67 = add i64 %sunkaddr66, 16 + %sunkaddr68 = inttoptr i64 %sunkaddr67 to i8** + %16 = load i8** %sunkaddr68, align 8 + %17 = load i8* %16, align 1 + %cmp26 = icmp eq i8 %17, 83 + %sunkaddr69 = ptrtoint i8* %4 to i64 + %sunkaddr70 = add i64 %sunkaddr69, %lsr.iv27 + %sunkaddr71 = inttoptr i64 %sunkaddr70 to i8* + %18 = load i8* %sunkaddr71, align 1 + br i1 %cmp26, label %land.lhs.true28, label %while.cond59.preheader, !prof !993 +land.lhs.true28: + switch i8 %18, label %land.rhs.preheader [ + i8 112, label %land.lhs.true35 + i8 0, label %return + ], !prof !994 +land.lhs.true35: + %sunkaddr72 = ptrtoint i8* %5 to i64 + %sunkaddr73 = add i64 %sunkaddr72, %lsr.iv27 + %sunkaddr74 = inttoptr i64 %sunkaddr73 to i8* + %19 = load i8* %sunkaddr74, align 1 + switch i8 %19, label %land.rhs.preheader [ + i8 112, label %land.lhs.true43 + ], !prof !995 +land.lhs.true43: + %20 = ptrtoint i8* %16 to i64 + %21 = sub i64 0, %20 + %scevgep52 = getelementptr i8* %4, i64 %21 + %scevgep53 = getelementptr i8* %scevgep52, i64 %lsr.iv27 + %scevgep54 = getelementptr i8* %scevgep53, i64 -1 + %cmp45 = icmp eq i8* %scevgep54, null + br i1 %cmp45, label %return, label %lor.lhs.false47, !prof !996 +lor.lhs.false47: + %22 = ptrtoint i8* %16 to i64 + %23 = sub i64 0, %22 + %scevgep47 = getelementptr i8* %4, i64 %23 + %scevgep48 = getelementptr i8* %scevgep47, i64 %lsr.iv27 + %scevgep49 = getelementptr i8* %scevgep48, i64 -2 + %cmp50 = icmp eq i8* %scevgep49, null + br i1 %cmp50, label %land.lhs.true52, label %while.cond59.preheader, !prof !997 +land.lhs.true52: + %sunkaddr75 = ptrtoint i8* %4 to i64 + %sunkaddr76 = add i64 %sunkaddr75, %lsr.iv27 + %sunkaddr77 = add i64 %sunkaddr76, -1 + %sunkaddr78 = inttoptr i64 %sunkaddr77 to i8* + %24 = load i8* %sunkaddr78, align 1 + %cmp55 = icmp eq i8 %24, 73 + %cmp61233 = icmp eq i8 %18, 0 + %or.cond265 = or i1 %cmp55, %cmp61233 + br i1 %or.cond265, label %return, label %land.rhs.preheader, !prof !998 +while.cond59.preheader: + %cmp61233.old = icmp eq i8 %18, 0 + br i1 %cmp61233.old, label %return, label %land.rhs.preheader, !prof !999 +land.rhs.preheader: + %scevgep33 = getelementptr i8* %5, i64 %lsr.iv27 + %scevgep43 = getelementptr i8* %4, i64 %lsr.iv27 + br label %land.rhs +land.rhs: + %lsr.iv = phi i64 [ 0, %land.rhs.preheader ], [ %lsr.iv.next, %if.then83 ] + %25 = phi i8 [ %27, %if.then83 ], [ %18, %land.rhs.preheader ] + %scevgep34 = getelementptr i8* %scevgep33, i64 %lsr.iv + %26 = load i8* %scevgep34, align 1 + %cmp64 = icmp eq i8 %26, 0 + br i1 %cmp64, label %return, label %while.body66, !prof !1000 +while.body66: + %cmp68 = icmp eq i8 %25, 42 + %cmp72 = icmp eq i8 %26, 42 + %or.cond = or i1 %cmp68, %cmp72 + br i1 %or.cond, label %if.then83, label %lor.lhs.false74, !prof !1001 +lor.lhs.false74: + %cmp77 = icmp ne i8 %25, %26 + %cmp81 = icmp eq i8 %25, 94 + %or.cond208 = or i1 %cmp77, %cmp81 + br i1 %or.cond208, label %return, label %if.then83, !prof !1002 +if.then83: + %scevgep44 = getelementptr i8* %scevgep43, i64 %lsr.iv + %scevgep45 = getelementptr i8* %scevgep44, i64 1 + %27 = load i8* %scevgep45, align 1 + %cmp61 = icmp eq i8 %27, 0 + %lsr.iv.next = add i64 %lsr.iv, 1 + br i1 %cmp61, label %return, label %land.rhs, !prof !999 +if.else88: + %cmp89 = icmp eq i8 %2, 1 + %cmp92 = icmp eq i8 %3, 2 + %or.cond159 = and i1 %cmp89, %cmp92 + br i1 %or.cond159, label %while.cond95.preheader, label %if.else123, !prof !1003 +while.cond95.preheader: + %sunkaddr79 = ptrtoint i8* %4 to i64 + %sunkaddr80 = add i64 %sunkaddr79, %lsr.iv27 + %sunkaddr81 = inttoptr i64 %sunkaddr80 to i8* + %28 = load i8* %sunkaddr81, align 1 + %cmp97238 = icmp eq i8 %28, 0 + br i1 %cmp97238, label %return, label %land.rhs99.preheader, !prof !1004 +land.rhs99.preheader: + %scevgep31 = getelementptr i8* %5, i64 %lsr.iv27 + %scevgep40 = getelementptr i8* %4, i64 %lsr.iv27 + br label %land.rhs99 +land.rhs99: + %lsr.iv17 = phi i64 [ 0, %land.rhs99.preheader ], [ %lsr.iv.next18, %if.then117 ] + %29 = phi i8 [ %31, %if.then117 ], [ %28, %land.rhs99.preheader ] + %scevgep32 = getelementptr i8* %scevgep31, i64 %lsr.iv17 + %30 = load i8* %scevgep32, align 1 + %cmp101 = icmp eq i8 %30, 0 + br i1 %cmp101, label %return, label %while.body104, !prof !1005 +while.body104: + %cmp107 = icmp eq i8 %29, %30 + %cmp111 = icmp eq i8 %29, 42 + %or.cond209 = or i1 %cmp107, %cmp111 + %cmp115 = icmp eq i8 %30, 94 + %or.cond210 = or i1 %or.cond209, %cmp115 + br i1 %or.cond210, label %if.then117, label %return, !prof !1006 +if.then117: + %scevgep41 = getelementptr i8* %scevgep40, i64 %lsr.iv17 + %scevgep42 = getelementptr i8* %scevgep41, i64 1 + %31 = load i8* %scevgep42, align 1 + %cmp97 = icmp eq i8 %31, 0 + %lsr.iv.next18 = add i64 %lsr.iv17, 1 + br i1 %cmp97, label %return, label %land.rhs99, !prof !1004 +if.else123: + %cmp124 = icmp eq i8 %3, 1 + %cmp127 = icmp eq i8 %2, 2 + %or.cond160 = and i1 %cmp124, %cmp127 + br i1 %or.cond160, label %while.cond130.preheader, label %return, !prof !1007 +while.cond130.preheader: + %sunkaddr82 = ptrtoint i8* %4 to i64 + %sunkaddr83 = add i64 %sunkaddr82, %lsr.iv27 + %sunkaddr84 = inttoptr i64 %sunkaddr83 to i8* + %32 = load i8* %sunkaddr84, align 1 + %cmp132244 = icmp eq i8 %32, 0 + br i1 %cmp132244, label %return, label %land.rhs134.preheader, !prof !1008 +land.rhs134.preheader: + %scevgep29 = getelementptr i8* %5, i64 %lsr.iv27 + %scevgep37 = getelementptr i8* %4, i64 %lsr.iv27 + br label %land.rhs134 +land.rhs134: + %lsr.iv22 = phi i64 [ 0, %land.rhs134.preheader ], [ %lsr.iv.next23, %if.then152 ] + %33 = phi i8 [ %35, %if.then152 ], [ %32, %land.rhs134.preheader ] + %scevgep30 = getelementptr i8* %scevgep29, i64 %lsr.iv22 + %34 = load i8* %scevgep30, align 1 + %cmp136 = icmp eq i8 %34, 0 + br i1 %cmp136, label %return, label %while.body139, !prof !1009 +while.body139: + %cmp142 = icmp eq i8 %33, %34 + %cmp146 = icmp eq i8 %34, 42 + %or.cond211 = or i1 %cmp142, %cmp146 + %cmp150 = icmp eq i8 %33, 94 + %or.cond212 = or i1 %or.cond211, %cmp150 + br i1 %or.cond212, label %if.then152, label %return, !prof !1010 +if.then152: + %scevgep38 = getelementptr i8* %scevgep37, i64 %lsr.iv22 + %scevgep39 = getelementptr i8* %scevgep38, i64 1 + %35 = load i8* %scevgep39, align 1 + %cmp132 = icmp eq i8 %35, 0 + %lsr.iv.next23 = add i64 %lsr.iv22, 1 + br i1 %cmp132, label %return, label %land.rhs134, !prof !1008 +return: + %retval.0 = phi i32 [ 0, %entry ], [ 1, %land.lhs.true52 ], [ 1, %land.lhs.true43 ], [ 0, %if.else123 ], [ 1, %while.cond59.preheader ], [ 1, %while.cond95.preheader ], [ 1, %while.cond130.preheader ], [ 1, %land.lhs.true28 ], [ 1, %if.then83 ], [ 0, %lor.lhs.false74 ], [ 1, %land.rhs ], [ 1, %if.then117 ], [ 0, %while.body104 ], [ 1, %land.rhs99 ], [ 1, %if.then152 ], [ 0, %while.body139 ], [ 1, %land.rhs134 ], [ 0, %while.body ] + ret i32 %retval.0 +} +!181 = metadata !{metadata !"branch_weights", i32 662038, i32 1} +!988 = metadata !{metadata !"branch_weights", i32 12091450, i32 1916} +!989 = metadata !{metadata !"branch_weights", i32 7564670, i32 4526781} +!990 = metadata !{metadata !"branch_weights", i32 7484958, i32 13283499} +!991 = metadata !{metadata !"branch_weights", i32 8677007, i32 4606493} +!992 = metadata !{metadata !"branch_weights", i32 -1172426948, i32 145094705} +!993 = metadata !{metadata !"branch_weights", i32 1468914, i32 5683688} +!994 = metadata !{metadata !"branch_weights", i32 114025221, i32 -1217548794, i32 -1199521551, i32 87712616} +!995 = metadata !{metadata !"branch_weights", i32 1853716452, i32 -444717951, i32 932776759} +!996 = metadata !{metadata !"branch_weights", i32 1004870, i32 20259} +!997 = metadata !{metadata !"branch_weights", i32 20071, i32 189} +!998 = metadata !{metadata !"branch_weights", i32 -1020255939, i32 572177766} +!999 = metadata !{metadata !"branch_weights", i32 2666513, i32 3466431} +!1000 = metadata !{metadata !"branch_weights", i32 5117635, i32 1859780} +!1001 = metadata !{metadata !"branch_weights", i32 354902465, i32 -1444604407} +!1002 = metadata !{metadata !"branch_weights", i32 -1762419279, i32 1592770684} +!1003 = metadata !{metadata !"branch_weights", i32 1435905930, i32 -1951930624} +!1004 = metadata !{metadata !"branch_weights", i32 1, i32 504888} +!1005 = metadata !{metadata !"branch_weights", i32 94662, i32 504888} +!1006 = metadata !{metadata !"branch_weights", i32 -1897793104, i32 160196332} +!1007 = metadata !{metadata !"branch_weights", i32 2074643678, i32 -29579071} +!1008 = metadata !{metadata !"branch_weights", i32 1, i32 226163} +!1009 = metadata !{metadata !"branch_weights", i32 58357, i32 226163} +!1010 = metadata !{metadata !"branch_weights", i32 -2072848646, i32 92907517} diff --git a/test/CodeGen/AArch64/sext_inreg.ll b/test/CodeGen/AArch64/sext_inreg.ll new file mode 100644 index 0000000..2f76081 --- /dev/null +++ b/test/CodeGen/AArch64/sext_inreg.ll @@ -0,0 +1,198 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +; For formal arguments, we have the following vector type promotion, +; v2i8 is promoted to v2i32(f64) +; v2i16 is promoted to v2i32(f64) +; v4i8 is promoted to v4i16(f64) +; v8i1 is promoted to v8i16(f128) + +define <2 x i8> @test_sext_inreg_v2i8i16(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i8i16 +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h + %1 = sext <2 x i8> %v1 to <2 x i16> + %2 = sext <2 x i8> %v2 to <2 x i16> + %3 = shufflevector <2 x i16> %1, <2 x i16> %2, <2 x i32> + %4 = trunc <2 x i16> %3 to <2 x i8> + ret <2 x i8> %4 +} + +define <2 x i8> @test_sext_inreg_v2i8i16_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i8i16_2 +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h + %a1 = shl <2 x i32> %v1, + %a2 = ashr <2 x i32> %a1, + %b1 = shl <2 x i32> %v2, + %b2 = ashr <2 x i32> %b1, + %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> + %d = trunc <2 x i32> %c to <2 x i8> + ret <2 x i8> %d +} + +define <2 x i8> @test_sext_inreg_v2i8i32(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i8i32 +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h + %1 = sext <2 x i8> %v1 to <2 x i32> + %2 = sext <2 x i8> %v2 to <2 x i32> + %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> + %4 = trunc <2 x i32> %3 to <2 x i8> + ret <2 x i8> %4 +} + +define <2 x i8> @test_sext_inreg_v2i8i64(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i8i64 +; CHECK: ushll v1.2d, v1.2s, #0 +; CHECK: ushll v0.2d, v0.2s, #0 +; CHECK: shl v0.2d, v0.2d, #56 +; CHECK: sshr v0.2d, v0.2d, #56 +; CHECK: shl v1.2d, v1.2d, #56 +; CHECK: sshr v1.2d, v1.2d, #56 + %1 = sext <2 x i8> %v1 to <2 x i64> + %2 = sext <2 x i8> %v2 to <2 x i64> + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + %4 = trunc <2 x i64> %3 to <2 x i8> + ret <2 x i8> %4 +} + +define <4 x i8> @test_sext_inreg_v4i8i16(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v4i8i16 +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h + %1 = sext <4 x i8> %v1 to <4 x i16> + %2 = sext <4 x i8> %v2 to <4 x i16> + %3 = shufflevector <4 x i16> %1, <4 x i16> %2, <4 x i32> + %4 = trunc <4 x i16> %3 to <4 x i8> + ret <4 x i8> %4 +} + +define <4 x i8> @test_sext_inreg_v4i8i16_2(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v4i8i16_2 +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h + %a1 = shl <4 x i16> %v1, + %a2 = ashr <4 x i16> %a1, + %b1 = shl <4 x i16> %v2, + %b2 = ashr <4 x i16> %b1, + %c = shufflevector <4 x i16> %a2, <4 x i16> %b2, <4 x i32> + %d = trunc <4 x i16> %c to <4 x i8> + ret <4 x i8> %d +} + +define <4 x i8> @test_sext_inreg_v4i8i32(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v4i8i32 +; CHECK: ushll v1.4s, v1.4h, #0 +; CHECK: ushll v0.4s, v0.4h, #0 +; CHECK: shl v0.4s, v0.4s, #24 +; CHECK: sshr v0.4s, v0.4s, #24 +; CHECK: shl v1.4s, v1.4s, #24 +; CHECK: sshr v1.4s, v1.4s, #24 + %1 = sext <4 x i8> %v1 to <4 x i32> + %2 = sext <4 x i8> %v2 to <4 x i32> + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + %4 = trunc <4 x i32> %3 to <4 x i8> + ret <4 x i8> %4 +} + +define <8 x i8> @test_sext_inreg_v8i8i16(<8 x i8> %v1, <8 x i8> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v8i8i16 +; CHECK: sshll v0.8h, v0.8b, #0 +; CHECK: sshll v1.8h, v1.8b, #0 + %1 = sext <8 x i8> %v1 to <8 x i16> + %2 = sext <8 x i8> %v2 to <8 x i16> + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + %4 = trunc <8 x i16> %3 to <8 x i8> + ret <8 x i8> %4 +} + +define <8 x i1> @test_sext_inreg_v8i1i16(<8 x i1> %v1, <8 x i1> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v8i1i16 +; CHECK: ushll v1.8h, v1.8b, #0 +; CHECK: ushll v0.8h, v0.8b, #0 +; CHECK: shl v0.8h, v0.8h, #15 +; CHECK: sshr v0.8h, v0.8h, #15 +; CHECK: shl v1.8h, v1.8h, #15 +; CHECK: sshr v1.8h, v1.8h, #15 + %1 = sext <8 x i1> %v1 to <8 x i16> + %2 = sext <8 x i1> %v2 to <8 x i16> + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + %4 = trunc <8 x i16> %3 to <8 x i1> + ret <8 x i1> %4 +} + +define <2 x i16> @test_sext_inreg_v2i16i32(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i16i32 +; CHECK: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v1.4s + %1 = sext <2 x i16> %v1 to <2 x i32> + %2 = sext <2 x i16> %v2 to <2 x i32> + %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> + %4 = trunc <2 x i32> %3 to <2 x i16> + ret <2 x i16> %4 +} + +define <2 x i16> @test_sext_inreg_v2i16i32_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i16i32_2 +; CHECK: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v1.4s + %a1 = shl <2 x i32> %v1, + %a2 = ashr <2 x i32> %a1, + %b1 = shl <2 x i32> %v2, + %b2 = ashr <2 x i32> %b1, + %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> + %d = trunc <2 x i32> %c to <2 x i16> + ret <2 x i16> %d +} + +define <2 x i16> @test_sext_inreg_v2i16i64(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i16i64 +; CHECK: ushll v1.2d, v1.2s, #0 +; CHECK: ushll v0.2d, v0.2s, #0 +; CHECK: shl v0.2d, v0.2d, #48 +; CHECK: sshr v0.2d, v0.2d, #48 +; CHECK: shl v1.2d, v1.2d, #48 +; CHECK: sshr v1.2d, v1.2d, #48 + %1 = sext <2 x i16> %v1 to <2 x i64> + %2 = sext <2 x i16> %v2 to <2 x i64> + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + %4 = trunc <2 x i64> %3 to <2 x i16> + ret <2 x i16> %4 +} + +define <4 x i16> @test_sext_inreg_v4i16i32(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v4i16i32 +; CHECK: sshll v0.4s, v0.4h, #0 +; CHECK: sshll v1.4s, v1.4h, #0 + %1 = sext <4 x i16> %v1 to <4 x i32> + %2 = sext <4 x i16> %v2 to <4 x i32> + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + %4 = trunc <4 x i32> %3 to <4 x i16> + ret <4 x i16> %4 +} + +define <2 x i32> @test_sext_inreg_v2i32i64(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { +; CHECK-LABEL: test_sext_inreg_v2i32i64 +; CHECK: sshll v0.2d, v0.2s, #0 +; CHECK: sshll v1.2d, v1.2s, #0 + %1 = sext <2 x i32> %v1 to <2 x i64> + %2 = sext <2 x i32> %v2 to <2 x i64> + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + %4 = trunc <2 x i64> %3 to <2 x i32> + ret <2 x i32> %4 +} + diff --git a/test/CodeGen/AArch64/sincospow-vector-expansion.ll b/test/CodeGen/AArch64/sincospow-vector-expansion.ll new file mode 100644 index 0000000..259a55e --- /dev/null +++ b/test/CodeGen/AArch64/sincospow-vector-expansion.ll @@ -0,0 +1,96 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + + +define <2 x float> @test_cos_v2f64(<2 x double> %v1) { +; CHECK-LABEL: test_cos_v2f64: +; CHECK: bl cos +; CHECK: bl cos + %1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %v1) + %2 = fptrunc <2 x double> %1 to <2 x float> + ret <2 x float> %2 +} + +define <2 x float> @test_sin_v2f64(<2 x double> %v1) { +; CHECK-LABEL: test_sin_v2f64: +; CHECK: bl sin +; CHECK: bl sin + %1 = call <2 x double> @llvm.sin.v2f64(<2 x double> %v1) + %2 = fptrunc <2 x double> %1 to <2 x float> + ret <2 x float> %2 +} + +define <2 x float> @test_pow_v2f64(<2 x double> %v1, <2 x double> %v2) { +; CHECK-LABEL: test_pow_v2f64: +; CHECK: bl pow +; CHECK: bl pow + %1 = call <2 x double> @llvm.pow.v2f64(<2 x double> %v1, <2 x double> %v2) + %2 = fptrunc <2 x double> %1 to <2 x float> + ret <2 x float> %2 +} + +declare <2 x double> @llvm.cos.v2f64(<2 x double>) +declare <2 x double> @llvm.sin.v2f64(<2 x double>) +declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_cos_v2f32(<2 x float> %v1) { +; CHECK-LABEL: test_cos_v2f32: +; CHECK: bl cos +; CHECK: bl cos + %1 = call <2 x float> @llvm.cos.v2f32(<2 x float> %v1) + ret <2 x float> %1 +} + +define <2 x float> @test_sin_v2f32(<2 x float> %v1) { +; CHECK-LABEL: test_sin_v2f32: +; CHECK: bl sin +; CHECK: bl sin + %1 = call <2 x float> @llvm.sin.v2f32(<2 x float> %v1) + ret <2 x float> %1 +} + +define <2 x float> @test_pow_v2f32(<2 x float> %v1, <2 x float> %v2) { +; CHECK-LABEL: test_pow_v2f32: +; CHECK: bl pow +; CHECK: bl pow + %1 = call <2 x float> @llvm.pow.v2f32(<2 x float> %v1, <2 x float> %v2) + ret <2 x float> %1 +} + +declare <2 x float> @llvm.cos.v2f32(<2 x float>) +declare <2 x float> @llvm.sin.v2f32(<2 x float>) +declare <2 x float> @llvm.pow.v2f32(<2 x float>, <2 x float>) + +define <4 x float> @test_cos_v4f32(<4 x float> %v1) { +; CHECK-LABEL: test_cos_v4f32: +; CHECK: bl cos +; CHECK: bl cos +; CHECK: bl cos +; CHECK: bl cos + %1 = call <4 x float> @llvm.cos.v4f32(<4 x float> %v1) + ret <4 x float> %1 +} + +define <4 x float> @test_sin_v4f32(<4 x float> %v1) { +; CHECK-LABEL: test_sin_v4f32: +; CHECK: bl sin +; CHECK: bl sin +; CHECK: bl sin +; CHECK: bl sin + %1 = call <4 x float> @llvm.sin.v4f32(<4 x float> %v1) + ret <4 x float> %1 +} + +define <4 x float> @test_pow_v4f32(<4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_pow_v4f32: +; CHECK: bl pow +; CHECK: bl pow +; CHECK: bl pow +; CHECK: bl pow + %1 = call <4 x float> @llvm.pow.v4f32(<4 x float> %v1, <4 x float> %v2) + ret <4 x float> %1 +} + +declare <4 x float> @llvm.cos.v4f32(<4 x float>) +declare <4 x float> @llvm.sin.v4f32(<4 x float>) +declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>) + diff --git a/test/CodeGen/AArch64/variadic.ll b/test/CodeGen/AArch64/variadic.ll index f3d376b..1c7f1e0 100644 --- a/test/CodeGen/AArch64/variadic.ll +++ b/test/CodeGen/AArch64/variadic.ll @@ -10,14 +10,12 @@ declare void @llvm.va_start(i8*) define void @test_simple(i32 %n, ...) { ; CHECK-LABEL: test_simple: ; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]] -; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var ; CHECK: mov x[[FPRBASE:[0-9]+]], sp ; CHECK: str q7, [x[[FPRBASE]], #112] ; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]] ; CHECK: str x7, [x[[GPRBASE]], #48] ; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]] -; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var ; CHECK-NOFP: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]] ; CHECK-NOFP: str x7, [x[[GPRBASE]], #48] ; CHECK-NOFP-NOT: str q7, @@ -27,8 +25,10 @@ define void @test_simple(i32 %n, ...) { ; CHECK: str q0, [sp] ; CHECK: str x1, [sp, #[[GPRFROMSP]]] +; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var ; CHECK-NOFP-NOT: str q0, [sp] +; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var %addr = bitcast %va_list* @var to i8* call void @llvm.va_start(i8* %addr) @@ -179,26 +179,63 @@ define void @test_va_copy() { ; Check beginning and end again: -; CHECK: ldr [[BLOCK:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var] ; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var -; CHECK-NOFP: ldr [[BLOCK:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var] +; CHECK: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list +; CHECK: ldr [[BLOCK1:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var] +; CHECK: ldr [[BLOCK2:x[0-9]+]], [x[[SRC_LIST]], #24] +; CHECK: str [[BLOCK1]], [{{x[0-9]+}}, #:lo12:second_list] +; CHECK: str [[BLOCK2]], [x[[DEST_LIST]], #24] + ; CHECK-NOFP: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var +; CHECK-NOFP: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list +; CHECK-NOFP: ldr [[BLOCK1:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var] +; CHECK-NOFP: ldr [[BLOCK2:x[0-9]+]], [x[[SRC_LIST]], #24] +; CHECK-NOFP: str [[BLOCK1]], [{{x[0-9]+}}, #:lo12:second_list] +; CHECK-NOFP: str [[BLOCK2]], [x[[DEST_LIST]], #24] -; CHECK: str [[BLOCK]], [{{x[0-9]+}}, #:lo12:second_list] + ret void +; CHECK: ret +; CHECK-NOFP: ret +} -; CHECK: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24] -; CHECK: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list +%struct.s_3i = type { i32, i32, i32 } -; CHECK: str [[BLOCK]], [x[[DEST_LIST]], #24] +; This checks that, if the last named argument is not a multiple of 8 bytes, +; and is allocated on the stack, that __va_list.__stack is initialised to the +; first 8-byte aligned location above it. +define void @test_va_odd_struct_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, [1 x i64], %struct.s_3i* byval nocapture readnone align 4 %h, ...) { +; CHECK-LABEL: test_va_odd_struct_on_stack: -; CHECK-NOFP: str [[BLOCK]], [{{x[0-9]+}}, #:lo12:second_list] +; CHECK: sub sp, sp, #128 +; CHECK: mov x[[FPRBASE:[0-9]+]], sp +; CHECK: str q7, [x[[FPRBASE]], #112] -; CHECK-NOFP: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24] -; CHECK-NOFP: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list +; CHECK-NOT: str x{{[0-9]+}}, + +; CHECK-NOFP-NOT: str q7, +; CHECK-NOT: str x7, -; CHECK-NOFP: str [[BLOCK]], [x[[DEST_LIST]], #24] +; Omit the middle ones + +; CHECK: str q0, [sp] + %addr = bitcast %va_list* @var to i8* + call void @llvm.va_start(i8* %addr) +; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var +; CHECK: movn [[VR_OFFS:w[0-9]+]], #127 +; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28] +; CHECK: str wzr, [x[[VA_LIST]], #24] +; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #128 +; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16] +; This constant would be #140 if it was not 8-byte aligned +; CHECK: add [[STACK:x[0-9]+]], sp, #144 +; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var] + +; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var +; This constant would be #12 if it was not 8-byte aligned +; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #16 +; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var] +; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28] +; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24] ret void -; CHECK: ret -; CHECK-NOFP: ret } -- cgit v1.1