aboutsummaryrefslogtreecommitdiffstats
path: root/test/CodeGen/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen/AArch64')
-rw-r--r--test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll491
-rw-r--r--test/CodeGen/AArch64/addsub.ll28
-rw-r--r--test/CodeGen/AArch64/argument-blocks.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll8
-rw-r--r--test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-aapcs.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-abi-varargs.ll6
-rw-r--r--test/CodeGen/AArch64/arm64-anyregcc-crash.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-anyregcc.ll16
-rw-r--r--test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll172
-rw-r--r--test/CodeGen/AArch64/arm64-call-tailcalls.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll638
-rw-r--r--test/CodeGen/AArch64/arm64-convert-v4f64.ll26
-rw-r--r--test/CodeGen/AArch64/arm64-dup.ll37
-rw-r--r--test/CodeGen/AArch64/arm64-fcopysign.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-join-reserved.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll14
-rw-r--r--test/CodeGen/AArch64/arm64-neon-copy.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-neon-v8.1a.ll456
-rw-r--r--test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll8
-rw-r--r--test/CodeGen/AArch64/arm64-patchpoint.ll16
-rw-r--r--test/CodeGen/AArch64/arm64-stackmap-nops.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-stackmap.ll22
-rw-r--r--test/CodeGen/AArch64/arm64-vshuffle.ll95
-rw-r--r--test/CodeGen/AArch64/bitcast.ll27
-rw-r--r--test/CodeGen/AArch64/br-to-eh-lpad.ll8
-rw-r--r--test/CodeGen/AArch64/concat_vector-scalar-combine.ll125
-rw-r--r--test/CodeGen/AArch64/concat_vector-truncate-combine.ll2
-rw-r--r--test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll18
-rw-r--r--test/CodeGen/AArch64/dag-combine-invaraints.ll2
-rw-r--r--test/CodeGen/AArch64/f16-instructions.ll765
-rw-r--r--test/CodeGen/AArch64/fast-isel-int-ext5.ll19
-rw-r--r--test/CodeGen/AArch64/fold-constants.ll21
-rw-r--r--test/CodeGen/AArch64/fp16-instructions.ll109
-rw-r--r--test/CodeGen/AArch64/global-merge-1.ll12
-rw-r--r--test/CodeGen/AArch64/global-merge-2.ll6
-rw-r--r--test/CodeGen/AArch64/global-merge-3.ll6
-rw-r--r--test/CodeGen/AArch64/global-merge-4.ll2
-rw-r--r--test/CodeGen/AArch64/merge-store.ll20
-rw-r--r--test/CodeGen/AArch64/print-mrs-system-register.ll11
-rw-r--r--test/CodeGen/AArch64/sibling-call.ll2
-rw-r--r--test/CodeGen/AArch64/stackmap-liveness.ll47
-rw-r--r--test/CodeGen/AArch64/tailcall-explicit-sret.ll106
-rw-r--r--test/CodeGen/AArch64/tailcall-implicit-sret.ll46
-rw-r--r--test/CodeGen/AArch64/tailcall-mem-intrinsics.ll31
-rw-r--r--test/CodeGen/AArch64/vcvt-oversize.ll16
47 files changed, 3199 insertions, 257 deletions
diff --git a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
new file mode 100644
index 0000000..a31c66b
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -0,0 +1,491 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; This test aims to check basic correctness of frame layout &
+; frame access code. There are 8 functions in this test file,
+; each function implements one element in the cartesian product
+; of:
+; . a function having a VLA/noVLA
+; . a function with dynamic stack realignment/no dynamic stack realignment.
+; . a function needing a frame pionter/no frame pointer,
+; since the presence/absence of these has influence on the frame
+; layout and which pointer to use to access various part of the
+; frame (bp,sp,fp).
+;
+; Furthermore: in every test function:
+; . there is always one integer and 1 floating point argument to be able
+; to check those are accessed correctly.
+; . there is always one local variable to check that is accessed
+; correctly
+;
+; The LLVM-IR below was produced by clang on the following C++ code:
+;extern "C" int g();
+;extern "C" int novla_nodynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+; // use an argument passed on the stack.
+; volatile int l1;
+; return i10 + (int)d10 + l1 + g();
+;}
+;extern "C" int novla_nodynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+; // use an argument passed on the stack.
+; volatile int l1;
+; return i10 + (int)d10 + l1;
+;}
+;extern "C" int novla_dynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+; // use an argument passed on the stack.
+; alignas(128) volatile int l1;
+; return i10 + (int)d10 + l1 + g();
+;}
+;extern "C" int novla_dynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+; // use an argument passed on the stack.
+; alignas(128) volatile int l1;
+; return i10 + (int)d10 + l1;
+;}
+;
+;extern "C" int vla_nodynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+; // use an argument passed on the stack.
+; volatile int l1;
+; volatile int vla[i1];
+; return i10 + (int)d10 + l1 + g() + vla[0];
+;}
+;extern "C" int vla_nodynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+; // use an argument passed on the stack.
+; volatile int l1;
+; volatile int vla[i1];
+; return i10 + (int)d10 + l1 + vla[0];
+;}
+;extern "C" int vla_dynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+; // use an argument passed on the stack.
+; alignas(128) volatile int l1;
+; volatile int vla[i1];
+; return i10 + (int)d10 + l1 + g() + vla[0];
+;}
+;extern "C" int vla_dynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+; // use an argument passed on the stack.
+; alignas(128) volatile int l1;
+; volatile int vla[i1];
+; return i10 + (int)d10 + l1 + vla[0];
+;}
+
+
+
+define i32 @novla_nodynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 {
+entry:
+ %l1 = alloca i32, align 4
+ %conv = fptosi double %d10 to i32
+ %add = add nsw i32 %conv, %i10
+ %l1.0.l1.0. = load volatile i32, i32* %l1, align 4
+ %add1 = add nsw i32 %add, %l1.0.l1.0.
+ %call = tail call i32 @g()
+ %add2 = add nsw i32 %add1, %call
+ ret i32 %add2
+}
+; CHECK-LABEL: novla_nodynamicrealign_call
+; CHECK: .cfi_startproc
+; Check that used callee-saved registers are saved
+; CHECK: stp x20, x19, [sp, #-32]!
+; Check that the frame pointer is created:
+; CHECK: stp x29, x30, [sp, #16]
+; CHECK: add x29, sp, #16
+; Check correctness of cfi pseudo-instructions
+; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_offset w30, -8
+; CHECK: .cfi_offset w29, -16
+; CHECK: .cfi_offset w19, -24
+; CHECK: .cfi_offset w20, -32
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40]
+; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24]
+; Check correct access to local variable on the stack, through stack pointer
+; CHECK: ldr w[[ILOC:[0-9]+]], [sp, #12]
+; Check epilogue:
+; CHECK: ldp x29, x30, [sp, #16]
+; CHECK: ldp x20, x19, [sp], #32
+; CHECK: ret
+; CHECK: .cfi_endproc
+
+
+declare i32 @g() #0
+
+; Function Attrs: nounwind
+define i32 @novla_nodynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+ %l1 = alloca i32, align 4
+ %conv = fptosi double %d10 to i32
+ %add = add nsw i32 %conv, %i10
+ %l1.0.l1.0. = load volatile i32, i32* %l1, align 4
+ %add1 = add nsw i32 %add, %l1.0.l1.0.
+ ret i32 %add1
+}
+; CHECK-LABEL: novla_nodynamicrealign_nocall
+; Check that space is reserved for one local variable on the stack.
+; CHECK: sub sp, sp, #16 // =16
+; Check correct access to arguments passed on the stack, through stack pointer
+; CHECK: ldr d[[DARG:[0-9]+]], [sp, #40]
+; CHECK: ldr w[[IARG:[0-9]+]], [sp, #24]
+; Check correct access to local variable on the stack, through stack pointer
+; CHECK: ldr w[[ILOC:[0-9]+]], [sp, #12]
+; Check epilogue:
+; CHECK: add sp, sp, #16 // =16
+; CHECK: ret
+
+
+define i32 @novla_dynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 {
+entry:
+ %l1 = alloca i32, align 128
+ %conv = fptosi double %d10 to i32
+ %add = add nsw i32 %conv, %i10
+ %l1.0.l1.0. = load volatile i32, i32* %l1, align 128
+ %add1 = add nsw i32 %add, %l1.0.l1.0.
+ %call = tail call i32 @g()
+ %add2 = add nsw i32 %add1, %call
+ ret i32 %add2
+}
+
+; CHECK-LABEL: novla_dynamicrealign_call
+; CHECK: .cfi_startproc
+; Check that used callee-saved registers are saved
+; CHECK: stp x20, x19, [sp, #-32]!
+; Check that the frame pointer is created:
+; CHECK: stp x29, x30, [sp, #16]
+; CHECK: add x29, sp, #16
+; Check the dynamic realignment of the stack pointer to a 128-byte boundary
+; CHECK: sub x9, sp, #96
+; CHECK: and sp, x9, #0xffffffffffffff80
+; Check correctness of cfi pseudo-instructions
+; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_offset w30, -8
+; CHECK: .cfi_offset w29, -16
+; CHECK: .cfi_offset w19, -24
+; CHECK: .cfi_offset w20, -32
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40]
+; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24]
+; Check correct access to local variable on the stack, through re-aligned stack pointer
+; CHECK: ldr w[[ILOC:[0-9]+]], [sp]
+; Check epilogue:
+; Check that stack pointer get restored from frame pointer.
+; CHECK: sub sp, x29, #16 // =16
+; CHECK: ldp x29, x30, [sp, #16]
+; CHECK: ldp x20, x19, [sp], #32
+; CHECK: ret
+; CHECK: .cfi_endproc
+
+
+; Function Attrs: nounwind
+define i32 @novla_dynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+ %l1 = alloca i32, align 128
+ %conv = fptosi double %d10 to i32
+ %add = add nsw i32 %conv, %i10
+ %l1.0.l1.0. = load volatile i32, i32* %l1, align 128
+ %add1 = add nsw i32 %add, %l1.0.l1.0.
+ ret i32 %add1
+}
+
+; CHECK-LABEL: novla_dynamicrealign_nocall
+; Check that the frame pointer is created:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; Check the dynamic realignment of the stack pointer to a 128-byte boundary
+; CHECK: sub x9, sp, #112
+; CHECK: and sp, x9, #0xffffffffffffff80
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40]
+; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24]
+; Check correct access to local variable on the stack, through re-aligned stack pointer
+; CHECK: ldr w[[ILOC:[0-9]+]], [sp]
+; Check epilogue:
+; Check that stack pointer get restored from frame pointer.
+; CHECK: mov sp, x29
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+
+
+define i32 @vla_nodynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 {
+entry:
+ %l1 = alloca i32, align 4
+ %0 = zext i32 %i1 to i64
+ %vla = alloca i32, i64 %0, align 4
+ %conv = fptosi double %d10 to i32
+ %add = add nsw i32 %conv, %i10
+ %l1.0.l1.0. = load volatile i32, i32* %l1, align 4
+ %add1 = add nsw i32 %add, %l1.0.l1.0.
+ %call = tail call i32 @g()
+ %add2 = add nsw i32 %add1, %call
+ %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+ %add3 = add nsw i32 %add2, %1
+ ret i32 %add3
+}
+
+; CHECK-LABEL: vla_nodynamicrealign_call
+; CHECK: .cfi_startproc
+; Check that used callee-saved registers are saved
+; CHECK: stp x20, x19, [sp, #-32]!
+; Check that the frame pointer is created:
+; CHECK: stp x29, x30, [sp, #16]
+; CHECK: add x29, sp, #16
+; Check that space is reserved on the stack for the local variable,
+; rounded up to a multiple of 16 to keep the stack pointer 16-byte aligned.
+; CHECK: sub sp, sp, #16
+; Check correctness of cfi pseudo-instructions
+; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_offset w30, -8
+; CHECK: .cfi_offset w29, -16
+; CHECK: .cfi_offset w19, -24
+; CHECK: .cfi_offset w20, -32
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40]
+; Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; CHECK: ubfx x9, x0, #0, #32
+; CHECK: lsl x9, x9, #2
+; CHECK: add x9, x9, #15
+; CHECK: and x9, x9, #0xfffffffffffffff0
+; CHECK: mov x10, sp
+; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov sp, x[[VLASPTMP]]
+; Check correct access to local variable, through frame pointer
+; CHECK: ldur w[[ILOC:[0-9]+]], [x29, #-20]
+; Check correct accessing of the VLA variable through the base pointer
+; CHECK: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+; Check epilogue:
+; Check that stack pointer get restored from frame pointer.
+; CHECK: sub sp, x29, #16 // =16
+; CHECK: ldp x29, x30, [sp, #16]
+; CHECK: ldp x20, x19, [sp], #32
+; CHECK: ret
+; CHECK: .cfi_endproc
+
+
+; Function Attrs: nounwind
+define i32 @vla_nodynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+ %l1 = alloca i32, align 4
+ %0 = zext i32 %i1 to i64
+ %vla = alloca i32, i64 %0, align 4
+ %conv = fptosi double %d10 to i32
+ %add = add nsw i32 %conv, %i10
+ %l1.0.l1.0. = load volatile i32, i32* %l1, align 4
+ %add1 = add nsw i32 %add, %l1.0.l1.0.
+ %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+ %add2 = add nsw i32 %add1, %1
+ ret i32 %add2
+}
+
+; CHECK-LABEL: vla_nodynamicrealign_nocall
+; Check that the frame pointer is created:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; Check that space is reserved on the stack for the local variable,
+; rounded up to a multiple of 16 to keep the stack pointer 16-byte aligned.
+; CHECK: sub sp, sp, #16
+; Check correctness of cfi pseudo-instructions
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40]
+; Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; CHECK: ubfx x9, x0, #0, #32
+; CHECK: lsl x9, x9, #2
+; CHECK: add x9, x9, #15
+; CHECK: and x9, x9, #0xfffffffffffffff0
+; CHECK: mov x10, sp
+; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov sp, x[[VLASPTMP]]
+; Check correct access to local variable, through frame pointer
+; CHECK: ldur w[[ILOC:[0-9]+]], [x29, #-4]
+; Check correct accessing of the VLA variable through the base pointer
+; CHECK: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+; Check epilogue:
+; Check that stack pointer get restored from frame pointer.
+; CHECK: mov sp, x29
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+
+
+define i32 @vla_dynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 {
+entry:
+ %l1 = alloca i32, align 128
+ %0 = zext i32 %i1 to i64
+ %vla = alloca i32, i64 %0, align 4
+ %conv = fptosi double %d10 to i32
+ %add = add nsw i32 %conv, %i10
+ %l1.0.l1.0. = load volatile i32, i32* %l1, align 128
+ %add1 = add nsw i32 %add, %l1.0.l1.0.
+ %call = tail call i32 @g()
+ %add2 = add nsw i32 %add1, %call
+ %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+ %add3 = add nsw i32 %add2, %1
+ ret i32 %add3
+}
+
+; CHECK-LABEL: vla_dynamicrealign_call
+; CHECK: .cfi_startproc
+; Check that used callee-saved registers are saved
+; CHECK: stp x22, x21, [sp, #-48]!
+; CHECK: stp x20, x19, [sp, #16]
+; Check that the frame pointer is created:
+; CHECK: stp x29, x30, [sp, #32]
+; CHECK: add x29, sp, #32
+; Check that the stack pointer gets re-aligned to 128
+; bytes & the base pointer (x19) gets initialized to
+; this 128-byte aligned area for local variables &
+; spill slots
+; CHECK: sub x9, sp, #80 // =80
+; CHECK: and sp, x9, #0xffffffffffffff80
+; CHECK: mov x19, sp
+; Check correctness of cfi pseudo-instructions
+; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_offset w30, -8
+; CHECK: .cfi_offset w29, -16
+; CHECK: .cfi_offset w19, -24
+; CHECK: .cfi_offset w20, -32
+; CHECK: .cfi_offset w21, -40
+; CHECK: .cfi_offset w22, -48
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40]
+; Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; and set-up of base pointer (x19).
+; CHECK: ubfx x9, x0, #0, #32
+; CHECK: lsl x9, x9, #2
+; CHECK: add x9, x9, #15
+; CHECK: and x9, x9, #0xfffffffffffffff0
+; CHECK: mov x10, sp
+; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov sp, x[[VLASPTMP]]
+; Check correct access to local variable, through base pointer
+; CHECK: ldr w[[ILOC:[0-9]+]], [x19]
+; CHECK: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+; Check epilogue:
+; Check that stack pointer get restored from frame pointer.
+; CHECK: sub sp, x29, #32
+; CHECK: ldp x29, x30, [sp, #32]
+; CHECK: ldp x20, x19, [sp, #16]
+; CHECK: ldp x22, x21, [sp], #48
+; CHECK: ret
+; CHECK: .cfi_endproc
+
+
+; Function Attrs: nounwind
+define i32 @vla_dynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+ %l1 = alloca i32, align 128
+ %0 = zext i32 %i1 to i64
+ %vla = alloca i32, i64 %0, align 4
+ %conv = fptosi double %d10 to i32
+ %add = add nsw i32 %conv, %i10
+ %l1.0.l1.0. = load volatile i32, i32* %l1, align 128
+ %add1 = add nsw i32 %add, %l1.0.l1.0.
+ %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+ %add2 = add nsw i32 %add1, %1
+ ret i32 %add2
+}
+
+; CHECK-LABEL: vla_dynamicrealign_nocall
+; Check that used callee-saved registers are saved
+; CHECK: stp x20, x19, [sp, #-32]!
+; Check that the frame pointer is created:
+; CHECK: stp x29, x30, [sp, #16]
+; CHECK: add x29, sp, #16
+; Check that the stack pointer gets re-aligned to 128
+; bytes & the base pointer (x19) gets initialized to
+; this 128-byte aligned area for local variables &
+; spill slots
+; CHECK: sub x9, sp, #96
+; CHECK: and sp, x9, #0xffffffffffffff80
+; CHECK: mov x19, sp
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40]
+; Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; and set-up of base pointer (x19).
+; CHECK: ubfx x9, x0, #0, #32
+; CHECK: lsl x9, x9, #2
+; CHECK: add x9, x9, #15
+; CHECK: and x9, x9, #0xfffffffffffffff0
+; CHECK: mov x10, sp
+; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov sp, x[[VLASPTMP]]
+; Check correct access to local variable, through base pointer
+; CHECK: ldr w[[ILOC:[0-9]+]], [x19]
+; CHECK: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+; Check epilogue:
+; Check that stack pointer get restored from frame pointer.
+; CHECK: sub sp, x29, #16
+; CHECK: ldp x29, x30, [sp, #16]
+; CHECK: ldp x20, x19, [sp], #32
+; CHECK: ret
+
+
+; Function Attrs: nounwind
+define i32 @vla_dynamicrealign_nocall_large_align(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+ %l1 = alloca i32, align 32768
+ %0 = zext i32 %i1 to i64
+ %vla = alloca i32, i64 %0, align 4
+ %conv = fptosi double %d10 to i32
+ %add = add nsw i32 %conv, %i10
+ %l1.0.l1.0. = load volatile i32, i32* %l1, align 32768
+ %add1 = add nsw i32 %add, %l1.0.l1.0.
+ %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+ %add2 = add nsw i32 %add1, %1
+ ret i32 %add2
+}
+
+; CHECK-LABEL: vla_dynamicrealign_nocall_large_align
+; Check that used callee-saved registers are saved
+; CHECK: stp x20, x19, [sp, #-32]!
+; Check that the frame pointer is created:
+; CHECK: stp x29, x30, [sp, #16]
+; CHECK: add x29, sp, #16
+; Check that the stack pointer gets re-aligned to 128
+; bytes & the base pointer (x19) gets initialized to
+; this 128-byte aligned area for local variables &
+; spill slots
+; CHECK: sub x9, sp, #7, lsl #12
+; CHECK: and sp, x9, #0xffffffffffff8000
+; CHECK: mov x19, sp
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40]
+; Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; and set-up of base pointer (x19).
+; CHECK: ubfx x9, x0, #0, #32
+; CHECK: lsl x9, x9, #2
+; CHECK: add x9, x9, #15
+; CHECK: and x9, x9, #0xfffffffffffffff0
+; CHECK: mov x10, sp
+; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov sp, x[[VLASPTMP]]
+; Check correct access to local variable, through base pointer
+; CHECK: ldr w[[ILOC:[0-9]+]], [x19]
+; CHECK: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+; Check epilogue:
+; Check that stack pointer get restored from frame pointer.
+; CHECK: sub sp, x29, #16
+; CHECK: ldp x29, x30, [sp, #16]
+; CHECK: ldp x20, x19, [sp], #32
+; CHECK: ret
+
+attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll
index 09b9f62..d6350a6 100644
--- a/test/CodeGen/AArch64/addsub.ll
+++ b/test/CodeGen/AArch64/addsub.ll
@@ -24,6 +24,34 @@ define void @add_small() {
ret void
}
+; Make sure we grab the imm variant when the register operand
+; can be implicitly zero-extend.
+; We used to generate something horrible like this:
+; wA = ldrb
+; xB = ldimm 12
+; xC = add xB, wA, uxtb
+; whereas this can be achieved with:
+; wA = ldrb
+; xC = add xA, #12 ; <- xA implicitly zero extend wA.
+define void @add_small_imm(i8* %p, i64* %q, i32 %b, i32* %addr) {
+; CHECK-LABEL: add_small_imm:
+entry:
+
+; CHECK: ldrb w[[LOAD32:[0-9]+]], [x0]
+ %t = load i8, i8* %p
+ %promoted = zext i8 %t to i64
+ %zextt = zext i8 %t to i32
+ %add = add nuw i32 %zextt, %b
+
+; CHECK: add [[ADD2:x[0-9]+]], x[[LOAD32]], #12
+ %add2 = add nuw i64 %promoted, 12
+ store i32 %add, i32* %addr
+
+; CHECK: str [[ADD2]], [x1]
+ store i64 %add2, i64* %q
+ ret void
+}
+
; Add 12-bit immediates, shifted left by 12 bits
define void @add_med() {
; CHECK-LABEL: add_med:
diff --git a/test/CodeGen/AArch64/argument-blocks.ll b/test/CodeGen/AArch64/argument-blocks.ll
index f1dcfa6..3169abc 100644
--- a/test/CodeGen/AArch64/argument-blocks.ll
+++ b/test/CodeGen/AArch64/argument-blocks.ll
@@ -64,7 +64,7 @@ define void @test_varargs_stackalign() {
; CHECK-LABEL: test_varargs_stackalign:
; CHECK-DARWINPCS: stp {{w[0-9]+}}, {{w[0-9]+}}, [sp, #16]
- call void(...)* @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
+ call void(...) @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
ret void
}
diff --git a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
index 41e22e9..b760261 100644
--- a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
+++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
@@ -16,11 +16,11 @@ entry:
%0 = load double, double* %d.addr, align 8
%1 = load double, double* %d.addr, align 8
%conv = fptoui double %1 to i64
- %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
+ %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
%2 = load double, double* %d.addr, align 8
%3 = load double, double* %d.addr, align 8
%conv1 = fptoui double %3 to i32
- %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
+ %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
ret void
}
@@ -37,12 +37,12 @@ entry:
%conv = fpext float %0 to double
%1 = load float, float* %f.addr, align 4
%conv1 = fptoui float %1 to i64
- %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
+ %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
%2 = load float, float* %f.addr, align 4
%conv2 = fpext float %2 to double
%3 = load float, float* %f.addr, align 4
%conv3 = fptoui float %3 to i32
- %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
+ %call4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
ret void
}
diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 6266d1c..8784abd 100644
--- a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -7,13 +7,13 @@ define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
; CHECK-LABEL: bar:
; CHECK: add.2d v[[REG:[0-9]+]], v0, v1
; CHECK: add d[[REG3:[0-9]+]], d[[REG]], d1
+; CHECK: sub d[[REG2:[0-9]+]], d[[REG]], d1
; Without advanced copy optimization, we end up with cross register
; banks copies that cannot be coalesced.
; CHECK-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
; With advanced copy optimization, we end up with just one copy
; to insert the computed high part into the V register.
; CHECK-OPT-NOT: fmov
-; CHECK: sub d[[REG2:[0-9]+]], d[[REG]], d1
; CHECK: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
; CHECK-NOOPT: fmov d0, [[COPY_REG3]]
; CHECK-OPT-NOT: fmov
@@ -23,9 +23,9 @@ define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
; GENERIC-LABEL: bar:
; GENERIC: add v[[REG:[0-9]+]].2d, v0.2d, v1.2d
; GENERIC: add d[[REG3:[0-9]+]], d[[REG]], d1
+; GENERIC: sub d[[REG2:[0-9]+]], d[[REG]], d1
; GENERIC-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
; GENERIC-OPT-NOT: fmov
-; GENERIC: sub d[[REG2:[0-9]+]], d[[REG]], d1
; GENERIC: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
; GENERIC-NOOPT: fmov d0, [[COPY_REG3]]
; GENERIC-OPT-NOT: fmov
diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
index 41c3ad5..390a3c7 100644
--- a/test/CodeGen/AArch64/arm64-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -78,7 +78,7 @@ declare void @variadic(i32 %a, ...)
; Under AAPCS variadic functions have the same calling convention as
; others. The extra arguments should go in registers rather than on the stack.
define void @test_variadic() {
- call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0)
+ call void(i32, ...) @variadic(i32 0, i64 1, double 2.0)
; CHECK: fmov d0, #2.0
; CHECK: orr w1, wzr, #0x1
; CHECK: bl variadic
diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
index f95fec6..03414b5 100644
--- a/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -94,7 +94,7 @@ define i32 @main() nounwind ssp {
%10 = load i32, i32* %a10, align 4
%11 = load i32, i32* %a11, align 4
%12 = load i32, i32* %a12, align 4
- call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
+ call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
ret i32 0
}
@@ -133,7 +133,7 @@ entry:
store <4 x i32> %y, <4 x i32>* %y.addr, align 16
%0 = load i32, i32* %x.addr, align 4
%1 = load <4 x i32>, <4 x i32>* %y.addr, align 16
- call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
+ call void (i8*, ...) @foo(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
ret void
}
@@ -186,6 +186,6 @@ entry:
%1 = load i32, i32* %x.addr, align 4
%2 = bitcast %struct.s41* %s41 to i128*
%3 = load i128, i128* %2, align 1
- call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
+ call void (i8*, ...) @foo2(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
ret void
}
diff --git a/test/CodeGen/AArch64/arm64-anyregcc-crash.ll b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
index 241cf97..56c62d5 100644
--- a/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
+++ b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
@@ -8,7 +8,7 @@ define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i6
i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
entry:
- %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
+ %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
diff --git a/test/CodeGen/AArch64/arm64-anyregcc.ll b/test/CodeGen/AArch64/arm64-anyregcc.ll
index e26875d..2a2f451 100644
--- a/test/CodeGen/AArch64/arm64-anyregcc.ll
+++ b/test/CodeGen/AArch64/arm64-anyregcc.ll
@@ -55,7 +55,7 @@
; CHECK-NEXT: .long 3
define i64 @test() nounwind ssp uwtable {
entry:
- call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
+ call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
ret i64 0
}
@@ -77,7 +77,7 @@ entry:
define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
entry:
%f = inttoptr i64 281474417671919 to i8*
- %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
ret i64 %ret
}
@@ -100,7 +100,7 @@ define i64 @property_access2() nounwind ssp uwtable {
entry:
%obj = alloca i64, align 8
%f = inttoptr i64 281474417671919 to i8*
- %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
ret i64 %ret
}
@@ -123,7 +123,7 @@ define i64 @property_access3() nounwind ssp uwtable {
entry:
%obj = alloca i64, align 8
%f = inttoptr i64 281474417671919 to i8*
- %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
ret i64 %ret
}
@@ -205,7 +205,7 @@ entry:
define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
entry:
%f = inttoptr i64 281474417671919 to i8*
- %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
ret i64 %ret
}
@@ -287,7 +287,7 @@ entry:
define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
entry:
%f = inttoptr i64 281474417671919 to i8*
- %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
ret i64 %ret
}
@@ -315,7 +315,7 @@ entry:
; CHECK-NEXT: .long 0
define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
entry:
- %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+ %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
ret i64 %result
}
@@ -355,7 +355,7 @@ entry:
define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
entry:
tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
- %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+ %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
ret i64 %result
}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
index c280bef..d089767 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
@@ -1,6 +1,10 @@
; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
+; Note, we split the functions in to multiple BBs below to isolate the call
+; instruction we want to test, from fast-isel failing to select instructions
+; after it.
+
; CHECK-LABEL: test_i64_f64:
declare i64 @test_i64_f64_helper(double %p)
define void @test_i64_f64(double* %p, i64* %q) {
@@ -8,6 +12,8 @@ define void @test_i64_f64(double* %p, i64* %q) {
%1 = load double, double* %p
%2 = fadd double %1, %1
%3 = call i64 @test_i64_f64_helper(double %2)
+ br label %return_bb
+return_bb:
%4 = add i64 %3, %3
store i64 %4, i64* %q
ret void
@@ -20,6 +26,8 @@ define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
%1 = load <1 x i64>, <1 x i64>* %p
%2 = add <1 x i64> %1, %1
%3 = call i64 @test_i64_v1i64_helper(<1 x i64> %2)
+ br label %return_bb
+return_bb:
%4 = add i64 %3, %3
store i64 %4, i64* %q
ret void
@@ -32,6 +40,8 @@ define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
%1 = load <2 x float>, <2 x float>* %p
%2 = fadd <2 x float> %1, %1
%3 = call i64 @test_i64_v2f32_helper(<2 x float> %2)
+ br label %return_bb
+return_bb:
%4 = add i64 %3, %3
store i64 %4, i64* %q
ret void
@@ -44,6 +54,8 @@ define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
%1 = load <2 x i32>, <2 x i32>* %p
%2 = add <2 x i32> %1, %1
%3 = call i64 @test_i64_v2i32_helper(<2 x i32> %2)
+ br label %return_bb
+return_bb:
%4 = add i64 %3, %3
store i64 %4, i64* %q
ret void
@@ -56,6 +68,8 @@ define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
%1 = load <4 x i16>, <4 x i16>* %p
%2 = add <4 x i16> %1, %1
%3 = call i64 @test_i64_v4i16_helper(<4 x i16> %2)
+ br label %return_bb
+return_bb:
%4 = add i64 %3, %3
store i64 %4, i64* %q
ret void
@@ -68,6 +82,8 @@ define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
%1 = load <8 x i8>, <8 x i8>* %p
%2 = add <8 x i8> %1, %1
%3 = call i64 @test_i64_v8i8_helper(<8 x i8> %2)
+ br label %return_bb
+return_bb:
%4 = add i64 %3, %3
store i64 %4, i64* %q
ret void
@@ -80,6 +96,8 @@ define void @test_f64_i64(i64* %p, double* %q) {
%1 = load i64, i64* %p
%2 = add i64 %1, %1
%3 = call double @test_f64_i64_helper(i64 %2)
+ br label %return_bb
+return_bb:
%4 = fadd double %3, %3
store double %4, double* %q
ret void
@@ -92,6 +110,8 @@ define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
%1 = load <1 x i64>, <1 x i64>* %p
%2 = add <1 x i64> %1, %1
%3 = call double @test_f64_v1i64_helper(<1 x i64> %2)
+ br label %return_bb
+return_bb:
%4 = fadd double %3, %3
store double %4, double* %q
ret void
@@ -104,6 +124,8 @@ define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
%1 = load <2 x float>, <2 x float>* %p
%2 = fadd <2 x float> %1, %1
%3 = call double @test_f64_v2f32_helper(<2 x float> %2)
+ br label %return_bb
+return_bb:
%4 = fadd double %3, %3
store double %4, double* %q
ret void
@@ -116,6 +138,8 @@ define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
%1 = load <2 x i32>, <2 x i32>* %p
%2 = add <2 x i32> %1, %1
%3 = call double @test_f64_v2i32_helper(<2 x i32> %2)
+ br label %return_bb
+return_bb:
%4 = fadd double %3, %3
store double %4, double* %q
ret void
@@ -128,6 +152,8 @@ define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
%1 = load <4 x i16>, <4 x i16>* %p
%2 = add <4 x i16> %1, %1
%3 = call double @test_f64_v4i16_helper(<4 x i16> %2)
+ br label %return_bb
+return_bb:
%4 = fadd double %3, %3
store double %4, double* %q
ret void
@@ -140,6 +166,8 @@ define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
%1 = load <8 x i8>, <8 x i8>* %p
%2 = add <8 x i8> %1, %1
%3 = call double @test_f64_v8i8_helper(<8 x i8> %2)
+ br label %return_bb
+return_bb:
%4 = fadd double %3, %3
store double %4, double* %q
ret void
@@ -152,6 +180,8 @@ define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
%1 = load i64, i64* %p
%2 = add i64 %1, %1
%3 = call <1 x i64> @test_v1i64_i64_helper(i64 %2)
+ br label %return_bb
+return_bb:
%4 = add <1 x i64> %3, %3
store <1 x i64> %4, <1 x i64>* %q
ret void
@@ -164,6 +194,8 @@ define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
%1 = load double, double* %p
%2 = fadd double %1, %1
%3 = call <1 x i64> @test_v1i64_f64_helper(double %2)
+ br label %return_bb
+return_bb:
%4 = add <1 x i64> %3, %3
store <1 x i64> %4, <1 x i64>* %q
ret void
@@ -176,6 +208,8 @@ define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
%1 = load <2 x float>, <2 x float>* %p
%2 = fadd <2 x float> %1, %1
%3 = call <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %2)
+ br label %return_bb
+return_bb:
%4 = add <1 x i64> %3, %3
store <1 x i64> %4, <1 x i64>* %q
ret void
@@ -188,6 +222,8 @@ define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
%1 = load <2 x i32>, <2 x i32>* %p
%2 = add <2 x i32> %1, %1
%3 = call <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %2)
+ br label %return_bb
+return_bb:
%4 = add <1 x i64> %3, %3
store <1 x i64> %4, <1 x i64>* %q
ret void
@@ -200,6 +236,8 @@ define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
%1 = load <4 x i16>, <4 x i16>* %p
%2 = add <4 x i16> %1, %1
%3 = call <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %2)
+ br label %return_bb
+return_bb:
%4 = add <1 x i64> %3, %3
store <1 x i64> %4, <1 x i64>* %q
ret void
@@ -212,6 +250,8 @@ define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
%1 = load <8 x i8>, <8 x i8>* %p
%2 = add <8 x i8> %1, %1
%3 = call <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %2)
+ br label %return_bb
+return_bb:
%4 = add <1 x i64> %3, %3
store <1 x i64> %4, <1 x i64>* %q
ret void
@@ -224,6 +264,8 @@ define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
%1 = load i64, i64* %p
%2 = add i64 %1, %1
%3 = call <2 x float> @test_v2f32_i64_helper(i64 %2)
+ br label %return_bb
+return_bb:
%4 = fadd <2 x float> %3, %3
store <2 x float> %4, <2 x float>* %q
ret void
@@ -236,6 +278,8 @@ define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
%1 = load double, double* %p
%2 = fadd double %1, %1
%3 = call <2 x float> @test_v2f32_f64_helper(double %2)
+ br label %return_bb
+return_bb:
%4 = fadd <2 x float> %3, %3
store <2 x float> %4, <2 x float>* %q
ret void
@@ -248,6 +292,8 @@ define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
%1 = load <1 x i64>, <1 x i64>* %p
%2 = add <1 x i64> %1, %1
%3 = call <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <2 x float> %3, %3
store <2 x float> %4, <2 x float>* %q
ret void
@@ -261,6 +307,8 @@ define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
%1 = load <2 x i32>, <2 x i32>* %p
%2 = add <2 x i32> %1, %1
%3 = call <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <2 x float> %3, %3
store <2 x float> %4, <2 x float>* %q
ret void
@@ -274,6 +322,8 @@ define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
%1 = load <4 x i16>, <4 x i16>* %p
%2 = add <4 x i16> %1, %1
%3 = call <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <2 x float> %3, %3
store <2 x float> %4, <2 x float>* %q
ret void
@@ -287,6 +337,8 @@ define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
%1 = load <8 x i8>, <8 x i8>* %p
%2 = add <8 x i8> %1, %1
%3 = call <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <2 x float> %3, %3
store <2 x float> %4, <2 x float>* %q
ret void
@@ -299,6 +351,8 @@ define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
%1 = load i64, i64* %p
%2 = add i64 %1, %1
%3 = call <2 x i32> @test_v2i32_i64_helper(i64 %2)
+ br label %return_bb
+return_bb:
%4 = add <2 x i32> %3, %3
store <2 x i32> %4, <2 x i32>* %q
ret void
@@ -311,6 +365,8 @@ define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
%1 = load double, double* %p
%2 = fadd double %1, %1
%3 = call <2 x i32> @test_v2i32_f64_helper(double %2)
+ br label %return_bb
+return_bb:
%4 = add <2 x i32> %3, %3
store <2 x i32> %4, <2 x i32>* %q
ret void
@@ -323,6 +379,8 @@ define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
%1 = load <1 x i64>, <1 x i64>* %p
%2 = add <1 x i64> %1, %1
%3 = call <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %2)
+ br label %return_bb
+return_bb:
%4 = add <2 x i32> %3, %3
store <2 x i32> %4, <2 x i32>* %q
ret void
@@ -336,6 +394,8 @@ define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
%1 = load <2 x float>, <2 x float>* %p
%2 = fadd <2 x float> %1, %1
%3 = call <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %2)
+ br label %return_bb
+return_bb:
%4 = add <2 x i32> %3, %3
store <2 x i32> %4, <2 x i32>* %q
ret void
@@ -349,6 +409,8 @@ define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
%1 = load <4 x i16>, <4 x i16>* %p
%2 = add <4 x i16> %1, %1
%3 = call <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %2)
+ br label %return_bb
+return_bb:
%4 = add <2 x i32> %3, %3
store <2 x i32> %4, <2 x i32>* %q
ret void
@@ -362,6 +424,8 @@ define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
%1 = load <8 x i8>, <8 x i8>* %p
%2 = add <8 x i8> %1, %1
%3 = call <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %2)
+ br label %return_bb
+return_bb:
%4 = add <2 x i32> %3, %3
store <2 x i32> %4, <2 x i32>* %q
ret void
@@ -374,6 +438,8 @@ define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
%1 = load i64, i64* %p
%2 = add i64 %1, %1
%3 = call <4 x i16> @test_v4i16_i64_helper(i64 %2)
+ br label %return_bb
+return_bb:
%4 = add <4 x i16> %3, %3
store <4 x i16> %4, <4 x i16>* %q
ret void
@@ -386,6 +452,8 @@ define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
%1 = load double, double* %p
%2 = fadd double %1, %1
%3 = call <4 x i16> @test_v4i16_f64_helper(double %2)
+ br label %return_bb
+return_bb:
%4 = add <4 x i16> %3, %3
store <4 x i16> %4, <4 x i16>* %q
ret void
@@ -398,6 +466,8 @@ define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
%1 = load <1 x i64>, <1 x i64>* %p
%2 = add <1 x i64> %1, %1
%3 = call <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %2)
+ br label %return_bb
+return_bb:
%4 = add <4 x i16> %3, %3
store <4 x i16> %4, <4 x i16>* %q
ret void
@@ -411,6 +481,8 @@ define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
%1 = load <2 x float>, <2 x float>* %p
%2 = fadd <2 x float> %1, %1
%3 = call <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %2)
+ br label %return_bb
+return_bb:
%4 = add <4 x i16> %3, %3
store <4 x i16> %4, <4 x i16>* %q
ret void
@@ -424,6 +496,8 @@ define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
%1 = load <2 x i32>, <2 x i32>* %p
%2 = add <2 x i32> %1, %1
%3 = call <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %2)
+ br label %return_bb
+return_bb:
%4 = add <4 x i16> %3, %3
store <4 x i16> %4, <4 x i16>* %q
ret void
@@ -437,6 +511,8 @@ define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
%1 = load <8 x i8>, <8 x i8>* %p
%2 = add <8 x i8> %1, %1
%3 = call <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %2)
+ br label %return_bb
+return_bb:
%4 = add <4 x i16> %3, %3
store <4 x i16> %4, <4 x i16>* %q
ret void
@@ -449,6 +525,8 @@ define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
%1 = load i64, i64* %p
%2 = add i64 %1, %1
%3 = call <8 x i8> @test_v8i8_i64_helper(i64 %2)
+ br label %return_bb
+return_bb:
%4 = add <8 x i8> %3, %3
store <8 x i8> %4, <8 x i8>* %q
ret void
@@ -461,6 +539,8 @@ define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
%1 = load double, double* %p
%2 = fadd double %1, %1
%3 = call <8 x i8> @test_v8i8_f64_helper(double %2)
+ br label %return_bb
+return_bb:
%4 = add <8 x i8> %3, %3
store <8 x i8> %4, <8 x i8>* %q
ret void
@@ -473,6 +553,8 @@ define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
%1 = load <1 x i64>, <1 x i64>* %p
%2 = add <1 x i64> %1, %1
%3 = call <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %2)
+ br label %return_bb
+return_bb:
%4 = add <8 x i8> %3, %3
store <8 x i8> %4, <8 x i8>* %q
ret void
@@ -486,6 +568,8 @@ define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
%1 = load <2 x float>, <2 x float>* %p
%2 = fadd <2 x float> %1, %1
%3 = call <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %2)
+ br label %return_bb
+return_bb:
%4 = add <8 x i8> %3, %3
store <8 x i8> %4, <8 x i8>* %q
ret void
@@ -499,6 +583,8 @@ define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
%1 = load <2 x i32>, <2 x i32>* %p
%2 = add <2 x i32> %1, %1
%3 = call <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %2)
+ br label %return_bb
+return_bb:
%4 = add <8 x i8> %3, %3
store <8 x i8> %4, <8 x i8>* %q
ret void
@@ -512,6 +598,8 @@ define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
%1 = load <4 x i16>, <4 x i16>* %p
%2 = add <4 x i16> %1, %1
%3 = call <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %2)
+ br label %return_bb
+return_bb:
%4 = add <8 x i8> %3, %3
store <8 x i8> %4, <8 x i8>* %q
ret void
@@ -524,6 +612,8 @@ define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
%1 = load <2 x double>, <2 x double>* %p
%2 = fadd <2 x double> %1, %1
%3 = call fp128 @test_f128_v2f64_helper(<2 x double> %2)
+ br label %return_bb
+return_bb:
%4 = fadd fp128 %3, %3
store fp128 %4, fp128* %q
ret void
@@ -536,6 +626,8 @@ define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
%1 = load <2 x i64>, <2 x i64>* %p
%2 = add <2 x i64> %1, %1
%3 = call fp128 @test_f128_v2i64_helper(<2 x i64> %2)
+ br label %return_bb
+return_bb:
%4 = fadd fp128 %3, %3
store fp128 %4, fp128* %q
ret void
@@ -549,6 +641,8 @@ define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
%1 = load <4 x float>, <4 x float>* %p
%2 = fadd <4 x float> %1, %1
%3 = call fp128 @test_f128_v4f32_helper(<4 x float> %2)
+ br label %return_bb
+return_bb:
%4 = fadd fp128 %3, %3
store fp128 %4, fp128* %q
ret void
@@ -562,6 +656,8 @@ define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
%1 = load <4 x i32>, <4 x i32>* %p
%2 = add <4 x i32> %1, %1
%3 = call fp128 @test_f128_v4i32_helper(<4 x i32> %2)
+ br label %return_bb
+return_bb:
%4 = fadd fp128 %3, %3
store fp128 %4, fp128* %q
ret void
@@ -575,6 +671,8 @@ define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
%1 = load <8 x i16>, <8 x i16>* %p
%2 = add <8 x i16> %1, %1
%3 = call fp128 @test_f128_v8i16_helper(<8 x i16> %2)
+ br label %return_bb
+return_bb:
%4 = fadd fp128 %3, %3
store fp128 %4, fp128* %q
ret void
@@ -588,6 +686,8 @@ define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
%1 = load <16 x i8>, <16 x i8>* %p
%2 = add <16 x i8> %1, %1
%3 = call fp128 @test_f128_v16i8_helper(<16 x i8> %2)
+ br label %return_bb
+return_bb:
%4 = fadd fp128 %3, %3
store fp128 %4, fp128* %q
ret void
@@ -600,6 +700,8 @@ define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
%1 = load fp128, fp128* %p
%2 = fadd fp128 %1, %1
%3 = call <2 x double> @test_v2f64_f128_helper(fp128 %2)
+ br label %return_bb
+return_bb:
%4 = fadd <2 x double> %3, %3
store <2 x double> %4, <2 x double>* %q
ret void
@@ -613,6 +715,8 @@ define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
%1 = load <2 x i64>, <2 x i64>* %p
%2 = add <2 x i64> %1, %1
%3 = call <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <2 x double> %3, %3
store <2 x double> %4, <2 x double>* %q
ret void
@@ -627,6 +731,8 @@ define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
%1 = load <4 x float>, <4 x float>* %p
%2 = fadd <4 x float> %1, %1
%3 = call <2 x double> @test_v2f64_v4f32_helper(<4 x float> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <2 x double> %3, %3
store <2 x double> %4, <2 x double>* %q
ret void
@@ -641,6 +747,8 @@ define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
%1 = load <4 x i32>, <4 x i32>* %p
%2 = add <4 x i32> %1, %1
%3 = call <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <2 x double> %3, %3
store <2 x double> %4, <2 x double>* %q
ret void
@@ -655,6 +763,8 @@ define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
%1 = load <8 x i16>, <8 x i16>* %p
%2 = add <8 x i16> %1, %1
%3 = call <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <2 x double> %3, %3
store <2 x double> %4, <2 x double>* %q
ret void
@@ -669,6 +779,8 @@ define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
%1 = load <16 x i8>, <16 x i8>* %p
%2 = add <16 x i8> %1, %1
%3 = call <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <2 x double> %3, %3
store <2 x double> %4, <2 x double>* %q
ret void
@@ -681,6 +793,8 @@ define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
%1 = load fp128, fp128* %p
%2 = fadd fp128 %1, %1
%3 = call <2 x i64> @test_v2i64_f128_helper(fp128 %2)
+ br label %return_bb
+return_bb:
%4 = add <2 x i64> %3, %3
store <2 x i64> %4, <2 x i64>* %q
ret void
@@ -694,6 +808,8 @@ define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
%1 = load <2 x double>, <2 x double>* %p
%2 = fadd <2 x double> %1, %1
%3 = call <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %2)
+ br label %return_bb
+return_bb:
%4 = add <2 x i64> %3, %3
store <2 x i64> %4, <2 x i64>* %q
ret void
@@ -708,6 +824,8 @@ define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
%1 = load <4 x float>, <4 x float>* %p
%2 = fadd <4 x float> %1, %1
%3 = call <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %2)
+ br label %return_bb
+return_bb:
%4 = add <2 x i64> %3, %3
store <2 x i64> %4, <2 x i64>* %q
ret void
@@ -722,6 +840,8 @@ define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
%1 = load <4 x i32>, <4 x i32>* %p
%2 = add <4 x i32> %1, %1
%3 = call <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %2)
+ br label %return_bb
+return_bb:
%4 = add <2 x i64> %3, %3
store <2 x i64> %4, <2 x i64>* %q
ret void
@@ -736,6 +856,8 @@ define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
%1 = load <8 x i16>, <8 x i16>* %p
%2 = add <8 x i16> %1, %1
%3 = call <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %2)
+ br label %return_bb
+return_bb:
%4 = add <2 x i64> %3, %3
store <2 x i64> %4, <2 x i64>* %q
ret void
@@ -750,6 +872,8 @@ define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
%1 = load <16 x i8>, <16 x i8>* %p
%2 = add <16 x i8> %1, %1
%3 = call <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %2)
+ br label %return_bb
+return_bb:
%4 = add <2 x i64> %3, %3
store <2 x i64> %4, <2 x i64>* %q
ret void
@@ -763,6 +887,8 @@ define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
%1 = load fp128, fp128* %p
%2 = fadd fp128 %1, %1
%3 = call <4 x float> @test_v4f32_f128_helper(fp128 %2)
+ br label %return_bb
+return_bb:
%4 = fadd <4 x float> %3, %3
store <4 x float> %4, <4 x float>* %q
ret void
@@ -777,6 +903,8 @@ define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
%1 = load <2 x double>, <2 x double>* %p
%2 = fadd <2 x double> %1, %1
%3 = call <4 x float> @test_v4f32_v2f64_helper(<2 x double> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <4 x float> %3, %3
store <4 x float> %4, <4 x float>* %q
ret void
@@ -791,6 +919,8 @@ define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
%1 = load <2 x i64>, <2 x i64>* %p
%2 = add <2 x i64> %1, %1
%3 = call <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <4 x float> %3, %3
store <4 x float> %4, <4 x float>* %q
ret void
@@ -806,6 +936,8 @@ define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
%1 = load <4 x i32>, <4 x i32>* %p
%2 = add <4 x i32> %1, %1
%3 = call <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <4 x float> %3, %3
store <4 x float> %4, <4 x float>* %q
ret void
@@ -821,6 +953,8 @@ define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
%1 = load <8 x i16>, <8 x i16>* %p
%2 = add <8 x i16> %1, %1
%3 = call <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <4 x float> %3, %3
store <4 x float> %4, <4 x float>* %q
ret void
@@ -836,6 +970,8 @@ define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
%1 = load <16 x i8>, <16 x i8>* %p
%2 = add <16 x i8> %1, %1
%3 = call <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %2)
+ br label %return_bb
+return_bb:
%4 = fadd <4 x float> %3, %3
store <4 x float> %4, <4 x float>* %q
ret void
@@ -849,6 +985,8 @@ define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
%1 = load fp128, fp128* %p
%2 = fadd fp128 %1, %1
%3 = call <4 x i32> @test_v4i32_f128_helper(fp128 %2)
+ br label %return_bb
+return_bb:
%4 = add <4 x i32> %3, %3
store <4 x i32> %4, <4 x i32>* %q
ret void
@@ -863,6 +1001,8 @@ define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
%1 = load <2 x double>, <2 x double>* %p
%2 = fadd <2 x double> %1, %1
%3 = call <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %2)
+ br label %return_bb
+return_bb:
%4 = add <4 x i32> %3, %3
store <4 x i32> %4, <4 x i32>* %q
ret void
@@ -877,6 +1017,8 @@ define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
%1 = load <2 x i64>, <2 x i64>* %p
%2 = add <2 x i64> %1, %1
%3 = call <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %2)
+ br label %return_bb
+return_bb:
%4 = add <4 x i32> %3, %3
store <4 x i32> %4, <4 x i32>* %q
ret void
@@ -892,6 +1034,8 @@ define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
%1 = load <4 x float>, <4 x float>* %p
%2 = fadd <4 x float> %1, %1
%3 = call <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %2)
+ br label %return_bb
+return_bb:
%4 = add <4 x i32> %3, %3
store <4 x i32> %4, <4 x i32>* %q
ret void
@@ -907,6 +1051,8 @@ define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
%1 = load <8 x i16>, <8 x i16>* %p
%2 = add <8 x i16> %1, %1
%3 = call <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %2)
+ br label %return_bb
+return_bb:
%4 = add <4 x i32> %3, %3
store <4 x i32> %4, <4 x i32>* %q
ret void
@@ -922,6 +1068,8 @@ define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
%1 = load <16 x i8>, <16 x i8>* %p
%2 = add <16 x i8> %1, %1
%3 = call <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %2)
+ br label %return_bb
+return_bb:
%4 = add <4 x i32> %3, %3
store <4 x i32> %4, <4 x i32>* %q
ret void
@@ -935,6 +1083,8 @@ define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
%1 = load fp128, fp128* %p
%2 = fadd fp128 %1, %1
%3 = call <8 x i16> @test_v8i16_f128_helper(fp128 %2)
+ br label %return_bb
+return_bb:
%4 = add <8 x i16> %3, %3
store <8 x i16> %4, <8 x i16>* %q
ret void
@@ -949,6 +1099,8 @@ define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
%1 = load <2 x double>, <2 x double>* %p
%2 = fadd <2 x double> %1, %1
%3 = call <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %2)
+ br label %return_bb
+return_bb:
%4 = add <8 x i16> %3, %3
store <8 x i16> %4, <8 x i16>* %q
ret void
@@ -963,6 +1115,8 @@ define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
%1 = load <2 x i64>, <2 x i64>* %p
%2 = add <2 x i64> %1, %1
%3 = call <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %2)
+ br label %return_bb
+return_bb:
%4 = add <8 x i16> %3, %3
store <8 x i16> %4, <8 x i16>* %q
ret void
@@ -978,6 +1132,8 @@ define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
%1 = load <4 x float>, <4 x float>* %p
%2 = fadd <4 x float> %1, %1
%3 = call <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %2)
+ br label %return_bb
+return_bb:
%4 = add <8 x i16> %3, %3
store <8 x i16> %4, <8 x i16>* %q
ret void
@@ -993,6 +1149,8 @@ define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
%1 = load <4 x i32>, <4 x i32>* %p
%2 = add <4 x i32> %1, %1
%3 = call <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %2)
+ br label %return_bb
+return_bb:
%4 = add <8 x i16> %3, %3
store <8 x i16> %4, <8 x i16>* %q
ret void
@@ -1008,6 +1166,8 @@ define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
%1 = load <16 x i8>, <16 x i8>* %p
%2 = add <16 x i8> %1, %1
%3 = call <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %2)
+ br label %return_bb
+return_bb:
%4 = add <8 x i16> %3, %3
store <8 x i16> %4, <8 x i16>* %q
ret void
@@ -1021,6 +1181,8 @@ define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
%1 = load fp128, fp128* %p
%2 = fadd fp128 %1, %1
%3 = call <16 x i8> @test_v16i8_f128_helper(fp128 %2)
+ br label %return_bb
+return_bb:
%4 = add <16 x i8> %3, %3
store <16 x i8> %4, <16 x i8>* %q
ret void
@@ -1035,6 +1197,8 @@ define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
%1 = load <2 x double>, <2 x double>* %p
%2 = fadd <2 x double> %1, %1
%3 = call <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %2)
+ br label %return_bb
+return_bb:
%4 = add <16 x i8> %3, %3
store <16 x i8> %4, <16 x i8>* %q
ret void
@@ -1049,6 +1213,8 @@ define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
%1 = load <2 x i64>, <2 x i64>* %p
%2 = add <2 x i64> %1, %1
%3 = call <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %2)
+ br label %return_bb
+return_bb:
%4 = add <16 x i8> %3, %3
store <16 x i8> %4, <16 x i8>* %q
ret void
@@ -1064,6 +1230,8 @@ define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
%1 = load <4 x float>, <4 x float>* %p
%2 = fadd <4 x float> %1, %1
%3 = call <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %2)
+ br label %return_bb
+return_bb:
%4 = add <16 x i8> %3, %3
store <16 x i8> %4, <16 x i8>* %q
ret void
@@ -1079,6 +1247,8 @@ define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
%1 = load <4 x i32>, <4 x i32>* %p
%2 = add <4 x i32> %1, %1
%3 = call <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %2)
+ br label %return_bb
+return_bb:
%4 = add <16 x i8> %3, %3
store <16 x i8> %4, <16 x i8>* %q
ret void
@@ -1094,6 +1264,8 @@ define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
%1 = load <8 x i16>, <8 x i16>* %p
%2 = add <8 x i16> %1, %1
%3 = call <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %2)
+ br label %return_bb
+return_bb:
%4 = add <16 x i8> %3, %3
store <16 x i8> %4, <16 x i8>* %q
ret void
diff --git a/test/CodeGen/AArch64/arm64-call-tailcalls.ll b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
index 71d9327..6621db2 100644
--- a/test/CodeGen/AArch64/arm64-call-tailcalls.ll
+++ b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
@@ -53,9 +53,9 @@ bb: ; preds = %entry
define i32 @t8(i32 %x) nounwind ssp {
; CHECK-LABEL: t8:
+; CHECK: b _c
; CHECK: b _a
; CHECK: b _b
-; CHECK: b _c
%and = and i32 %x, 1
%tobool = icmp eq i32 %and, 0
br i1 %tobool, label %if.end, label %if.then
diff --git a/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll b/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
new file mode 100644
index 0000000..f0b8299
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
@@ -0,0 +1,638 @@
+; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=NONSTRESS
+; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S -stress-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=STRESS
+; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S -disable-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=DISABLE
+
+; CodeGenPrepare should move the zext into the block with the load
+; so that SelectionDAG can select it with the load.
+;
+; OPTALL-LABEL: @foo
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+; OPTALL-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; OPTALL: store i32 [[ZEXT]], i32* %q
+; OPTALL: ret
+define void @foo(i8* %p, i32* %q) {
+entry:
+ %t = load i8, i8* %p
+ %a = icmp slt i8 %t, 20
+ br i1 %a, label %true, label %false
+true:
+ %s = zext i8 %t to i32
+ store i32 %s, i32* %q
+ ret void
+false:
+ ret void
+}
+
+; Check that we manage to form a zextload is an operation with only one
+; argument to explicitly extend is in the the way.
+; OPTALL-LABEL: @promoteOneArg
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+; OPT-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT]], 2
+; Make sure the operation is not promoted when the promotion pass is disabled.
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], 2
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteOneArg(i8* %p, i32* %q) {
+entry:
+ %t = load i8, i8* %p
+ %add = add nuw i8 %t, 2
+ %a = icmp slt i8 %t, 20
+ br i1 %a, label %true, label %false
+true:
+ %s = zext i8 %add to i32
+ store i32 %s, i32* %q
+ ret void
+false:
+ ret void
+}
+
+; Check that we manage to form a sextload is an operation with only one
+; argument to explicitly extend is in the the way.
+; Version with sext.
+; OPTALL-LABEL: @promoteOneArgSExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXT]], 2
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], 2
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteOneArgSExt(i8* %p, i32* %q) {
+entry:
+ %t = load i8, i8* %p
+ %add = add nsw i8 %t, 2
+ %a = icmp slt i8 %t, 20
+ br i1 %a, label %true, label %false
+true:
+ %s = sext i8 %add to i32
+ store i32 %s, i32* %q
+ ret void
+false:
+ ret void
+}
+
+; Check that we manage to form a zextload is an operation with two
+; arguments to explicitly extend is in the the way.
+; Extending %add will create two extensions:
+; 1. One for %b.
+; 2. One for %t.
+; #1 will not be removed as we do not know anything about %b.
+; #2 may not be merged with the load because %t is used in a comparison.
+; Since two extensions may be emitted in the end instead of one before the
+; transformation, the regular heuristic does not apply the optimization.
+;
+; OPTALL-LABEL: @promoteTwoArgZext
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+;
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteTwoArgZext(i8* %p, i32* %q, i8 %b) {
+entry:
+ %t = load i8, i8* %p
+ %add = add nuw i8 %t, %b
+ %a = icmp slt i8 %t, 20
+ br i1 %a, label %true, label %false
+true:
+ %s = zext i8 %add to i32
+ store i32 %s, i32* %q
+ ret void
+false:
+ ret void
+}
+
+; Check that we manage to form a sextload is an operation with two
+; arguments to explicitly extend is in the the way.
+; Version with sext.
+; OPTALL-LABEL: @promoteTwoArgSExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[SEXTLD:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i8 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXTLD]], [[SEXTB]]
+;
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+;
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteTwoArgSExt(i8* %p, i32* %q, i8 %b) {
+entry:
+ %t = load i8, i8* %p
+ %add = add nsw i8 %t, %b
+ %a = icmp slt i8 %t, 20
+ br i1 %a, label %true, label %false
+true:
+ %s = sext i8 %add to i32
+ store i32 %s, i32* %q
+ ret void
+false:
+ ret void
+}
+
+; Check that we do not a zextload if we need to introduce more than
+; one additional extension.
+; OPTALL-LABEL: @promoteThreeArgZext
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32
+; STRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+; STRESS-NEXT: [[ZEXTC:%[a-zA-Z_0-9-]+]] = zext i8 %c to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[TMP]], [[ZEXTC]]
+;
+; NONSTRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
+; NONSTRESS-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[TMP]], %c
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; DISABLE: add nuw i8
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteThreeArgZext(i8* %p, i32* %q, i8 %b, i8 %c) {
+entry:
+ %t = load i8, i8* %p
+ %tmp = add nuw i8 %t, %b
+ %add = add nuw i8 %tmp, %c
+ %a = icmp slt i8 %t, 20
+ br i1 %a, label %true, label %false
+true:
+ %s = zext i8 %add to i32
+ store i32 %s, i32* %q
+ ret void
+false:
+ ret void
+}
+
+; Check that we manage to form a zextload after promoting and merging
+; two extensions.
+; OPTALL-LABEL: @promoteMergeExtArgZExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i16 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+;
+; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32
+;
+; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32
+;
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteMergeExtArgZExt(i8* %p, i32* %q, i16 %b) {
+entry:
+ %t = load i8, i8* %p
+ %ext = zext i8 %t to i16
+ %add = add nuw i16 %ext, %b
+ %a = icmp slt i8 %t, 20
+ br i1 %a, label %true, label %false
+true:
+ %s = zext i16 %add to i32
+ store i32 %s, i32* %q
+ ret void
+false:
+ ret void
+}
+
+; Check that we manage to form a sextload after promoting and merging
+; two extensions.
+; Version with sext.
+; OPTALL-LABEL: @promoteMergeExtArgSExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = sext i16 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXTLD]], [[ZEXTB]]
+;
+; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+;
+; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteMergeExtArgSExt(i8* %p, i32* %q, i16 %b) {
+entry:
+ %t = load i8, i8* %p
+ %ext = zext i8 %t to i16
+ %add = add nsw i16 %ext, %b
+ %a = icmp slt i8 %t, 20
+ br i1 %a, label %true, label %false
+true:
+ %s = sext i16 %add to i32
+ store i32 %s, i32* %q
+ ret void
+false:
+ ret void
+}
+
+; Check that we manage to catch all the extload opportunities that are exposed
+; by the different iterations of codegen prepare.
+; Moreover, check that we do not promote more than we need to.
+; Here is what is happening in this test (not necessarly in this order):
+; 1. We try to promote the operand of %sextadd.
+; a. This creates one sext of %ld2 and one of %zextld
+; b. The sext of %ld2 can be combine with %ld2, so we remove one sext but
+; introduced one. This is fine with the current heuristic: neutral.
+; => We have one zext of %zextld left and we created one sext of %ld2.
+; 2. We try to promote the operand of %sextaddza.
+; a. This creates one sext of %zexta and one of %zextld
+; b. The sext of %zexta does not lead to any load, it stays here, even if it
+; could have been combine with the zext of %a.
+; c. The sext of %zextld leads to %ld and can be combined with it. This is
+; done by promoting %zextld. This is fine with the current heuristic:
+; neutral.
+; => We have created a new zext of %ld and we created one sext of %zexta.
+; 3. We try to promote the operand of %sextaddb.
+; a. This creates one sext of %b and one of %zextld
+; b. The sext of %b is a dead-end, nothing to be done.
+; c. Same thing as 2.c. happens.
+; => We have created a new zext of %ld and we created one sext of %b.
+; 4. We try to promote the operand of the zext of %zextld introduced in #1.
+; a. Same thing as 2.c. happens.
+; b. %zextld does not have any other uses. It is dead coded.
+; => We have created a new zext of %ld and we removed a zext of %zextld and
+; a zext of %ld.
+; Currently we do not try to reuse existing extensions, so in the end we have
+; 3 identical zext of %ld. The extensions will be CSE'ed by SDag.
+;
+; OPTALL-LABEL: @severalPromotions
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %addr1
+; OPT-NEXT: [[ZEXTLD1_1:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTLD1_2:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[LD2:%[a-zA-Z_0-9-]+]] = load i32, i32* %addr2
+; OPT-NEXT: [[SEXTLD2:%[a-zA-Z_0-9-]+]] = sext i32 [[LD2]] to i64
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_1]]
+; We do not combine this one: see 2.b.
+; OPT-NEXT: [[ZEXTA:%[a-zA-Z_0-9-]+]] = zext i8 %a to i32
+; OPT-NEXT: [[SEXTZEXTA:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXTA]] to i64
+; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTZEXTA]], [[ZEXTLD1_3]]
+; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_2]]
+;
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
+; DISABLE: [[ADDZA:%[a-zA-Z_0-9-]+]] = add nsw i32
+; DISABLE: [[RESZA:%[a-zA-Z_0-9-]+]] = sext i32 [[ADDZA]] to i64
+; DISABLE: [[ADDB:%[a-zA-Z_0-9-]+]] = add nsw i32
+; DISABLE: [[RESB:%[a-zA-Z_0-9-]+]] = sext i32 [[ADDB]] to i64
+;
+; OPTALL: call void @dummy(i64 [[RES]], i64 [[RESZA]], i64 [[RESB]])
+; OPTALL: ret
+define void @severalPromotions(i8* %addr1, i32* %addr2, i8 %a, i32 %b) {
+ %ld = load i8, i8* %addr1
+ %zextld = zext i8 %ld to i32
+ %ld2 = load i32, i32* %addr2
+ %add = add nsw i32 %ld2, %zextld
+ %sextadd = sext i32 %add to i64
+ %zexta = zext i8 %a to i32
+ %addza = add nsw i32 %zexta, %zextld
+ %sextaddza = sext i32 %addza to i64
+ %addb = add nsw i32 %b, %zextld
+ %sextaddb = sext i32 %addb to i64
+ call void @dummy(i64 %sextadd, i64 %sextaddza, i64 %sextaddb)
+ ret void
+}
+
+declare void @dummy(i64, i64, i64)
+
+; Make sure we do not try to promote vector types since the type promotion
+; helper does not support them for now.
+; OPTALL-LABEL: @vectorPromotion
+; OPTALL: [[SHL:%[a-zA-Z_0-9-]+]] = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
+; OPTALL: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext <2 x i32> [[SHL]] to <2 x i64>
+; OPTALL: ret
+define void @vectorPromotion() {
+entry:
+ %a = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
+ %b = zext <2 x i32> %a to <2 x i64>
+ ret void
+}
+
+@a = common global i32 0, align 4
+@c = common global [2 x i32] zeroinitializer, align 4
+
+; Make sure we support promotion of operands that produces a Value as opposed
+; to an instruction.
+; This used to cause a crash.
+; OPTALL-LABEL: @promotionOfArgEndsUpInValue
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i16, i16* %addr
+;
+; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i16 [[LD]] to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw nsw i32 [[SEXT]], zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @c, i64 0, i64 1), i32* @a) to i32)
+;
+; DISABLE-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw nsw i16 [[LD]], zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @c, i64 0, i64 1), i32* @a) to i16)
+; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+;
+; OPTALL-NEXT: ret i32 [[RES]]
+define i32 @promotionOfArgEndsUpInValue(i16* %addr) {
+entry:
+ %val = load i16, i16* %addr
+ %add = add nuw nsw i16 %val, zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @c, i64 0, i64 1), i32* @a) to i16)
+ %conv3 = sext i16 %add to i32
+ ret i32 %conv3
+}
+
+; Check that we see that one zext can be derived from the other for free.
+; OPTALL-LABEL: @promoteTwoArgZextWithSourceExtendedTwice
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; OPT-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
+; OPT-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], 12
+; OPT-NEXT: store i32 [[RES32]], i32* %addr
+; OPT-NEXT: store i64 [[RES64]], i64* %q
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[RES2_32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], 12
+; DISABLE-NEXT: store i32 [[RES32]], i32* %addr
+; DISABLE-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES2_32]] to i64
+; DISABLE-NEXT: store i64 [[ZEXT64]], i64* %q
+;
+; OPTALL-NEXT: ret void
+define void @promoteTwoArgZextWithSourceExtendedTwice(i8* %p, i64* %q, i32 %b, i32* %addr) {
+entry:
+ %t = load i8, i8* %p
+ %zextt = zext i8 %t to i32
+ %add = add nuw i32 %zextt, %b
+ %add2 = add nuw i32 %zextt, 12
+ store i32 %add, i32 *%addr
+ %s = zext i32 %add2 to i64
+ store i64 %s, i64* %q
+ ret void
+}
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free sext. If we would have promoted
+; all the way through the load we would end up with a free zext and a
+; non-free sext (of %b).
+; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+; STRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = trunc i64 [[IDX64]] to i32
+;
+; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, i32* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i32 [[RES32]], i32* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @doNotPromoteFreeSExtFromAddrMode(i8* %p, i32 %b, i32* %addr) {
+entry:
+ %t = load i8, i8* %p
+ %zextt = zext i8 %t to i32
+ %add = add nsw i32 %zextt, %b
+ %idx64 = sext i32 %add to i64
+ %staddr = getelementptr inbounds i32, i32* %addr, i64 %idx64
+ store i32 %add, i32 *%staddr
+ ret void
+}
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free sext. If we would have promoted
+; all the way through the load we would end up with a free zext and a
+; non-free sext (of %b).
+; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode64
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+;
+; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i64, i64* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i64 %stuff, i64* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @doNotPromoteFreeSExtFromAddrMode64(i8* %p, i32 %b, i64* %addr, i64 %stuff) {
+entry:
+ %t = load i8, i8* %p
+ %zextt = zext i8 %t to i32
+ %add = add nsw i32 %zextt, %b
+ %idx64 = sext i32 %add to i64
+ %staddr = getelementptr inbounds i64, i64* %addr, i64 %idx64
+ store i64 %stuff, i64 *%staddr
+ ret void
+}
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free sext. If we would have promoted
+; all the way through the load we would end up with a free zext and a
+; non-free sext (of %b).
+; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode128
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+;
+; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i128, i128* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i128 %stuff, i128* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @doNotPromoteFreeSExtFromAddrMode128(i8* %p, i32 %b, i128* %addr, i128 %stuff) {
+entry:
+ %t = load i8, i8* %p
+ %zextt = zext i8 %t to i32
+ %add = add nsw i32 %zextt, %b
+ %idx64 = sext i32 %add to i64
+ %staddr = getelementptr inbounds i128, i128* %addr, i64 %idx64
+ store i128 %stuff, i128 *%staddr
+ ret void
+}
+
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free sext. If we would have promoted
+; all the way through the load we would end up with a free zext and a
+; non-free sext (of %b).
+; OPTALL-LABEL: @promoteSExtFromAddrMode256
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i256, i256* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i256 %stuff, i256* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @promoteSExtFromAddrMode256(i8* %p, i32 %b, i256* %addr, i256 %stuff) {
+entry:
+ %t = load i8, i8* %p
+ %zextt = zext i8 %t to i32
+ %add = add nsw i32 %zextt, %b
+ %idx64 = sext i32 %add to i64
+ %staddr = getelementptr inbounds i256, i256* %addr, i64 %idx64
+ store i256 %stuff, i256 *%staddr
+ ret void
+}
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free zext.
+; When we promote all the way through the load, we end up with
+; a free zext and a non-free zext (of %b).
+; However, the current target lowering says zext i32 to i64 is free
+; so the promotion happens because the cost did not change and may
+; expose more opportunities.
+; This would need to be fixed at some point.
+; OPTALL-LABEL: @doNotPromoteFreeZExtFromAddrMode
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; This transformation should really happen only for stress mode.
+; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i32 %b to i64
+; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], [[ZEXTB]]
+; OPT-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = trunc i64 [[IDX64]] to i32
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, i32* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i32 [[RES32]], i32* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @doNotPromoteFreeZExtFromAddrMode(i8* %p, i32 %b, i32* %addr) {
+entry:
+ %t = load i8, i8* %p
+ %zextt = zext i8 %t to i32
+ %add = add nuw i32 %zextt, %b
+ %idx64 = zext i32 %add to i64
+ %staddr = getelementptr inbounds i32, i32* %addr, i64 %idx64
+ store i32 %add, i32 *%staddr
+ ret void
+}
+
+; OPTALL-LABEL: @doNotPromoteFreeSExtFromShift
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+;
+; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = shl i64 [[IDX64]], 12
+; OPTALL-NEXT: ret i64 %staddr
+define i64 @doNotPromoteFreeSExtFromShift(i8* %p, i32 %b) {
+entry:
+ %t = load i8, i8* %p
+ %zextt = zext i8 %t to i32
+ %add = add nsw i32 %zextt, %b
+ %idx64 = sext i32 %add to i64
+ %staddr = shl i64 %idx64, 12
+ ret i64 %staddr
+}
+
+; Same comment as doNotPromoteFreeZExtFromAddrMode.
+; OPTALL-LABEL: @doNotPromoteFreeZExtFromShift
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; This transformation should really happen only for stress mode.
+; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i32 %b to i64
+; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], [[ZEXTB]]
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = shl i64 [[IDX64]], 12
+; OPTALL-NEXT: ret i64 %staddr
+define i64 @doNotPromoteFreeZExtFromShift(i8* %p, i32 %b) {
+entry:
+ %t = load i8, i8* %p
+ %zextt = zext i8 %t to i32
+ %add = add nuw i32 %zextt, %b
+ %idx64 = zext i32 %add to i64
+ %staddr = shl i64 %idx64, 12
+ ret i64 %staddr
+}
+
+; The input has one free zext and one non-free sext.
+; When we promote all the way through to the load, we end up with
+; a free zext, a free sext (%ld1), and a non-free sext (of %cst).
+; However, we when generate load pair and the free sext(%ld1) becomes
+; non-free. So technically, we trade a non-free sext to two non-free
+; sext.
+; This would need to be fixed at some point.
+; OPTALL-LABEL: @doNotPromoteBecauseOfPairedLoad
+; OPTALL: [[LD0:%[a-zA-Z_0-9-]+]] = load i32, i32* %p
+; OPTALL: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, i32* %p, i64 1
+; OPTALL: [[LD1:%[a-zA-Z_0-9-]+]] = load i32, i32* [[GEP]]
+;
+; This transformation should really happen only for stress mode.
+; OPT-NEXT: [[SEXTLD1:%[a-zA-Z_0-9-]+]] = sext i32 [[LD1]] to i64
+; OPT-NEXT: [[SEXTCST:%[a-zA-Z_0-9-]+]] = sext i32 %cst to i64
+; OPT-NEXT: [[SEXTRES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD1]], [[SEXTCST]]
+;
+; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[LD1]], %cst
+; DISABLE-NEXT: [[SEXTRES:%[a-zA-Z_0-9-]+]] = sext i32 [[RES]] to i64
+;
+; OPTALL-NEXT: [[ZEXTLD0:%[a-zA-Z_0-9-]+]] = zext i32 [[LD0]] to i64
+; OPTALL-NEXT: [[FINAL:%[a-zA-Z_0-9-]+]] = add i64 [[SEXTRES]], [[ZEXTLD0]]
+; OPTALL-NEXT: ret i64 [[FINAL]]
+define i64 @doNotPromoteBecauseOfPairedLoad(i32* %p, i32 %cst) {
+ %ld0 = load i32, i32* %p
+ %idxLd1 = getelementptr inbounds i32, i32* %p, i64 1
+ %ld1 = load i32, i32* %idxLd1
+ %res = add nsw i32 %ld1, %cst
+ %sextres = sext i32 %res to i64
+ %zextLd0 = zext i32 %ld0 to i64
+ %final = add i64 %sextres, %zextLd0
+ ret i64 %final
+}
diff --git a/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index c6b7d83..c4e3e4e 100644
--- a/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -3,11 +3,11 @@
define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) {
; CHECK: fptosi_v4f64_to_v4i16
-; CHECK-DAG: fcvtzs v[[LHS:[0-9]+]].2d, v1.2d
-; CHECK-DAG: fcvtzs v[[RHS:[0-9]+]].2d, v0.2d
-; CHECK-DAG: xtn v[[LHS_NA:[0-9]+]].2s, v[[LHS]].2d
-; CHECK-DAG: xtn v[[RHS_NA:[0-9]+]].2s, v[[RHS]].2d
-; CHECK: uzp1 v0.4h, v[[RHS_NA]].4h, v[[LHS_NA]].4h
+; CHECK-DAG: fcvtzs v[[LHS:[0-9]+]].2d, v0.2d
+; CHECK-DAG: fcvtzs v[[RHS:[0-9]+]].2d, v1.2d
+; CHECK-DAG: xtn v[[MID:[0-9]+]].2s, v[[LHS]].2d
+; CHECK-DAG: xtn2 v[[MID]].4s, v[[RHS]].2d
+; CHECK: xtn v0.4h, v[[MID]].4s
%tmp1 = load <4 x double>, <4 x double>* %ptr
%tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
ret <4 x i16> %tmp2
@@ -15,17 +15,17 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) {
define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) {
; CHECK: fptosi_v4f64_to_v4i8
-; CHECK-DAG: fcvtzs v[[CONV3:[0-9]+]].2d, v3.2d
-; CHECK-DAG: fcvtzs v[[CONV2:[0-9]+]].2d, v2.2d
-; CHECK-DAG: fcvtzs v[[CONV1:[0-9]+]].2d, v1.2d
; CHECK-DAG: fcvtzs v[[CONV0:[0-9]+]].2d, v0.2d
-; CHECK-DAG: xtn v[[NA3:[0-9]+]].2s, v[[CONV3]].2d
+; CHECK-DAG: fcvtzs v[[CONV1:[0-9]+]].2d, v1.2d
+; CHECK-DAG: fcvtzs v[[CONV2:[0-9]+]].2d, v2.2d
+; CHECK-DAG: fcvtzs v[[CONV3:[0-9]+]].2d, v3.2d
; CHECK-DAG: xtn v[[NA2:[0-9]+]].2s, v[[CONV2]].2d
-; CHECK-DAG: xtn v[[NA1:[0-9]+]].2s, v[[CONV1]].2d
+; CHECK-DAG: xtn2 v[[NA2]].4s, v[[CONV3]].2d
; CHECK-DAG: xtn v[[NA0:[0-9]+]].2s, v[[CONV0]].2d
-; CHECK-DAG: uzp1 v[[TMP1:[0-9]+]].4h, v[[CONV2]].4h, v[[CONV3]].4h
-; CHECK-DAG: uzp1 v[[TMP2:[0-9]+]].4h, v[[CONV0]].4h, v[[CONV1]].4h
-; CHECK: uzp1 v0.8b, v[[TMP2]].8b, v[[TMP1]].8b
+; CHECK-DAG: xtn2 v[[NA0]].4s, v[[CONV1]].2d
+; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA0]].4s
+; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA2]].4s
+; CHECK: xtn v0.8b, v[[TMP1]].8h
%tmp1 = load <8 x double>, <8 x double>* %ptr
%tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
ret <8 x i8> %tmp2
diff --git a/test/CodeGen/AArch64/arm64-dup.ll b/test/CodeGen/AArch64/arm64-dup.ll
index 849e227..c6b7de3 100644
--- a/test/CodeGen/AArch64/arm64-dup.ll
+++ b/test/CodeGen/AArch64/arm64-dup.ll
@@ -321,3 +321,40 @@ entry:
%sub = sub <4 x i16> %a, %mul
ret <4 x i16> %sub
}
+
+; Also test the DUP path in the PerfectShuffle generator.
+
+; CHECK-LABEL: test_perfectshuffle_dupext_v4i16:
+; CHECK-NEXT: dup.4h v0, v0[0]
+; CHECK-NEXT: ext.8b v0, v0, v1, #4
+define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+ %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+ ret <4 x i16> %r
+}
+
+; CHECK-LABEL: test_perfectshuffle_dupext_v4f16:
+; CHECK-NEXT: dup.4h v0, v0[0]
+; CHECK-NEXT: ext.8b v0, v0, v1, #4
+; CHECK-NEXT: ret
+define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
+ %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+ ret <4 x half> %r
+}
+
+; CHECK-LABEL: test_perfectshuffle_dupext_v4i32:
+; CHECK-NEXT: dup.4s v0, v0[0]
+; CHECK-NEXT: ext.16b v0, v0, v1, #8
+; CHECK-NEXT: ret
+define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+ %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+ ret <4 x i32> %r
+}
+
+; CHECK-LABEL: test_perfectshuffle_dupext_v4f32:
+; CHECK-NEXT: dup.4s v0, v0[0]
+; CHECK-NEXT: ext.16b v0, v0, v1, #8
+; CHECK-NEXT: ret
+define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind {
+ %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+ ret <4 x float> %r
+}
diff --git a/test/CodeGen/AArch64/arm64-fcopysign.ll b/test/CodeGen/AArch64/arm64-fcopysign.ll
index 66241df..feffd41 100644
--- a/test/CodeGen/AArch64/arm64-fcopysign.ll
+++ b/test/CodeGen/AArch64/arm64-fcopysign.ll
@@ -39,7 +39,7 @@ entry:
; CHECK: fcvt s0, d0
; CHECK: movi.4s v[[CONST:[0-9]+]], #0x80, lsl #24
; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
- %0 = tail call double (...)* @bar() nounwind
+ %0 = tail call double (...) @bar() nounwind
%1 = fptrunc double %0 to float
%2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
%3 = fadd float %1, %2
diff --git a/test/CodeGen/AArch64/arm64-join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll
index e99168b..dee0344 100644
--- a/test/CodeGen/AArch64/arm64-join-reserved.ll
+++ b/test/CodeGen/AArch64/arm64-join-reserved.ll
@@ -10,7 +10,7 @@ target triple = "arm64-apple-macosx10"
; CHECK: ret
define void @g() nounwind ssp {
entry:
- tail call void (i32, ...)* @f(i32 0, i32 0) nounwind
+ tail call void (i32, ...) @f(i32 0, i32 0) nounwind
ret void
}
diff --git a/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
new file mode 100644
index 0000000..5bc4d71
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=arm64-apple-ios -aarch64-strict-align < %s | FileCheck %s
+
+; Small (16-bytes here) unaligned memcpys should stay memcpy calls if
+; strict-alignment is turned on.
+define void @t0(i8* %out, i8* %in) {
+; CHECK-LABEL: t0:
+; CHECK: orr w2, wzr, #0x10
+; CHECK-NEXT: bl _memcpy
+entry:
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 16, i32 1, i1 false)
+ ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index 4a92c3d..b74a406 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1086,7 +1086,7 @@ define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) {
; CHECK-LABEL: test_concat_diff_v1i32_v1i32:
; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: ins {{v[0-9]+}}.s[1], w{{[0-9]+}}
entry:
%c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
%d = insertelement <2 x i32> undef, i32 %c, i32 0
diff --git a/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
new file mode 100644
index 0000000..51ed8a1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -0,0 +1,456 @@
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
+
+declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
+
+declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
+
+;-----------------------------------------------------------------------------
+; RDMA Vector
+; test for SIMDThreeSameVectorSQRDMLxHTiedHS
+
+define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v4i16:
+ %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
+ %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h
+; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.4h
+; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2
+ ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v8i16:
+ %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
+ %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h
+; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.8h
+; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
+ ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v2i32:
+ %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
+ %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s
+; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.2s
+; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2
+ ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v4i32:
+ %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
+ %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V81: sqrdmulh v1.4s, v1.4s, v2.4s
+; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.4s
+; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2
+ ret <4 x i32> %retval
+}
+
+define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v4i16:
+ %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
+ %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h
+; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.4h
+; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2
+ ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v8i16:
+ %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
+ %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h
+; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.8h
+; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2
+ ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v2i32:
+ %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
+ %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s
+; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.2s
+; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2
+ ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v4i32:
+ %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
+ %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.4s
+; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.4s
+; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2
+ ret <4 x i32> %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Vector, by element
+; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
+
+define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlah_lane_s16:
+entry:
+ %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+ %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a : sqrdmulh v1.4h, v1.4h, v2.h[3]
+; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.h[3]
+; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2[3]
+ ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlahq_lane_s16:
+entry:
+ %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+ %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2]
+; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.h[2]
+; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2[2]
+ ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlah_lane_s32:
+entry:
+ %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+ %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1]
+; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.s[1]
+; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2[1]
+ ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlahq_lane_s32:
+entry:
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+ %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+ %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0]
+; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.s[0]
+; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2[0]
+ ret <4 x i32> %retval
+}
+
+define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlsh_lane_s16:
+entry:
+ %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+ %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.h[3]
+; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.h[3]
+; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2[3]
+ ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlshq_lane_s16:
+entry:
+ %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+ %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2]
+; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.h[2]
+; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2[2]
+ ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlsh_lane_s32:
+entry:
+ %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+ %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1]
+; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.s[1]
+; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2[1]
+ ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlshq_lane_s32:
+entry:
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+ %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+ %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0]
+; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.s[0]
+; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2[0]
+ ret <4 x i32> %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Vector, by element, extracted
+; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
+; i32 tests are for "def : Pat" in SIMDIndexedSQRDMLxHSDTied
+
+define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
+entry:
+ %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
+ %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+ %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+ %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
+ %retval = extractelement <4 x i16> %retval_vec, i64 0
+; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1]
+; CHECK-V81a: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[1]
+; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}}, v0, v1[1]
+ ret i16 %retval
+}
+
+define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
+entry:
+ %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
+ %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+ %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
+ %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
+ %retval = extractelement <8 x i16> %retval_vec, i64 0
+; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1]
+; CHECK-V81a: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[1]
+; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}}, v0, v1[1]
+ ret i16 %retval
+}
+
+define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
+entry:
+ %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+ %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+ %extract = extractelement <2 x i32> %prod, i64 0
+ %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
+; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0]
+; CHECK-V81a: sqrdmlah v2.2s, v0.2s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlah.2s v2, v0, v1[0]
+ ret i32 %retval
+}
+
+define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
+entry:
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+ %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+ %extract = extractelement <4 x i32> %prod, i64 0
+ %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
+; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0]
+; CHECK-V81a: sqrdmlah v2.4s, v0.4s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlah.4s v2, v0, v1[0]
+ ret i32 %retval
+}
+
+define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
+entry:
+ %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
+ %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+ %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+ %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
+ %retval = extractelement <4 x i16> %retval_vec, i64 0
+; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1]
+; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.4h, v0.4h, v1.h[1]
+; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}}, v0, v1[1]
+ ret i16 %retval
+}
+
+define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
+entry:
+ %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
+ %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+ %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
+ %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
+ %retval = extractelement <8 x i16> %retval_vec, i64 0
+; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1]
+; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.8h, v0.8h, v1.h[1]
+; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}}, v0, v1[1]
+ ret i16 %retval
+}
+
+define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
+entry:
+ %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+ %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+ %extract = extractelement <2 x i32> %prod, i64 0
+ %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
+; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0]
+; CHECK-V81a: sqrdmlsh v2.2s, v0.2s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlsh.2s v2, v0, v1[0]
+ ret i32 %retval
+}
+
+define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
+entry:
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+ %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+ %extract = extractelement <4 x i32> %prod, i64 0
+ %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
+; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0]
+; CHECK-V81a: sqrdmlsh v2.4s, v0.4s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlsh.4s v2, v0, v1[0]
+ ret i32 %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Scalar
+; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
+
+define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
+; CHECK-LABEL: test_sqrdmlah_v1i16:
+ %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
+ %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
+ %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec)
+ %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+ %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec)
+ %retval = extractelement <4 x i16> %retval_vec, i64 0
+; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+ ret i16 %retval
+}
+
+define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
+; CHECK-LABEL: test_sqrdmlah_v1i32:
+ %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
+ %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
+ %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec)
+ %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
+ %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec)
+ %retval = extractelement <4 x i32> %retval_vec, i64 0
+; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+ ret i32 %retval
+}
+
+
+define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
+; CHECK-LABEL: test_sqrdmlsh_v1i16:
+ %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
+ %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
+ %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec)
+ %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+ %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec)
+ %retval = extractelement <4 x i16> %retval_vec, i64 0
+; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+ ret i16 %retval
+}
+
+define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
+; CHECK-LABEL: test_sqrdmlsh_v1i32:
+ %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
+ %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
+ %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec)
+ %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
+ %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec)
+ %retval = extractelement <4 x i32> %retval_vec, i64 0
+; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+ ret i32 %retval
+}
+define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
+; CHECK-LABEL: test_sqrdmlah_i32:
+ %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
+ %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod)
+; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+ ret i32 %retval
+}
+
+define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_i32:
+ %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
+ %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
+; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+ ret i32 %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Scalar, by element
+; i16 tests are performed via tests in above chapter, with IR in ACLE style
+; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied
+
+define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
+; CHECK-LABEL: test_sqrdmlah_extract_i16:
+ %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
+ %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
+ %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
+ %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+ %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
+ %retval = extractelement <4 x i16> %retval_vec, i32 0
+; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
+; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
+; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, v0[1]
+ ret i16 %retval
+}
+
+define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_extract_i32:
+ %extract = extractelement <4 x i32> %rhs, i32 3
+ %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract)
+ %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod)
+; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
+ ret i32 %retval
+}
+
+define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
+; CHECK-LABEL: test_sqrdmlshq_extract_i16:
+ %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
+ %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
+ %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
+ %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
+ %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
+ %retval = extractelement <8 x i16> %retval_vec, i32 0
+; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
+; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
+; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}}, {{v[0-9]+}}, v0[1]
+ ret i16 %retval
+}
+
+define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_extract_i32:
+ %extract = extractelement <4 x i32> %rhs, i32 3
+ %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract)
+ %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
+; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
+ ret i32 %retval
+}
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
index 5a740d8..2651f11 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
@@ -9,7 +9,7 @@
define void @clobberScratch(i32* %p) {
%v = load i32, i32* %p
tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
- tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
+ tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
store i32 %v, i32* %p
ret void
}
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
index 8f79f80..b8236c5 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
@@ -23,9 +23,9 @@ entry:
; FAST-NEXT: movk x16, #0xbeef
; FAST-NEXT: blr x16
%resolveCall2 = inttoptr i64 281474417671919 to i8*
- %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+ %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
%resolveCall3 = inttoptr i64 244837814038255 to i8*
- tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+ tail call webkit_jscc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
ret void
}
@@ -59,7 +59,7 @@ entry:
; FAST-NEXT: movk x16, #0xbeef
; FAST-NEXT: blr x16
%call = inttoptr i64 281474417671919 to i8*
- %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+ %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
ret i64 %result
}
@@ -101,7 +101,7 @@ entry:
; FAST-NEXT: movk x16, #0xbeef
; FAST-NEXT: blr x16
%call = inttoptr i64 281474417671919 to i8*
- %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+ %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
ret i64 %result
}
diff --git a/test/CodeGen/AArch64/arm64-patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll
index cf06653..d9ec7e5 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -16,9 +16,9 @@ entry:
; CHECK-NEXT: blr x16
; CHECK: ret
%resolveCall2 = inttoptr i64 244837814094590 to i8*
- %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+ %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
%resolveCall3 = inttoptr i64 244837814094591 to i8*
- tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+ tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
ret i64 %result
}
@@ -38,7 +38,7 @@ entry:
store i64 11, i64* %metadata
store i64 12, i64* %metadata
store i64 13, i64* %metadata
- call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+ call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
ret void
}
@@ -51,14 +51,14 @@ entry:
%tmp80 = add i64 %tmp79, -16
%tmp81 = inttoptr i64 %tmp80 to i64*
%tmp82 = load i64, i64* %tmp81, align 8
- tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
- tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+ tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+ tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
%tmp83 = load i64, i64* %tmp33, align 8
%tmp84 = add i64 %tmp83, -24
%tmp85 = inttoptr i64 %tmp84 to i64*
%tmp86 = load i64, i64* %tmp85, align 8
- tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
- tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+ tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
+ tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
ret i64 10
}
@@ -74,7 +74,7 @@ entry:
; CHECK-NEXT: nop
; CHECK-NEXT: ldp
; CHECK-NEXT: ret
- %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
+ %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
ret void
}
diff --git a/test/CodeGen/AArch64/arm64-stackmap-nops.ll b/test/CodeGen/AArch64/arm64-stackmap-nops.ll
index 5915b64..2647ac4 100644
--- a/test/CodeGen/AArch64/arm64-stackmap-nops.ll
+++ b/test/CodeGen/AArch64/arm64-stackmap-nops.ll
@@ -8,7 +8,7 @@ entry:
; CHECK: nop
; CHECK-NEXT: nop
; CHECK-NOT: nop
- tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 16)
+ tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 0, i32 16)
ret void
}
diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
index 29e4484..1a4df7a 100644
--- a/test/CodeGen/AArch64/arm64-stackmap.ll
+++ b/test/CodeGen/AArch64/arm64-stackmap.ll
@@ -78,7 +78,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define void @constantargs() {
entry:
%0 = inttoptr i64 244837814094590 to i8*
- tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+ tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
ret void
}
@@ -100,7 +100,7 @@ entry:
; Runtime void->void call.
call void inttoptr (i64 244837814094590 to void ()*)()
; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
- call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+ call void (i64, i32, ...) @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
ret void
}
@@ -126,7 +126,7 @@ entry:
cold:
; OSR patchpoint with 12-byte nop-slide and 2 live vars.
%thunk = inttoptr i64 244837814094590 to i8*
- call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
+ call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
unreachable
ret:
ret void
@@ -142,7 +142,7 @@ ret:
define i64 @propertyRead(i64* %obj) {
entry:
%resolveRead = inttoptr i64 244837814094590 to i8*
- %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
+ %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
%add = add i64 %result, 3
ret i64 %add
}
@@ -162,7 +162,7 @@ entry:
define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
entry:
%resolveWrite = inttoptr i64 244837814094590 to i8*
- call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+ call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
ret void
}
@@ -184,7 +184,7 @@ entry:
define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
entry:
%resolveCall = inttoptr i64 244837814094590 to i8*
- call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+ call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
ret void
}
@@ -206,7 +206,7 @@ entry:
define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
entry:
%resolveCall = inttoptr i64 244837814094590 to i8*
- %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+ %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
%add = add i64 %result, 3
ret i64 %add
}
@@ -226,7 +226,7 @@ entry:
; CHECK-NEXT: .short 29
define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
entry:
- call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+ call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
ret void
}
@@ -245,7 +245,7 @@ entry:
; CHECK-NEXT: .short 29
define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
entry:
- call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
+ call void (i64, i32, ...) @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
ret void
}
@@ -263,7 +263,7 @@ entry:
; CHECK-NEXT: .long 33
define void @liveConstant() {
- tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
+ tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
ret void
}
@@ -280,7 +280,7 @@ define void @liveConstant() {
; CHECK-NEXT: .long -{{[0-9]+}}
define void @clobberLR(i32 %a) {
tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x31}"() nounwind
- tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+ tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
ret void
}
diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll
index 75e0d80..15ea21b 100644
--- a/test/CodeGen/AArch64/arm64-vshuffle.ll
+++ b/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -1,22 +1,8 @@
; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s
-; The mask:
-; CHECK: lCPI0_0:
-; CHECK: .byte 2 ; 0x2
-; CHECK: .byte 255 ; 0xff
-; CHECK: .byte 6 ; 0x6
-; CHECK: .byte 255 ; 0xff
-; The second vector is legalized to undef and the elements of the first vector
-; are used instead.
-; CHECK: .byte 2 ; 0x2
-; CHECK: .byte 4 ; 0x4
-; CHECK: .byte 6 ; 0x6
-; CHECK: .byte 0 ; 0x0
; CHECK: test1
-; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0
-; CHECK: movi.8h v[[REG1:[0-9]+]], #0x1, lsl #8
-; CHECK: tbl.8b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+; CHECK: movi d[[REG0:[0-9]+]], #0000000000000000
define <8 x i1> @test1() {
entry:
%Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
@@ -30,18 +16,16 @@ entry:
; CHECK: lCPI1_0:
; CHECK: .byte 0 ; 0x0
-; CHECK: .byte 255 ; 0xff
-; CHECK: .byte 2 ; 0x2
-; CHECK: .byte 255 ; 0xff
-; CHECK: .byte 10 ; 0xa
-; CHECK: .byte 12 ; 0xc
-; CHECK: .byte 14 ; 0xe
-; CHECK: .byte 7 ; 0x7
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 1 ; 0x1
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
; CHECK: test2
-; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF]
-; CHECK: adrp x[[REG2:[0-9]+]], lCPI1_1@PAGE
-; CHECK: ldr q[[REG1:[0-9]+]], [x[[REG2]], lCPI1_1@PAGEOFF]
-; CHECK: tbl.8b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+; CHECK: adrp x[[REG2:[0-9]+]], lCPI1_0@PAGE
+; CHECK: ldr d[[REG1:[0-9]+]], [x[[REG2]], lCPI1_0@PAGEOFF]
define <8 x i1>@test2() {
bb:
%Shuff = shufflevector <8 x i1> zeroinitializer,
@@ -51,28 +35,8 @@ bb:
ret <8 x i1> %Shuff
}
-; CHECK: lCPI2_0:
-; CHECK: .byte 2 ; 0x2
-; CHECK: .byte 255 ; 0xff
-; CHECK: .byte 6 ; 0x6
-; CHECK: .byte 255 ; 0xff
-; CHECK: .byte 10 ; 0xa
-; CHECK: .byte 12 ; 0xc
-; CHECK: .byte 14 ; 0xe
-; CHECK: .byte 0 ; 0x0
-; CHECK: .byte 2 ; 0x2
-; CHECK: .byte 255 ; 0xff
-; CHECK: .byte 6 ; 0x6
-; CHECK: .byte 255 ; 0xff
-; CHECK: .byte 10 ; 0xa
-; CHECK: .byte 12 ; 0xc
-; CHECK: .byte 14 ; 0xe
-; CHECK: .byte 0 ; 0x0
; CHECK: test3
-; CHECK: adrp x[[REG3:[0-9]+]], lCPI2_0@PAGE
-; CHECK: ldr q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF]
-; CHECK: ldr q[[REG1:[0-9]+]], [x[[REG3]], lCPI2_1@PAGEOFF]
-; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+; CHECK: movi.4s v{{[0-9]+}}, #0x1
define <16 x i1> @test3(i1* %ptr, i32 %v) {
bb:
%Shuff = shufflevector <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i1> undef,
@@ -81,29 +45,26 @@ bb:
i32 14, i32 0>
ret <16 x i1> %Shuff
}
-; CHECK: lCPI3_1:
+; CHECK: lCPI3_0:
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
; CHECK: .byte 0 ; 0x0
; CHECK: .byte 1 ; 0x1
-; CHECK: .byte 2 ; 0x2
-; CHECK: .byte 18 ; 0x12
-; CHECK: .byte 4 ; 0x4
-; CHECK: .byte 5 ; 0x5
-; CHECK: .byte 6 ; 0x6
-; CHECK: .byte 7 ; 0x7
-; CHECK: .byte 8 ; 0x8
-; CHECK: .byte 31 ; 0x1f
-; CHECK: .byte 10 ; 0xa
-; CHECK: .byte 30 ; 0x1e
-; CHECK: .byte 12 ; 0xc
-; CHECK: .byte 13 ; 0xd
-; CHECK: .byte 14 ; 0xe
-; CHECK: .byte 15 ; 0xf
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 0 ; 0x0
; CHECK: _test4:
-; CHECK: ldr q[[REG1:[0-9]+]]
-; CHECK: movi.2d v[[REG0:[0-9]+]], #0000000000000000
-; CHECK: adrp x[[REG3:[0-9]+]], lCPI3_1@PAGE
-; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF]
-; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]]
+; CHECK: adrp x[[REG3:[0-9]+]], lCPI3_0@PAGE
+; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_0@PAGEOFF]
define <16 x i1> @test4(i1* %ptr, i32 %v) {
bb:
%Shuff = shufflevector <16 x i1> zeroinitializer,
diff --git a/test/CodeGen/AArch64/bitcast.ll b/test/CodeGen/AArch64/bitcast.ll
new file mode 100644
index 0000000..e88ea9e
--- /dev/null
+++ b/test/CodeGen/AArch64/bitcast.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s
+
+; PR23065: SCALAR_TO_VECTOR implies the top elements 1 to N-1 of the N-element vector are undefined.
+
+define <4 x i16> @foo1(<2 x i32> %a) {
+; CHECK-LABEL: foo1:
+; CHECK: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+
+ %1 = shufflevector <2 x i32> <i32 58712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
+; Can't optimize the following bitcast to scalar_to_vector.
+ %2 = bitcast <2 x i32> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ ret <4 x i16> %3
+}
+
+define <4 x i16> @foo2(<2 x i32> %a) {
+; CHECK-LABEL: foo2:
+; CHECK: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+
+ %1 = shufflevector <2 x i32> <i32 712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
+; Can't optimize the following bitcast to scalar_to_vector.
+ %2 = bitcast <2 x i32> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ ret <4 x i16> %3
+}
diff --git a/test/CodeGen/AArch64/br-to-eh-lpad.ll b/test/CodeGen/AArch64/br-to-eh-lpad.ll
index e948b87..f304ba4 100644
--- a/test/CodeGen/AArch64/br-to-eh-lpad.ll
+++ b/test/CodeGen/AArch64/br-to-eh-lpad.ll
@@ -30,12 +30,12 @@ invoke.cont7:
unreachable
if.end50.thread:
- tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125)
- tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 128)
+ tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125)
+ tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 128)
unreachable
invoke.cont33:
- tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 119)
+ tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 119)
unreachable
invoke.cont41:
@@ -51,7 +51,7 @@ lpad40:
br label %finally.catchall
finally.catchall:
- tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125)
+ tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125)
unreachable
}
diff --git a/test/CodeGen/AArch64/concat_vector-scalar-combine.ll b/test/CodeGen/AArch64/concat_vector-scalar-combine.ll
new file mode 100644
index 0000000..1c64af6
--- /dev/null
+++ b/test/CodeGen/AArch64/concat_vector-scalar-combine.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Test the (concat_vectors (bitcast (scalar)), ..) pattern.
+
+define <8 x i8> @test_concat_scalar_v2i8_to_v8i8_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalar_v2i8_to_v8i8_dup:
+; CHECK-NEXT: dup.4h v0, w0
+; CHECK-NEXT: ret
+ %t = trunc i32 %x to i16
+ %0 = bitcast i16 %t to <2 x i8>
+ %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <8 x i8> %1
+}
+
+define <8 x i8> @test_concat_scalar_v4i8_to_v8i8_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalar_v4i8_to_v8i8_dup:
+; CHECK-NEXT: dup.2s v0, w0
+; CHECK-NEXT: ret
+ %0 = bitcast i32 %x to <4 x i8>
+ %1 = shufflevector <4 x i8> %0, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i8> %1
+}
+
+define <8 x i16> @test_concat_scalar_v2i16_to_v8i16_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalar_v2i16_to_v8i16_dup:
+; CHECK-NEXT: dup.4s v0, w0
+; CHECK-NEXT: ret
+ %0 = bitcast i32 %x to <2 x i16>
+ %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 0, i32 1, i32 0, i32 1>
+ ret <8 x i16> %1
+}
+
+define <8 x i8> @test_concat_scalars_2x_v2i8_to_v8i8(i32 %x, i32 %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_2x_v2i8_to_v8i8:
+; CHECK-NEXT: ins.h v0[0], w0
+; CHECK-NEXT: ins.h v0[1], w1
+; CHECK-NEXT: ins.h v0[3], w1
+; CHECK-NEXT: ret
+ %tx = trunc i32 %x to i16
+ %ty = trunc i32 %y to i16
+ %bx = bitcast i16 %tx to <2 x i8>
+ %by = bitcast i16 %ty to <2 x i8>
+ %r = shufflevector <2 x i8> %bx, <2 x i8> %by, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 2, i32 3>
+ ret <8 x i8> %r
+}
+
+define <8 x i8> @test_concat_scalars_2x_v4i8_to_v8i8_dup(i32 %x, i32 %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_2x_v4i8_to_v8i8_dup:
+; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: ins.s v0[1], w0
+; CHECK-NEXT: ret
+ %bx = bitcast i32 %x to <4 x i8>
+ %by = bitcast i32 %y to <4 x i8>
+ %r = shufflevector <4 x i8> %bx, <4 x i8> %by, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i8> %r
+}
+
+define <8 x i16> @test_concat_scalars_2x_v2i16_to_v8i16_dup(i32 %x, i32 %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_2x_v2i16_to_v8i16_dup:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ins.s v0[2], w1
+; CHECK-NEXT: ins.s v0[3], w0
+; CHECK-NEXT: ret
+ %bx = bitcast i32 %x to <2 x i16>
+ %by = bitcast i32 %y to <2 x i16>
+ %r = shufflevector <2 x i16> %bx, <2 x i16> %by, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1>
+ ret <8 x i16> %r
+}
+
+; Also make sure we minimize bitcasts.
+
+; This is a pretty artificial testcase: make sure we bitcast to floating-point
+; if any of the scalars is floating-point.
+define <8 x i8> @test_concat_scalars_mixed_2x_v2i8_to_v8i8(float %dummy, i32 %x, half %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_mixed_2x_v2i8_to_v8i8:
+; CHECK-NEXT: fmov s[[X:[0-9]+]], w0
+; CHECK-NEXT: ins.h v0[0], v[[X]][0]
+; CHECK-NEXT: ins.h v0[1], v1[0]
+; CHECK-NEXT: ins.h v0[2], v[[X]][0]
+; CHECK-NEXT: ins.h v0[3], v1[0]
+; CHECK-NEXT: ret
+ %t = trunc i32 %x to i16
+ %0 = bitcast i16 %t to <2 x i8>
+ %y0 = bitcast half %y to <2 x i8>
+ %1 = shufflevector <2 x i8> %0, <2 x i8> %y0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i8> %1
+}
+
+define <2 x float> @test_concat_scalars_fp_2x_v2i8_to_v8i8(float %dummy, half %x, half %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_fp_2x_v2i8_to_v8i8:
+; CHECK-NEXT: ins.h v0[0], v1[0]
+; CHECK-NEXT: ins.h v0[1], v2[0]
+; CHECK-NEXT: ins.h v0[2], v1[0]
+; CHECK-NEXT: ins.h v0[3], v2[0]
+; CHECK-NEXT: ret
+ %0 = bitcast half %x to <2 x i8>
+ %y0 = bitcast half %y to <2 x i8>
+ %1 = shufflevector <2 x i8> %0, <2 x i8> %y0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %2 = bitcast <8 x i8> %1 to <2 x float>
+ ret <2 x float> %2
+}
+
+define <4 x float> @test_concat_scalar_fp_v2i16_to_v16i8_dup(float %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalar_fp_v2i16_to_v16i8_dup:
+; CHECK-NEXT: dup.4s v0, v0[0]
+; CHECK-NEXT: ret
+ %0 = bitcast float %x to <2 x i16>
+ %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 0, i32 1, i32 0, i32 1>
+ %2 = bitcast <8 x i16> %1 to <4 x float>
+ ret <4 x float> %2
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
index c510e27..ee52786 100644
--- a/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
+++ b/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
@@ -2,6 +2,8 @@
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+; Test the (concat_vectors (trunc), (trunc)) pattern.
+
define <4 x i16> @test_concat_truncate_v2i64_to_v4i16(<2 x i64> %a, <2 x i64> %b) #0 {
entry:
; CHECK-LABEL: test_concat_truncate_v2i64_to_v4i16:
diff --git a/test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll b/test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll
new file mode 100644
index 0000000..eb6c80d
--- /dev/null
+++ b/test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Test the (concat_vectors (bitcast (trunc (scalar))), undef..) pattern.
+
+define <8 x i8> @test_concat_from_truncated_scalar(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_from_truncated_scalar:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ret
+ %t = trunc i32 %x to i16
+ %0 = bitcast i16 %t to <2 x i8>
+ %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x i8> %1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/dag-combine-invaraints.ll b/test/CodeGen/AArch64/dag-combine-invaraints.ll
index 3614133..ac2d057 100644
--- a/test/CodeGen/AArch64/dag-combine-invaraints.ll
+++ b/test/CodeGen/AArch64/dag-combine-invaraints.ll
@@ -20,7 +20,7 @@ main_:
%DHSelect = select i1 %tmp8, i32 %tmp9, i32 %tmp10
store i32 %DHSelect, i32* %i32X, align 4
%tmp15 = load i32, i32* %i32X, align 4
- %tmp17 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str2, i32 0, i32 0), i32 %tmp15)
+ %tmp17 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str2, i32 0, i32 0), i32 %tmp15)
ret i32 0
; CHECK: main:
diff --git a/test/CodeGen/AArch64/f16-instructions.ll b/test/CodeGen/AArch64/f16-instructions.ll
new file mode 100644
index 0000000..be5e2e5
--- /dev/null
+++ b/test/CodeGen/AArch64/f16-instructions.ll
@@ -0,0 +1,765 @@
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_fadd:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fadd(half %a, half %b) #0 {
+ %r = fadd half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fsub:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fsub s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fsub(half %a, half %b) #0 {
+ %r = fsub half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fmul:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fmul(half %a, half %b) #0 {
+ %r = fmul half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fdiv:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fdiv s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fdiv(half %a, half %b) #0 {
+ %r = fdiv half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_frem:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl {{_?}}fmodf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_frem(half %a, half %b) #0 {
+ %r = frem half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_store:
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+define void @test_store(half %a, half* %b) #0 {
+ store half %a, half* %b
+ ret void
+}
+
+; CHECK-LABEL: test_load:
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: ret
+define half @test_load(half* %a) #0 {
+ %r = load half, half* %a
+ ret half %r
+}
+
+
+declare half @test_callee(half %a, half %b) #0
+
+; CHECK-LABEL: test_call:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: bl {{_?}}test_callee
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_call(half %a, half %b) #0 {
+ %r = call half @test_callee(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_call_flipped:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: mov.16b v2, v0
+; CHECK-NEXT: mov.16b v0, v1
+; CHECK-NEXT: mov.16b v1, v2
+; CHECK-NEXT: bl {{_?}}test_callee
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_call_flipped(half %a, half %b) #0 {
+ %r = call half @test_callee(half %b, half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_tailcall_flipped:
+; CHECK-NEXT: mov.16b v2, v0
+; CHECK-NEXT: mov.16b v0, v1
+; CHECK-NEXT: mov.16b v1, v2
+; CHECK-NEXT: b {{_?}}test_callee
+define half @test_tailcall_flipped(half %a, half %b) #0 {
+ %r = tail call half @test_callee(half %b, half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_select:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: cmp w0, #0
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_select(half %a, half %b, i1 zeroext %c) #0 {
+ %r = select i1 %c, half %a, half %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_select_cc:
+; CHECK-DAG: fcvt s3, h3
+; CHECK-DAG: fcvt s2, h2
+; CHECK-DAG: fcvt s1, h1
+; CHECK-DAG: fcvt s0, h0
+; CHECK-DAG: fcmp s2, s3
+; CHECK-DAG: cset [[CC:w[0-9]+]], ne
+; CHECK-DAG: cmp [[CC]], #0
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
+ %cc = fcmp une half %c, %d
+ %r = select i1 %cc, half %a, half %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fcmp_une:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+define i1 @test_fcmp_une(half %a, half %b) #0 {
+ %r = fcmp une half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: orr [[TRUE:w[0-9]+]], wzr, #0x1
+; CHECK-NEXT: csel [[CC:w[0-9]+]], [[TRUE]], wzr, eq
+; CHECK-NEXT: csel w0, [[TRUE]], [[CC]], vs
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ueq(half %a, half %b) #0 {
+ %r = fcmp ueq half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, hi
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ugt(half %a, half %b) #0 {
+ %r = fcmp ugt half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, pl
+; CHECK-NEXT: ret
+define i1 @test_fcmp_uge(half %a, half %b) #0 {
+ %r = fcmp uge half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ult(half %a, half %b) #0 {
+ %r = fcmp ult half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, le
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ule(half %a, half %b) #0 {
+ %r = fcmp ule half %a, %b
+ ret i1 %r
+}
+
+
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, vs
+; CHECK-NEXT: ret
+define i1 @test_fcmp_uno(half %a, half %b) #0 {
+ %r = fcmp uno half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_one:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: orr [[TRUE:w[0-9]+]], wzr, #0x1
+; CHECK-NEXT: csel [[CC:w[0-9]+]], [[TRUE]], wzr, mi
+; CHECK-NEXT: csel w0, [[TRUE]], [[CC]], gt
+; CHECK-NEXT: ret
+define i1 @test_fcmp_one(half %a, half %b) #0 {
+ %r = fcmp one half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+define i1 @test_fcmp_oeq(half %a, half %b) #0 {
+ %r = fcmp oeq half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ogt(half %a, half %b) #0 {
+ %r = fcmp ogt half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, ge
+; CHECK-NEXT: ret
+define i1 @test_fcmp_oge(half %a, half %b) #0 {
+ %r = fcmp oge half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, mi
+; CHECK-NEXT: ret
+define i1 @test_fcmp_olt(half %a, half %b) #0 {
+ %r = fcmp olt half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, ls
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ole(half %a, half %b) #0 {
+ %r = fcmp ole half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, vc
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ord(half %a, half %b) #0 {
+ %r = fcmp ord half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_br_cc:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: b.mi [[BRCC_ELSE:.?LBB[0-9_]+]]
+; CHECK-NEXT: str wzr, [x0]
+; CHECK-NEXT: ret
+; CHECK-NEXT: [[BRCC_ELSE]]:
+; CHECK-NEXT: str wzr, [x1]
+; CHECK-NEXT: ret
+define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 {
+ %c = fcmp uge half %a, %b
+ br i1 %c, label %then, label %else
+then:
+ store i32 0, i32* %p1
+ ret void
+else:
+ store i32 0, i32* %p2
+ ret void
+}
+
+; CHECK-LABEL: test_phi:
+; CHECK: mov x[[PTR:[0-9]+]], x0
+; CHECK: ldr h[[AB:[0-9]+]], [x[[PTR]]]
+; CHECK: [[LOOP:LBB[0-9_]+]]:
+; CHECK: mov.16b v[[R:[0-9]+]], v[[AB]]
+; CHECK: ldr h[[AB]], [x[[PTR]]]
+; CHECK: mov x0, x[[PTR]]
+; CHECK: bl {{_?}}test_dummy
+; CHECK: mov.16b v0, v[[R]]
+; CHECK: ret
+define half @test_phi(half* %p1) #0 {
+entry:
+ %a = load half, half* %p1
+ br label %loop
+loop:
+ %r = phi half [%a, %entry], [%b, %loop]
+ %b = load half, half* %p1
+ %c = call i1 @test_dummy(half* %p1)
+ br i1 %c, label %loop, label %return
+return:
+ ret half %r
+}
+declare i1 @test_dummy(half* %p1) #0
+
+; CHECK-LABEL: test_fptosi_i32:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzs w0, s0
+; CHECK-NEXT: ret
+define i32 @test_fptosi_i32(half %a) #0 {
+ %r = fptosi half %a to i32
+ ret i32 %r
+}
+
+; CHECK-LABEL: test_fptosi_i64:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzs x0, s0
+; CHECK-NEXT: ret
+define i64 @test_fptosi_i64(half %a) #0 {
+ %r = fptosi half %a to i64
+ ret i64 %r
+}
+
+; CHECK-LABEL: test_fptoui_i32:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzu w0, s0
+; CHECK-NEXT: ret
+define i32 @test_fptoui_i32(half %a) #0 {
+ %r = fptoui half %a to i32
+ ret i32 %r
+}
+
+; CHECK-LABEL: test_fptoui_i64:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzu x0, s0
+; CHECK-NEXT: ret
+define i64 @test_fptoui_i64(half %a) #0 {
+ %r = fptoui half %a to i64
+ ret i64 %r
+}
+
+; CHECK-LABEL: test_uitofp_i32:
+; CHECK-NEXT: ucvtf s0, w0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_uitofp_i32(i32 %a) #0 {
+ %r = uitofp i32 %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_uitofp_i64:
+; CHECK-NEXT: ucvtf s0, x0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_uitofp_i64(i64 %a) #0 {
+ %r = uitofp i64 %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i32:
+; CHECK-NEXT: scvtf s0, w0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_sitofp_i32(i32 %a) #0 {
+ %r = sitofp i32 %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i64:
+; CHECK-NEXT: scvtf s0, x0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_sitofp_i64(i64 %a) #0 {
+ %r = sitofp i64 %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_fptrunc_float:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+
+define half @test_fptrunc_float(float %a) #0 {
+ %r = fptrunc float %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_fptrunc_double:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: ret
+define half @test_fptrunc_double(double %a) #0 {
+ %r = fptrunc double %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_fpext_float:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: ret
+define float @test_fpext_float(half %a) #0 {
+ %r = fpext half %a to float
+ ret float %r
+}
+
+; CHECK-LABEL: test_fpext_double:
+; CHECK-NEXT: fcvt d0, h0
+; CHECK-NEXT: ret
+define double @test_fpext_double(half %a) #0 {
+ %r = fpext half %a to double
+ ret double %r
+}
+
+
+; CHECK-LABEL: test_bitcast_halftoi16:
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+define i16 @test_bitcast_halftoi16(half %a) #0 {
+ %r = bitcast half %a to i16
+ ret i16 %r
+}
+
+; CHECK-LABEL: test_bitcast_i16tohalf:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ret
+define half @test_bitcast_i16tohalf(i16 %a) #0 {
+ %r = bitcast i16 %a to half
+ ret half %r
+}
+
+
+declare half @llvm.sqrt.f16(half %a) #0
+declare half @llvm.powi.f16(half %a, i32 %b) #0
+declare half @llvm.sin.f16(half %a) #0
+declare half @llvm.cos.f16(half %a) #0
+declare half @llvm.pow.f16(half %a, half %b) #0
+declare half @llvm.exp.f16(half %a) #0
+declare half @llvm.exp2.f16(half %a) #0
+declare half @llvm.log.f16(half %a) #0
+declare half @llvm.log10.f16(half %a) #0
+declare half @llvm.log2.f16(half %a) #0
+declare half @llvm.fma.f16(half %a, half %b, half %c) #0
+declare half @llvm.fabs.f16(half %a) #0
+declare half @llvm.minnum.f16(half %a, half %b) #0
+declare half @llvm.maxnum.f16(half %a, half %b) #0
+declare half @llvm.copysign.f16(half %a, half %b) #0
+declare half @llvm.floor.f16(half %a) #0
+declare half @llvm.ceil.f16(half %a) #0
+declare half @llvm.trunc.f16(half %a) #0
+declare half @llvm.rint.f16(half %a) #0
+declare half @llvm.nearbyint.f16(half %a) #0
+declare half @llvm.round.f16(half %a) #0
+declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
+
+; CHECK-LABEL: test_sqrt:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fsqrt s0, s0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_sqrt(half %a) #0 {
+ %r = call half @llvm.sqrt.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_powi:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}__powisf2
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_powi(half %a, i32 %b) #0 {
+ %r = call half @llvm.powi.f16(half %a, i32 %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_sin:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}sinf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_sin(half %a) #0 {
+ %r = call half @llvm.sin.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_cos:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}cosf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_cos(half %a) #0 {
+ %r = call half @llvm.cos.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_pow:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl {{_?}}powf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_pow(half %a, half %b) #0 {
+ %r = call half @llvm.pow.f16(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_exp:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}expf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_exp(half %a) #0 {
+ %r = call half @llvm.exp.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_exp2:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}exp2f
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_exp2(half %a) #0 {
+ %r = call half @llvm.exp2.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_log:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}logf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_log(half %a) #0 {
+ %r = call half @llvm.log.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_log10:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}log10f
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_log10(half %a) #0 {
+ %r = call half @llvm.log10.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_log2:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}log2f
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_log2(half %a) #0 {
+ %r = call half @llvm.log2.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_fma:
+; CHECK-NEXT: fcvt s2, h2
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fmadd s0, s0, s1, s2
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fma(half %a, half %b, half %c) #0 {
+ %r = call half @llvm.fma.f16(half %a, half %b, half %c)
+ ret half %r
+}
+
+; CHECK-LABEL: test_fabs:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fabs s0, s0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fabs(half %a) #0 {
+ %r = call half @llvm.fabs.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_minnum:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl {{_?}}fminf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_minnum(half %a, half %b) #0 {
+ %r = call half @llvm.minnum.f16(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_maxnum:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl {{_?}}fmaxf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_maxnum(half %a, half %b) #0 {
+ %r = call half @llvm.maxnum.f16(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_copysign:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: bit.16b v0, v1, v2
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_copysign(half %a, half %b) #0 {
+ %r = call half @llvm.copysign.f16(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_floor:
+; CHECK-NEXT: fcvt s1, h0
+; CHECK-NEXT: frintm s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_floor(half %a) #0 {
+ %r = call half @llvm.floor.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_ceil:
+; CHECK-NEXT: fcvt s1, h0
+; CHECK-NEXT: frintp s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_ceil(half %a) #0 {
+ %r = call half @llvm.ceil.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_trunc:
+; CHECK-NEXT: fcvt s1, h0
+; CHECK-NEXT: frintz s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_trunc(half %a) #0 {
+ %r = call half @llvm.trunc.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_rint:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: frintx s0, s0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_rint(half %a) #0 {
+ %r = call half @llvm.rint.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_nearbyint:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: frinti s0, s0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_nearbyint(half %a) #0 {
+ %r = call half @llvm.nearbyint.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_round:
+; CHECK-NEXT: fcvt s1, h0
+; CHECK-NEXT: frinta s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_round(half %a) #0 {
+ %r = call half @llvm.round.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_fmuladd:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h2
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fmuladd(half %a, half %b, half %c) #0 {
+ %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
+ ret half %r
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/fast-isel-int-ext5.ll b/test/CodeGen/AArch64/fast-isel-int-ext5.ll
new file mode 100644
index 0000000..0f9ec62
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext5.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: int_ext_opt
+define i64 @int_ext_opt(i8* %addr, i1 %c1, i1 %c2) {
+entry:
+ %0 = load i8, i8* %addr
+ br i1 %c1, label %bb1, label %bb2
+
+bb1:
+ %1 = zext i8 %0 to i64
+ br i1 %c2, label %bb2, label %exit
+
+bb2:
+ %2 = phi i64 [1, %entry], [%1, %bb1]
+ ret i64 %2
+
+exit:
+ ret i64 0
+}
diff --git a/test/CodeGen/AArch64/fold-constants.ll b/test/CodeGen/AArch64/fold-constants.ll
new file mode 100644
index 0000000..2dd0d12
--- /dev/null
+++ b/test/CodeGen/AArch64/fold-constants.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+
+define i64 @dotests_616() {
+; CHECK-LABEL: dotests_616
+; CHECK: movi d0, #0000000000000000
+; CHECK-NEXT: umov w8, v0.b[2]
+; CHECK-NEXT: sbfx w8, w8, #0, #1
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <2 x i64> zeroinitializer to <8 x i16>
+ %1 = and <8 x i16> zeroinitializer, %0
+ %2 = icmp ne <8 x i16> %1, zeroinitializer
+ %3 = extractelement <8 x i1> %2, i32 2
+ %vgetq_lane285 = sext i1 %3 to i16
+ %vset_lane = insertelement <4 x i16> undef, i16 %vgetq_lane285, i32 0
+ %4 = bitcast <4 x i16> %vset_lane to <1 x i64>
+ %vget_lane = extractelement <1 x i64> %4, i32 0
+ ret i64 %vget_lane
+}
diff --git a/test/CodeGen/AArch64/fp16-instructions.ll b/test/CodeGen/AArch64/fp16-instructions.ll
deleted file mode 100644
index ba96694..0000000
--- a/test/CodeGen/AArch64/fp16-instructions.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
-
-define half @add_h(half %a, half %b) {
-entry:
-; CHECK-LABEL: add_h:
-; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
-; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
-; CHECK: fadd [[RES:s[0-9]+]], [[OP1]], [[OP2]]
-; CHECK: fcvt h0, [[RES]]
- %0 = fadd half %a, %b
- ret half %0
-}
-
-
-define half @sub_h(half %a, half %b) {
-entry:
-; CHECK-LABEL: sub_h:
-; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
-; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
-; CHECK: fsub [[RES:s[0-9]+]], [[OP1]], [[OP2]]
-; CHECK: fcvt h0, [[RES]]
- %0 = fsub half %a, %b
- ret half %0
-}
-
-
-define half @mul_h(half %a, half %b) {
-entry:
-; CHECK-LABEL: mul_h:
-; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
-; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
-; CHECK: fmul [[RES:s[0-9]+]], [[OP1]], [[OP2]]
-; CHECK: fcvt h0, [[RES]]
- %0 = fmul half %a, %b
- ret half %0
-}
-
-
-define half @div_h(half %a, half %b) {
-entry:
-; CHECK-LABEL: div_h:
-; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
-; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
-; CHECK: fdiv [[RES:s[0-9]+]], [[OP1]], [[OP2]]
-; CHECK: fcvt h0, [[RES]]
- %0 = fdiv half %a, %b
- ret half %0
-}
-
-
-define half @load_h(half* %a) {
-entry:
-; CHECK-LABEL: load_h:
-; CHECK: ldr h0, [x0]
- %0 = load half, half* %a, align 4
- ret half %0
-}
-
-
-define void @store_h(half* %a, half %b) {
-entry:
-; CHECK-LABEL: store_h:
-; CHECK: str h0, [x0]
- store half %b, half* %a, align 4
- ret void
-}
-
-define half @s_to_h(float %a) {
-; CHECK-LABEL: s_to_h:
-; CHECK: fcvt h0, s0
- %1 = fptrunc float %a to half
- ret half %1
-}
-
-define half @d_to_h(double %a) {
-; CHECK-LABEL: d_to_h:
-; CHECK: fcvt h0, d0
- %1 = fptrunc double %a to half
- ret half %1
-}
-
-define float @h_to_s(half %a) {
-; CHECK-LABEL: h_to_s:
-; CHECK: fcvt s0, h0
- %1 = fpext half %a to float
- ret float %1
-}
-
-define double @h_to_d(half %a) {
-; CHECK-LABEL: h_to_d:
-; CHECK: fcvt d0, h0
- %1 = fpext half %a to double
- ret double %1
-}
-
-define half @bitcast_i_to_h(i16 %a) {
-; CHECK-LABEL: bitcast_i_to_h:
-; CHECK: fmov s0, w0
- %1 = bitcast i16 %a to half
- ret half %1
-}
-
-
-define i16 @bitcast_h_to_i(half %a) {
-; CHECK-LABEL: bitcast_h_to_i:
-; CHECK: fmov w0, s0
- %1 = bitcast half %a to i16
- ret i16 %1
-}
diff --git a/test/CodeGen/AArch64/global-merge-1.ll b/test/CodeGen/AArch64/global-merge-1.ll
index b404389..14b0430 100644
--- a/test/CodeGen/AArch64/global-merge-1.ll
+++ b/test/CodeGen/AArch64/global-merge-1.ll
@@ -1,11 +1,11 @@
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
-; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
@m = internal global i32 0, align 4
@n = internal global i32 0, align 4
diff --git a/test/CodeGen/AArch64/global-merge-2.ll b/test/CodeGen/AArch64/global-merge-2.ll
index d5967b9..af68403 100644
--- a/test/CodeGen/AArch64/global-merge-2.ll
+++ b/test/CodeGen/AArch64/global-merge-2.ll
@@ -1,6 +1,6 @@
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
@x = global i32 0, align 4
@y = global i32 0, align 4
diff --git a/test/CodeGen/AArch64/global-merge-3.ll b/test/CodeGen/AArch64/global-merge-3.ll
index 15035c0..9251083 100644
--- a/test/CodeGen/AArch64/global-merge-3.ll
+++ b/test/CodeGen/AArch64/global-merge-3.ll
@@ -1,6 +1,6 @@
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
@x = global [1000 x i32] zeroinitializer, align 1
@y = global [1000 x i32] zeroinitializer, align 1
diff --git a/test/CodeGen/AArch64/global-merge-4.ll b/test/CodeGen/AArch64/global-merge-4.ll
index 8fb7747..bc6b68a 100644
--- a/test/CodeGen/AArch64/global-merge-4.ll
+++ b/test/CodeGen/AArch64/global-merge-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -o - | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
target triple = "arm64-apple-ios7.0.0"
diff --git a/test/CodeGen/AArch64/merge-store.ll b/test/CodeGen/AArch64/merge-store.ll
new file mode 100644
index 0000000..18dbad4
--- /dev/null
+++ b/test/CodeGen/AArch64/merge-store.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march aarch64 %s -o - | FileCheck %s
+
+@g0 = external global <3 x float>, align 16
+@g1 = external global <3 x float>, align 4
+
+; CHECK: ldr s[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]]{{\]}}, #4
+; CHECK: ld1{{\.?s?}} { v[[R0]]{{\.?s?}} }[1], {{\[}}[[R1]]{{\]}}
+; CHECK: str d[[R0]]
+
+define void @blam() {
+ %tmp4 = getelementptr inbounds <3 x float>, <3 x float>* @g1, i64 0, i64 0
+ %tmp5 = load <3 x float>, <3 x float>* @g0, align 16
+ %tmp6 = extractelement <3 x float> %tmp5, i64 0
+ store float %tmp6, float* %tmp4
+ %tmp7 = getelementptr inbounds float, float* %tmp4, i64 1
+ %tmp8 = load <3 x float>, <3 x float>* @g0, align 16
+ %tmp9 = extractelement <3 x float> %tmp8, i64 1
+ store float %tmp9, float* %tmp7
+ ret void;
+}
diff --git a/test/CodeGen/AArch64/print-mrs-system-register.ll b/test/CodeGen/AArch64/print-mrs-system-register.ll
new file mode 100644
index 0000000..3411ed6
--- /dev/null
+++ b/test/CodeGen/AArch64/print-mrs-system-register.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=arm64-apple-darwin %s -o - | FileCheck %s
+
+; CHECK: mrs x0, CPM_IOACC_CTL_EL3
+
+define void @foo1() #0 {
+entry:
+ tail call void asm sideeffect "mrs x0, cpm_ioacc_ctl_el3", ""()
+ ret void
+}
+
+attributes #0 = { "target-cpu"="cyclone" }
diff --git a/test/CodeGen/AArch64/sibling-call.ll b/test/CodeGen/AArch64/sibling-call.ll
index 34d45d8..a68fdec 100644
--- a/test/CodeGen/AArch64/sibling-call.ll
+++ b/test/CodeGen/AArch64/sibling-call.ll
@@ -75,8 +75,8 @@ define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
; CHECK: ldr [[VAL0:x[0-9]+]],
; CHECK: ldr [[VAL1:x[0-9]+]],
-; CHECK: str [[VAL1]],
; CHECK: str [[VAL0]],
+; CHECK: str [[VAL1]],
; CHECK-NOT: add sp, sp,
; CHECK: b callee_stack16
diff --git a/test/CodeGen/AArch64/stackmap-liveness.ll b/test/CodeGen/AArch64/stackmap-liveness.ll
new file mode 100644
index 0000000..6b37aac
--- /dev/null
+++ b/test/CodeGen/AArch64/stackmap-liveness.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -mtriple=aarch64-apple-darwin | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT: __LLVM_StackMaps:
+; Header
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .short 0
+; Num Functions
+; CHECK-NEXT: .long 1
+; Num LargeConstants
+; CHECK-NEXT: .long 0
+; Num Callsites
+; CHECK-NEXT: .long 1
+
+; Functions and stack size
+; CHECK-NEXT: .quad _stackmap_liveness
+; CHECK-NEXT: .quad 16
+
+; Test that the return register is recognized as an live-out.
+define i64 @stackmap_liveness(i1 %c) {
+; CHECK-LABEL: .long L{{.*}}-_stackmap_liveness
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 0
+; Padding
+; CHECK-NEXT: .short 0
+; Num LiveOut Entries: 1
+; CHECK-NEXT: .short 2
+; LiveOut Entry 0: X0
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; LiveOut Entry 1: SP
+; CHECK-NEXT: .short 31
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; Align
+; CHECK-NEXT: .align 3
+ %1 = select i1 %c, i64 1, i64 2
+ call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 32, i8* null, i32 0)
+ ret i64 %1
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+
diff --git a/test/CodeGen/AArch64/tailcall-explicit-sret.ll b/test/CodeGen/AArch64/tailcall-explicit-sret.ll
new file mode 100644
index 0000000..4d80f2a
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-explicit-sret.ll
@@ -0,0 +1,106 @@
+; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s
+; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Check that we don't try to tail-call with a non-forwarded sret parameter.
+declare void @test_explicit_sret(i1024* sret) #0
+
+; This is the only OK case, where we forward the explicit sret pointer.
+
+; CHECK-LABEL: _test_tailcall_explicit_sret:
+; CHECK-NEXT: b _test_explicit_sret
+define void @test_tailcall_explicit_sret(i1024* sret %arg) #0 {
+ tail call void @test_explicit_sret(i1024* %arg)
+ ret void
+}
+
+; CHECK-LABEL: _test_call_explicit_sret:
+; CHECK-NOT: mov x8
+; CHECK: bl _test_explicit_sret
+; CHECK: ret
+define void @test_call_explicit_sret(i1024* sret %arg) #0 {
+ call void @test_explicit_sret(i1024* %arg)
+ ret void
+}
+
+; CHECK-LABEL: _test_tailcall_explicit_sret_alloca_unused:
+; CHECK: mov x8, sp
+; CHECK-NEXT: bl _test_explicit_sret
+; CHECK: ret
+define void @test_tailcall_explicit_sret_alloca_unused() #0 {
+ %l = alloca i1024, align 8
+ tail call void @test_explicit_sret(i1024* %l)
+ ret void
+}
+
+; CHECK-LABEL: _test_tailcall_explicit_sret_alloca_dummyusers:
+; CHECK: ldr [[PTRLOAD1:x[0-9]+]], [x0]
+; CHECK: str [[PTRLOAD1]], [sp]
+; CHECK: mov x8, sp
+; CHECK-NEXT: bl _test_explicit_sret
+; CHECK: ret
+define void @test_tailcall_explicit_sret_alloca_dummyusers(i1024* %ptr) #0 {
+ %l = alloca i1024, align 8
+ %r = load i1024, i1024* %ptr, align 8
+ store i1024 %r, i1024* %l, align 8
+ tail call void @test_explicit_sret(i1024* %l)
+ ret void
+}
+
+; This is too conservative, but doesn't really happen in practice.
+
+; CHECK-LABEL: _test_tailcall_explicit_sret_gep:
+; CHECK: add x8, x0, #128
+; CHECK-NEXT: bl _test_explicit_sret
+; CHECK: ret
+define void @test_tailcall_explicit_sret_gep(i1024* %ptr) #0 {
+ %ptr2 = getelementptr i1024, i1024* %ptr, i32 1
+ tail call void @test_explicit_sret(i1024* %ptr2)
+ ret void
+}
+
+; CHECK-LABEL: _test_tailcall_explicit_sret_alloca_returned:
+; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov x8, sp
+; CHECK-NEXT: bl _test_explicit_sret
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define i1024 @test_tailcall_explicit_sret_alloca_returned() #0 {
+ %l = alloca i1024, align 8
+ tail call void @test_explicit_sret(i1024* %l)
+ %r = load i1024, i1024* %l, align 8
+ ret i1024 %r
+}
+
+; CHECK-LABEL: _test_indirect_tailcall_explicit_sret_nosret_arg:
+; CHECK-DAG: mov x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK-DAG: mov [[FPTR:x[0-9]+]], x0
+; CHECK: mov x0, sp
+; CHECK-NEXT: blr [[FPTR]]
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define void @test_indirect_tailcall_explicit_sret_nosret_arg(i1024* sret %arg, void (i1024*)* %f) #0 {
+ %l = alloca i1024, align 8
+ tail call void %f(i1024* %l)
+ %r = load i1024, i1024* %l, align 8
+ store i1024 %r, i1024* %arg, align 8
+ ret void
+}
+
+; CHECK-LABEL: _test_indirect_tailcall_explicit_sret_:
+; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov x8, sp
+; CHECK-NEXT: blr x0
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define void @test_indirect_tailcall_explicit_sret_(i1024* sret %arg, i1024 ()* %f) #0 {
+ %ret = tail call i1024 %f()
+ store i1024 %ret, i1024* %arg, align 8
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/tailcall-implicit-sret.ll b/test/CodeGen/AArch64/tailcall-implicit-sret.ll
new file mode 100644
index 0000000..5d68059
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-implicit-sret.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s
+; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Check that we don't try to tail-call with an sret-demoted return.
+
+declare i1024 @test_sret() #0
+
+; CHECK-LABEL: _test_call_sret:
+; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov x8, sp
+; CHECK-NEXT: bl _test_sret
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define i1024 @test_call_sret() #0 {
+ %a = call i1024 @test_sret()
+ ret i1024 %a
+}
+
+; CHECK-LABEL: _test_tailcall_sret:
+; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov x8, sp
+; CHECK-NEXT: bl _test_sret
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define i1024 @test_tailcall_sret() #0 {
+ %a = tail call i1024 @test_sret()
+ ret i1024 %a
+}
+
+; CHECK-LABEL: _test_indirect_tailcall_sret:
+; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov x8, sp
+; CHECK-NEXT: blr x0
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define i1024 @test_indirect_tailcall_sret(i1024 ()* %f) #0 {
+ %a = tail call i1024 %f()
+ ret i1024 %a
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/tailcall-mem-intrinsics.ll b/test/CodeGen/AArch64/tailcall-mem-intrinsics.ll
new file mode 100644
index 0000000..b970fb1
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-mem-intrinsics.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
+
+; CHECK-LABEL: tail_memcpy:
+; CHECK: b memcpy
+define void @tail_memcpy(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+ ret void
+}
+
+; CHECK-LABEL: tail_memmove:
+; CHECK: b memmove
+define void @tail_memmove(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+ tail call void @llvm.memmove.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+ ret void
+}
+
+; CHECK-LABEL: tail_memset:
+; CHECK: b memset
+define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 {
+entry:
+ tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i32 1, i1 false)
+ ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/vcvt-oversize.ll b/test/CodeGen/AArch64/vcvt-oversize.ll
new file mode 100644
index 0000000..066a4b6
--- /dev/null
+++ b/test/CodeGen/AArch64/vcvt-oversize.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define <8 x i8> @float_to_i8(<8 x float>* %in) {
+; CHECK-LABEL: float_to_i8:
+; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v0.4s, v0.4s
+; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v1.4s, v1.4s
+; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s
+; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s
+; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s
+; CHECK-DAG: xtn2 v[[TMP]].8h, v[[MSB]].4s
+; CHECK-DAG: xtn v0.8b, v[[TMP]].8h
+ %l = load <8 x float>, <8 x float>* %in
+ %scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>
+ %conv = fptoui <8 x float> %scale to <8 x i8>
+ ret <8 x i8> %conv
+}