diff options
Diffstat (limited to 'test/CodeGen/AArch64')
47 files changed, 3199 insertions, 257 deletions
diff --git a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll new file mode 100644 index 0000000..a31c66b --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll @@ -0,0 +1,491 @@ +; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s + +; This test aims to check basic correctness of frame layout & +; frame access code. There are 8 functions in this test file, +; each function implements one element in the cartesian product +; of: +; . a function having a VLA/noVLA +; . a function with dynamic stack realignment/no dynamic stack realignment. +; . a function needing a frame pionter/no frame pointer, +; since the presence/absence of these has influence on the frame +; layout and which pointer to use to access various part of the +; frame (bp,sp,fp). +; +; Furthermore: in every test function: +; . there is always one integer and 1 floating point argument to be able +; to check those are accessed correctly. +; . there is always one local variable to check that is accessed +; correctly +; +; The LLVM-IR below was produced by clang on the following C++ code: +;extern "C" int g(); +;extern "C" int novla_nodynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, +; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10) +;{ +; // use an argument passed on the stack. +; volatile int l1; +; return i10 + (int)d10 + l1 + g(); +;} +;extern "C" int novla_nodynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, +; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10) +;{ +; // use an argument passed on the stack. +; volatile int l1; +; return i10 + (int)d10 + l1; +;} +;extern "C" int novla_dynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, +; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10) +;{ +; // use an argument passed on the stack. +; alignas(128) volatile int l1; +; return i10 + (int)d10 + l1 + g(); +;} +;extern "C" int novla_dynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, +; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10) +;{ +; // use an argument passed on the stack. +; alignas(128) volatile int l1; +; return i10 + (int)d10 + l1; +;} +; +;extern "C" int vla_nodynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, +; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10) +;{ +; // use an argument passed on the stack. +; volatile int l1; +; volatile int vla[i1]; +; return i10 + (int)d10 + l1 + g() + vla[0]; +;} +;extern "C" int vla_nodynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, +; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10) +;{ +; // use an argument passed on the stack. +; volatile int l1; +; volatile int vla[i1]; +; return i10 + (int)d10 + l1 + vla[0]; +;} +;extern "C" int vla_dynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, +; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10) +;{ +; // use an argument passed on the stack. +; alignas(128) volatile int l1; +; volatile int vla[i1]; +; return i10 + (int)d10 + l1 + g() + vla[0]; +;} +;extern "C" int vla_dynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, +; double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10) +;{ +; // use an argument passed on the stack. +; alignas(128) volatile int l1; +; volatile int vla[i1]; +; return i10 + (int)d10 + l1 + vla[0]; +;} + + + +define i32 @novla_nodynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 { +entry: + %l1 = alloca i32, align 4 + %conv = fptosi double %d10 to i32 + %add = add nsw i32 %conv, %i10 + %l1.0.l1.0. = load volatile i32, i32* %l1, align 4 + %add1 = add nsw i32 %add, %l1.0.l1.0. + %call = tail call i32 @g() + %add2 = add nsw i32 %add1, %call + ret i32 %add2 +} +; CHECK-LABEL: novla_nodynamicrealign_call +; CHECK: .cfi_startproc +; Check that used callee-saved registers are saved +; CHECK: stp x20, x19, [sp, #-32]! +; Check that the frame pointer is created: +; CHECK: stp x29, x30, [sp, #16] +; CHECK: add x29, sp, #16 +; Check correctness of cfi pseudo-instructions +; CHECK: .cfi_def_cfa w29, 16 +; CHECK: .cfi_offset w30, -8 +; CHECK: .cfi_offset w29, -16 +; CHECK: .cfi_offset w19, -24 +; CHECK: .cfi_offset w20, -32 +; Check correct access to arguments passed on the stack, through frame pointer +; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] +; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24] +; Check correct access to local variable on the stack, through stack pointer +; CHECK: ldr w[[ILOC:[0-9]+]], [sp, #12] +; Check epilogue: +; CHECK: ldp x29, x30, [sp, #16] +; CHECK: ldp x20, x19, [sp], #32 +; CHECK: ret +; CHECK: .cfi_endproc + + +declare i32 @g() #0 + +; Function Attrs: nounwind +define i32 @novla_nodynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 { +entry: + %l1 = alloca i32, align 4 + %conv = fptosi double %d10 to i32 + %add = add nsw i32 %conv, %i10 + %l1.0.l1.0. = load volatile i32, i32* %l1, align 4 + %add1 = add nsw i32 %add, %l1.0.l1.0. + ret i32 %add1 +} +; CHECK-LABEL: novla_nodynamicrealign_nocall +; Check that space is reserved for one local variable on the stack. +; CHECK: sub sp, sp, #16 // =16 +; Check correct access to arguments passed on the stack, through stack pointer +; CHECK: ldr d[[DARG:[0-9]+]], [sp, #40] +; CHECK: ldr w[[IARG:[0-9]+]], [sp, #24] +; Check correct access to local variable on the stack, through stack pointer +; CHECK: ldr w[[ILOC:[0-9]+]], [sp, #12] +; Check epilogue: +; CHECK: add sp, sp, #16 // =16 +; CHECK: ret + + +define i32 @novla_dynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 { +entry: + %l1 = alloca i32, align 128 + %conv = fptosi double %d10 to i32 + %add = add nsw i32 %conv, %i10 + %l1.0.l1.0. = load volatile i32, i32* %l1, align 128 + %add1 = add nsw i32 %add, %l1.0.l1.0. + %call = tail call i32 @g() + %add2 = add nsw i32 %add1, %call + ret i32 %add2 +} + +; CHECK-LABEL: novla_dynamicrealign_call +; CHECK: .cfi_startproc +; Check that used callee-saved registers are saved +; CHECK: stp x20, x19, [sp, #-32]! +; Check that the frame pointer is created: +; CHECK: stp x29, x30, [sp, #16] +; CHECK: add x29, sp, #16 +; Check the dynamic realignment of the stack pointer to a 128-byte boundary +; CHECK: sub x9, sp, #96 +; CHECK: and sp, x9, #0xffffffffffffff80 +; Check correctness of cfi pseudo-instructions +; CHECK: .cfi_def_cfa w29, 16 +; CHECK: .cfi_offset w30, -8 +; CHECK: .cfi_offset w29, -16 +; CHECK: .cfi_offset w19, -24 +; CHECK: .cfi_offset w20, -32 +; Check correct access to arguments passed on the stack, through frame pointer +; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] +; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24] +; Check correct access to local variable on the stack, through re-aligned stack pointer +; CHECK: ldr w[[ILOC:[0-9]+]], [sp] +; Check epilogue: +; Check that stack pointer get restored from frame pointer. +; CHECK: sub sp, x29, #16 // =16 +; CHECK: ldp x29, x30, [sp, #16] +; CHECK: ldp x20, x19, [sp], #32 +; CHECK: ret +; CHECK: .cfi_endproc + + +; Function Attrs: nounwind +define i32 @novla_dynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 { +entry: + %l1 = alloca i32, align 128 + %conv = fptosi double %d10 to i32 + %add = add nsw i32 %conv, %i10 + %l1.0.l1.0. = load volatile i32, i32* %l1, align 128 + %add1 = add nsw i32 %add, %l1.0.l1.0. + ret i32 %add1 +} + +; CHECK-LABEL: novla_dynamicrealign_nocall +; Check that the frame pointer is created: +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK: mov x29, sp +; Check the dynamic realignment of the stack pointer to a 128-byte boundary +; CHECK: sub x9, sp, #112 +; CHECK: and sp, x9, #0xffffffffffffff80 +; Check correct access to arguments passed on the stack, through frame pointer +; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] +; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24] +; Check correct access to local variable on the stack, through re-aligned stack pointer +; CHECK: ldr w[[ILOC:[0-9]+]], [sp] +; Check epilogue: +; Check that stack pointer get restored from frame pointer. +; CHECK: mov sp, x29 +; CHECK: ldp x29, x30, [sp], #16 +; CHECK: ret + + +define i32 @vla_nodynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 { +entry: + %l1 = alloca i32, align 4 + %0 = zext i32 %i1 to i64 + %vla = alloca i32, i64 %0, align 4 + %conv = fptosi double %d10 to i32 + %add = add nsw i32 %conv, %i10 + %l1.0.l1.0. = load volatile i32, i32* %l1, align 4 + %add1 = add nsw i32 %add, %l1.0.l1.0. + %call = tail call i32 @g() + %add2 = add nsw i32 %add1, %call + %1 = load volatile i32, i32* %vla, align 4, !tbaa !1 + %add3 = add nsw i32 %add2, %1 + ret i32 %add3 +} + +; CHECK-LABEL: vla_nodynamicrealign_call +; CHECK: .cfi_startproc +; Check that used callee-saved registers are saved +; CHECK: stp x20, x19, [sp, #-32]! +; Check that the frame pointer is created: +; CHECK: stp x29, x30, [sp, #16] +; CHECK: add x29, sp, #16 +; Check that space is reserved on the stack for the local variable, +; rounded up to a multiple of 16 to keep the stack pointer 16-byte aligned. +; CHECK: sub sp, sp, #16 +; Check correctness of cfi pseudo-instructions +; CHECK: .cfi_def_cfa w29, 16 +; CHECK: .cfi_offset w30, -8 +; CHECK: .cfi_offset w29, -16 +; CHECK: .cfi_offset w19, -24 +; CHECK: .cfi_offset w20, -32 +; Check correct access to arguments passed on the stack, through frame pointer +; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24] +; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] +; Check correct reservation of 16-byte aligned VLA (size in w0) on stack +; CHECK: ubfx x9, x0, #0, #32 +; CHECK: lsl x9, x9, #2 +; CHECK: add x9, x9, #15 +; CHECK: and x9, x9, #0xfffffffffffffff0 +; CHECK: mov x10, sp +; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9 +; CHECK: mov sp, x[[VLASPTMP]] +; Check correct access to local variable, through frame pointer +; CHECK: ldur w[[ILOC:[0-9]+]], [x29, #-20] +; Check correct accessing of the VLA variable through the base pointer +; CHECK: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]] +; Check epilogue: +; Check that stack pointer get restored from frame pointer. +; CHECK: sub sp, x29, #16 // =16 +; CHECK: ldp x29, x30, [sp, #16] +; CHECK: ldp x20, x19, [sp], #32 +; CHECK: ret +; CHECK: .cfi_endproc + + +; Function Attrs: nounwind +define i32 @vla_nodynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 { +entry: + %l1 = alloca i32, align 4 + %0 = zext i32 %i1 to i64 + %vla = alloca i32, i64 %0, align 4 + %conv = fptosi double %d10 to i32 + %add = add nsw i32 %conv, %i10 + %l1.0.l1.0. = load volatile i32, i32* %l1, align 4 + %add1 = add nsw i32 %add, %l1.0.l1.0. + %1 = load volatile i32, i32* %vla, align 4, !tbaa !1 + %add2 = add nsw i32 %add1, %1 + ret i32 %add2 +} + +; CHECK-LABEL: vla_nodynamicrealign_nocall +; Check that the frame pointer is created: +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK: mov x29, sp +; Check that space is reserved on the stack for the local variable, +; rounded up to a multiple of 16 to keep the stack pointer 16-byte aligned. +; CHECK: sub sp, sp, #16 +; Check correctness of cfi pseudo-instructions +; Check correct access to arguments passed on the stack, through frame pointer +; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24] +; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] +; Check correct reservation of 16-byte aligned VLA (size in w0) on stack +; CHECK: ubfx x9, x0, #0, #32 +; CHECK: lsl x9, x9, #2 +; CHECK: add x9, x9, #15 +; CHECK: and x9, x9, #0xfffffffffffffff0 +; CHECK: mov x10, sp +; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9 +; CHECK: mov sp, x[[VLASPTMP]] +; Check correct access to local variable, through frame pointer +; CHECK: ldur w[[ILOC:[0-9]+]], [x29, #-4] +; Check correct accessing of the VLA variable through the base pointer +; CHECK: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]] +; Check epilogue: +; Check that stack pointer get restored from frame pointer. +; CHECK: mov sp, x29 +; CHECK: ldp x29, x30, [sp], #16 +; CHECK: ret + + +define i32 @vla_dynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 { +entry: + %l1 = alloca i32, align 128 + %0 = zext i32 %i1 to i64 + %vla = alloca i32, i64 %0, align 4 + %conv = fptosi double %d10 to i32 + %add = add nsw i32 %conv, %i10 + %l1.0.l1.0. = load volatile i32, i32* %l1, align 128 + %add1 = add nsw i32 %add, %l1.0.l1.0. + %call = tail call i32 @g() + %add2 = add nsw i32 %add1, %call + %1 = load volatile i32, i32* %vla, align 4, !tbaa !1 + %add3 = add nsw i32 %add2, %1 + ret i32 %add3 +} + +; CHECK-LABEL: vla_dynamicrealign_call +; CHECK: .cfi_startproc +; Check that used callee-saved registers are saved +; CHECK: stp x22, x21, [sp, #-48]! +; CHECK: stp x20, x19, [sp, #16] +; Check that the frame pointer is created: +; CHECK: stp x29, x30, [sp, #32] +; CHECK: add x29, sp, #32 +; Check that the stack pointer gets re-aligned to 128 +; bytes & the base pointer (x19) gets initialized to +; this 128-byte aligned area for local variables & +; spill slots +; CHECK: sub x9, sp, #80 // =80 +; CHECK: and sp, x9, #0xffffffffffffff80 +; CHECK: mov x19, sp +; Check correctness of cfi pseudo-instructions +; CHECK: .cfi_def_cfa w29, 16 +; CHECK: .cfi_offset w30, -8 +; CHECK: .cfi_offset w29, -16 +; CHECK: .cfi_offset w19, -24 +; CHECK: .cfi_offset w20, -32 +; CHECK: .cfi_offset w21, -40 +; CHECK: .cfi_offset w22, -48 +; Check correct access to arguments passed on the stack, through frame pointer +; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24] +; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] +; Check correct reservation of 16-byte aligned VLA (size in w0) on stack +; and set-up of base pointer (x19). +; CHECK: ubfx x9, x0, #0, #32 +; CHECK: lsl x9, x9, #2 +; CHECK: add x9, x9, #15 +; CHECK: and x9, x9, #0xfffffffffffffff0 +; CHECK: mov x10, sp +; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9 +; CHECK: mov sp, x[[VLASPTMP]] +; Check correct access to local variable, through base pointer +; CHECK: ldr w[[ILOC:[0-9]+]], [x19] +; CHECK: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]] +; Check epilogue: +; Check that stack pointer get restored from frame pointer. +; CHECK: sub sp, x29, #32 +; CHECK: ldp x29, x30, [sp, #32] +; CHECK: ldp x20, x19, [sp, #16] +; CHECK: ldp x22, x21, [sp], #48 +; CHECK: ret +; CHECK: .cfi_endproc + + +; Function Attrs: nounwind +define i32 @vla_dynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 { +entry: + %l1 = alloca i32, align 128 + %0 = zext i32 %i1 to i64 + %vla = alloca i32, i64 %0, align 4 + %conv = fptosi double %d10 to i32 + %add = add nsw i32 %conv, %i10 + %l1.0.l1.0. = load volatile i32, i32* %l1, align 128 + %add1 = add nsw i32 %add, %l1.0.l1.0. + %1 = load volatile i32, i32* %vla, align 4, !tbaa !1 + %add2 = add nsw i32 %add1, %1 + ret i32 %add2 +} + +; CHECK-LABEL: vla_dynamicrealign_nocall +; Check that used callee-saved registers are saved +; CHECK: stp x20, x19, [sp, #-32]! +; Check that the frame pointer is created: +; CHECK: stp x29, x30, [sp, #16] +; CHECK: add x29, sp, #16 +; Check that the stack pointer gets re-aligned to 128 +; bytes & the base pointer (x19) gets initialized to +; this 128-byte aligned area for local variables & +; spill slots +; CHECK: sub x9, sp, #96 +; CHECK: and sp, x9, #0xffffffffffffff80 +; CHECK: mov x19, sp +; Check correct access to arguments passed on the stack, through frame pointer +; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24] +; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] +; Check correct reservation of 16-byte aligned VLA (size in w0) on stack +; and set-up of base pointer (x19). +; CHECK: ubfx x9, x0, #0, #32 +; CHECK: lsl x9, x9, #2 +; CHECK: add x9, x9, #15 +; CHECK: and x9, x9, #0xfffffffffffffff0 +; CHECK: mov x10, sp +; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9 +; CHECK: mov sp, x[[VLASPTMP]] +; Check correct access to local variable, through base pointer +; CHECK: ldr w[[ILOC:[0-9]+]], [x19] +; CHECK: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]] +; Check epilogue: +; Check that stack pointer get restored from frame pointer. +; CHECK: sub sp, x29, #16 +; CHECK: ldp x29, x30, [sp, #16] +; CHECK: ldp x20, x19, [sp], #32 +; CHECK: ret + + +; Function Attrs: nounwind +define i32 @vla_dynamicrealign_nocall_large_align(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 { +entry: + %l1 = alloca i32, align 32768 + %0 = zext i32 %i1 to i64 + %vla = alloca i32, i64 %0, align 4 + %conv = fptosi double %d10 to i32 + %add = add nsw i32 %conv, %i10 + %l1.0.l1.0. = load volatile i32, i32* %l1, align 32768 + %add1 = add nsw i32 %add, %l1.0.l1.0. + %1 = load volatile i32, i32* %vla, align 4, !tbaa !1 + %add2 = add nsw i32 %add1, %1 + ret i32 %add2 +} + +; CHECK-LABEL: vla_dynamicrealign_nocall_large_align +; Check that used callee-saved registers are saved +; CHECK: stp x20, x19, [sp, #-32]! +; Check that the frame pointer is created: +; CHECK: stp x29, x30, [sp, #16] +; CHECK: add x29, sp, #16 +; Check that the stack pointer gets re-aligned to 128 +; bytes & the base pointer (x19) gets initialized to +; this 128-byte aligned area for local variables & +; spill slots +; CHECK: sub x9, sp, #7, lsl #12 +; CHECK: and sp, x9, #0xffffffffffff8000 +; CHECK: mov x19, sp +; Check correct access to arguments passed on the stack, through frame pointer +; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24] +; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40] +; Check correct reservation of 16-byte aligned VLA (size in w0) on stack +; and set-up of base pointer (x19). +; CHECK: ubfx x9, x0, #0, #32 +; CHECK: lsl x9, x9, #2 +; CHECK: add x9, x9, #15 +; CHECK: and x9, x9, #0xfffffffffffffff0 +; CHECK: mov x10, sp +; CHECK: sub x[[VLASPTMP:[0-9]+]], x10, x9 +; CHECK: mov sp, x[[VLASPTMP]] +; Check correct access to local variable, through base pointer +; CHECK: ldr w[[ILOC:[0-9]+]], [x19] +; CHECK: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]] +; Check epilogue: +; Check that stack pointer get restored from frame pointer. +; CHECK: sub sp, x29, #16 +; CHECK: ldp x29, x30, [sp, #16] +; CHECK: ldp x20, x19, [sp], #32 +; CHECK: ret + +attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll index 09b9f62..d6350a6 100644 --- a/test/CodeGen/AArch64/addsub.ll +++ b/test/CodeGen/AArch64/addsub.ll @@ -24,6 +24,34 @@ define void @add_small() { ret void } +; Make sure we grab the imm variant when the register operand +; can be implicitly zero-extend. +; We used to generate something horrible like this: +; wA = ldrb +; xB = ldimm 12 +; xC = add xB, wA, uxtb +; whereas this can be achieved with: +; wA = ldrb +; xC = add xA, #12 ; <- xA implicitly zero extend wA. +define void @add_small_imm(i8* %p, i64* %q, i32 %b, i32* %addr) { +; CHECK-LABEL: add_small_imm: +entry: + +; CHECK: ldrb w[[LOAD32:[0-9]+]], [x0] + %t = load i8, i8* %p + %promoted = zext i8 %t to i64 + %zextt = zext i8 %t to i32 + %add = add nuw i32 %zextt, %b + +; CHECK: add [[ADD2:x[0-9]+]], x[[LOAD32]], #12 + %add2 = add nuw i64 %promoted, 12 + store i32 %add, i32* %addr + +; CHECK: str [[ADD2]], [x1] + store i64 %add2, i64* %q + ret void +} + ; Add 12-bit immediates, shifted left by 12 bits define void @add_med() { ; CHECK-LABEL: add_med: diff --git a/test/CodeGen/AArch64/argument-blocks.ll b/test/CodeGen/AArch64/argument-blocks.ll index f1dcfa6..3169abc 100644 --- a/test/CodeGen/AArch64/argument-blocks.ll +++ b/test/CodeGen/AArch64/argument-blocks.ll @@ -64,7 +64,7 @@ define void @test_varargs_stackalign() { ; CHECK-LABEL: test_varargs_stackalign: ; CHECK-DARWINPCS: stp {{w[0-9]+}}, {{w[0-9]+}}, [sp, #16] - call void(...)* @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0]) + call void(...) @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0]) ret void } diff --git a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll index 41e22e9..b760261 100644 --- a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll +++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll @@ -16,11 +16,11 @@ entry: %0 = load double, double* %d.addr, align 8 %1 = load double, double* %d.addr, align 8 %conv = fptoui double %1 to i64 - %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv) + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv) %2 = load double, double* %d.addr, align 8 %3 = load double, double* %d.addr, align 8 %conv1 = fptoui double %3 to i32 - %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1) + %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1) ret void } @@ -37,12 +37,12 @@ entry: %conv = fpext float %0 to double %1 = load float, float* %f.addr, align 4 %conv1 = fptoui float %1 to i64 - %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1) + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1) %2 = load float, float* %f.addr, align 4 %conv2 = fpext float %2 to double %3 = load float, float* %f.addr, align 4 %conv3 = fptoui float %3 to i32 - %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3) + %call4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3) ret void } diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll index 6266d1c..8784abd 100644 --- a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll +++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll @@ -7,13 +7,13 @@ define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone { ; CHECK-LABEL: bar: ; CHECK: add.2d v[[REG:[0-9]+]], v0, v1 ; CHECK: add d[[REG3:[0-9]+]], d[[REG]], d1 +; CHECK: sub d[[REG2:[0-9]+]], d[[REG]], d1 ; Without advanced copy optimization, we end up with cross register ; banks copies that cannot be coalesced. ; CHECK-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]] ; With advanced copy optimization, we end up with just one copy ; to insert the computed high part into the V register. ; CHECK-OPT-NOT: fmov -; CHECK: sub d[[REG2:[0-9]+]], d[[REG]], d1 ; CHECK: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]] ; CHECK-NOOPT: fmov d0, [[COPY_REG3]] ; CHECK-OPT-NOT: fmov @@ -23,9 +23,9 @@ define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone { ; GENERIC-LABEL: bar: ; GENERIC: add v[[REG:[0-9]+]].2d, v0.2d, v1.2d ; GENERIC: add d[[REG3:[0-9]+]], d[[REG]], d1 +; GENERIC: sub d[[REG2:[0-9]+]], d[[REG]], d1 ; GENERIC-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]] ; GENERIC-OPT-NOT: fmov -; GENERIC: sub d[[REG2:[0-9]+]], d[[REG]], d1 ; GENERIC: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]] ; GENERIC-NOOPT: fmov d0, [[COPY_REG3]] ; GENERIC-OPT-NOT: fmov diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll index 41c3ad5..390a3c7 100644 --- a/test/CodeGen/AArch64/arm64-aapcs.ll +++ b/test/CodeGen/AArch64/arm64-aapcs.ll @@ -78,7 +78,7 @@ declare void @variadic(i32 %a, ...) ; Under AAPCS variadic functions have the same calling convention as ; others. The extra arguments should go in registers rather than on the stack. define void @test_variadic() { - call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0) + call void(i32, ...) @variadic(i32 0, i64 1, double 2.0) ; CHECK: fmov d0, #2.0 ; CHECK: orr w1, wzr, #0x1 ; CHECK: bl variadic diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll index f95fec6..03414b5 100644 --- a/test/CodeGen/AArch64/arm64-abi-varargs.ll +++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll @@ -94,7 +94,7 @@ define i32 @main() nounwind ssp { %10 = load i32, i32* %a10, align 4 %11 = load i32, i32* %a11, align 4 %12 = load i32, i32* %a12, align 4 - call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12) + call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12) ret i32 0 } @@ -133,7 +133,7 @@ entry: store <4 x i32> %y, <4 x i32>* %y.addr, align 16 %0 = load i32, i32* %x.addr, align 4 %1 = load <4 x i32>, <4 x i32>* %y.addr, align 16 - call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1) + call void (i8*, ...) @foo(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1) ret void } @@ -186,6 +186,6 @@ entry: %1 = load i32, i32* %x.addr, align 4 %2 = bitcast %struct.s41* %s41 to i128* %3 = load i128, i128* %2, align 1 - call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3) + call void (i8*, ...) @foo2(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3) ret void } diff --git a/test/CodeGen/AArch64/arm64-anyregcc-crash.ll b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll index 241cf97..56c62d5 100644 --- a/test/CodeGen/AArch64/arm64-anyregcc-crash.ll +++ b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll @@ -8,7 +8,7 @@ define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i6 i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24, i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) { entry: - %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32, + %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32, i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16, i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24, diff --git a/test/CodeGen/AArch64/arm64-anyregcc.ll b/test/CodeGen/AArch64/arm64-anyregcc.ll index e26875d..2a2f451 100644 --- a/test/CodeGen/AArch64/arm64-anyregcc.ll +++ b/test/CodeGen/AArch64/arm64-anyregcc.ll @@ -55,7 +55,7 @@ ; CHECK-NEXT: .long 3 define i64 @test() nounwind ssp uwtable { entry: - call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3) + call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3) ret i64 0 } @@ -77,7 +77,7 @@ entry: define i64 @property_access1(i8* %obj) nounwind ssp uwtable { entry: %f = inttoptr i64 281474417671919 to i8* - %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj) + %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj) ret i64 %ret } @@ -100,7 +100,7 @@ define i64 @property_access2() nounwind ssp uwtable { entry: %obj = alloca i64, align 8 %f = inttoptr i64 281474417671919 to i8* - %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj) + %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj) ret i64 %ret } @@ -123,7 +123,7 @@ define i64 @property_access3() nounwind ssp uwtable { entry: %obj = alloca i64, align 8 %f = inttoptr i64 281474417671919 to i8* - %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj) + %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj) ret i64 %ret } @@ -205,7 +205,7 @@ entry: define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable { entry: %f = inttoptr i64 281474417671919 to i8* - %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) + %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) ret i64 %ret } @@ -287,7 +287,7 @@ entry: define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable { entry: %f = inttoptr i64 281474417671919 to i8* - %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) + %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) ret i64 %ret } @@ -315,7 +315,7 @@ entry: ; CHECK-NEXT: .long 0 define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) { entry: - %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2) + %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2) tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind ret i64 %result } @@ -355,7 +355,7 @@ entry: define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) { entry: tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind - %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4) + %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4) ret i64 %result } diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll index c280bef..d089767 100644 --- a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll +++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll @@ -1,6 +1,10 @@ ; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s ; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s +; Note, we split the functions in to multiple BBs below to isolate the call +; instruction we want to test, from fast-isel failing to select instructions +; after it. + ; CHECK-LABEL: test_i64_f64: declare i64 @test_i64_f64_helper(double %p) define void @test_i64_f64(double* %p, i64* %q) { @@ -8,6 +12,8 @@ define void @test_i64_f64(double* %p, i64* %q) { %1 = load double, double* %p %2 = fadd double %1, %1 %3 = call i64 @test_i64_f64_helper(double %2) + br label %return_bb +return_bb: %4 = add i64 %3, %3 store i64 %4, i64* %q ret void @@ -20,6 +26,8 @@ define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) { %1 = load <1 x i64>, <1 x i64>* %p %2 = add <1 x i64> %1, %1 %3 = call i64 @test_i64_v1i64_helper(<1 x i64> %2) + br label %return_bb +return_bb: %4 = add i64 %3, %3 store i64 %4, i64* %q ret void @@ -32,6 +40,8 @@ define void @test_i64_v2f32(<2 x float>* %p, i64* %q) { %1 = load <2 x float>, <2 x float>* %p %2 = fadd <2 x float> %1, %1 %3 = call i64 @test_i64_v2f32_helper(<2 x float> %2) + br label %return_bb +return_bb: %4 = add i64 %3, %3 store i64 %4, i64* %q ret void @@ -44,6 +54,8 @@ define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) { %1 = load <2 x i32>, <2 x i32>* %p %2 = add <2 x i32> %1, %1 %3 = call i64 @test_i64_v2i32_helper(<2 x i32> %2) + br label %return_bb +return_bb: %4 = add i64 %3, %3 store i64 %4, i64* %q ret void @@ -56,6 +68,8 @@ define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) { %1 = load <4 x i16>, <4 x i16>* %p %2 = add <4 x i16> %1, %1 %3 = call i64 @test_i64_v4i16_helper(<4 x i16> %2) + br label %return_bb +return_bb: %4 = add i64 %3, %3 store i64 %4, i64* %q ret void @@ -68,6 +82,8 @@ define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) { %1 = load <8 x i8>, <8 x i8>* %p %2 = add <8 x i8> %1, %1 %3 = call i64 @test_i64_v8i8_helper(<8 x i8> %2) + br label %return_bb +return_bb: %4 = add i64 %3, %3 store i64 %4, i64* %q ret void @@ -80,6 +96,8 @@ define void @test_f64_i64(i64* %p, double* %q) { %1 = load i64, i64* %p %2 = add i64 %1, %1 %3 = call double @test_f64_i64_helper(i64 %2) + br label %return_bb +return_bb: %4 = fadd double %3, %3 store double %4, double* %q ret void @@ -92,6 +110,8 @@ define void @test_f64_v1i64(<1 x i64>* %p, double* %q) { %1 = load <1 x i64>, <1 x i64>* %p %2 = add <1 x i64> %1, %1 %3 = call double @test_f64_v1i64_helper(<1 x i64> %2) + br label %return_bb +return_bb: %4 = fadd double %3, %3 store double %4, double* %q ret void @@ -104,6 +124,8 @@ define void @test_f64_v2f32(<2 x float>* %p, double* %q) { %1 = load <2 x float>, <2 x float>* %p %2 = fadd <2 x float> %1, %1 %3 = call double @test_f64_v2f32_helper(<2 x float> %2) + br label %return_bb +return_bb: %4 = fadd double %3, %3 store double %4, double* %q ret void @@ -116,6 +138,8 @@ define void @test_f64_v2i32(<2 x i32>* %p, double* %q) { %1 = load <2 x i32>, <2 x i32>* %p %2 = add <2 x i32> %1, %1 %3 = call double @test_f64_v2i32_helper(<2 x i32> %2) + br label %return_bb +return_bb: %4 = fadd double %3, %3 store double %4, double* %q ret void @@ -128,6 +152,8 @@ define void @test_f64_v4i16(<4 x i16>* %p, double* %q) { %1 = load <4 x i16>, <4 x i16>* %p %2 = add <4 x i16> %1, %1 %3 = call double @test_f64_v4i16_helper(<4 x i16> %2) + br label %return_bb +return_bb: %4 = fadd double %3, %3 store double %4, double* %q ret void @@ -140,6 +166,8 @@ define void @test_f64_v8i8(<8 x i8>* %p, double* %q) { %1 = load <8 x i8>, <8 x i8>* %p %2 = add <8 x i8> %1, %1 %3 = call double @test_f64_v8i8_helper(<8 x i8> %2) + br label %return_bb +return_bb: %4 = fadd double %3, %3 store double %4, double* %q ret void @@ -152,6 +180,8 @@ define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) { %1 = load i64, i64* %p %2 = add i64 %1, %1 %3 = call <1 x i64> @test_v1i64_i64_helper(i64 %2) + br label %return_bb +return_bb: %4 = add <1 x i64> %3, %3 store <1 x i64> %4, <1 x i64>* %q ret void @@ -164,6 +194,8 @@ define void @test_v1i64_f64(double* %p, <1 x i64>* %q) { %1 = load double, double* %p %2 = fadd double %1, %1 %3 = call <1 x i64> @test_v1i64_f64_helper(double %2) + br label %return_bb +return_bb: %4 = add <1 x i64> %3, %3 store <1 x i64> %4, <1 x i64>* %q ret void @@ -176,6 +208,8 @@ define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) { %1 = load <2 x float>, <2 x float>* %p %2 = fadd <2 x float> %1, %1 %3 = call <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %2) + br label %return_bb +return_bb: %4 = add <1 x i64> %3, %3 store <1 x i64> %4, <1 x i64>* %q ret void @@ -188,6 +222,8 @@ define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) { %1 = load <2 x i32>, <2 x i32>* %p %2 = add <2 x i32> %1, %1 %3 = call <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %2) + br label %return_bb +return_bb: %4 = add <1 x i64> %3, %3 store <1 x i64> %4, <1 x i64>* %q ret void @@ -200,6 +236,8 @@ define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) { %1 = load <4 x i16>, <4 x i16>* %p %2 = add <4 x i16> %1, %1 %3 = call <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %2) + br label %return_bb +return_bb: %4 = add <1 x i64> %3, %3 store <1 x i64> %4, <1 x i64>* %q ret void @@ -212,6 +250,8 @@ define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) { %1 = load <8 x i8>, <8 x i8>* %p %2 = add <8 x i8> %1, %1 %3 = call <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %2) + br label %return_bb +return_bb: %4 = add <1 x i64> %3, %3 store <1 x i64> %4, <1 x i64>* %q ret void @@ -224,6 +264,8 @@ define void @test_v2f32_i64(i64* %p, <2 x float>* %q) { %1 = load i64, i64* %p %2 = add i64 %1, %1 %3 = call <2 x float> @test_v2f32_i64_helper(i64 %2) + br label %return_bb +return_bb: %4 = fadd <2 x float> %3, %3 store <2 x float> %4, <2 x float>* %q ret void @@ -236,6 +278,8 @@ define void @test_v2f32_f64(double* %p, <2 x float>* %q) { %1 = load double, double* %p %2 = fadd double %1, %1 %3 = call <2 x float> @test_v2f32_f64_helper(double %2) + br label %return_bb +return_bb: %4 = fadd <2 x float> %3, %3 store <2 x float> %4, <2 x float>* %q ret void @@ -248,6 +292,8 @@ define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) { %1 = load <1 x i64>, <1 x i64>* %p %2 = add <1 x i64> %1, %1 %3 = call <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %2) + br label %return_bb +return_bb: %4 = fadd <2 x float> %3, %3 store <2 x float> %4, <2 x float>* %q ret void @@ -261,6 +307,8 @@ define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) { %1 = load <2 x i32>, <2 x i32>* %p %2 = add <2 x i32> %1, %1 %3 = call <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %2) + br label %return_bb +return_bb: %4 = fadd <2 x float> %3, %3 store <2 x float> %4, <2 x float>* %q ret void @@ -274,6 +322,8 @@ define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) { %1 = load <4 x i16>, <4 x i16>* %p %2 = add <4 x i16> %1, %1 %3 = call <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %2) + br label %return_bb +return_bb: %4 = fadd <2 x float> %3, %3 store <2 x float> %4, <2 x float>* %q ret void @@ -287,6 +337,8 @@ define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) { %1 = load <8 x i8>, <8 x i8>* %p %2 = add <8 x i8> %1, %1 %3 = call <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %2) + br label %return_bb +return_bb: %4 = fadd <2 x float> %3, %3 store <2 x float> %4, <2 x float>* %q ret void @@ -299,6 +351,8 @@ define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) { %1 = load i64, i64* %p %2 = add i64 %1, %1 %3 = call <2 x i32> @test_v2i32_i64_helper(i64 %2) + br label %return_bb +return_bb: %4 = add <2 x i32> %3, %3 store <2 x i32> %4, <2 x i32>* %q ret void @@ -311,6 +365,8 @@ define void @test_v2i32_f64(double* %p, <2 x i32>* %q) { %1 = load double, double* %p %2 = fadd double %1, %1 %3 = call <2 x i32> @test_v2i32_f64_helper(double %2) + br label %return_bb +return_bb: %4 = add <2 x i32> %3, %3 store <2 x i32> %4, <2 x i32>* %q ret void @@ -323,6 +379,8 @@ define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) { %1 = load <1 x i64>, <1 x i64>* %p %2 = add <1 x i64> %1, %1 %3 = call <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %2) + br label %return_bb +return_bb: %4 = add <2 x i32> %3, %3 store <2 x i32> %4, <2 x i32>* %q ret void @@ -336,6 +394,8 @@ define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) { %1 = load <2 x float>, <2 x float>* %p %2 = fadd <2 x float> %1, %1 %3 = call <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %2) + br label %return_bb +return_bb: %4 = add <2 x i32> %3, %3 store <2 x i32> %4, <2 x i32>* %q ret void @@ -349,6 +409,8 @@ define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) { %1 = load <4 x i16>, <4 x i16>* %p %2 = add <4 x i16> %1, %1 %3 = call <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %2) + br label %return_bb +return_bb: %4 = add <2 x i32> %3, %3 store <2 x i32> %4, <2 x i32>* %q ret void @@ -362,6 +424,8 @@ define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) { %1 = load <8 x i8>, <8 x i8>* %p %2 = add <8 x i8> %1, %1 %3 = call <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %2) + br label %return_bb +return_bb: %4 = add <2 x i32> %3, %3 store <2 x i32> %4, <2 x i32>* %q ret void @@ -374,6 +438,8 @@ define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) { %1 = load i64, i64* %p %2 = add i64 %1, %1 %3 = call <4 x i16> @test_v4i16_i64_helper(i64 %2) + br label %return_bb +return_bb: %4 = add <4 x i16> %3, %3 store <4 x i16> %4, <4 x i16>* %q ret void @@ -386,6 +452,8 @@ define void @test_v4i16_f64(double* %p, <4 x i16>* %q) { %1 = load double, double* %p %2 = fadd double %1, %1 %3 = call <4 x i16> @test_v4i16_f64_helper(double %2) + br label %return_bb +return_bb: %4 = add <4 x i16> %3, %3 store <4 x i16> %4, <4 x i16>* %q ret void @@ -398,6 +466,8 @@ define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) { %1 = load <1 x i64>, <1 x i64>* %p %2 = add <1 x i64> %1, %1 %3 = call <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %2) + br label %return_bb +return_bb: %4 = add <4 x i16> %3, %3 store <4 x i16> %4, <4 x i16>* %q ret void @@ -411,6 +481,8 @@ define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) { %1 = load <2 x float>, <2 x float>* %p %2 = fadd <2 x float> %1, %1 %3 = call <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %2) + br label %return_bb +return_bb: %4 = add <4 x i16> %3, %3 store <4 x i16> %4, <4 x i16>* %q ret void @@ -424,6 +496,8 @@ define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) { %1 = load <2 x i32>, <2 x i32>* %p %2 = add <2 x i32> %1, %1 %3 = call <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %2) + br label %return_bb +return_bb: %4 = add <4 x i16> %3, %3 store <4 x i16> %4, <4 x i16>* %q ret void @@ -437,6 +511,8 @@ define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) { %1 = load <8 x i8>, <8 x i8>* %p %2 = add <8 x i8> %1, %1 %3 = call <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %2) + br label %return_bb +return_bb: %4 = add <4 x i16> %3, %3 store <4 x i16> %4, <4 x i16>* %q ret void @@ -449,6 +525,8 @@ define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) { %1 = load i64, i64* %p %2 = add i64 %1, %1 %3 = call <8 x i8> @test_v8i8_i64_helper(i64 %2) + br label %return_bb +return_bb: %4 = add <8 x i8> %3, %3 store <8 x i8> %4, <8 x i8>* %q ret void @@ -461,6 +539,8 @@ define void @test_v8i8_f64(double* %p, <8 x i8>* %q) { %1 = load double, double* %p %2 = fadd double %1, %1 %3 = call <8 x i8> @test_v8i8_f64_helper(double %2) + br label %return_bb +return_bb: %4 = add <8 x i8> %3, %3 store <8 x i8> %4, <8 x i8>* %q ret void @@ -473,6 +553,8 @@ define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) { %1 = load <1 x i64>, <1 x i64>* %p %2 = add <1 x i64> %1, %1 %3 = call <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %2) + br label %return_bb +return_bb: %4 = add <8 x i8> %3, %3 store <8 x i8> %4, <8 x i8>* %q ret void @@ -486,6 +568,8 @@ define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) { %1 = load <2 x float>, <2 x float>* %p %2 = fadd <2 x float> %1, %1 %3 = call <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %2) + br label %return_bb +return_bb: %4 = add <8 x i8> %3, %3 store <8 x i8> %4, <8 x i8>* %q ret void @@ -499,6 +583,8 @@ define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) { %1 = load <2 x i32>, <2 x i32>* %p %2 = add <2 x i32> %1, %1 %3 = call <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %2) + br label %return_bb +return_bb: %4 = add <8 x i8> %3, %3 store <8 x i8> %4, <8 x i8>* %q ret void @@ -512,6 +598,8 @@ define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) { %1 = load <4 x i16>, <4 x i16>* %p %2 = add <4 x i16> %1, %1 %3 = call <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %2) + br label %return_bb +return_bb: %4 = add <8 x i8> %3, %3 store <8 x i8> %4, <8 x i8>* %q ret void @@ -524,6 +612,8 @@ define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) { %1 = load <2 x double>, <2 x double>* %p %2 = fadd <2 x double> %1, %1 %3 = call fp128 @test_f128_v2f64_helper(<2 x double> %2) + br label %return_bb +return_bb: %4 = fadd fp128 %3, %3 store fp128 %4, fp128* %q ret void @@ -536,6 +626,8 @@ define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) { %1 = load <2 x i64>, <2 x i64>* %p %2 = add <2 x i64> %1, %1 %3 = call fp128 @test_f128_v2i64_helper(<2 x i64> %2) + br label %return_bb +return_bb: %4 = fadd fp128 %3, %3 store fp128 %4, fp128* %q ret void @@ -549,6 +641,8 @@ define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) { %1 = load <4 x float>, <4 x float>* %p %2 = fadd <4 x float> %1, %1 %3 = call fp128 @test_f128_v4f32_helper(<4 x float> %2) + br label %return_bb +return_bb: %4 = fadd fp128 %3, %3 store fp128 %4, fp128* %q ret void @@ -562,6 +656,8 @@ define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) { %1 = load <4 x i32>, <4 x i32>* %p %2 = add <4 x i32> %1, %1 %3 = call fp128 @test_f128_v4i32_helper(<4 x i32> %2) + br label %return_bb +return_bb: %4 = fadd fp128 %3, %3 store fp128 %4, fp128* %q ret void @@ -575,6 +671,8 @@ define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) { %1 = load <8 x i16>, <8 x i16>* %p %2 = add <8 x i16> %1, %1 %3 = call fp128 @test_f128_v8i16_helper(<8 x i16> %2) + br label %return_bb +return_bb: %4 = fadd fp128 %3, %3 store fp128 %4, fp128* %q ret void @@ -588,6 +686,8 @@ define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) { %1 = load <16 x i8>, <16 x i8>* %p %2 = add <16 x i8> %1, %1 %3 = call fp128 @test_f128_v16i8_helper(<16 x i8> %2) + br label %return_bb +return_bb: %4 = fadd fp128 %3, %3 store fp128 %4, fp128* %q ret void @@ -600,6 +700,8 @@ define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) { %1 = load fp128, fp128* %p %2 = fadd fp128 %1, %1 %3 = call <2 x double> @test_v2f64_f128_helper(fp128 %2) + br label %return_bb +return_bb: %4 = fadd <2 x double> %3, %3 store <2 x double> %4, <2 x double>* %q ret void @@ -613,6 +715,8 @@ define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) { %1 = load <2 x i64>, <2 x i64>* %p %2 = add <2 x i64> %1, %1 %3 = call <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %2) + br label %return_bb +return_bb: %4 = fadd <2 x double> %3, %3 store <2 x double> %4, <2 x double>* %q ret void @@ -627,6 +731,8 @@ define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) { %1 = load <4 x float>, <4 x float>* %p %2 = fadd <4 x float> %1, %1 %3 = call <2 x double> @test_v2f64_v4f32_helper(<4 x float> %2) + br label %return_bb +return_bb: %4 = fadd <2 x double> %3, %3 store <2 x double> %4, <2 x double>* %q ret void @@ -641,6 +747,8 @@ define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) { %1 = load <4 x i32>, <4 x i32>* %p %2 = add <4 x i32> %1, %1 %3 = call <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %2) + br label %return_bb +return_bb: %4 = fadd <2 x double> %3, %3 store <2 x double> %4, <2 x double>* %q ret void @@ -655,6 +763,8 @@ define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) { %1 = load <8 x i16>, <8 x i16>* %p %2 = add <8 x i16> %1, %1 %3 = call <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %2) + br label %return_bb +return_bb: %4 = fadd <2 x double> %3, %3 store <2 x double> %4, <2 x double>* %q ret void @@ -669,6 +779,8 @@ define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) { %1 = load <16 x i8>, <16 x i8>* %p %2 = add <16 x i8> %1, %1 %3 = call <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %2) + br label %return_bb +return_bb: %4 = fadd <2 x double> %3, %3 store <2 x double> %4, <2 x double>* %q ret void @@ -681,6 +793,8 @@ define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) { %1 = load fp128, fp128* %p %2 = fadd fp128 %1, %1 %3 = call <2 x i64> @test_v2i64_f128_helper(fp128 %2) + br label %return_bb +return_bb: %4 = add <2 x i64> %3, %3 store <2 x i64> %4, <2 x i64>* %q ret void @@ -694,6 +808,8 @@ define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) { %1 = load <2 x double>, <2 x double>* %p %2 = fadd <2 x double> %1, %1 %3 = call <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %2) + br label %return_bb +return_bb: %4 = add <2 x i64> %3, %3 store <2 x i64> %4, <2 x i64>* %q ret void @@ -708,6 +824,8 @@ define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) { %1 = load <4 x float>, <4 x float>* %p %2 = fadd <4 x float> %1, %1 %3 = call <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %2) + br label %return_bb +return_bb: %4 = add <2 x i64> %3, %3 store <2 x i64> %4, <2 x i64>* %q ret void @@ -722,6 +840,8 @@ define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) { %1 = load <4 x i32>, <4 x i32>* %p %2 = add <4 x i32> %1, %1 %3 = call <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %2) + br label %return_bb +return_bb: %4 = add <2 x i64> %3, %3 store <2 x i64> %4, <2 x i64>* %q ret void @@ -736,6 +856,8 @@ define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) { %1 = load <8 x i16>, <8 x i16>* %p %2 = add <8 x i16> %1, %1 %3 = call <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %2) + br label %return_bb +return_bb: %4 = add <2 x i64> %3, %3 store <2 x i64> %4, <2 x i64>* %q ret void @@ -750,6 +872,8 @@ define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) { %1 = load <16 x i8>, <16 x i8>* %p %2 = add <16 x i8> %1, %1 %3 = call <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %2) + br label %return_bb +return_bb: %4 = add <2 x i64> %3, %3 store <2 x i64> %4, <2 x i64>* %q ret void @@ -763,6 +887,8 @@ define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) { %1 = load fp128, fp128* %p %2 = fadd fp128 %1, %1 %3 = call <4 x float> @test_v4f32_f128_helper(fp128 %2) + br label %return_bb +return_bb: %4 = fadd <4 x float> %3, %3 store <4 x float> %4, <4 x float>* %q ret void @@ -777,6 +903,8 @@ define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) { %1 = load <2 x double>, <2 x double>* %p %2 = fadd <2 x double> %1, %1 %3 = call <4 x float> @test_v4f32_v2f64_helper(<2 x double> %2) + br label %return_bb +return_bb: %4 = fadd <4 x float> %3, %3 store <4 x float> %4, <4 x float>* %q ret void @@ -791,6 +919,8 @@ define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) { %1 = load <2 x i64>, <2 x i64>* %p %2 = add <2 x i64> %1, %1 %3 = call <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %2) + br label %return_bb +return_bb: %4 = fadd <4 x float> %3, %3 store <4 x float> %4, <4 x float>* %q ret void @@ -806,6 +936,8 @@ define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) { %1 = load <4 x i32>, <4 x i32>* %p %2 = add <4 x i32> %1, %1 %3 = call <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %2) + br label %return_bb +return_bb: %4 = fadd <4 x float> %3, %3 store <4 x float> %4, <4 x float>* %q ret void @@ -821,6 +953,8 @@ define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) { %1 = load <8 x i16>, <8 x i16>* %p %2 = add <8 x i16> %1, %1 %3 = call <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %2) + br label %return_bb +return_bb: %4 = fadd <4 x float> %3, %3 store <4 x float> %4, <4 x float>* %q ret void @@ -836,6 +970,8 @@ define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) { %1 = load <16 x i8>, <16 x i8>* %p %2 = add <16 x i8> %1, %1 %3 = call <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %2) + br label %return_bb +return_bb: %4 = fadd <4 x float> %3, %3 store <4 x float> %4, <4 x float>* %q ret void @@ -849,6 +985,8 @@ define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) { %1 = load fp128, fp128* %p %2 = fadd fp128 %1, %1 %3 = call <4 x i32> @test_v4i32_f128_helper(fp128 %2) + br label %return_bb +return_bb: %4 = add <4 x i32> %3, %3 store <4 x i32> %4, <4 x i32>* %q ret void @@ -863,6 +1001,8 @@ define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) { %1 = load <2 x double>, <2 x double>* %p %2 = fadd <2 x double> %1, %1 %3 = call <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %2) + br label %return_bb +return_bb: %4 = add <4 x i32> %3, %3 store <4 x i32> %4, <4 x i32>* %q ret void @@ -877,6 +1017,8 @@ define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) { %1 = load <2 x i64>, <2 x i64>* %p %2 = add <2 x i64> %1, %1 %3 = call <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %2) + br label %return_bb +return_bb: %4 = add <4 x i32> %3, %3 store <4 x i32> %4, <4 x i32>* %q ret void @@ -892,6 +1034,8 @@ define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) { %1 = load <4 x float>, <4 x float>* %p %2 = fadd <4 x float> %1, %1 %3 = call <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %2) + br label %return_bb +return_bb: %4 = add <4 x i32> %3, %3 store <4 x i32> %4, <4 x i32>* %q ret void @@ -907,6 +1051,8 @@ define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) { %1 = load <8 x i16>, <8 x i16>* %p %2 = add <8 x i16> %1, %1 %3 = call <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %2) + br label %return_bb +return_bb: %4 = add <4 x i32> %3, %3 store <4 x i32> %4, <4 x i32>* %q ret void @@ -922,6 +1068,8 @@ define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) { %1 = load <16 x i8>, <16 x i8>* %p %2 = add <16 x i8> %1, %1 %3 = call <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %2) + br label %return_bb +return_bb: %4 = add <4 x i32> %3, %3 store <4 x i32> %4, <4 x i32>* %q ret void @@ -935,6 +1083,8 @@ define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) { %1 = load fp128, fp128* %p %2 = fadd fp128 %1, %1 %3 = call <8 x i16> @test_v8i16_f128_helper(fp128 %2) + br label %return_bb +return_bb: %4 = add <8 x i16> %3, %3 store <8 x i16> %4, <8 x i16>* %q ret void @@ -949,6 +1099,8 @@ define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) { %1 = load <2 x double>, <2 x double>* %p %2 = fadd <2 x double> %1, %1 %3 = call <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %2) + br label %return_bb +return_bb: %4 = add <8 x i16> %3, %3 store <8 x i16> %4, <8 x i16>* %q ret void @@ -963,6 +1115,8 @@ define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) { %1 = load <2 x i64>, <2 x i64>* %p %2 = add <2 x i64> %1, %1 %3 = call <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %2) + br label %return_bb +return_bb: %4 = add <8 x i16> %3, %3 store <8 x i16> %4, <8 x i16>* %q ret void @@ -978,6 +1132,8 @@ define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) { %1 = load <4 x float>, <4 x float>* %p %2 = fadd <4 x float> %1, %1 %3 = call <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %2) + br label %return_bb +return_bb: %4 = add <8 x i16> %3, %3 store <8 x i16> %4, <8 x i16>* %q ret void @@ -993,6 +1149,8 @@ define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) { %1 = load <4 x i32>, <4 x i32>* %p %2 = add <4 x i32> %1, %1 %3 = call <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %2) + br label %return_bb +return_bb: %4 = add <8 x i16> %3, %3 store <8 x i16> %4, <8 x i16>* %q ret void @@ -1008,6 +1166,8 @@ define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) { %1 = load <16 x i8>, <16 x i8>* %p %2 = add <16 x i8> %1, %1 %3 = call <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %2) + br label %return_bb +return_bb: %4 = add <8 x i16> %3, %3 store <8 x i16> %4, <8 x i16>* %q ret void @@ -1021,6 +1181,8 @@ define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) { %1 = load fp128, fp128* %p %2 = fadd fp128 %1, %1 %3 = call <16 x i8> @test_v16i8_f128_helper(fp128 %2) + br label %return_bb +return_bb: %4 = add <16 x i8> %3, %3 store <16 x i8> %4, <16 x i8>* %q ret void @@ -1035,6 +1197,8 @@ define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) { %1 = load <2 x double>, <2 x double>* %p %2 = fadd <2 x double> %1, %1 %3 = call <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %2) + br label %return_bb +return_bb: %4 = add <16 x i8> %3, %3 store <16 x i8> %4, <16 x i8>* %q ret void @@ -1049,6 +1213,8 @@ define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) { %1 = load <2 x i64>, <2 x i64>* %p %2 = add <2 x i64> %1, %1 %3 = call <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %2) + br label %return_bb +return_bb: %4 = add <16 x i8> %3, %3 store <16 x i8> %4, <16 x i8>* %q ret void @@ -1064,6 +1230,8 @@ define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) { %1 = load <4 x float>, <4 x float>* %p %2 = fadd <4 x float> %1, %1 %3 = call <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %2) + br label %return_bb +return_bb: %4 = add <16 x i8> %3, %3 store <16 x i8> %4, <16 x i8>* %q ret void @@ -1079,6 +1247,8 @@ define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) { %1 = load <4 x i32>, <4 x i32>* %p %2 = add <4 x i32> %1, %1 %3 = call <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %2) + br label %return_bb +return_bb: %4 = add <16 x i8> %3, %3 store <16 x i8> %4, <16 x i8>* %q ret void @@ -1094,6 +1264,8 @@ define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) { %1 = load <8 x i16>, <8 x i16>* %p %2 = add <8 x i16> %1, %1 %3 = call <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %2) + br label %return_bb +return_bb: %4 = add <16 x i8> %3, %3 store <16 x i8> %4, <16 x i8>* %q ret void diff --git a/test/CodeGen/AArch64/arm64-call-tailcalls.ll b/test/CodeGen/AArch64/arm64-call-tailcalls.ll index 71d9327..6621db2 100644 --- a/test/CodeGen/AArch64/arm64-call-tailcalls.ll +++ b/test/CodeGen/AArch64/arm64-call-tailcalls.ll @@ -53,9 +53,9 @@ bb: ; preds = %entry define i32 @t8(i32 %x) nounwind ssp { ; CHECK-LABEL: t8: +; CHECK: b _c ; CHECK: b _a ; CHECK: b _b -; CHECK: b _c %and = and i32 %x, 1 %tobool = icmp eq i32 %and, 0 br i1 %tobool, label %if.end, label %if.then diff --git a/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll b/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll new file mode 100644 index 0000000..f0b8299 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll @@ -0,0 +1,638 @@ +; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=NONSTRESS +; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S -stress-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=STRESS +; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S -disable-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=DISABLE + +; CodeGenPrepare should move the zext into the block with the load +; so that SelectionDAG can select it with the load. +; +; OPTALL-LABEL: @foo +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; OPTALL-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; OPTALL: store i32 [[ZEXT]], i32* %q +; OPTALL: ret +define void @foo(i8* %p, i32* %q) { +entry: + %t = load i8, i8* %p + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = zext i8 %t to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to form a zextload is an operation with only one +; argument to explicitly extend is in the the way. +; OPTALL-LABEL: @promoteOneArg +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; OPT-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT]], 2 +; Make sure the operation is not promoted when the promotion pass is disabled. +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], 2 +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32 +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteOneArg(i8* %p, i32* %q) { +entry: + %t = load i8, i8* %p + %add = add nuw i8 %t, 2 + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = zext i8 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to form a sextload is an operation with only one +; argument to explicitly extend is in the the way. +; Version with sext. +; OPTALL-LABEL: @promoteOneArgSExt +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32 +; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXT]], 2 +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], 2 +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32 +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteOneArgSExt(i8* %p, i32* %q) { +entry: + %t = load i8, i8* %p + %add = add nsw i8 %t, 2 + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = sext i8 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to form a zextload is an operation with two +; arguments to explicitly extend is in the the way. +; Extending %add will create two extensions: +; 1. One for %b. +; 2. One for %t. +; #1 will not be removed as we do not know anything about %b. +; #2 may not be merged with the load because %t is used in a comparison. +; Since two extensions may be emitted in the end instead of one before the +; transformation, the regular heuristic does not apply the optimization. +; +; OPTALL-LABEL: @promoteTwoArgZext +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32 +; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]] +; +; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b +; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32 +; +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32 +; +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteTwoArgZext(i8* %p, i32* %q, i8 %b) { +entry: + %t = load i8, i8* %p + %add = add nuw i8 %t, %b + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = zext i8 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to form a sextload is an operation with two +; arguments to explicitly extend is in the the way. +; Version with sext. +; OPTALL-LABEL: @promoteTwoArgSExt +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; STRESS-NEXT: [[SEXTLD:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32 +; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i8 %b to i32 +; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXTLD]], [[SEXTB]] +; +; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b +; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32 +; +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32 +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteTwoArgSExt(i8* %p, i32* %q, i8 %b) { +entry: + %t = load i8, i8* %p + %add = add nsw i8 %t, %b + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = sext i8 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we do not a zextload if we need to introduce more than +; one additional extension. +; OPTALL-LABEL: @promoteThreeArgZext +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32 +; STRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]] +; STRESS-NEXT: [[ZEXTC:%[a-zA-Z_0-9-]+]] = zext i8 %c to i32 +; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[TMP]], [[ZEXTC]] +; +; NONSTRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b +; NONSTRESS-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[TMP]], %c +; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32 +; +; DISABLE: add nuw i8 +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32 +; +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteThreeArgZext(i8* %p, i32* %q, i8 %b, i8 %c) { +entry: + %t = load i8, i8* %p + %tmp = add nuw i8 %t, %b + %add = add nuw i8 %tmp, %c + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = zext i8 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to form a zextload after promoting and merging +; two extensions. +; OPTALL-LABEL: @promoteMergeExtArgZExt +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i16 %b to i32 +; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]] +; +; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16 +; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b +; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32 +; +; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16 +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32 +; +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteMergeExtArgZExt(i8* %p, i32* %q, i16 %b) { +entry: + %t = load i8, i8* %p + %ext = zext i8 %t to i16 + %add = add nuw i16 %ext, %b + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = zext i16 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to form a sextload after promoting and merging +; two extensions. +; Version with sext. +; OPTALL-LABEL: @promoteMergeExtArgSExt +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = sext i16 %b to i32 +; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXTLD]], [[ZEXTB]] +; +; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16 +; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b +; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32 +; +; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16 +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32 +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteMergeExtArgSExt(i8* %p, i32* %q, i16 %b) { +entry: + %t = load i8, i8* %p + %ext = zext i8 %t to i16 + %add = add nsw i16 %ext, %b + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = sext i16 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to catch all the extload opportunities that are exposed +; by the different iterations of codegen prepare. +; Moreover, check that we do not promote more than we need to. +; Here is what is happening in this test (not necessarly in this order): +; 1. We try to promote the operand of %sextadd. +; a. This creates one sext of %ld2 and one of %zextld +; b. The sext of %ld2 can be combine with %ld2, so we remove one sext but +; introduced one. This is fine with the current heuristic: neutral. +; => We have one zext of %zextld left and we created one sext of %ld2. +; 2. We try to promote the operand of %sextaddza. +; a. This creates one sext of %zexta and one of %zextld +; b. The sext of %zexta does not lead to any load, it stays here, even if it +; could have been combine with the zext of %a. +; c. The sext of %zextld leads to %ld and can be combined with it. This is +; done by promoting %zextld. This is fine with the current heuristic: +; neutral. +; => We have created a new zext of %ld and we created one sext of %zexta. +; 3. We try to promote the operand of %sextaddb. +; a. This creates one sext of %b and one of %zextld +; b. The sext of %b is a dead-end, nothing to be done. +; c. Same thing as 2.c. happens. +; => We have created a new zext of %ld and we created one sext of %b. +; 4. We try to promote the operand of the zext of %zextld introduced in #1. +; a. Same thing as 2.c. happens. +; b. %zextld does not have any other uses. It is dead coded. +; => We have created a new zext of %ld and we removed a zext of %zextld and +; a zext of %ld. +; Currently we do not try to reuse existing extensions, so in the end we have +; 3 identical zext of %ld. The extensions will be CSE'ed by SDag. +; +; OPTALL-LABEL: @severalPromotions +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %addr1 +; OPT-NEXT: [[ZEXTLD1_1:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; OPT-NEXT: [[ZEXTLD1_2:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; OPT-NEXT: [[LD2:%[a-zA-Z_0-9-]+]] = load i32, i32* %addr2 +; OPT-NEXT: [[SEXTLD2:%[a-zA-Z_0-9-]+]] = sext i32 [[LD2]] to i64 +; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_1]] +; We do not combine this one: see 2.b. +; OPT-NEXT: [[ZEXTA:%[a-zA-Z_0-9-]+]] = zext i8 %a to i32 +; OPT-NEXT: [[SEXTZEXTA:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXTA]] to i64 +; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTZEXTA]], [[ZEXTLD1_3]] +; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64 +; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_2]] +; +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64 +; DISABLE: [[ADDZA:%[a-zA-Z_0-9-]+]] = add nsw i32 +; DISABLE: [[RESZA:%[a-zA-Z_0-9-]+]] = sext i32 [[ADDZA]] to i64 +; DISABLE: [[ADDB:%[a-zA-Z_0-9-]+]] = add nsw i32 +; DISABLE: [[RESB:%[a-zA-Z_0-9-]+]] = sext i32 [[ADDB]] to i64 +; +; OPTALL: call void @dummy(i64 [[RES]], i64 [[RESZA]], i64 [[RESB]]) +; OPTALL: ret +define void @severalPromotions(i8* %addr1, i32* %addr2, i8 %a, i32 %b) { + %ld = load i8, i8* %addr1 + %zextld = zext i8 %ld to i32 + %ld2 = load i32, i32* %addr2 + %add = add nsw i32 %ld2, %zextld + %sextadd = sext i32 %add to i64 + %zexta = zext i8 %a to i32 + %addza = add nsw i32 %zexta, %zextld + %sextaddza = sext i32 %addza to i64 + %addb = add nsw i32 %b, %zextld + %sextaddb = sext i32 %addb to i64 + call void @dummy(i64 %sextadd, i64 %sextaddza, i64 %sextaddb) + ret void +} + +declare void @dummy(i64, i64, i64) + +; Make sure we do not try to promote vector types since the type promotion +; helper does not support them for now. +; OPTALL-LABEL: @vectorPromotion +; OPTALL: [[SHL:%[a-zA-Z_0-9-]+]] = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8> +; OPTALL: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext <2 x i32> [[SHL]] to <2 x i64> +; OPTALL: ret +define void @vectorPromotion() { +entry: + %a = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8> + %b = zext <2 x i32> %a to <2 x i64> + ret void +} + +@a = common global i32 0, align 4 +@c = common global [2 x i32] zeroinitializer, align 4 + +; Make sure we support promotion of operands that produces a Value as opposed +; to an instruction. +; This used to cause a crash. +; OPTALL-LABEL: @promotionOfArgEndsUpInValue +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i16, i16* %addr +; +; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i16 [[LD]] to i32 +; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw nsw i32 [[SEXT]], zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @c, i64 0, i64 1), i32* @a) to i32) +; +; DISABLE-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw nsw i16 [[LD]], zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @c, i64 0, i64 1), i32* @a) to i16) +; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32 +; +; OPTALL-NEXT: ret i32 [[RES]] +define i32 @promotionOfArgEndsUpInValue(i16* %addr) { +entry: + %val = load i16, i16* %addr + %add = add nuw nsw i16 %val, zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @c, i64 0, i64 1), i32* @a) to i16) + %conv3 = sext i16 %add to i32 + ret i32 %conv3 +} + +; Check that we see that one zext can be derived from the other for free. +; OPTALL-LABEL: @promoteTwoArgZextWithSourceExtendedTwice +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; OPT-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; OPT-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b +; OPT-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], 12 +; OPT-NEXT: store i32 [[RES32]], i32* %addr +; OPT-NEXT: store i64 [[RES64]], i64* %q +; +; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b +; DISABLE-NEXT: [[RES2_32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], 12 +; DISABLE-NEXT: store i32 [[RES32]], i32* %addr +; DISABLE-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES2_32]] to i64 +; DISABLE-NEXT: store i64 [[ZEXT64]], i64* %q +; +; OPTALL-NEXT: ret void +define void @promoteTwoArgZextWithSourceExtendedTwice(i8* %p, i64* %q, i32 %b, i32* %addr) { +entry: + %t = load i8, i8* %p + %zextt = zext i8 %t to i32 + %add = add nuw i32 %zextt, %b + %add2 = add nuw i32 %zextt, 12 + store i32 %add, i32 *%addr + %s = zext i32 %add2 to i64 + store i64 %s, i64* %q + ret void +} + +; Check that we do not increase the cost of the code. +; The input has one free zext and one free sext. If we would have promoted +; all the way through the load we would end up with a free zext and a +; non-free sext (of %b). +; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64 +; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]] +; STRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = trunc i64 [[IDX64]] to i32 +; +; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b +; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64 +; +; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b +; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64 +; +; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, i32* %addr, i64 [[IDX64]] +; OPTALL-NEXT: store i32 [[RES32]], i32* [[GEP]] +; OPTALL-NEXT: ret void +define void @doNotPromoteFreeSExtFromAddrMode(i8* %p, i32 %b, i32* %addr) { +entry: + %t = load i8, i8* %p + %zextt = zext i8 %t to i32 + %add = add nsw i32 %zextt, %b + %idx64 = sext i32 %add to i64 + %staddr = getelementptr inbounds i32, i32* %addr, i64 %idx64 + store i32 %add, i32 *%staddr + ret void +} + +; Check that we do not increase the cost of the code. +; The input has one free zext and one free sext. If we would have promoted +; all the way through the load we would end up with a free zext and a +; non-free sext (of %b). +; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode64 +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64 +; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]] +; +; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b +; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64 +; +; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b +; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64 +; +; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i64, i64* %addr, i64 [[IDX64]] +; OPTALL-NEXT: store i64 %stuff, i64* [[GEP]] +; OPTALL-NEXT: ret void +define void @doNotPromoteFreeSExtFromAddrMode64(i8* %p, i32 %b, i64* %addr, i64 %stuff) { +entry: + %t = load i8, i8* %p + %zextt = zext i8 %t to i32 + %add = add nsw i32 %zextt, %b + %idx64 = sext i32 %add to i64 + %staddr = getelementptr inbounds i64, i64* %addr, i64 %idx64 + store i64 %stuff, i64 *%staddr + ret void +} + +; Check that we do not increase the cost of the code. +; The input has one free zext and one free sext. If we would have promoted +; all the way through the load we would end up with a free zext and a +; non-free sext (of %b). +; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode128 +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64 +; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]] +; +; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b +; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64 +; +; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b +; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64 +; +; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i128, i128* %addr, i64 [[IDX64]] +; OPTALL-NEXT: store i128 %stuff, i128* [[GEP]] +; OPTALL-NEXT: ret void +define void @doNotPromoteFreeSExtFromAddrMode128(i8* %p, i32 %b, i128* %addr, i128 %stuff) { +entry: + %t = load i8, i8* %p + %zextt = zext i8 %t to i32 + %add = add nsw i32 %zextt, %b + %idx64 = sext i32 %add to i64 + %staddr = getelementptr inbounds i128, i128* %addr, i64 %idx64 + store i128 %stuff, i128 *%staddr + ret void +} + + +; Check that we do not increase the cost of the code. +; The input has one free zext and one free sext. If we would have promoted +; all the way through the load we would end up with a free zext and a +; non-free sext (of %b). +; OPTALL-LABEL: @promoteSExtFromAddrMode256 +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64 +; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]] +; +; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b +; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64 +; +; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i256, i256* %addr, i64 [[IDX64]] +; OPTALL-NEXT: store i256 %stuff, i256* [[GEP]] +; OPTALL-NEXT: ret void +define void @promoteSExtFromAddrMode256(i8* %p, i32 %b, i256* %addr, i256 %stuff) { +entry: + %t = load i8, i8* %p + %zextt = zext i8 %t to i32 + %add = add nsw i32 %zextt, %b + %idx64 = sext i32 %add to i64 + %staddr = getelementptr inbounds i256, i256* %addr, i64 %idx64 + store i256 %stuff, i256 *%staddr + ret void +} + +; Check that we do not increase the cost of the code. +; The input has one free zext and one free zext. +; When we promote all the way through the load, we end up with +; a free zext and a non-free zext (of %b). +; However, the current target lowering says zext i32 to i64 is free +; so the promotion happens because the cost did not change and may +; expose more opportunities. +; This would need to be fixed at some point. +; OPTALL-LABEL: @doNotPromoteFreeZExtFromAddrMode +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; This transformation should really happen only for stress mode. +; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; OPT-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i32 %b to i64 +; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], [[ZEXTB]] +; OPT-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = trunc i64 [[IDX64]] to i32 +; +; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b +; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES32]] to i64 +; +; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, i32* %addr, i64 [[IDX64]] +; OPTALL-NEXT: store i32 [[RES32]], i32* [[GEP]] +; OPTALL-NEXT: ret void +define void @doNotPromoteFreeZExtFromAddrMode(i8* %p, i32 %b, i32* %addr) { +entry: + %t = load i8, i8* %p + %zextt = zext i8 %t to i32 + %add = add nuw i32 %zextt, %b + %idx64 = zext i32 %add to i64 + %staddr = getelementptr inbounds i32, i32* %addr, i64 %idx64 + store i32 %add, i32 *%staddr + ret void +} + +; OPTALL-LABEL: @doNotPromoteFreeSExtFromShift +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64 +; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]] +; +; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b +; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64 +; +; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b +; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64 +; +; OPTALL-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = shl i64 [[IDX64]], 12 +; OPTALL-NEXT: ret i64 %staddr +define i64 @doNotPromoteFreeSExtFromShift(i8* %p, i32 %b) { +entry: + %t = load i8, i8* %p + %zextt = zext i8 %t to i32 + %add = add nsw i32 %zextt, %b + %idx64 = sext i32 %add to i64 + %staddr = shl i64 %idx64, 12 + ret i64 %staddr +} + +; Same comment as doNotPromoteFreeZExtFromAddrMode. +; OPTALL-LABEL: @doNotPromoteFreeZExtFromShift +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p +; +; This transformation should really happen only for stress mode. +; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; OPT-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i32 %b to i64 +; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], [[ZEXTB]] +; +; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b +; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES32]] to i64 +; +; OPTALL-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = shl i64 [[IDX64]], 12 +; OPTALL-NEXT: ret i64 %staddr +define i64 @doNotPromoteFreeZExtFromShift(i8* %p, i32 %b) { +entry: + %t = load i8, i8* %p + %zextt = zext i8 %t to i32 + %add = add nuw i32 %zextt, %b + %idx64 = zext i32 %add to i64 + %staddr = shl i64 %idx64, 12 + ret i64 %staddr +} + +; The input has one free zext and one non-free sext. +; When we promote all the way through to the load, we end up with +; a free zext, a free sext (%ld1), and a non-free sext (of %cst). +; However, we when generate load pair and the free sext(%ld1) becomes +; non-free. So technically, we trade a non-free sext to two non-free +; sext. +; This would need to be fixed at some point. +; OPTALL-LABEL: @doNotPromoteBecauseOfPairedLoad +; OPTALL: [[LD0:%[a-zA-Z_0-9-]+]] = load i32, i32* %p +; OPTALL: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, i32* %p, i64 1 +; OPTALL: [[LD1:%[a-zA-Z_0-9-]+]] = load i32, i32* [[GEP]] +; +; This transformation should really happen only for stress mode. +; OPT-NEXT: [[SEXTLD1:%[a-zA-Z_0-9-]+]] = sext i32 [[LD1]] to i64 +; OPT-NEXT: [[SEXTCST:%[a-zA-Z_0-9-]+]] = sext i32 %cst to i64 +; OPT-NEXT: [[SEXTRES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD1]], [[SEXTCST]] +; +; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[LD1]], %cst +; DISABLE-NEXT: [[SEXTRES:%[a-zA-Z_0-9-]+]] = sext i32 [[RES]] to i64 +; +; OPTALL-NEXT: [[ZEXTLD0:%[a-zA-Z_0-9-]+]] = zext i32 [[LD0]] to i64 +; OPTALL-NEXT: [[FINAL:%[a-zA-Z_0-9-]+]] = add i64 [[SEXTRES]], [[ZEXTLD0]] +; OPTALL-NEXT: ret i64 [[FINAL]] +define i64 @doNotPromoteBecauseOfPairedLoad(i32* %p, i32 %cst) { + %ld0 = load i32, i32* %p + %idxLd1 = getelementptr inbounds i32, i32* %p, i64 1 + %ld1 = load i32, i32* %idxLd1 + %res = add nsw i32 %ld1, %cst + %sextres = sext i32 %res to i64 + %zextLd0 = zext i32 %ld0 to i64 + %final = add i64 %sextres, %zextLd0 + ret i64 %final +} diff --git a/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/test/CodeGen/AArch64/arm64-convert-v4f64.ll index c6b7d83..c4e3e4e 100644 --- a/test/CodeGen/AArch64/arm64-convert-v4f64.ll +++ b/test/CodeGen/AArch64/arm64-convert-v4f64.ll @@ -3,11 +3,11 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) { ; CHECK: fptosi_v4f64_to_v4i16 -; CHECK-DAG: fcvtzs v[[LHS:[0-9]+]].2d, v1.2d -; CHECK-DAG: fcvtzs v[[RHS:[0-9]+]].2d, v0.2d -; CHECK-DAG: xtn v[[LHS_NA:[0-9]+]].2s, v[[LHS]].2d -; CHECK-DAG: xtn v[[RHS_NA:[0-9]+]].2s, v[[RHS]].2d -; CHECK: uzp1 v0.4h, v[[RHS_NA]].4h, v[[LHS_NA]].4h +; CHECK-DAG: fcvtzs v[[LHS:[0-9]+]].2d, v0.2d +; CHECK-DAG: fcvtzs v[[RHS:[0-9]+]].2d, v1.2d +; CHECK-DAG: xtn v[[MID:[0-9]+]].2s, v[[LHS]].2d +; CHECK-DAG: xtn2 v[[MID]].4s, v[[RHS]].2d +; CHECK: xtn v0.4h, v[[MID]].4s %tmp1 = load <4 x double>, <4 x double>* %ptr %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16> ret <4 x i16> %tmp2 @@ -15,17 +15,17 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) { define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) { ; CHECK: fptosi_v4f64_to_v4i8 -; CHECK-DAG: fcvtzs v[[CONV3:[0-9]+]].2d, v3.2d -; CHECK-DAG: fcvtzs v[[CONV2:[0-9]+]].2d, v2.2d -; CHECK-DAG: fcvtzs v[[CONV1:[0-9]+]].2d, v1.2d ; CHECK-DAG: fcvtzs v[[CONV0:[0-9]+]].2d, v0.2d -; CHECK-DAG: xtn v[[NA3:[0-9]+]].2s, v[[CONV3]].2d +; CHECK-DAG: fcvtzs v[[CONV1:[0-9]+]].2d, v1.2d +; CHECK-DAG: fcvtzs v[[CONV2:[0-9]+]].2d, v2.2d +; CHECK-DAG: fcvtzs v[[CONV3:[0-9]+]].2d, v3.2d ; CHECK-DAG: xtn v[[NA2:[0-9]+]].2s, v[[CONV2]].2d -; CHECK-DAG: xtn v[[NA1:[0-9]+]].2s, v[[CONV1]].2d +; CHECK-DAG: xtn2 v[[NA2]].4s, v[[CONV3]].2d ; CHECK-DAG: xtn v[[NA0:[0-9]+]].2s, v[[CONV0]].2d -; CHECK-DAG: uzp1 v[[TMP1:[0-9]+]].4h, v[[CONV2]].4h, v[[CONV3]].4h -; CHECK-DAG: uzp1 v[[TMP2:[0-9]+]].4h, v[[CONV0]].4h, v[[CONV1]].4h -; CHECK: uzp1 v0.8b, v[[TMP2]].8b, v[[TMP1]].8b +; CHECK-DAG: xtn2 v[[NA0]].4s, v[[CONV1]].2d +; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA0]].4s +; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA2]].4s +; CHECK: xtn v0.8b, v[[TMP1]].8h %tmp1 = load <8 x double>, <8 x double>* %ptr %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8> ret <8 x i8> %tmp2 diff --git a/test/CodeGen/AArch64/arm64-dup.ll b/test/CodeGen/AArch64/arm64-dup.ll index 849e227..c6b7de3 100644 --- a/test/CodeGen/AArch64/arm64-dup.ll +++ b/test/CodeGen/AArch64/arm64-dup.ll @@ -321,3 +321,40 @@ entry: %sub = sub <4 x i16> %a, %mul ret <4 x i16> %sub } + +; Also test the DUP path in the PerfectShuffle generator. + +; CHECK-LABEL: test_perfectshuffle_dupext_v4i16: +; CHECK-NEXT: dup.4h v0, v0[0] +; CHECK-NEXT: ext.8b v0, v0, v1, #4 +define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { + %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> + ret <4 x i16> %r +} + +; CHECK-LABEL: test_perfectshuffle_dupext_v4f16: +; CHECK-NEXT: dup.4h v0, v0[0] +; CHECK-NEXT: ext.8b v0, v0, v1, #4 +; CHECK-NEXT: ret +define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind { + %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> + ret <4 x half> %r +} + +; CHECK-LABEL: test_perfectshuffle_dupext_v4i32: +; CHECK-NEXT: dup.4s v0, v0[0] +; CHECK-NEXT: ext.16b v0, v0, v1, #8 +; CHECK-NEXT: ret +define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { + %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> + ret <4 x i32> %r +} + +; CHECK-LABEL: test_perfectshuffle_dupext_v4f32: +; CHECK-NEXT: dup.4s v0, v0[0] +; CHECK-NEXT: ext.16b v0, v0, v1, #8 +; CHECK-NEXT: ret +define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind { + %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5> + ret <4 x float> %r +} diff --git a/test/CodeGen/AArch64/arm64-fcopysign.ll b/test/CodeGen/AArch64/arm64-fcopysign.ll index 66241df..feffd41 100644 --- a/test/CodeGen/AArch64/arm64-fcopysign.ll +++ b/test/CodeGen/AArch64/arm64-fcopysign.ll @@ -39,7 +39,7 @@ entry: ; CHECK: fcvt s0, d0 ; CHECK: movi.4s v[[CONST:[0-9]+]], #0x80, lsl #24 ; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]] - %0 = tail call double (...)* @bar() nounwind + %0 = tail call double (...) @bar() nounwind %1 = fptrunc double %0 to float %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone %3 = fadd float %1, %2 diff --git a/test/CodeGen/AArch64/arm64-join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll index e99168b..dee0344 100644 --- a/test/CodeGen/AArch64/arm64-join-reserved.ll +++ b/test/CodeGen/AArch64/arm64-join-reserved.ll @@ -10,7 +10,7 @@ target triple = "arm64-apple-macosx10" ; CHECK: ret define void @g() nounwind ssp { entry: - tail call void (i32, ...)* @f(i32 0, i32 0) nounwind + tail call void (i32, ...) @f(i32 0, i32 0) nounwind ret void } diff --git a/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll new file mode 100644 index 0000000..5bc4d71 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll @@ -0,0 +1,14 @@ +; RUN: llc -mtriple=arm64-apple-ios -aarch64-strict-align < %s | FileCheck %s + +; Small (16-bytes here) unaligned memcpys should stay memcpy calls if +; strict-alignment is turned on. +define void @t0(i8* %out, i8* %in) { +; CHECK-LABEL: t0: +; CHECK: orr w2, wzr, #0x10 +; CHECK-NEXT: bl _memcpy +entry: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 16, i32 1, i1 false) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll index 4a92c3d..b74a406 100644 --- a/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1086,7 +1086,7 @@ define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) { ; CHECK-LABEL: test_concat_diff_v1i32_v1i32: ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} -; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK: ins {{v[0-9]+}}.s[1], w{{[0-9]+}} entry: %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) %d = insertelement <2 x i32> undef, i32 %c, i32 0 diff --git a/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/test/CodeGen/AArch64/arm64-neon-v8.1a.ll new file mode 100644 index 0000000..51ed8a1 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-neon-v8.1a.ll @@ -0,0 +1,456 @@ +; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a +; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a +; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple + +declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) +declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) +declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16) + +declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) +declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) +declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16) + +declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) +declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) +declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16) + +;----------------------------------------------------------------------------- +; RDMA Vector +; test for SIMDThreeSameVectorSQRDMLxHTiedHS + +define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { +; CHECK-LABEL: test_sqrdmlah_v4i16: + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) + %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h +; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.4h +; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2 + ret <4 x i16> %retval +} + +define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { +; CHECK-LABEL: test_sqrdmlah_v8i16: + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) + %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h +; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.8h +; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2 + ret <8 x i16> %retval +} + +define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmlah_v2i32: + %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) + %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s +; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.2s +; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2 + ret <2 x i32> %retval +} + +define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmlah_v4i32: + %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) + %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) +; CHECK-V81: sqrdmulh v1.4s, v1.4s, v2.4s +; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.4s +; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2 + ret <4 x i32> %retval +} + +define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { +; CHECK-LABEL: test_sqrdmlsh_v4i16: + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) + %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h +; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.4h +; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2 + ret <4 x i16> %retval +} + +define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { +; CHECK-LABEL: test_sqrdmlsh_v8i16: + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) + %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h +; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.8h +; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2 + ret <8 x i16> %retval +} + +define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmlsh_v2i32: + %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) + %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s +; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.2s +; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2 + ret <2 x i32> %retval +} + +define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmlsh_v4i32: + %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) + %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.4s +; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.4s +; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2 + ret <4 x i32> %retval +} + +;----------------------------------------------------------------------------- +; RDMA Vector, by element +; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied + +define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) { +; CHECK-LABEL: test_sqrdmlah_lane_s16: +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) + %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) +; CHECK-V8a : sqrdmulh v1.4h, v1.4h, v2.h[3] +; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.h[3] +; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2[3] + ret <4 x i16> %retval +} + +define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) { +; CHECK-LABEL: test_sqrdmlahq_lane_s16: +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) + %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2] +; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.h[2] +; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2[2] + ret <8 x i16> %retval +} + +define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) { +; CHECK-LABEL: test_sqrdmlah_lane_s32: +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) + %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1] +; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.s[1] +; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2[1] + ret <2 x i32> %retval +} + +define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) { +; CHECK-LABEL: test_sqrdmlahq_lane_s32: +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer + %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) + %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0] +; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.s[0] +; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2[0] + ret <4 x i32> %retval +} + +define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) { +; CHECK-LABEL: test_sqrdmlsh_lane_s16: +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) + %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.h[3] +; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.h[3] +; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2[3] + ret <4 x i16> %retval +} + +define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) { +; CHECK-LABEL: test_sqrdmlshq_lane_s16: +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) + %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) +; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2] +; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.h[2] +; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2[2] + ret <8 x i16> %retval +} + +define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) { +; CHECK-LABEL: test_sqrdmlsh_lane_s32: +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) + %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1] +; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.s[1] +; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2[1] + ret <2 x i32> %retval +} + +define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) { +; CHECK-LABEL: test_sqrdmlshq_lane_s32: +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer + %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) + %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) +; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0] +; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.s[0] +; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2[0] + ret <4 x i32> %retval +} + +;----------------------------------------------------------------------------- +; RDMA Vector, by element, extracted +; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style +; i32 tests are for "def : Pat" in SIMDIndexedSQRDMLxHSDTied + +define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) { +; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16: +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1> + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) + %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) + %retval = extractelement <4 x i16> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1] +; CHECK-V81a: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[1] +; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}}, v0, v1[1] + ret i16 %retval +} + +define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) { +; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16: +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1> + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) + %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) + %retval = extractelement <8 x i16> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1] +; CHECK-V81a: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[1] +; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}}, v0, v1[1] + ret i16 %retval +} + +define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) { +; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32: +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) + %extract = extractelement <2 x i32> %prod, i64 0 + %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract) +; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0] +; CHECK-V81a: sqrdmlah v2.2s, v0.2s, v1.s[0] +; CHECK-V81a-apple: sqrdmlah.2s v2, v0, v1[0] + ret i32 %retval +} + +define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) { +; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32: +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer + %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) + %extract = extractelement <4 x i32> %prod, i64 0 + %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract) +; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0] +; CHECK-V81a: sqrdmlah v2.4s, v0.4s, v1.s[0] +; CHECK-V81a-apple: sqrdmlah.4s v2, v0, v1[0] + ret i32 %retval +} + +define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) { +; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16: +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1> + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) + %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) + %retval = extractelement <4 x i16> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1] +; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.4h, v0.4h, v1.h[1] +; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}}, v0, v1[1] + ret i16 %retval +} + +define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) { +; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16: +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1> + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) + %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) + %retval = extractelement <8 x i16> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1] +; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.8h, v0.8h, v1.h[1] +; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}}, v0, v1[1] + ret i16 %retval +} + +define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) { +; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32: +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) + %extract = extractelement <2 x i32> %prod, i64 0 + %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract) +; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0] +; CHECK-V81a: sqrdmlsh v2.2s, v0.2s, v1.s[0] +; CHECK-V81a-apple: sqrdmlsh.2s v2, v0, v1[0] + ret i32 %retval +} + +define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) { +; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32: +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer + %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) + %extract = extractelement <4 x i32> %prod, i64 0 + %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract) +; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0] +; CHECK-V81a: sqrdmlsh v2.4s, v0.4s, v1.s[0] +; CHECK-V81a-apple: sqrdmlsh.4s v2, v0, v1[0] + ret i32 %retval +} + +;----------------------------------------------------------------------------- +; RDMA Scalar +; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td + +define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) { +; CHECK-LABEL: test_sqrdmlah_v1i16: + %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 + %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0 + %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec) + %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec) + %retval = extractelement <4 x i16> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + ret i16 %retval +} + +define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) { +; CHECK-LABEL: test_sqrdmlah_v1i32: + %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0 + %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0 + %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec) + %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0 + %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec) + %retval = extractelement <4 x i32> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + ret i32 %retval +} + + +define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) { +; CHECK-LABEL: test_sqrdmlsh_v1i16: + %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 + %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0 + %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec) + %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec) + %retval = extractelement <4 x i16> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + ret i16 %retval +} + +define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) { +; CHECK-LABEL: test_sqrdmlsh_v1i32: + %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0 + %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0 + %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec) + %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0 + %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec) + %retval = extractelement <4 x i32> %retval_vec, i64 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + ret i32 %retval +} +define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) { +; CHECK-LABEL: test_sqrdmlah_i32: + %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) + %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) +; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} + ret i32 %retval +} + +define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) { +; CHECK-LABEL: test_sqrdmlsh_i32: + %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) + %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) +; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} + ret i32 %retval +} + +;----------------------------------------------------------------------------- +; RDMA Scalar, by element +; i16 tests are performed via tests in above chapter, with IR in ACLE style +; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied + +define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) { +; CHECK-LABEL: test_sqrdmlah_extract_i16: + %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1> + %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 + %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle) + %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) + %retval = extractelement <4 x i16> %retval_vec, i32 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] +; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] +; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, v0[1] + ret i16 %retval +} + +define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmlah_extract_i32: + %extract = extractelement <4 x i32> %rhs, i32 3 + %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) + %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) +; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] +; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] +; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3] + ret i32 %retval +} + +define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) { +; CHECK-LABEL: test_sqrdmlshq_extract_i16: + %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1> + %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0 + %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle) + %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 + %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) + %retval = extractelement <8 x i16> %retval_vec, i32 0 +; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] +; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] +; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}}, {{v[0-9]+}}, v0[1] + ret i16 %retval +} + +define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmlsh_extract_i32: + %extract = extractelement <4 x i32> %rhs, i32 3 + %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) + %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) +; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] +; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] +; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3] + ret i32 %retval +} diff --git a/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll index 5a740d8..2651f11 100644 --- a/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll +++ b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll @@ -9,7 +9,7 @@ define void @clobberScratch(i32* %p) { %v = load i32, i32* %p tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind - tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v) + tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v) store i32 %v, i32* %p ret void } diff --git a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll index 8f79f80..b8236c5 100644 --- a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll +++ b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll @@ -23,9 +23,9 @@ entry: ; FAST-NEXT: movk x16, #0xbeef ; FAST-NEXT: blr x16 %resolveCall2 = inttoptr i64 281474417671919 to i8* - %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2) + %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2) %resolveCall3 = inttoptr i64 244837814038255 to i8* - tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result) + tail call webkit_jscc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result) ret void } @@ -59,7 +59,7 @@ entry: ; FAST-NEXT: movk x16, #0xbeef ; FAST-NEXT: blr x16 %call = inttoptr i64 281474417671919 to i8* - %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6) + %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6) ret i64 %result } @@ -101,7 +101,7 @@ entry: ; FAST-NEXT: movk x16, #0xbeef ; FAST-NEXT: blr x16 %call = inttoptr i64 281474417671919 to i8* - %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10) + %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10) ret i64 %result } diff --git a/test/CodeGen/AArch64/arm64-patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll index cf06653..d9ec7e5 100644 --- a/test/CodeGen/AArch64/arm64-patchpoint.ll +++ b/test/CodeGen/AArch64/arm64-patchpoint.ll @@ -16,9 +16,9 @@ entry: ; CHECK-NEXT: blr x16 ; CHECK: ret %resolveCall2 = inttoptr i64 244837814094590 to i8* - %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4) + %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4) %resolveCall3 = inttoptr i64 244837814094591 to i8* - tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result) + tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result) ret i64 %result } @@ -38,7 +38,7 @@ entry: store i64 11, i64* %metadata store i64 12, i64* %metadata store i64 13, i64* %metadata - call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata) + call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata) ret void } @@ -51,14 +51,14 @@ entry: %tmp80 = add i64 %tmp79, -16 %tmp81 = inttoptr i64 %tmp80 to i64* %tmp82 = load i64, i64* %tmp81, align 8 - tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82) - tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82) + tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82) + tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82) %tmp83 = load i64, i64* %tmp33, align 8 %tmp84 = add i64 %tmp83, -24 %tmp85 = inttoptr i64 %tmp84 to i64* %tmp86 = load i64, i64* %tmp85, align 8 - tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86) - tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86) + tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86) + tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86) ret i64 10 } @@ -74,7 +74,7 @@ entry: ; CHECK-NEXT: nop ; CHECK-NEXT: ldp ; CHECK-NEXT: ret - %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2) + %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2) ret void } diff --git a/test/CodeGen/AArch64/arm64-stackmap-nops.ll b/test/CodeGen/AArch64/arm64-stackmap-nops.ll index 5915b64..2647ac4 100644 --- a/test/CodeGen/AArch64/arm64-stackmap-nops.ll +++ b/test/CodeGen/AArch64/arm64-stackmap-nops.ll @@ -8,7 +8,7 @@ entry: ; CHECK: nop ; CHECK-NEXT: nop ; CHECK-NOT: nop - tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 16) + tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 0, i32 16) ret void } diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll index 29e4484..1a4df7a 100644 --- a/test/CodeGen/AArch64/arm64-stackmap.ll +++ b/test/CodeGen/AArch64/arm64-stackmap.ll @@ -78,7 +78,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" define void @constantargs() { entry: %0 = inttoptr i64 244837814094590 to i8* - tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296) + tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296) ret void } @@ -100,7 +100,7 @@ entry: ; Runtime void->void call. call void inttoptr (i64 244837814094590 to void ()*)() ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars. - call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b) + call void (i64, i32, ...) @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b) ret void } @@ -126,7 +126,7 @@ entry: cold: ; OSR patchpoint with 12-byte nop-slide and 2 live vars. %thunk = inttoptr i64 244837814094590 to i8* - call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b) + call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b) unreachable ret: ret void @@ -142,7 +142,7 @@ ret: define i64 @propertyRead(i64* %obj) { entry: %resolveRead = inttoptr i64 244837814094590 to i8* - %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj) + %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj) %add = add i64 %result, 3 ret i64 %add } @@ -162,7 +162,7 @@ entry: define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) { entry: %resolveWrite = inttoptr i64 244837814094590 to i8* - call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a) + call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a) ret void } @@ -184,7 +184,7 @@ entry: define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) { entry: %resolveCall = inttoptr i64 244837814094590 to i8* - call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2) + call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2) ret void } @@ -206,7 +206,7 @@ entry: define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) { entry: %resolveCall = inttoptr i64 244837814094590 to i8* - %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2) + %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2) %add = add i64 %result, 3 ret i64 %add } @@ -226,7 +226,7 @@ entry: ; CHECK-NEXT: .short 29 define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) { entry: - call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) + call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) ret void } @@ -245,7 +245,7 @@ entry: ; CHECK-NEXT: .short 29 define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) { entry: - call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) + call void (i64, i32, ...) @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) ret void } @@ -263,7 +263,7 @@ entry: ; CHECK-NEXT: .long 33 define void @liveConstant() { - tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33) + tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 15, i32 8, i32 33) ret void } @@ -280,7 +280,7 @@ define void @liveConstant() { ; CHECK-NEXT: .long -{{[0-9]+}} define void @clobberLR(i32 %a) { tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x31}"() nounwind - tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a) + tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 8, i32 %a) ret void } diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll index 75e0d80..15ea21b 100644 --- a/test/CodeGen/AArch64/arm64-vshuffle.ll +++ b/test/CodeGen/AArch64/arm64-vshuffle.ll @@ -1,22 +1,8 @@ ; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s -; The mask: -; CHECK: lCPI0_0: -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 255 ; 0xff -; The second vector is legalized to undef and the elements of the first vector -; are used instead. -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 4 ; 0x4 -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 0 ; 0x0 ; CHECK: test1 -; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0 -; CHECK: movi.8h v[[REG1:[0-9]+]], #0x1, lsl #8 -; CHECK: tbl.8b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]] +; CHECK: movi d[[REG0:[0-9]+]], #0000000000000000 define <8 x i1> @test1() { entry: %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6, @@ -30,18 +16,16 @@ entry: ; CHECK: lCPI1_0: ; CHECK: .byte 0 ; 0x0 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 7 ; 0x7 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 1 ; 0x1 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 ; CHECK: test2 -; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF] -; CHECK: adrp x[[REG2:[0-9]+]], lCPI1_1@PAGE -; CHECK: ldr q[[REG1:[0-9]+]], [x[[REG2]], lCPI1_1@PAGEOFF] -; CHECK: tbl.8b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]] +; CHECK: adrp x[[REG2:[0-9]+]], lCPI1_0@PAGE +; CHECK: ldr d[[REG1:[0-9]+]], [x[[REG2]], lCPI1_0@PAGEOFF] define <8 x i1>@test2() { bb: %Shuff = shufflevector <8 x i1> zeroinitializer, @@ -51,28 +35,8 @@ bb: ret <8 x i1> %Shuff } -; CHECK: lCPI2_0: -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 0 ; 0x0 -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 0 ; 0x0 ; CHECK: test3 -; CHECK: adrp x[[REG3:[0-9]+]], lCPI2_0@PAGE -; CHECK: ldr q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF] -; CHECK: ldr q[[REG1:[0-9]+]], [x[[REG3]], lCPI2_1@PAGEOFF] -; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]] +; CHECK: movi.4s v{{[0-9]+}}, #0x1 define <16 x i1> @test3(i1* %ptr, i32 %v) { bb: %Shuff = shufflevector <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i1> undef, @@ -81,29 +45,26 @@ bb: i32 14, i32 0> ret <16 x i1> %Shuff } -; CHECK: lCPI3_1: +; CHECK: lCPI3_0: +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 ; CHECK: .byte 0 ; 0x0 ; CHECK: .byte 1 ; 0x1 -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 18 ; 0x12 -; CHECK: .byte 4 ; 0x4 -; CHECK: .byte 5 ; 0x5 -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 7 ; 0x7 -; CHECK: .byte 8 ; 0x8 -; CHECK: .byte 31 ; 0x1f -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 30 ; 0x1e -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 13 ; 0xd -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 15 ; 0xf +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 ; CHECK: _test4: -; CHECK: ldr q[[REG1:[0-9]+]] -; CHECK: movi.2d v[[REG0:[0-9]+]], #0000000000000000 -; CHECK: adrp x[[REG3:[0-9]+]], lCPI3_1@PAGE -; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF] -; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]] +; CHECK: adrp x[[REG3:[0-9]+]], lCPI3_0@PAGE +; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_0@PAGEOFF] define <16 x i1> @test4(i1* %ptr, i32 %v) { bb: %Shuff = shufflevector <16 x i1> zeroinitializer, diff --git a/test/CodeGen/AArch64/bitcast.ll b/test/CodeGen/AArch64/bitcast.ll new file mode 100644 index 0000000..e88ea9e --- /dev/null +++ b/test/CodeGen/AArch64/bitcast.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s + +; PR23065: SCALAR_TO_VECTOR implies the top elements 1 to N-1 of the N-element vector are undefined. + +define <4 x i16> @foo1(<2 x i32> %a) { +; CHECK-LABEL: foo1: +; CHECK: movi d0, #0000000000000000 +; CHECK-NEXT: ret + + %1 = shufflevector <2 x i32> <i32 58712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2> +; Can't optimize the following bitcast to scalar_to_vector. + %2 = bitcast <2 x i32> %1 to <4 x i16> + %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + ret <4 x i16> %3 +} + +define <4 x i16> @foo2(<2 x i32> %a) { +; CHECK-LABEL: foo2: +; CHECK: movi d0, #0000000000000000 +; CHECK-NEXT: ret + + %1 = shufflevector <2 x i32> <i32 712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2> +; Can't optimize the following bitcast to scalar_to_vector. + %2 = bitcast <2 x i32> %1 to <4 x i16> + %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + ret <4 x i16> %3 +} diff --git a/test/CodeGen/AArch64/br-to-eh-lpad.ll b/test/CodeGen/AArch64/br-to-eh-lpad.ll index e948b87..f304ba4 100644 --- a/test/CodeGen/AArch64/br-to-eh-lpad.ll +++ b/test/CodeGen/AArch64/br-to-eh-lpad.ll @@ -30,12 +30,12 @@ invoke.cont7: unreachable if.end50.thread: - tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125) - tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 128) + tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125) + tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 128) unreachable invoke.cont33: - tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 119) + tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 119) unreachable invoke.cont41: @@ -51,7 +51,7 @@ lpad40: br label %finally.catchall finally.catchall: - tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125) + tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125) unreachable } diff --git a/test/CodeGen/AArch64/concat_vector-scalar-combine.ll b/test/CodeGen/AArch64/concat_vector-scalar-combine.ll new file mode 100644 index 0000000..1c64af6 --- /dev/null +++ b/test/CodeGen/AArch64/concat_vector-scalar-combine.ll @@ -0,0 +1,125 @@ +; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; Test the (concat_vectors (bitcast (scalar)), ..) pattern. + +define <8 x i8> @test_concat_scalar_v2i8_to_v8i8_dup(i32 %x) #0 { +entry: +; CHECK-LABEL: test_concat_scalar_v2i8_to_v8i8_dup: +; CHECK-NEXT: dup.4h v0, w0 +; CHECK-NEXT: ret + %t = trunc i32 %x to i16 + %0 = bitcast i16 %t to <2 x i8> + %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> + ret <8 x i8> %1 +} + +define <8 x i8> @test_concat_scalar_v4i8_to_v8i8_dup(i32 %x) #0 { +entry: +; CHECK-LABEL: test_concat_scalar_v4i8_to_v8i8_dup: +; CHECK-NEXT: dup.2s v0, w0 +; CHECK-NEXT: ret + %0 = bitcast i32 %x to <4 x i8> + %1 = shufflevector <4 x i8> %0, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + ret <8 x i8> %1 +} + +define <8 x i16> @test_concat_scalar_v2i16_to_v8i16_dup(i32 %x) #0 { +entry: +; CHECK-LABEL: test_concat_scalar_v2i16_to_v8i16_dup: +; CHECK-NEXT: dup.4s v0, w0 +; CHECK-NEXT: ret + %0 = bitcast i32 %x to <2 x i16> + %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 0, i32 1, i32 0, i32 1> + ret <8 x i16> %1 +} + +define <8 x i8> @test_concat_scalars_2x_v2i8_to_v8i8(i32 %x, i32 %y) #0 { +entry: +; CHECK-LABEL: test_concat_scalars_2x_v2i8_to_v8i8: +; CHECK-NEXT: ins.h v0[0], w0 +; CHECK-NEXT: ins.h v0[1], w1 +; CHECK-NEXT: ins.h v0[3], w1 +; CHECK-NEXT: ret + %tx = trunc i32 %x to i16 + %ty = trunc i32 %y to i16 + %bx = bitcast i16 %tx to <2 x i8> + %by = bitcast i16 %ty to <2 x i8> + %r = shufflevector <2 x i8> %bx, <2 x i8> %by, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 2, i32 3> + ret <8 x i8> %r +} + +define <8 x i8> @test_concat_scalars_2x_v4i8_to_v8i8_dup(i32 %x, i32 %y) #0 { +entry: +; CHECK-LABEL: test_concat_scalars_2x_v4i8_to_v8i8_dup: +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: ins.s v0[1], w0 +; CHECK-NEXT: ret + %bx = bitcast i32 %x to <4 x i8> + %by = bitcast i32 %y to <4 x i8> + %r = shufflevector <4 x i8> %bx, <4 x i8> %by, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> + ret <8 x i8> %r +} + +define <8 x i16> @test_concat_scalars_2x_v2i16_to_v8i16_dup(i32 %x, i32 %y) #0 { +entry: +; CHECK-LABEL: test_concat_scalars_2x_v2i16_to_v8i16_dup: +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ins.s v0[1], w1 +; CHECK-NEXT: ins.s v0[2], w1 +; CHECK-NEXT: ins.s v0[3], w0 +; CHECK-NEXT: ret + %bx = bitcast i32 %x to <2 x i16> + %by = bitcast i32 %y to <2 x i16> + %r = shufflevector <2 x i16> %bx, <2 x i16> %by, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1> + ret <8 x i16> %r +} + +; Also make sure we minimize bitcasts. + +; This is a pretty artificial testcase: make sure we bitcast to floating-point +; if any of the scalars is floating-point. +define <8 x i8> @test_concat_scalars_mixed_2x_v2i8_to_v8i8(float %dummy, i32 %x, half %y) #0 { +entry: +; CHECK-LABEL: test_concat_scalars_mixed_2x_v2i8_to_v8i8: +; CHECK-NEXT: fmov s[[X:[0-9]+]], w0 +; CHECK-NEXT: ins.h v0[0], v[[X]][0] +; CHECK-NEXT: ins.h v0[1], v1[0] +; CHECK-NEXT: ins.h v0[2], v[[X]][0] +; CHECK-NEXT: ins.h v0[3], v1[0] +; CHECK-NEXT: ret + %t = trunc i32 %x to i16 + %0 = bitcast i16 %t to <2 x i8> + %y0 = bitcast half %y to <2 x i8> + %1 = shufflevector <2 x i8> %0, <2 x i8> %y0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + ret <8 x i8> %1 +} + +define <2 x float> @test_concat_scalars_fp_2x_v2i8_to_v8i8(float %dummy, half %x, half %y) #0 { +entry: +; CHECK-LABEL: test_concat_scalars_fp_2x_v2i8_to_v8i8: +; CHECK-NEXT: ins.h v0[0], v1[0] +; CHECK-NEXT: ins.h v0[1], v2[0] +; CHECK-NEXT: ins.h v0[2], v1[0] +; CHECK-NEXT: ins.h v0[3], v2[0] +; CHECK-NEXT: ret + %0 = bitcast half %x to <2 x i8> + %y0 = bitcast half %y to <2 x i8> + %1 = shufflevector <2 x i8> %0, <2 x i8> %y0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + %2 = bitcast <8 x i8> %1 to <2 x float> + ret <2 x float> %2 +} + +define <4 x float> @test_concat_scalar_fp_v2i16_to_v16i8_dup(float %x) #0 { +entry: +; CHECK-LABEL: test_concat_scalar_fp_v2i16_to_v16i8_dup: +; CHECK-NEXT: dup.4s v0, v0[0] +; CHECK-NEXT: ret + %0 = bitcast float %x to <2 x i16> + %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 0, i32 1, i32 0, i32 1> + %2 = bitcast <8 x i16> %1 to <4 x float> + ret <4 x float> %2 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/test/CodeGen/AArch64/concat_vector-truncate-combine.ll index c510e27..ee52786 100644 --- a/test/CodeGen/AArch64/concat_vector-truncate-combine.ll +++ b/test/CodeGen/AArch64/concat_vector-truncate-combine.ll @@ -2,6 +2,8 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +; Test the (concat_vectors (trunc), (trunc)) pattern. + define <4 x i16> @test_concat_truncate_v2i64_to_v4i16(<2 x i64> %a, <2 x i64> %b) #0 { entry: ; CHECK-LABEL: test_concat_truncate_v2i64_to_v4i16: diff --git a/test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll b/test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll new file mode 100644 index 0000000..eb6c80d --- /dev/null +++ b/test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -mtriple aarch64-unknown-unknown -asm-verbose=false | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; Test the (concat_vectors (bitcast (trunc (scalar))), undef..) pattern. + +define <8 x i8> @test_concat_from_truncated_scalar(i32 %x) #0 { +entry: +; CHECK-LABEL: test_concat_from_truncated_scalar: +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ret + %t = trunc i32 %x to i16 + %0 = bitcast i16 %t to <2 x i8> + %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> + ret <8 x i8> %1 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/dag-combine-invaraints.ll b/test/CodeGen/AArch64/dag-combine-invaraints.ll index 3614133..ac2d057 100644 --- a/test/CodeGen/AArch64/dag-combine-invaraints.ll +++ b/test/CodeGen/AArch64/dag-combine-invaraints.ll @@ -20,7 +20,7 @@ main_: %DHSelect = select i1 %tmp8, i32 %tmp9, i32 %tmp10 store i32 %DHSelect, i32* %i32X, align 4 %tmp15 = load i32, i32* %i32X, align 4 - %tmp17 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str2, i32 0, i32 0), i32 %tmp15) + %tmp17 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str2, i32 0, i32 0), i32 %tmp15) ret i32 0 ; CHECK: main: diff --git a/test/CodeGen/AArch64/f16-instructions.ll b/test/CodeGen/AArch64/f16-instructions.ll new file mode 100644 index 0000000..be5e2e5 --- /dev/null +++ b/test/CodeGen/AArch64/f16-instructions.ll @@ -0,0 +1,765 @@ +; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: test_fadd: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_fadd(half %a, half %b) #0 { + %r = fadd half %a, %b + ret half %r +} + +; CHECK-LABEL: test_fsub: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fsub s0, s0, s1 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_fsub(half %a, half %b) #0 { + %r = fsub half %a, %b + ret half %r +} + +; CHECK-LABEL: test_fmul: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_fmul(half %a, half %b) #0 { + %r = fmul half %a, %b + ret half %r +} + +; CHECK-LABEL: test_fdiv: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fdiv s0, s0, s1 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_fdiv(half %a, half %b) #0 { + %r = fdiv half %a, %b + ret half %r +} + +; CHECK-LABEL: test_frem: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: bl {{_?}}fmodf +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_frem(half %a, half %b) #0 { + %r = frem half %a, %b + ret half %r +} + +; CHECK-LABEL: test_store: +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret +define void @test_store(half %a, half* %b) #0 { + store half %a, half* %b + ret void +} + +; CHECK-LABEL: test_load: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ret +define half @test_load(half* %a) #0 { + %r = load half, half* %a + ret half %r +} + + +declare half @test_callee(half %a, half %b) #0 + +; CHECK-LABEL: test_call: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: bl {{_?}}test_callee +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_call(half %a, half %b) #0 { + %r = call half @test_callee(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_call_flipped: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: mov.16b v2, v0 +; CHECK-NEXT: mov.16b v0, v1 +; CHECK-NEXT: mov.16b v1, v2 +; CHECK-NEXT: bl {{_?}}test_callee +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_call_flipped(half %a, half %b) #0 { + %r = call half @test_callee(half %b, half %a) + ret half %r +} + +; CHECK-LABEL: test_tailcall_flipped: +; CHECK-NEXT: mov.16b v2, v0 +; CHECK-NEXT: mov.16b v0, v1 +; CHECK-NEXT: mov.16b v1, v2 +; CHECK-NEXT: b {{_?}}test_callee +define half @test_tailcall_flipped(half %a, half %b) #0 { + %r = tail call half @test_callee(half %b, half %a) + ret half %r +} + +; CHECK-LABEL: test_select: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: fcsel s0, s0, s1, ne +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_select(half %a, half %b, i1 zeroext %c) #0 { + %r = select i1 %c, half %a, half %b + ret half %r +} + +; CHECK-LABEL: test_select_cc: +; CHECK-DAG: fcvt s3, h3 +; CHECK-DAG: fcvt s2, h2 +; CHECK-DAG: fcvt s1, h1 +; CHECK-DAG: fcvt s0, h0 +; CHECK-DAG: fcmp s2, s3 +; CHECK-DAG: cset [[CC:w[0-9]+]], ne +; CHECK-DAG: cmp [[CC]], #0 +; CHECK-NEXT: fcsel s0, s0, s1, ne +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_select_cc(half %a, half %b, half %c, half %d) #0 { + %cc = fcmp une half %c, %d + %r = select i1 %cc, half %a, half %b + ret half %r +} + +; CHECK-LABEL: test_fcmp_une: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +define i1 @test_fcmp_une(half %a, half %b) #0 { + %r = fcmp une half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ueq: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: orr [[TRUE:w[0-9]+]], wzr, #0x1 +; CHECK-NEXT: csel [[CC:w[0-9]+]], [[TRUE]], wzr, eq +; CHECK-NEXT: csel w0, [[TRUE]], [[CC]], vs +; CHECK-NEXT: ret +define i1 @test_fcmp_ueq(half %a, half %b) #0 { + %r = fcmp ueq half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ugt: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: ret +define i1 @test_fcmp_ugt(half %a, half %b) #0 { + %r = fcmp ugt half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_uge: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, pl +; CHECK-NEXT: ret +define i1 @test_fcmp_uge(half %a, half %b) #0 { + %r = fcmp uge half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ult: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, lt +; CHECK-NEXT: ret +define i1 @test_fcmp_ult(half %a, half %b) #0 { + %r = fcmp ult half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ule: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, le +; CHECK-NEXT: ret +define i1 @test_fcmp_ule(half %a, half %b) #0 { + %r = fcmp ule half %a, %b + ret i1 %r +} + + +; CHECK-LABEL: test_fcmp_uno: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, vs +; CHECK-NEXT: ret +define i1 @test_fcmp_uno(half %a, half %b) #0 { + %r = fcmp uno half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_one: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: orr [[TRUE:w[0-9]+]], wzr, #0x1 +; CHECK-NEXT: csel [[CC:w[0-9]+]], [[TRUE]], wzr, mi +; CHECK-NEXT: csel w0, [[TRUE]], [[CC]], gt +; CHECK-NEXT: ret +define i1 @test_fcmp_one(half %a, half %b) #0 { + %r = fcmp one half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_oeq: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +define i1 @test_fcmp_oeq(half %a, half %b) #0 { + %r = fcmp oeq half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ogt: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, gt +; CHECK-NEXT: ret +define i1 @test_fcmp_ogt(half %a, half %b) #0 { + %r = fcmp ogt half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_oge: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, ge +; CHECK-NEXT: ret +define i1 @test_fcmp_oge(half %a, half %b) #0 { + %r = fcmp oge half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_olt: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, mi +; CHECK-NEXT: ret +define i1 @test_fcmp_olt(half %a, half %b) #0 { + %r = fcmp olt half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ole: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret +define i1 @test_fcmp_ole(half %a, half %b) #0 { + %r = fcmp ole half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ord: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, vc +; CHECK-NEXT: ret +define i1 @test_fcmp_ord(half %a, half %b) #0 { + %r = fcmp ord half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_br_cc: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: b.mi [[BRCC_ELSE:.?LBB[0-9_]+]] +; CHECK-NEXT: str wzr, [x0] +; CHECK-NEXT: ret +; CHECK-NEXT: [[BRCC_ELSE]]: +; CHECK-NEXT: str wzr, [x1] +; CHECK-NEXT: ret +define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 { + %c = fcmp uge half %a, %b + br i1 %c, label %then, label %else +then: + store i32 0, i32* %p1 + ret void +else: + store i32 0, i32* %p2 + ret void +} + +; CHECK-LABEL: test_phi: +; CHECK: mov x[[PTR:[0-9]+]], x0 +; CHECK: ldr h[[AB:[0-9]+]], [x[[PTR]]] +; CHECK: [[LOOP:LBB[0-9_]+]]: +; CHECK: mov.16b v[[R:[0-9]+]], v[[AB]] +; CHECK: ldr h[[AB]], [x[[PTR]]] +; CHECK: mov x0, x[[PTR]] +; CHECK: bl {{_?}}test_dummy +; CHECK: mov.16b v0, v[[R]] +; CHECK: ret +define half @test_phi(half* %p1) #0 { +entry: + %a = load half, half* %p1 + br label %loop +loop: + %r = phi half [%a, %entry], [%b, %loop] + %b = load half, half* %p1 + %c = call i1 @test_dummy(half* %p1) + br i1 %c, label %loop, label %return +return: + ret half %r +} +declare i1 @test_dummy(half* %p1) #0 + +; CHECK-LABEL: test_fptosi_i32: +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcvtzs w0, s0 +; CHECK-NEXT: ret +define i32 @test_fptosi_i32(half %a) #0 { + %r = fptosi half %a to i32 + ret i32 %r +} + +; CHECK-LABEL: test_fptosi_i64: +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcvtzs x0, s0 +; CHECK-NEXT: ret +define i64 @test_fptosi_i64(half %a) #0 { + %r = fptosi half %a to i64 + ret i64 %r +} + +; CHECK-LABEL: test_fptoui_i32: +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcvtzu w0, s0 +; CHECK-NEXT: ret +define i32 @test_fptoui_i32(half %a) #0 { + %r = fptoui half %a to i32 + ret i32 %r +} + +; CHECK-LABEL: test_fptoui_i64: +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcvtzu x0, s0 +; CHECK-NEXT: ret +define i64 @test_fptoui_i64(half %a) #0 { + %r = fptoui half %a to i64 + ret i64 %r +} + +; CHECK-LABEL: test_uitofp_i32: +; CHECK-NEXT: ucvtf s0, w0 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_uitofp_i32(i32 %a) #0 { + %r = uitofp i32 %a to half + ret half %r +} + +; CHECK-LABEL: test_uitofp_i64: +; CHECK-NEXT: ucvtf s0, x0 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_uitofp_i64(i64 %a) #0 { + %r = uitofp i64 %a to half + ret half %r +} + +; CHECK-LABEL: test_sitofp_i32: +; CHECK-NEXT: scvtf s0, w0 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_sitofp_i32(i32 %a) #0 { + %r = sitofp i32 %a to half + ret half %r +} + +; CHECK-LABEL: test_sitofp_i64: +; CHECK-NEXT: scvtf s0, x0 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_sitofp_i64(i64 %a) #0 { + %r = sitofp i64 %a to half + ret half %r +} + +; CHECK-LABEL: test_fptrunc_float: +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret + +define half @test_fptrunc_float(float %a) #0 { + %r = fptrunc float %a to half + ret half %r +} + +; CHECK-LABEL: test_fptrunc_double: +; CHECK-NEXT: fcvt h0, d0 +; CHECK-NEXT: ret +define half @test_fptrunc_double(double %a) #0 { + %r = fptrunc double %a to half + ret half %r +} + +; CHECK-LABEL: test_fpext_float: +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: ret +define float @test_fpext_float(half %a) #0 { + %r = fpext half %a to float + ret float %r +} + +; CHECK-LABEL: test_fpext_double: +; CHECK-NEXT: fcvt d0, h0 +; CHECK-NEXT: ret +define double @test_fpext_double(half %a) #0 { + %r = fpext half %a to double + ret double %r +} + + +; CHECK-LABEL: test_bitcast_halftoi16: +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +define i16 @test_bitcast_halftoi16(half %a) #0 { + %r = bitcast half %a to i16 + ret i16 %r +} + +; CHECK-LABEL: test_bitcast_i16tohalf: +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ret +define half @test_bitcast_i16tohalf(i16 %a) #0 { + %r = bitcast i16 %a to half + ret half %r +} + + +declare half @llvm.sqrt.f16(half %a) #0 +declare half @llvm.powi.f16(half %a, i32 %b) #0 +declare half @llvm.sin.f16(half %a) #0 +declare half @llvm.cos.f16(half %a) #0 +declare half @llvm.pow.f16(half %a, half %b) #0 +declare half @llvm.exp.f16(half %a) #0 +declare half @llvm.exp2.f16(half %a) #0 +declare half @llvm.log.f16(half %a) #0 +declare half @llvm.log10.f16(half %a) #0 +declare half @llvm.log2.f16(half %a) #0 +declare half @llvm.fma.f16(half %a, half %b, half %c) #0 +declare half @llvm.fabs.f16(half %a) #0 +declare half @llvm.minnum.f16(half %a, half %b) #0 +declare half @llvm.maxnum.f16(half %a, half %b) #0 +declare half @llvm.copysign.f16(half %a, half %b) #0 +declare half @llvm.floor.f16(half %a) #0 +declare half @llvm.ceil.f16(half %a) #0 +declare half @llvm.trunc.f16(half %a) #0 +declare half @llvm.rint.f16(half %a) #0 +declare half @llvm.nearbyint.f16(half %a) #0 +declare half @llvm.round.f16(half %a) #0 +declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0 + +; CHECK-LABEL: test_sqrt: +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fsqrt s0, s0 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_sqrt(half %a) #0 { + %r = call half @llvm.sqrt.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_powi: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl {{_?}}__powisf2 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_powi(half %a, i32 %b) #0 { + %r = call half @llvm.powi.f16(half %a, i32 %b) + ret half %r +} + +; CHECK-LABEL: test_sin: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl {{_?}}sinf +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_sin(half %a) #0 { + %r = call half @llvm.sin.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_cos: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl {{_?}}cosf +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_cos(half %a) #0 { + %r = call half @llvm.cos.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_pow: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: bl {{_?}}powf +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_pow(half %a, half %b) #0 { + %r = call half @llvm.pow.f16(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_exp: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl {{_?}}expf +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_exp(half %a) #0 { + %r = call half @llvm.exp.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_exp2: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl {{_?}}exp2f +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_exp2(half %a) #0 { + %r = call half @llvm.exp2.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_log: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl {{_?}}logf +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_log(half %a) #0 { + %r = call half @llvm.log.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_log10: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl {{_?}}log10f +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_log10(half %a) #0 { + %r = call half @llvm.log10.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_log2: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl {{_?}}log2f +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_log2(half %a) #0 { + %r = call half @llvm.log2.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_fma: +; CHECK-NEXT: fcvt s2, h2 +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fmadd s0, s0, s1, s2 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_fma(half %a, half %b, half %c) #0 { + %r = call half @llvm.fma.f16(half %a, half %b, half %c) + ret half %r +} + +; CHECK-LABEL: test_fabs: +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fabs s0, s0 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_fabs(half %a) #0 { + %r = call half @llvm.fabs.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_minnum: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: bl {{_?}}fminf +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_minnum(half %a, half %b) #0 { + %r = call half @llvm.minnum.f16(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_maxnum: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: bl {{_?}}fmaxf +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret +define half @test_maxnum(half %a, half %b) #0 { + %r = call half @llvm.maxnum.f16(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_copysign: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: movi.4s v2, #0x80, lsl #24 +; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_copysign(half %a, half %b) #0 { + %r = call half @llvm.copysign.f16(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_floor: +; CHECK-NEXT: fcvt s1, h0 +; CHECK-NEXT: frintm s0, s1 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: frintx s1, s1 +; CHECK-NEXT: ret +define half @test_floor(half %a) #0 { + %r = call half @llvm.floor.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_ceil: +; CHECK-NEXT: fcvt s1, h0 +; CHECK-NEXT: frintp s0, s1 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: frintx s1, s1 +; CHECK-NEXT: ret +define half @test_ceil(half %a) #0 { + %r = call half @llvm.ceil.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_trunc: +; CHECK-NEXT: fcvt s1, h0 +; CHECK-NEXT: frintz s0, s1 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: frintx s1, s1 +; CHECK-NEXT: ret +define half @test_trunc(half %a) #0 { + %r = call half @llvm.trunc.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_rint: +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: frintx s0, s0 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_rint(half %a) #0 { + %r = call half @llvm.rint.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_nearbyint: +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: frinti s0, s0 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_nearbyint(half %a) #0 { + %r = call half @llvm.nearbyint.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_round: +; CHECK-NEXT: fcvt s1, h0 +; CHECK-NEXT: frinta s0, s1 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: frintx s1, s1 +; CHECK-NEXT: ret +define half @test_round(half %a) #0 { + %r = call half @llvm.round.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_fmuladd: +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcvt s1, h2 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ret +define half @test_fmuladd(half %a, half %b, half %c) #0 { + %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c) + ret half %r +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/fast-isel-int-ext5.ll b/test/CodeGen/AArch64/fast-isel-int-ext5.ll new file mode 100644 index 0000000..0f9ec62 --- /dev/null +++ b/test/CodeGen/AArch64/fast-isel-int-ext5.ll @@ -0,0 +1,19 @@ +; RUN: llc -mtriple=aarch64-apple-darwin -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: int_ext_opt +define i64 @int_ext_opt(i8* %addr, i1 %c1, i1 %c2) { +entry: + %0 = load i8, i8* %addr + br i1 %c1, label %bb1, label %bb2 + +bb1: + %1 = zext i8 %0 to i64 + br i1 %c2, label %bb2, label %exit + +bb2: + %2 = phi i64 [1, %entry], [%1, %bb1] + ret i64 %2 + +exit: + ret i64 0 +} diff --git a/test/CodeGen/AArch64/fold-constants.ll b/test/CodeGen/AArch64/fold-constants.ll new file mode 100644 index 0000000..2dd0d12 --- /dev/null +++ b/test/CodeGen/AArch64/fold-constants.ll @@ -0,0 +1,21 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s + +define i64 @dotests_616() { +; CHECK-LABEL: dotests_616 +; CHECK: movi d0, #0000000000000000 +; CHECK-NEXT: umov w8, v0.b[2] +; CHECK-NEXT: sbfx w8, w8, #0, #1 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +entry: + %0 = bitcast <2 x i64> zeroinitializer to <8 x i16> + %1 = and <8 x i16> zeroinitializer, %0 + %2 = icmp ne <8 x i16> %1, zeroinitializer + %3 = extractelement <8 x i1> %2, i32 2 + %vgetq_lane285 = sext i1 %3 to i16 + %vset_lane = insertelement <4 x i16> undef, i16 %vgetq_lane285, i32 0 + %4 = bitcast <4 x i16> %vset_lane to <1 x i64> + %vget_lane = extractelement <1 x i64> %4, i32 0 + ret i64 %vget_lane +} diff --git a/test/CodeGen/AArch64/fp16-instructions.ll b/test/CodeGen/AArch64/fp16-instructions.ll deleted file mode 100644 index ba96694..0000000 --- a/test/CodeGen/AArch64/fp16-instructions.ll +++ /dev/null @@ -1,109 +0,0 @@ -; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s - -define half @add_h(half %a, half %b) { -entry: -; CHECK-LABEL: add_h: -; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0 -; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1 -; CHECK: fadd [[RES:s[0-9]+]], [[OP1]], [[OP2]] -; CHECK: fcvt h0, [[RES]] - %0 = fadd half %a, %b - ret half %0 -} - - -define half @sub_h(half %a, half %b) { -entry: -; CHECK-LABEL: sub_h: -; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0 -; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1 -; CHECK: fsub [[RES:s[0-9]+]], [[OP1]], [[OP2]] -; CHECK: fcvt h0, [[RES]] - %0 = fsub half %a, %b - ret half %0 -} - - -define half @mul_h(half %a, half %b) { -entry: -; CHECK-LABEL: mul_h: -; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0 -; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1 -; CHECK: fmul [[RES:s[0-9]+]], [[OP1]], [[OP2]] -; CHECK: fcvt h0, [[RES]] - %0 = fmul half %a, %b - ret half %0 -} - - -define half @div_h(half %a, half %b) { -entry: -; CHECK-LABEL: div_h: -; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0 -; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1 -; CHECK: fdiv [[RES:s[0-9]+]], [[OP1]], [[OP2]] -; CHECK: fcvt h0, [[RES]] - %0 = fdiv half %a, %b - ret half %0 -} - - -define half @load_h(half* %a) { -entry: -; CHECK-LABEL: load_h: -; CHECK: ldr h0, [x0] - %0 = load half, half* %a, align 4 - ret half %0 -} - - -define void @store_h(half* %a, half %b) { -entry: -; CHECK-LABEL: store_h: -; CHECK: str h0, [x0] - store half %b, half* %a, align 4 - ret void -} - -define half @s_to_h(float %a) { -; CHECK-LABEL: s_to_h: -; CHECK: fcvt h0, s0 - %1 = fptrunc float %a to half - ret half %1 -} - -define half @d_to_h(double %a) { -; CHECK-LABEL: d_to_h: -; CHECK: fcvt h0, d0 - %1 = fptrunc double %a to half - ret half %1 -} - -define float @h_to_s(half %a) { -; CHECK-LABEL: h_to_s: -; CHECK: fcvt s0, h0 - %1 = fpext half %a to float - ret float %1 -} - -define double @h_to_d(half %a) { -; CHECK-LABEL: h_to_d: -; CHECK: fcvt d0, h0 - %1 = fpext half %a to double - ret double %1 -} - -define half @bitcast_i_to_h(i16 %a) { -; CHECK-LABEL: bitcast_i_to_h: -; CHECK: fmov s0, w0 - %1 = bitcast i16 %a to half - ret half %1 -} - - -define i16 @bitcast_h_to_i(half %a) { -; CHECK-LABEL: bitcast_h_to_i: -; CHECK: fmov w0, s0 - %1 = bitcast half %a to i16 - ret i16 %1 -} diff --git a/test/CodeGen/AArch64/global-merge-1.ll b/test/CodeGen/AArch64/global-merge-1.ll index b404389..14b0430 100644 --- a/test/CodeGen/AArch64/global-merge-1.ll +++ b/test/CodeGen/AArch64/global-merge-1.ll @@ -1,11 +1,11 @@ -; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS -; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS +; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS +; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS @m = internal global i32 0, align 4 @n = internal global i32 0, align 4 diff --git a/test/CodeGen/AArch64/global-merge-2.ll b/test/CodeGen/AArch64/global-merge-2.ll index d5967b9..af68403 100644 --- a/test/CodeGen/AArch64/global-merge-2.ll +++ b/test/CodeGen/AArch64/global-merge-2.ll @@ -1,6 +1,6 @@ -; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS +; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS @x = global i32 0, align 4 @y = global i32 0, align 4 diff --git a/test/CodeGen/AArch64/global-merge-3.ll b/test/CodeGen/AArch64/global-merge-3.ll index 15035c0..9251083 100644 --- a/test/CodeGen/AArch64/global-merge-3.ll +++ b/test/CodeGen/AArch64/global-merge-3.ll @@ -1,6 +1,6 @@ -; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS +; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS @x = global [1000 x i32] zeroinitializer, align 1 @y = global [1000 x i32] zeroinitializer, align 1 diff --git a/test/CodeGen/AArch64/global-merge-4.ll b/test/CodeGen/AArch64/global-merge-4.ll index 8fb7747..bc6b68a 100644 --- a/test/CodeGen/AArch64/global-merge-4.ll +++ b/test/CodeGen/AArch64/global-merge-4.ll @@ -1,4 +1,4 @@ -; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -o - | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" target triple = "arm64-apple-ios7.0.0" diff --git a/test/CodeGen/AArch64/merge-store.ll b/test/CodeGen/AArch64/merge-store.ll new file mode 100644 index 0000000..18dbad4 --- /dev/null +++ b/test/CodeGen/AArch64/merge-store.ll @@ -0,0 +1,20 @@ +; RUN: llc -march aarch64 %s -o - | FileCheck %s + +@g0 = external global <3 x float>, align 16 +@g1 = external global <3 x float>, align 4 + +; CHECK: ldr s[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]]{{\]}}, #4 +; CHECK: ld1{{\.?s?}} { v[[R0]]{{\.?s?}} }[1], {{\[}}[[R1]]{{\]}} +; CHECK: str d[[R0]] + +define void @blam() { + %tmp4 = getelementptr inbounds <3 x float>, <3 x float>* @g1, i64 0, i64 0 + %tmp5 = load <3 x float>, <3 x float>* @g0, align 16 + %tmp6 = extractelement <3 x float> %tmp5, i64 0 + store float %tmp6, float* %tmp4 + %tmp7 = getelementptr inbounds float, float* %tmp4, i64 1 + %tmp8 = load <3 x float>, <3 x float>* @g0, align 16 + %tmp9 = extractelement <3 x float> %tmp8, i64 1 + store float %tmp9, float* %tmp7 + ret void; +} diff --git a/test/CodeGen/AArch64/print-mrs-system-register.ll b/test/CodeGen/AArch64/print-mrs-system-register.ll new file mode 100644 index 0000000..3411ed6 --- /dev/null +++ b/test/CodeGen/AArch64/print-mrs-system-register.ll @@ -0,0 +1,11 @@ +; RUN: llc -mtriple=arm64-apple-darwin %s -o - | FileCheck %s + +; CHECK: mrs x0, CPM_IOACC_CTL_EL3 + +define void @foo1() #0 { +entry: + tail call void asm sideeffect "mrs x0, cpm_ioacc_ctl_el3", ""() + ret void +} + +attributes #0 = { "target-cpu"="cyclone" } diff --git a/test/CodeGen/AArch64/sibling-call.ll b/test/CodeGen/AArch64/sibling-call.ll index 34d45d8..a68fdec 100644 --- a/test/CodeGen/AArch64/sibling-call.ll +++ b/test/CodeGen/AArch64/sibling-call.ll @@ -75,8 +75,8 @@ define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) { ; CHECK: ldr [[VAL0:x[0-9]+]], ; CHECK: ldr [[VAL1:x[0-9]+]], -; CHECK: str [[VAL1]], ; CHECK: str [[VAL0]], +; CHECK: str [[VAL1]], ; CHECK-NOT: add sp, sp, ; CHECK: b callee_stack16 diff --git a/test/CodeGen/AArch64/stackmap-liveness.ll b/test/CodeGen/AArch64/stackmap-liveness.ll new file mode 100644 index 0000000..6b37aac --- /dev/null +++ b/test/CodeGen/AArch64/stackmap-liveness.ll @@ -0,0 +1,47 @@ +; RUN: llc < %s -mtriple=aarch64-apple-darwin | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps +; CHECK-NEXT: __LLVM_StackMaps: +; Header +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .short 0 +; Num Functions +; CHECK-NEXT: .long 1 +; Num LargeConstants +; CHECK-NEXT: .long 0 +; Num Callsites +; CHECK-NEXT: .long 1 + +; Functions and stack size +; CHECK-NEXT: .quad _stackmap_liveness +; CHECK-NEXT: .quad 16 + +; Test that the return register is recognized as an live-out. +define i64 @stackmap_liveness(i1 %c) { +; CHECK-LABEL: .long L{{.*}}-_stackmap_liveness +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .short 0 +; Padding +; CHECK-NEXT: .short 0 +; Num LiveOut Entries: 1 +; CHECK-NEXT: .short 2 +; LiveOut Entry 0: X0 +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 8 +; LiveOut Entry 1: SP +; CHECK-NEXT: .short 31 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 8 +; Align +; CHECK-NEXT: .align 3 + %1 = select i1 %c, i64 1, i64 2 + call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 32, i8* null, i32 0) + ret i64 %1 +} + +declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...) + diff --git a/test/CodeGen/AArch64/tailcall-explicit-sret.ll b/test/CodeGen/AArch64/tailcall-explicit-sret.ll new file mode 100644 index 0000000..4d80f2a --- /dev/null +++ b/test/CodeGen/AArch64/tailcall-explicit-sret.ll @@ -0,0 +1,106 @@ +; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s +; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks. + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; Check that we don't try to tail-call with a non-forwarded sret parameter. +declare void @test_explicit_sret(i1024* sret) #0 + +; This is the only OK case, where we forward the explicit sret pointer. + +; CHECK-LABEL: _test_tailcall_explicit_sret: +; CHECK-NEXT: b _test_explicit_sret +define void @test_tailcall_explicit_sret(i1024* sret %arg) #0 { + tail call void @test_explicit_sret(i1024* %arg) + ret void +} + +; CHECK-LABEL: _test_call_explicit_sret: +; CHECK-NOT: mov x8 +; CHECK: bl _test_explicit_sret +; CHECK: ret +define void @test_call_explicit_sret(i1024* sret %arg) #0 { + call void @test_explicit_sret(i1024* %arg) + ret void +} + +; CHECK-LABEL: _test_tailcall_explicit_sret_alloca_unused: +; CHECK: mov x8, sp +; CHECK-NEXT: bl _test_explicit_sret +; CHECK: ret +define void @test_tailcall_explicit_sret_alloca_unused() #0 { + %l = alloca i1024, align 8 + tail call void @test_explicit_sret(i1024* %l) + ret void +} + +; CHECK-LABEL: _test_tailcall_explicit_sret_alloca_dummyusers: +; CHECK: ldr [[PTRLOAD1:x[0-9]+]], [x0] +; CHECK: str [[PTRLOAD1]], [sp] +; CHECK: mov x8, sp +; CHECK-NEXT: bl _test_explicit_sret +; CHECK: ret +define void @test_tailcall_explicit_sret_alloca_dummyusers(i1024* %ptr) #0 { + %l = alloca i1024, align 8 + %r = load i1024, i1024* %ptr, align 8 + store i1024 %r, i1024* %l, align 8 + tail call void @test_explicit_sret(i1024* %l) + ret void +} + +; This is too conservative, but doesn't really happen in practice. + +; CHECK-LABEL: _test_tailcall_explicit_sret_gep: +; CHECK: add x8, x0, #128 +; CHECK-NEXT: bl _test_explicit_sret +; CHECK: ret +define void @test_tailcall_explicit_sret_gep(i1024* %ptr) #0 { + %ptr2 = getelementptr i1024, i1024* %ptr, i32 1 + tail call void @test_explicit_sret(i1024* %ptr2) + ret void +} + +; CHECK-LABEL: _test_tailcall_explicit_sret_alloca_returned: +; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8 +; CHECK: mov x8, sp +; CHECK-NEXT: bl _test_explicit_sret +; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp] +; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]] +; CHECK: ret +define i1024 @test_tailcall_explicit_sret_alloca_returned() #0 { + %l = alloca i1024, align 8 + tail call void @test_explicit_sret(i1024* %l) + %r = load i1024, i1024* %l, align 8 + ret i1024 %r +} + +; CHECK-LABEL: _test_indirect_tailcall_explicit_sret_nosret_arg: +; CHECK-DAG: mov x[[CALLERX8NUM:[0-9]+]], x8 +; CHECK-DAG: mov [[FPTR:x[0-9]+]], x0 +; CHECK: mov x0, sp +; CHECK-NEXT: blr [[FPTR]] +; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp] +; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]] +; CHECK: ret +define void @test_indirect_tailcall_explicit_sret_nosret_arg(i1024* sret %arg, void (i1024*)* %f) #0 { + %l = alloca i1024, align 8 + tail call void %f(i1024* %l) + %r = load i1024, i1024* %l, align 8 + store i1024 %r, i1024* %arg, align 8 + ret void +} + +; CHECK-LABEL: _test_indirect_tailcall_explicit_sret_: +; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8 +; CHECK: mov x8, sp +; CHECK-NEXT: blr x0 +; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp] +; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]] +; CHECK: ret +define void @test_indirect_tailcall_explicit_sret_(i1024* sret %arg, i1024 ()* %f) #0 { + %ret = tail call i1024 %f() + store i1024 %ret, i1024* %arg, align 8 + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/tailcall-implicit-sret.ll b/test/CodeGen/AArch64/tailcall-implicit-sret.ll new file mode 100644 index 0000000..5d68059 --- /dev/null +++ b/test/CodeGen/AArch64/tailcall-implicit-sret.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s +; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks. + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; Check that we don't try to tail-call with an sret-demoted return. + +declare i1024 @test_sret() #0 + +; CHECK-LABEL: _test_call_sret: +; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8 +; CHECK: mov x8, sp +; CHECK-NEXT: bl _test_sret +; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp] +; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]] +; CHECK: ret +define i1024 @test_call_sret() #0 { + %a = call i1024 @test_sret() + ret i1024 %a +} + +; CHECK-LABEL: _test_tailcall_sret: +; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8 +; CHECK: mov x8, sp +; CHECK-NEXT: bl _test_sret +; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp] +; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]] +; CHECK: ret +define i1024 @test_tailcall_sret() #0 { + %a = tail call i1024 @test_sret() + ret i1024 %a +} + +; CHECK-LABEL: _test_indirect_tailcall_sret: +; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8 +; CHECK: mov x8, sp +; CHECK-NEXT: blr x0 +; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp] +; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]] +; CHECK: ret +define i1024 @test_indirect_tailcall_sret(i1024 ()* %f) #0 { + %a = tail call i1024 %f() + ret i1024 %a +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/tailcall-mem-intrinsics.ll b/test/CodeGen/AArch64/tailcall-mem-intrinsics.ll new file mode 100644 index 0000000..b970fb1 --- /dev/null +++ b/test/CodeGen/AArch64/tailcall-mem-intrinsics.ll @@ -0,0 +1,31 @@ +; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s + +; CHECK-LABEL: tail_memcpy: +; CHECK: b memcpy +define void @tail_memcpy(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 { +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: tail_memmove: +; CHECK: b memmove +define void @tail_memmove(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 { +entry: + tail call void @llvm.memmove.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false) + ret void +} + +; CHECK-LABEL: tail_memset: +; CHECK: b memset +define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 { +entry: + tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i32 1, i1 false) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0 +declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0 +declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0 + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/vcvt-oversize.ll b/test/CodeGen/AArch64/vcvt-oversize.ll new file mode 100644 index 0000000..066a4b6 --- /dev/null +++ b/test/CodeGen/AArch64/vcvt-oversize.ll @@ -0,0 +1,16 @@ +; RUN: llc -mtriple=aarch64 < %s | FileCheck %s + +define <8 x i8> @float_to_i8(<8 x float>* %in) { +; CHECK-LABEL: float_to_i8: +; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v0.4s, v0.4s +; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v1.4s, v1.4s +; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s +; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s +; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s +; CHECK-DAG: xtn2 v[[TMP]].8h, v[[MSB]].4s +; CHECK-DAG: xtn v0.8b, v[[TMP]].8h + %l = load <8 x float>, <8 x float>* %in + %scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0> + %conv = fptoui <8 x float> %scale to <8 x i8> + ret <8 x i8> %conv +} |