From bfc2d688b591c574c0cc788348c74545ce894efa Mon Sep 17 00:00:00 2001 From: Stephen Hines Date: Fri, 17 Oct 2014 08:47:43 -0700 Subject: Bring in fixes for Cortex-A53 errata + build updates. Bug: 18034609 Change-Id: I2cf0094eb9df801a84274ff29018431d75da89dd --- test/CodeGen/AArch64/a57-csel.ll | 11 + .../AArch64/aarch64-a57-fp-load-balancing.ll | 323 +++++++++++++ .../AArch64/aarch64-fix-cortex-a53-835769.ll | 534 +++++++++++++++++++++ test/CodeGen/AArch64/remat.ll | 16 + test/CodeGen/X86/critical-anti-dep-breaker.ll | 28 ++ 5 files changed, 912 insertions(+) create mode 100644 test/CodeGen/AArch64/a57-csel.ll create mode 100644 test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll create mode 100644 test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll create mode 100644 test/CodeGen/AArch64/remat.ll create mode 100644 test/CodeGen/X86/critical-anti-dep-breaker.ll (limited to 'test') diff --git a/test/CodeGen/AArch64/a57-csel.ll b/test/CodeGen/AArch64/a57-csel.ll new file mode 100644 index 0000000..9d16d1a --- /dev/null +++ b/test/CodeGen/AArch64/a57-csel.ll @@ -0,0 +1,11 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mcpu=cortex-a57 -aarch64-enable-early-ifcvt=false | FileCheck %s + +; Check that the select is expanded into a branch sequence. +define i64 @f(i64 %a, i64 %b, i64* %c, i64 %d, i64 %e) { + ; CHECK: cbz + %x0 = load i64* %c + %x1 = icmp eq i64 %x0, 0 + %x2 = select i1 %x1, i64 %a, i64 %b + %x3 = add i64 %x2, %d + ret i64 %x3 +} diff --git a/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll new file mode 100644 index 0000000..fb229fc --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll @@ -0,0 +1,323 @@ +; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN +; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD + +; Test the AArch64A57FPLoadBalancing pass. This pass relies heavily on register allocation, so +; our test strategy is to: +; * Force the pass to always perform register swapping even if the dest register is of the +; correct color already (-force-all) +; * Force the pass to ignore all hints it obtained from regalloc (-deterministic-balance), +; and run it twice, once where it always hints odd, and once where it always hints even. +; +; We then use regex magic to check that in the two cases the register allocation is +; different; this is what gives us the testing coverage and distinguishes cases where +; the pass has done some work versus accidental regalloc. + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +; Non-overlapping groups - shouldn't need any changing at all. + +; CHECK-LABEL: f1: +; CHECK-EVEN: fmadd [[x:d[0-9]*[02468]]] +; CHECK-ODD: fmadd [[x:d[0-9]*[13579]]] +; CHECK: fmadd [[x]] +; CHECK: fmsub [[x]] +; CHECK: fmadd [[x]] +; CHECK: str [[x]] + +define void @f1(double* nocapture readonly %p, double* nocapture %q) #0 { +entry: + %0 = load double* %p, align 8 + %arrayidx1 = getelementptr inbounds double* %p, i64 1 + %1 = load double* %arrayidx1, align 8 + %arrayidx2 = getelementptr inbounds double* %p, i64 2 + %2 = load double* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds double* %p, i64 3 + %3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %p, i64 4 + %4 = load double* %arrayidx4, align 8 + %mul = fmul fast double %0, %1 + %add = fadd fast double %mul, %4 + %mul5 = fmul fast double %1, %2 + %add6 = fadd fast double %mul5, %add + %mul7 = fmul fast double %1, %3 + %sub = fsub fast double %add6, %mul7 + %mul8 = fmul fast double %2, %3 + %add9 = fadd fast double %mul8, %sub + store double %add9, double* %q, align 8 + %arrayidx11 = getelementptr inbounds double* %p, i64 5 + %5 = load double* %arrayidx11, align 8 + %arrayidx12 = getelementptr inbounds double* %p, i64 6 + %6 = load double* %arrayidx12, align 8 + %arrayidx13 = getelementptr inbounds double* %p, i64 7 + %7 = load double* %arrayidx13, align 8 + %mul15 = fmul fast double %6, %7 + %mul16 = fmul fast double %0, %5 + %add17 = fadd fast double %mul16, %mul15 + %mul18 = fmul fast double %5, %6 + %add19 = fadd fast double %mul18, %add17 + %arrayidx20 = getelementptr inbounds double* %q, i64 1 + store double %add19, double* %arrayidx20, align 8 + ret void +} + +; Overlapping groups - coloring needed. + +; CHECK-LABEL: f2: +; CHECK-EVEN: fmadd [[x:d[0-9]*[02468]]] +; CHECK-EVEN: fmul [[y:d[0-9]*[13579]]] +; CHECK-ODD: fmadd [[x:d[0-9]*[13579]]] +; CHECK-ODD: fmul [[y:d[0-9]*[02468]]] +; CHECK: fmadd [[x]] +; CHECK: fmadd [[y]] +; CHECK: fmsub [[x]] +; CHECK: fmadd [[y]] +; CHECK: fmadd [[x]] +; CHECK: stp [[x]], [[y]] + +define void @f2(double* nocapture readonly %p, double* nocapture %q) #0 { +entry: + %0 = load double* %p, align 8 + %arrayidx1 = getelementptr inbounds double* %p, i64 1 + %1 = load double* %arrayidx1, align 8 + %arrayidx2 = getelementptr inbounds double* %p, i64 2 + %2 = load double* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds double* %p, i64 3 + %3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %p, i64 4 + %4 = load double* %arrayidx4, align 8 + %arrayidx5 = getelementptr inbounds double* %p, i64 5 + %5 = load double* %arrayidx5, align 8 + %arrayidx6 = getelementptr inbounds double* %p, i64 6 + %6 = load double* %arrayidx6, align 8 + %arrayidx7 = getelementptr inbounds double* %p, i64 7 + %7 = load double* %arrayidx7, align 8 + %mul = fmul fast double %0, %1 + %add = fadd fast double %mul, %7 + %mul8 = fmul fast double %5, %6 + %mul9 = fmul fast double %1, %2 + %add10 = fadd fast double %mul9, %add + %mul11 = fmul fast double %3, %4 + %add12 = fadd fast double %mul11, %mul8 + %mul13 = fmul fast double %1, %3 + %sub = fsub fast double %add10, %mul13 + %mul14 = fmul fast double %4, %5 + %add15 = fadd fast double %mul14, %add12 + %mul16 = fmul fast double %2, %3 + %add17 = fadd fast double %mul16, %sub + store double %add17, double* %q, align 8 + %arrayidx19 = getelementptr inbounds double* %q, i64 1 + store double %add15, double* %arrayidx19, align 8 + ret void +} + +; Dest register is live on block exit - fixup needed. + +; CHECK-LABEL: f3: +; CHECK-EVEN: fmadd [[x:d[0-9]*[02468]]] +; CHECK-ODD: fmadd [[x:d[0-9]*[13579]]] +; CHECK: fmadd [[x]] +; CHECK: fmsub [[x]] +; CHECK: fmadd [[y:d[0-9]+]], {{.*}}, [[x]] +; CHECK: str [[y]] + +define void @f3(double* nocapture readonly %p, double* nocapture %q) #0 { +entry: + %0 = load double* %p, align 8 + %arrayidx1 = getelementptr inbounds double* %p, i64 1 + %1 = load double* %arrayidx1, align 8 + %arrayidx2 = getelementptr inbounds double* %p, i64 2 + %2 = load double* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds double* %p, i64 3 + %3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %p, i64 4 + %4 = load double* %arrayidx4, align 8 + %mul = fmul fast double %0, %1 + %add = fadd fast double %mul, %4 + %mul5 = fmul fast double %1, %2 + %add6 = fadd fast double %mul5, %add + %mul7 = fmul fast double %1, %3 + %sub = fsub fast double %add6, %mul7 + %mul8 = fmul fast double %2, %3 + %add9 = fadd fast double %mul8, %sub + %cmp = fcmp oeq double %3, 0.000000e+00 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + tail call void bitcast (void (...)* @g to void ()*)() #2 + br label %if.end + +if.end: ; preds = %if.then, %entry + store double %add9, double* %q, align 8 + ret void +} + +declare void @g(...) #1 + +; Single precision version of f2. + +; CHECK-LABEL: f4: +; CHECK-EVEN: fmadd [[x:s[0-9]*[02468]]] +; CHECK-EVEN: fmul [[y:s[0-9]*[13579]]] +; CHECK-ODD: fmadd [[x:s[0-9]*[13579]]] +; CHECK-ODD: fmul [[y:s[0-9]*[02468]]] +; CHECK: fmadd [[x]] +; CHECK: fmadd [[y]] +; CHECK: fmsub [[x]] +; CHECK: fmadd [[y]] +; CHECK: fmadd [[x]] +; CHECK: stp [[x]], [[y]] + +define void @f4(float* nocapture readonly %p, float* nocapture %q) #0 { +entry: + %0 = load float* %p, align 4 + %arrayidx1 = getelementptr inbounds float* %p, i64 1 + %1 = load float* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds float* %p, i64 2 + %2 = load float* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds float* %p, i64 3 + %3 = load float* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds float* %p, i64 4 + %4 = load float* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds float* %p, i64 5 + %5 = load float* %arrayidx5, align 4 + %arrayidx6 = getelementptr inbounds float* %p, i64 6 + %6 = load float* %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds float* %p, i64 7 + %7 = load float* %arrayidx7, align 4 + %mul = fmul fast float %0, %1 + %add = fadd fast float %mul, %7 + %mul8 = fmul fast float %5, %6 + %mul9 = fmul fast float %1, %2 + %add10 = fadd fast float %mul9, %add + %mul11 = fmul fast float %3, %4 + %add12 = fadd fast float %mul11, %mul8 + %mul13 = fmul fast float %1, %3 + %sub = fsub fast float %add10, %mul13 + %mul14 = fmul fast float %4, %5 + %add15 = fadd fast float %mul14, %add12 + %mul16 = fmul fast float %2, %3 + %add17 = fadd fast float %mul16, %sub + store float %add17, float* %q, align 4 + %arrayidx19 = getelementptr inbounds float* %q, i64 1 + store float %add15, float* %arrayidx19, align 4 + ret void +} + +; Single precision version of f3 + +; CHECK-LABEL: f5: +; CHECK-EVEN: fmadd [[x:s[0-9]*[02468]]] +; CHECK-ODD: fmadd [[x:s[0-9]*[13579]]] +; CHECK: fmadd [[x]] +; CHECK: fmsub [[x]] +; CHECK: fmadd [[y:s[0-9]+]], {{.*}}, [[x]] +; CHECK: str [[y]] + +define void @f5(float* nocapture readonly %p, float* nocapture %q) #0 { +entry: + %0 = load float* %p, align 4 + %arrayidx1 = getelementptr inbounds float* %p, i64 1 + %1 = load float* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds float* %p, i64 2 + %2 = load float* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds float* %p, i64 3 + %3 = load float* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds float* %p, i64 4 + %4 = load float* %arrayidx4, align 4 + %mul = fmul fast float %0, %1 + %add = fadd fast float %mul, %4 + %mul5 = fmul fast float %1, %2 + %add6 = fadd fast float %mul5, %add + %mul7 = fmul fast float %1, %3 + %sub = fsub fast float %add6, %mul7 + %mul8 = fmul fast float %2, %3 + %add9 = fadd fast float %mul8, %sub + %cmp = fcmp oeq float %3, 0.000000e+00 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + tail call void bitcast (void (...)* @g to void ()*)() #2 + br label %if.end + +if.end: ; preds = %if.then, %entry + store float %add9, float* %q, align 4 + ret void +} + +; Test that regmask clobbering stops a chain sequence. + +; CHECK-LABEL: f6: +; CHECK-EVEN: fmadd [[x:d[0-9]*[02468]]] +; CHECK-ODD: fmadd [[x:d[0-9]*[13579]]] +; CHECK: fmadd [[x]] +; CHECK: fmsub [[x]] +; CHECK: fmadd d0, {{.*}}, [[x]] +; CHECK: bl hh +; CHECK: str d0 + +define void @f6(double* nocapture readonly %p, double* nocapture %q) #0 { +entry: + %0 = load double* %p, align 8 + %arrayidx1 = getelementptr inbounds double* %p, i64 1 + %1 = load double* %arrayidx1, align 8 + %arrayidx2 = getelementptr inbounds double* %p, i64 2 + %2 = load double* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds double* %p, i64 3 + %3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %p, i64 4 + %4 = load double* %arrayidx4, align 8 + %mul = fmul fast double %0, %1 + %add = fadd fast double %mul, %4 + %mul5 = fmul fast double %1, %2 + %add6 = fadd fast double %mul5, %add + %mul7 = fmul fast double %1, %3 + %sub = fsub fast double %add6, %mul7 + %mul8 = fmul fast double %2, %3 + %add9 = fadd fast double %mul8, %sub + %call = tail call double @hh(double %add9) #2 + store double %call, double* %q, align 8 + ret void +} + +declare double @hh(double) #1 + +; Check that we correctly deal with repeated operands. +; The following testcase creates: +; %D1 = FADDDrr %D0, %D0 +; We'll get a crash if we naively look at the first operand, remove it +; from the substitution list then look at the second operand. + +; CHECK: fmadd [[x:d[0-9]+]] +; CHECK: fadd d1, [[x]], [[x]] + +define void @f7(double* nocapture readonly %p, double* nocapture %q) #0 { +entry: + %0 = load double* %p, align 8 + %arrayidx1 = getelementptr inbounds double* %p, i64 1 + %1 = load double* %arrayidx1, align 8 + %arrayidx2 = getelementptr inbounds double* %p, i64 2 + %2 = load double* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds double* %p, i64 3 + %3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %p, i64 4 + %4 = load double* %arrayidx4, align 8 + %mul = fmul fast double %0, %1 + %add = fadd fast double %mul, %4 + %mul5 = fmul fast double %1, %2 + %add6 = fadd fast double %mul5, %add + %mul7 = fmul fast double %1, %3 + %sub = fsub fast double %add6, %mul7 + %mul8 = fmul fast double %2, %3 + %add9 = fadd fast double %mul8, %sub + %add10 = fadd fast double %add9, %add9 + call void @hhh(double 0.0, double %add10) + ret void +} + +declare void @hhh(double, double) + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #2 = { nounwind } + diff --git a/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll b/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll new file mode 100644 index 0000000..64d91ee --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll @@ -0,0 +1,534 @@ +; REQUIRES: asserts +; The regression tests need to test for order of emitted instructions, and +; therefore, the tests are a bit fragile/reliant on instruction scheduling. The +; test cases have been minimized as much as possible, but still most of the test +; cases could break if instruction scheduling heuristics for cortex-a53 change +; RUN: llc < %s -mcpu=cortex-a53 -aarch64-fix-cortex-a53-835769=1 -stats 2>&1 \ +; RUN: | FileCheck %s --check-prefix CHECK +; RUN: llc < %s -mcpu=cortex-a53 -aarch64-fix-cortex-a53-835769=0 -stats 2>&1 \ +; RUN: | FileCheck %s --check-prefix CHECK-NOWORKAROUND +; The following run lines are just to verify whether or not this pass runs by +; default for given CPUs. Given the fragility of the tests, this is only run on +; a test case where the scheduler has not freedom at all to reschedule the +; instructions, so the potentially massively different scheduling heuristics +; will not break the test case. +; RUN: llc < %s -mcpu=generic | FileCheck %s --check-prefix CHECK-BASIC-PASS-DISABLED +; RUN: llc < %s -mcpu=cortex-a53 | FileCheck %s --check-prefix CHECK-BASIC-PASS-DISABLED +; RUN: llc < %s -mcpu=cortex-a57 | FileCheck %s --check-prefix CHECK-BASIC-PASS-DISABLED +; RUN: llc < %s -mcpu=cyclone | FileCheck %s --check-prefix CHECK-BASIC-PASS-DISABLED + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +define i64 @f_load_madd_64(i64 %a, i64 %b, i64* nocapture readonly %c) #0 { +entry: + %0 = load i64* %c, align 8 + %mul = mul nsw i64 %0, %b + %add = add nsw i64 %mul, %a + ret i64 %add +} +; CHECK-LABEL: f_load_madd_64: +; CHECK: ldr +; CHECK-NEXT: nop +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: f_load_madd_64: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: madd +; CHECK-BASIC-PASS-DISABLED-LABEL: f_load_madd_64: +; CHECK-BASIC-PASS-DISABLED: ldr +; CHECK-BASIC-PASS-DISABLED-NEXT: madd + + +define i32 @f_load_madd_32(i32 %a, i32 %b, i32* nocapture readonly %c) #0 { +entry: + %0 = load i32* %c, align 4 + %mul = mul nsw i32 %0, %b + %add = add nsw i32 %mul, %a + ret i32 %add +} +; CHECK-LABEL: f_load_madd_32: +; CHECK: ldr +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: f_load_madd_32: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: madd + + +define i64 @f_load_msub_64(i64 %a, i64 %b, i64* nocapture readonly %c) #0 { +entry: + %0 = load i64* %c, align 8 + %mul = mul nsw i64 %0, %b + %sub = sub nsw i64 %a, %mul + ret i64 %sub +} +; CHECK-LABEL: f_load_msub_64: +; CHECK: ldr +; CHECK-NEXT: nop +; CHECK-NEXT: msub +; CHECK-NOWORKAROUND-LABEL: f_load_msub_64: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: msub + + +define i32 @f_load_msub_32(i32 %a, i32 %b, i32* nocapture readonly %c) #0 { +entry: + %0 = load i32* %c, align 4 + %mul = mul nsw i32 %0, %b + %sub = sub nsw i32 %a, %mul + ret i32 %sub +} +; CHECK-LABEL: f_load_msub_32: +; CHECK: ldr +; CHECK-NEXT: msub +; CHECK-NOWORKAROUND-LABEL: f_load_msub_32: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: msub + + +define i64 @f_load_mul_64(i64 %a, i64 %b, i64* nocapture readonly %c) #0 { +entry: + %0 = load i64* %c, align 8 + %mul = mul nsw i64 %0, %b + ret i64 %mul +} +; CHECK-LABEL: f_load_mul_64: +; CHECK: ldr +; CHECK-NEXT: mul +; CHECK-NOWORKAROUND-LABEL: f_load_mul_64: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: mul + + +define i32 @f_load_mul_32(i32 %a, i32 %b, i32* nocapture readonly %c) #0 { +entry: + %0 = load i32* %c, align 4 + %mul = mul nsw i32 %0, %b + ret i32 %mul +} +; CHECK-LABEL: f_load_mul_32: +; CHECK: ldr +; CHECK-NEXT: mul +; CHECK-NOWORKAROUND-LABEL: f_load_mul_32: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: mul + + +define i64 @f_load_mneg_64(i64 %a, i64 %b, i64* nocapture readonly %c) #0 { +entry: + %0 = load i64* %c, align 8 + %mul = sub i64 0, %b + %sub = mul i64 %0, %mul + ret i64 %sub +} +; CHECK-LABEL: f_load_mneg_64: +; CHECK-NOWORKAROUND-LABEL: f_load_mneg_64: +; FIXME: only add further checks here once LLVM actually produces +; neg instructions +; FIXME-CHECK: ldr +; FIXME-CHECK-NEXT: nop +; FIXME-CHECK-NEXT: mneg +; FIXME-CHECK-NOWORKAROUND: ldr +; FIXME-CHECK-NOWORKAROUND-NEXT: mneg + + +define i32 @f_load_mneg_32(i32 %a, i32 %b, i32* nocapture readonly %c) #0 { +entry: + %0 = load i32* %c, align 4 + %mul = sub i32 0, %b + %sub = mul i32 %0, %mul + ret i32 %sub +} +; CHECK-LABEL: f_load_mneg_32: +; CHECK-NOWORKAROUND-LABEL: f_load_mneg_32: +; FIXME: only add further checks here once LLVM actually produces +; neg instructions +; FIXME-CHECK: ldr +; FIXME-CHECK-NEXT: mneg +; FIXME-CHECK-NOWORKAROUND: ldr +; FIXME-CHECK-NOWORKAROUND-NEXT: mneg + + +define i64 @f_load_smaddl(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = sext i32 %b to i64 + %conv1 = sext i32 %c to i64 + %mul = mul nsw i64 %conv1, %conv + %add = add nsw i64 %mul, %a + %0 = load i32* %d, align 4 + %conv2 = sext i32 %0 to i64 + %add3 = add nsw i64 %add, %conv2 + ret i64 %add3 +} +; CHECK-LABEL: f_load_smaddl: +; CHECK: ldrsw +; CHECK-NEXT: nop +; CHECK-NEXT: smaddl +; CHECK-NOWORKAROUND-LABEL: f_load_smaddl: +; CHECK-NOWORKAROUND: ldrsw +; CHECK-NOWORKAROUND-NEXT: smaddl + + +define i64 @f_load_smsubl_64(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = sext i32 %b to i64 + %conv1 = sext i32 %c to i64 + %mul = mul nsw i64 %conv1, %conv + %sub = sub i64 %a, %mul + %0 = load i32* %d, align 4 + %conv2 = sext i32 %0 to i64 + %add = add nsw i64 %sub, %conv2 + ret i64 %add +} +; CHECK-LABEL: f_load_smsubl_64: +; CHECK: ldrsw +; CHECK-NEXT: nop +; CHECK-NEXT: smsubl +; CHECK-NOWORKAROUND-LABEL: f_load_smsubl_64: +; CHECK-NOWORKAROUND: ldrsw +; CHECK-NOWORKAROUND-NEXT: smsubl + + +define i64 @f_load_smull(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = sext i32 %b to i64 + %conv1 = sext i32 %c to i64 + %mul = mul nsw i64 %conv1, %conv + %0 = load i32* %d, align 4 + %conv2 = sext i32 %0 to i64 + %div = sdiv i64 %mul, %conv2 + ret i64 %div +} +; CHECK-LABEL: f_load_smull: +; CHECK: ldrsw +; CHECK-NEXT: smull +; CHECK-NOWORKAROUND-LABEL: f_load_smull: +; CHECK-NOWORKAROUND: ldrsw +; CHECK-NOWORKAROUND-NEXT: smull + + +define i64 @f_load_smnegl_64(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = sext i32 %b to i64 + %conv1 = sext i32 %c to i64 + %mul = sub nsw i64 0, %conv + %sub = mul i64 %conv1, %mul + %0 = load i32* %d, align 4 + %conv2 = sext i32 %0 to i64 + %div = sdiv i64 %sub, %conv2 + ret i64 %div +} +; CHECK-LABEL: f_load_smnegl_64: +; CHECK-NOWORKAROUND-LABEL: f_load_smnegl_64: +; FIXME: only add further checks here once LLVM actually produces +; smnegl instructions + + +define i64 @f_load_umaddl(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = zext i32 %b to i64 + %conv1 = zext i32 %c to i64 + %mul = mul i64 %conv1, %conv + %add = add i64 %mul, %a + %0 = load i32* %d, align 4 + %conv2 = zext i32 %0 to i64 + %add3 = add i64 %add, %conv2 + ret i64 %add3 +} +; CHECK-LABEL: f_load_umaddl: +; CHECK: ldr +; CHECK-NEXT: nop +; CHECK-NEXT: umaddl +; CHECK-NOWORKAROUND-LABEL: f_load_umaddl: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: umaddl + + +define i64 @f_load_umsubl_64(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = zext i32 %b to i64 + %conv1 = zext i32 %c to i64 + %mul = mul i64 %conv1, %conv + %sub = sub i64 %a, %mul + %0 = load i32* %d, align 4 + %conv2 = zext i32 %0 to i64 + %add = add i64 %sub, %conv2 + ret i64 %add +} +; CHECK-LABEL: f_load_umsubl_64: +; CHECK: ldr +; CHECK-NEXT: nop +; CHECK-NEXT: umsubl +; CHECK-NOWORKAROUND-LABEL: f_load_umsubl_64: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: umsubl + + +define i64 @f_load_umull(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = zext i32 %b to i64 + %conv1 = zext i32 %c to i64 + %mul = mul i64 %conv1, %conv + %0 = load i32* %d, align 4 + %conv2 = zext i32 %0 to i64 + %div = udiv i64 %mul, %conv2 + ret i64 %div +} +; CHECK-LABEL: f_load_umull: +; CHECK: ldr +; CHECK-NEXT: umull +; CHECK-NOWORKAROUND-LABEL: f_load_umull: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: umull + + +define i64 @f_load_umnegl_64(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = zext i32 %b to i64 + %conv1 = zext i32 %c to i64 + %mul = sub nsw i64 0, %conv + %sub = mul i64 %conv1, %mul + %0 = load i32* %d, align 4 + %conv2 = zext i32 %0 to i64 + %div = udiv i64 %sub, %conv2 + ret i64 %div +} +; CHECK-LABEL: f_load_umnegl_64: +; CHECK-NOWORKAROUND-LABEL: f_load_umnegl_64: +; FIXME: only add further checks here once LLVM actually produces +; umnegl instructions + + +define i64 @f_store_madd_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 { +entry: + %0 = load i64* %cp, align 8 + store i64 %a, i64* %e, align 8 + %mul = mul nsw i64 %0, %b + %add = add nsw i64 %mul, %a + ret i64 %add +} +; CHECK-LABEL: f_store_madd_64: +; CHECK: str +; CHECK-NEXT: nop +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: f_store_madd_64: +; CHECK-NOWORKAROUND: str +; CHECK-NOWORKAROUND-NEXT: madd + + +define i32 @f_store_madd_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 { +entry: + %0 = load i32* %cp, align 4 + store i32 %a, i32* %e, align 4 + %mul = mul nsw i32 %0, %b + %add = add nsw i32 %mul, %a + ret i32 %add +} +; CHECK-LABEL: f_store_madd_32: +; CHECK: str +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: f_store_madd_32: +; CHECK-NOWORKAROUND: str +; CHECK-NOWORKAROUND-NEXT: madd + + +define i64 @f_store_msub_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 { +entry: + %0 = load i64* %cp, align 8 + store i64 %a, i64* %e, align 8 + %mul = mul nsw i64 %0, %b + %sub = sub nsw i64 %a, %mul + ret i64 %sub +} +; CHECK-LABEL: f_store_msub_64: +; CHECK: str +; CHECK-NEXT: nop +; CHECK-NEXT: msub +; CHECK-NOWORKAROUND-LABEL: f_store_msub_64: +; CHECK-NOWORKAROUND: str +; CHECK-NOWORKAROUND-NEXT: msub + + +define i32 @f_store_msub_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 { +entry: + %0 = load i32* %cp, align 4 + store i32 %a, i32* %e, align 4 + %mul = mul nsw i32 %0, %b + %sub = sub nsw i32 %a, %mul + ret i32 %sub +} +; CHECK-LABEL: f_store_msub_32: +; CHECK: str +; CHECK-NEXT: msub +; CHECK-NOWORKAROUND-LABEL: f_store_msub_32: +; CHECK-NOWORKAROUND: str +; CHECK-NOWORKAROUND-NEXT: msub + + +define i64 @f_store_mul_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 { +entry: + %0 = load i64* %cp, align 8 + store i64 %a, i64* %e, align 8 + %mul = mul nsw i64 %0, %b + ret i64 %mul +} +; CHECK-LABEL: f_store_mul_64: +; CHECK: str +; CHECK-NEXT: mul +; CHECK-NOWORKAROUND-LABEL: f_store_mul_64: +; CHECK-NOWORKAROUND: str +; CHECK-NOWORKAROUND-NEXT: mul + + +define i32 @f_store_mul_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 { +entry: + %0 = load i32* %cp, align 4 + store i32 %a, i32* %e, align 4 + %mul = mul nsw i32 %0, %b + ret i32 %mul +} +; CHECK-LABEL: f_store_mul_32: +; CHECK: str +; CHECK-NEXT: mul +; CHECK-NOWORKAROUND-LABEL: f_store_mul_32: +; CHECK-NOWORKAROUND: str +; CHECK-NOWORKAROUND-NEXT: mul + + +define i64 @f_prefetch_madd_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 { +entry: + %0 = load i64* %cp, align 8 + %1 = bitcast i64* %e to i8* + tail call void @llvm.prefetch(i8* %1, i32 0, i32 0, i32 1) + %mul = mul nsw i64 %0, %b + %add = add nsw i64 %mul, %a + ret i64 %add +} +; CHECK-LABEL: f_prefetch_madd_64: +; CHECK: prfm +; CHECK-NEXT: nop +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: f_prefetch_madd_64: +; CHECK-NOWORKAROUND: prfm +; CHECK-NOWORKAROUND-NEXT: madd + +declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) #2 + +define i32 @f_prefetch_madd_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 { +entry: + %0 = load i32* %cp, align 4 + %1 = bitcast i32* %e to i8* + tail call void @llvm.prefetch(i8* %1, i32 1, i32 0, i32 1) + %mul = mul nsw i32 %0, %b + %add = add nsw i32 %mul, %a + ret i32 %add +} +; CHECK-LABEL: f_prefetch_madd_32: +; CHECK: prfm +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: f_prefetch_madd_32: +; CHECK-NOWORKAROUND: prfm +; CHECK-NOWORKAROUND-NEXT: madd + +define i64 @f_prefetch_msub_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 { +entry: + %0 = load i64* %cp, align 8 + %1 = bitcast i64* %e to i8* + tail call void @llvm.prefetch(i8* %1, i32 0, i32 1, i32 1) + %mul = mul nsw i64 %0, %b + %sub = sub nsw i64 %a, %mul + ret i64 %sub +} +; CHECK-LABEL: f_prefetch_msub_64: +; CHECK: prfm +; CHECK-NEXT: nop +; CHECK-NEXT: msub +; CHECK-NOWORKAROUND-LABEL: f_prefetch_msub_64: +; CHECK-NOWORKAROUND: prfm +; CHECK-NOWORKAROUND-NEXT: msub + +define i32 @f_prefetch_msub_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 { +entry: + %0 = load i32* %cp, align 4 + %1 = bitcast i32* %e to i8* + tail call void @llvm.prefetch(i8* %1, i32 1, i32 1, i32 1) + %mul = mul nsw i32 %0, %b + %sub = sub nsw i32 %a, %mul + ret i32 %sub +} +; CHECK-LABEL: f_prefetch_msub_32: +; CHECK: prfm +; CHECK-NEXT: msub +; CHECK-NOWORKAROUND-LABEL: f_prefetch_msub_32: +; CHECK-NOWORKAROUND: prfm +; CHECK-NOWORKAROUND-NEXT: msub + +define i64 @f_prefetch_mul_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 { +entry: + %0 = load i64* %cp, align 8 + %1 = bitcast i64* %e to i8* + tail call void @llvm.prefetch(i8* %1, i32 0, i32 3, i32 1) + %mul = mul nsw i64 %0, %b + ret i64 %mul +} +; CHECK-LABEL: f_prefetch_mul_64: +; CHECK: prfm +; CHECK-NEXT: mul +; CHECK-NOWORKAROUND-LABEL: f_prefetch_mul_64: +; CHECK-NOWORKAROUND: prfm +; CHECK-NOWORKAROUND-NEXT: mul + +define i32 @f_prefetch_mul_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 { +entry: + %0 = load i32* %cp, align 4 + %1 = bitcast i32* %e to i8* + tail call void @llvm.prefetch(i8* %1, i32 1, i32 3, i32 1) + %mul = mul nsw i32 %0, %b + ret i32 %mul +} +; CHECK-LABEL: f_prefetch_mul_32: +; CHECK: prfm +; CHECK-NEXT: mul +; CHECK-NOWORKAROUND-LABEL: f_prefetch_mul_32: +; CHECK-NOWORKAROUND: prfm +; CHECK-NOWORKAROUND-NEXT: mul + +define i64 @fall_through(i64 %a, i64 %b, i64* nocapture readonly %c) #0 { +entry: + %0 = load i64* %c, align 8 + br label %block1 + +block1: + %mul = mul nsw i64 %0, %b + %add = add nsw i64 %mul, %a + %tmp = ptrtoint i8* blockaddress(@fall_through, %block1) to i64 + %ret = add nsw i64 %tmp, %add + ret i64 %ret +} +; CHECK-LABEL: fall_through +; CHECK: ldr +; CHECK-NEXT: nop +; CHECK-NEXT: .Ltmp +; CHECK-NEXT: BB +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: fall_through +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: .Ltmp +; CHECK-NOWORKAROUND-NEXT: BB +; CHECK-NOWORKAROUND-NEXT: madd + +; No checks for this, just check it doesn't crash +define i32 @crash_check(i8** nocapture readnone %data) #0 { +entry: + br label %while.cond + +while.cond: + br label %while.cond +} + +attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + + +; CHECK-LABEL: ... Statistics Collected ... +; CHECK: 11 aarch64-fix-cortex-a53-835769 - Number of Nops added to work around erratum 835769 diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll new file mode 100644 index 0000000..32b3ed2 --- /dev/null +++ b/test/CodeGen/AArch64/remat.ll @@ -0,0 +1,16 @@ +; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a57 -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s + +%X = type { i64, i64, i64 } +declare void @f(%X*) +define void @t() { +entry: + %tmp = alloca %X + call void @f(%X* %tmp) +; CHECK: add x0, sp, #8 +; CHECK-NEXT-NOT: mov + call void @f(%X* %tmp) +; CHECK: add x0, sp, #8 +; CHECK-NEXT-NOT: mov + ret void +} diff --git a/test/CodeGen/X86/critical-anti-dep-breaker.ll b/test/CodeGen/X86/critical-anti-dep-breaker.ll new file mode 100644 index 0000000..32d3f49 --- /dev/null +++ b/test/CodeGen/X86/critical-anti-dep-breaker.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic -post-RA-scheduler=1 -break-anti-dependencies=critical | FileCheck %s + +; PR20308 ( http://llvm.org/bugs/show_bug.cgi?id=20308 ) +; The critical-anti-dependency-breaker must not use register def information from a kill inst. +; This test case expects such an instruction to appear as a comment with def info for RDI. +; There is an anti-dependency (WAR) hazard using RAX using default reg allocation and scheduling. +; The post-RA-scheduler and critical-anti-dependency breaker can eliminate that hazard using R10. +; That is the first free register that isn't used as a param in the call to "@Image". + +@PartClass = external global i32 +@NullToken = external global i64 + +; CHECK-LABEL: Part_Create: +; CHECK-DAG: # kill: RDI +; CHECK-DAG: movq PartClass@GOTPCREL(%rip), %r10 +define i32 @Part_Create(i64* %Anchor, i32 %TypeNum, i32 %F, i32 %Z, i32* %Status, i64* %PartTkn) { + %PartObj = alloca i64*, align 8 + %Vchunk = alloca i64, align 8 + %1 = load i64* @NullToken, align 4 + store i64 %1, i64* %Vchunk, align 8 + %2 = load i32* @PartClass, align 4 + call i32 @Image(i64* %Anchor, i32 %2, i32 0, i32 0, i32* %Status, i64* %PartTkn, i64** %PartObj) + call i32 @Create(i64* %Anchor) + ret i32 %2 +} + +declare i32 @Image(i64*, i32, i32, i32, i32*, i64*, i64**) +declare i32 @Create(i64*) -- cgit v1.1