From de5e5ec3045a73a06b1054417f9ac6c02929e9ce Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Wed, 1 Feb 2012 03:51:43 +0000 Subject: Add a basic-block autovectorization pass. This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure. Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149468 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Transforms/BBVectorize/cycle.ll | 112 ++++++++++++++++++++ test/Transforms/BBVectorize/dg.exp | 3 + test/Transforms/BBVectorize/ld1.ll | 41 ++++++++ test/Transforms/BBVectorize/loop1.ll | 93 +++++++++++++++++ test/Transforms/BBVectorize/req-depth.ll | 17 ++++ test/Transforms/BBVectorize/search-limit.ll | 46 +++++++++ test/Transforms/BBVectorize/simple-int.ll | 59 +++++++++++ test/Transforms/BBVectorize/simple-ldstr.ll | 110 ++++++++++++++++++++ test/Transforms/BBVectorize/simple.ll | 152 ++++++++++++++++++++++++++++ 9 files changed, 633 insertions(+) create mode 100644 test/Transforms/BBVectorize/cycle.ll create mode 100644 test/Transforms/BBVectorize/dg.exp create mode 100644 test/Transforms/BBVectorize/ld1.ll create mode 100644 test/Transforms/BBVectorize/loop1.ll create mode 100644 test/Transforms/BBVectorize/req-depth.ll create mode 100644 test/Transforms/BBVectorize/search-limit.ll create mode 100644 test/Transforms/BBVectorize/simple-int.ll create mode 100644 test/Transforms/BBVectorize/simple-ldstr.ll create mode 100644 test/Transforms/BBVectorize/simple.ll (limited to 'test') diff --git a/test/Transforms/BBVectorize/cycle.ll b/test/Transforms/BBVectorize/cycle.ll new file mode 100644 index 0000000..32a91ce --- /dev/null +++ b/test/Transforms/BBVectorize/cycle.ll @@ -0,0 +1,112 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s + +; This test checks the non-trivial pairing-induced cycle avoidance. Without this cycle avoidance, the algorithm would otherwise +; want to select the pairs: +; %div77 = fdiv double %sub74, %mul76.v.r1 <-> %div125 = fdiv double %mul121, %mul76.v.r2 (div125 depends on mul117) +; %add84 = fadd double %sub83, 2.000000e+00 <-> %add127 = fadd double %mul126, 1.000000e+00 (add127 depends on div77) +; %mul95 = fmul double %sub45.v.r1, %sub36.v.r1 <-> %mul88 = fmul double %sub36.v.r1, %sub87 (mul88 depends on add84) +; %mul117 = fmul double %sub39.v.r1, %sub116 <-> %mul97 = fmul double %mul96, %sub39.v.r1 (mul97 depends on mul95) +; and so a dependency cycle would be created. + +declare double @fabs(double) nounwind readnone +define void @test1(double %a, double %b, double %c, double %add80, double %mul1, double %mul2.v.r1, double %mul73, double %sub, double %sub65, double %F.0, i32 %n.0, double %Bnm3.0, double %Bnm2.0, double %Bnm1.0, double %Anm3.0, double %Anm2.0, double %Anm1.0) { +entry: + br label %go +go: + %conv = sitofp i32 %n.0 to double + %add35 = fadd double %conv, %a + %sub36 = fadd double %add35, -1.000000e+00 + %add38 = fadd double %conv, %b + %sub39 = fadd double %add38, -1.000000e+00 + %add41 = fadd double %conv, %c + %sub42 = fadd double %add41, -1.000000e+00 + %sub45 = fadd double %add35, -2.000000e+00 + %sub48 = fadd double %add38, -2.000000e+00 + %sub51 = fadd double %add41, -2.000000e+00 + %mul52 = shl nsw i32 %n.0, 1 + %sub53 = add nsw i32 %mul52, -1 + %conv54 = sitofp i32 %sub53 to double + %sub56 = add nsw i32 %mul52, -3 + %conv57 = sitofp i32 %sub56 to double + %sub59 = add nsw i32 %mul52, -5 + %conv60 = sitofp i32 %sub59 to double + %mul61 = mul nsw i32 %n.0, %n.0 + %conv62 = sitofp i32 %mul61 to double + %mul63 = fmul double %conv62, 3.000000e+00 + %mul67 = fmul double %sub65, %conv + %add68 = fadd double %mul63, %mul67 + %add69 = fadd double %add68, 2.000000e+00 + %sub71 = fsub double %add69, %mul2.v.r1 + %sub74 = fsub double %sub71, %mul73 + %mul75 = fmul double %conv57, 2.000000e+00 + %mul76 = fmul double %mul75, %sub42 + %div77 = fdiv double %sub74, %mul76 + %mul82 = fmul double %add80, %conv + %sub83 = fsub double %mul63, %mul82 + %add84 = fadd double %sub83, 2.000000e+00 + %sub86 = fsub double %add84, %mul2.v.r1 + %sub87 = fsub double -0.000000e+00, %sub86 + %mul88 = fmul double %sub36, %sub87 + %mul89 = fmul double %mul88, %sub39 + %mul90 = fmul double %conv54, 4.000000e+00 + %mul91 = fmul double %mul90, %conv57 + %mul92 = fmul double %mul91, %sub51 + %mul93 = fmul double %mul92, %sub42 + %div94 = fdiv double %mul89, %mul93 + %mul95 = fmul double %sub45, %sub36 + %mul96 = fmul double %mul95, %sub48 + %mul97 = fmul double %mul96, %sub39 + %sub99 = fsub double %conv, %a + %sub100 = fadd double %sub99, -2.000000e+00 + %mul101 = fmul double %mul97, %sub100 + %sub103 = fsub double %conv, %b + %sub104 = fadd double %sub103, -2.000000e+00 + %mul105 = fmul double %mul101, %sub104 + %mul106 = fmul double %conv57, 8.000000e+00 + %mul107 = fmul double %mul106, %conv57 + %mul108 = fmul double %mul107, %conv60 + %sub111 = fadd double %add41, -3.000000e+00 + %mul112 = fmul double %mul108, %sub111 + %mul113 = fmul double %mul112, %sub51 + %mul114 = fmul double %mul113, %sub42 + %div115 = fdiv double %mul105, %mul114 + %sub116 = fsub double -0.000000e+00, %sub36 + %mul117 = fmul double %sub39, %sub116 + %sub119 = fsub double %conv, %c + %sub120 = fadd double %sub119, -1.000000e+00 + %mul121 = fmul double %mul117, %sub120 + %mul123 = fmul double %mul75, %sub51 + %mul124 = fmul double %mul123, %sub42 + %div125 = fdiv double %mul121, %mul124 + %mul126 = fmul double %div77, %sub + %add127 = fadd double %mul126, 1.000000e+00 + %mul128 = fmul double %add127, %Anm1.0 + %mul129 = fmul double %div94, %sub + %add130 = fadd double %div125, %mul129 + %mul131 = fmul double %add130, %sub + %mul132 = fmul double %mul131, %Anm2.0 + %add133 = fadd double %mul128, %mul132 + %mul134 = fmul double %div115, %mul1 + %mul135 = fmul double %mul134, %Anm3.0 + %add136 = fadd double %add133, %mul135 + %mul139 = fmul double %add127, %Bnm1.0 + %mul143 = fmul double %mul131, %Bnm2.0 + %add144 = fadd double %mul139, %mul143 + %mul146 = fmul double %mul134, %Bnm3.0 + %add147 = fadd double %add144, %mul146 + %div148 = fdiv double %add136, %add147 + %sub149 = fsub double %F.0, %div148 + %div150 = fdiv double %sub149, %F.0 + %call = tail call double @fabs(double %div150) nounwind readnone + %cmp = fcmp olt double %call, 0x3CB0000000000000 + %cmp152 = icmp sgt i32 %n.0, 20000 + %or.cond = or i1 %cmp, %cmp152 + br i1 %or.cond, label %done, label %go +done: + ret void +; CHECK: @test1 +; CHECK: go: +; CHECK-NEXT: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0 +; FIXME: When tree pruning is deterministic, include the entire output. +} diff --git a/test/Transforms/BBVectorize/dg.exp b/test/Transforms/BBVectorize/dg.exp new file mode 100644 index 0000000..f200589 --- /dev/null +++ b/test/Transforms/BBVectorize/dg.exp @@ -0,0 +1,3 @@ +load_lib llvm.exp + +RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]] diff --git a/test/Transforms/BBVectorize/ld1.ll b/test/Transforms/BBVectorize/ld1.ll new file mode 100644 index 0000000..cea225d --- /dev/null +++ b/test/Transforms/BBVectorize/ld1.ll @@ -0,0 +1,41 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s + +define double @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %i2 = load double* %c, align 8 + %add = fadd double %mul, %i2 + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + %arrayidx6 = getelementptr inbounds double* %c, i64 1 + %i5 = load double* %arrayidx6, align 8 + %add7 = fadd double %mul5, %i5 + %mul9 = fmul double %add, %i1 + %add11 = fadd double %mul9, %i2 + %mul13 = fmul double %add7, %i4 + %add15 = fadd double %mul13, %i5 + %mul16 = fmul double %add11, %add15 + ret double %mul16 +; CHECK: @test1 +; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>* +; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>* +; CHECK: %i2.v.i0 = bitcast double* %c to <2 x double>* +; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8 +; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8 +; CHECK: %mul = fmul <2 x double> %i0, %i1 +; CHECK: %i2 = load <2 x double>* %i2.v.i0, align 8 +; CHECK: %add = fadd <2 x double> %mul, %i2 +; CHECK: %mul9 = fmul <2 x double> %add, %i1 +; CHECK: %add11 = fadd <2 x double> %mul9, %i2 +; CHECK: %add11.v.r1 = extractelement <2 x double> %add11, i32 0 +; CHECK: %add11.v.r2 = extractelement <2 x double> %add11, i32 1 +; CHECK: %mul16 = fmul double %add11.v.r1, %add11.v.r2 +; CHECK: ret double %mul16 +} + diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll new file mode 100644 index 0000000..bebc91a --- /dev/null +++ b/test/Transforms/BBVectorize/loop1.ll @@ -0,0 +1,93 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s +; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL +; The second check covers the use of alias analysis (with loop unrolling). + +define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable { +entry: + br label %for.body +; CHECK: @test1 +; CHECK-UNRL: @test1 + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv + %0 = load double* %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv + %1 = load double* %arrayidx2, align 8 + %mul = fmul double %0, %0 + %mul3 = fmul double %0, %1 + %add = fadd double %mul, %mul3 + %add4 = fadd double %1, %1 + %add5 = fadd double %add4, %0 + %mul6 = fmul double %0, %add5 + %add7 = fadd double %add, %mul6 + %mul8 = fmul double %1, %1 + %add9 = fadd double %0, %0 + %add10 = fadd double %add9, %0 + %mul11 = fmul double %mul8, %add10 + %add12 = fadd double %add7, %mul11 + %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv + store double %add12, double* %arrayidx14, align 8 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 10 + br i1 %exitcond, label %for.end, label %for.body +; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +; CHECK: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv +; CHECK: %0 = load double* %arrayidx, align 8 +; CHECK: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv +; CHECK: %1 = load double* %arrayidx2, align 8 +; CHECK: %mul = fmul double %0, %0 +; CHECK: %mul3 = fmul double %0, %1 +; CHECK: %add = fadd double %mul, %mul3 +; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0 +; CHECK: %mul8 = fmul double %1, %1 +; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1 +; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2 +; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0 +; CHECK: %add5.v.i1.2 = insertelement <2 x double> %add5.v.i1.1, double %0, i32 1 +; CHECK: %add5 = fadd <2 x double> %add4, %add5.v.i1.2 +; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %add5.v.i1.1, double %mul8, i32 1 +; CHECK: %mul6 = fmul <2 x double> %mul6.v.i0.2, %add5 +; CHECK: %mul6.v.r1 = extractelement <2 x double> %mul6, i32 0 +; CHECK: %mul6.v.r2 = extractelement <2 x double> %mul6, i32 1 +; CHECK: %add7 = fadd double %add, %mul6.v.r1 +; CHECK: %add12 = fadd double %add7, %mul6.v.r2 +; CHECK: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv +; CHECK: store double %add12, double* %arrayidx14, align 8 +; CHECK: %indvars.iv.next = add i64 %indvars.iv, 1 +; CHECK: %lftr.wideiv = trunc i64 %indvars.iv.next to i32 +; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, 10 +; CHECK: br i1 %exitcond, label %for.end, label %for.body +; CHECK-UNRL: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ] +; CHECK-UNRL: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv +; CHECK-UNRL: %0 = bitcast double* %arrayidx to <2 x double>* +; CHECK-UNRL: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv +; CHECK-UNRL: %1 = bitcast double* %arrayidx2 to <2 x double>* +; CHECK-UNRL: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv +; CHECK-UNRL: %2 = load <2 x double>* %0, align 8 +; CHECK-UNRL: %3 = load <2 x double>* %1, align 8 +; CHECK-UNRL: %mul = fmul <2 x double> %2, %2 +; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3 +; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3 +; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3 +; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2 +; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5 +; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6 +; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3 +; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2 +; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2 +; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10 +; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11 +; CHECK-UNRL: %4 = bitcast double* %arrayidx14 to <2 x double>* +; CHECK-UNRL: store <2 x double> %add12, <2 x double>* %4, align 8 +; CHECK-UNRL: %indvars.iv.next.1 = add i64 %indvars.iv, 2 +; CHECK-UNRL: %lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32 +; CHECK-UNRL: %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 10 +; CHECK-UNRL: br i1 %exitcond.1, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} diff --git a/test/Transforms/BBVectorize/req-depth.ll b/test/Transforms/BBVectorize/req-depth.ll new file mode 100644 index 0000000..8c9cc3c --- /dev/null +++ b/test/Transforms/BBVectorize/req-depth.ll @@ -0,0 +1,17 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 3 -S | FileCheck %s -check-prefix=CHECK-RD3 +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 2 -S | FileCheck %s -check-prefix=CHECK-RD2 + +define double @test1(double %A1, double %A2, double %B1, double %B2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 + %R = fmul double %Y1, %Y2 + ret double %R +; CHECK-RD3: @test1 +; CHECK-RD2: @test1 +; CHECK-RD3-NOT: <2 x double> +; CHECK-RD2: <2 x double> +} + diff --git a/test/Transforms/BBVectorize/search-limit.ll b/test/Transforms/BBVectorize/search-limit.ll new file mode 100644 index 0000000..d9945b5 --- /dev/null +++ b/test/Transforms/BBVectorize/search-limit.ll @@ -0,0 +1,46 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-search-limit=4 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-SL4 + +define double @test1(double %A1, double %A2, double %B1, double %B2) { +; CHECK: @test1 +; CHECK-SL4: @test1 +; CHECK-SL4-NOT: <2 x double> +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 +; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 + %Z1 = fadd double %Y1, %B1 + ; Here we have a dependency chain: the short search limit will not + ; see past this chain and so will not see the second part of the + ; pair to vectorize. + %mul41 = fmul double %Z1, %Y2 + %sub48 = fsub double %Z1, %mul41 + %mul62 = fmul double %Z1, %sub48 + %sub69 = fsub double %Z1, %mul62 + %mul83 = fmul double %Z1, %sub69 + %sub90 = fsub double %Z1, %mul83 + %mul104 = fmul double %Z1, %sub90 + %sub111 = fsub double %Z1, %mul104 + %mul125 = fmul double %Z1, %sub111 + %sub132 = fsub double %Z1, %mul125 + %mul146 = fmul double %Z1, %sub132 + %sub153 = fsub double %Z1, %mul146 + ; end of chain. + %Z2 = fadd double %Y2, %B2 +; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 + %R1 = fdiv double %Z1, %Z2 + %R = fmul double %R1, %sub153 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R1 = fdiv double %Z1.v.r1, %Z1.v.r2 + ret double %R +; CHECK: ret double %R +} + diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll new file mode 100644 index 0000000..b2ef27b --- /dev/null +++ b/test/Transforms/BBVectorize/simple-int.ll @@ -0,0 +1,59 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s + +declare double @llvm.fma.f64(double, double, double) +declare double @llvm.cos.f64(double) + +; Basic depth-3 chain with fma +define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1) + %Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2) + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %R = fmul double %Z1, %Z2 + ret double %R +; CHECK: @test1 +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 +; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0 +; CHECK: %Y1.v.i2.2 = insertelement <2 x double> %Y1.v.i2.1, double %C2, i32 1 +; CHECK: %Y1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %X1, <2 x double> %X1.v.i0.2, <2 x double> %Y1.v.i2.2) +; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 +; CHECK: ret double %R +} + +; Basic depth-3 chain with cos +define double @test2(double %A1, double %A2, double %B1, double %B2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = call double @llvm.cos.f64(double %X1) + %Y2 = call double @llvm.cos.f64(double %X2) + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %R = fmul double %Z1, %Z2 + ret double %R +; CHECK: @test2 +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 +; CHECK: %Y1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %X1) +; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 +; CHECK: ret double %R +} + +; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) nounwind readonly + diff --git a/test/Transforms/BBVectorize/simple-ldstr.ll b/test/Transforms/BBVectorize/simple-ldstr.ll new file mode 100644 index 0000000..a5397ee --- /dev/null +++ b/test/Transforms/BBVectorize/simple-ldstr.ll @@ -0,0 +1,110 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO + +; Simple 3-pair chain with loads and stores +define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + store double %mul, double* %c, align 8 + %arrayidx5 = getelementptr inbounds double* %c, i64 1 + store double %mul5, double* %arrayidx5, align 8 + ret void +; CHECK: @test1 +; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>* +; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>* +; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8 +; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8 +; CHECK: %mul = fmul <2 x double> %i0, %i1 +; CHECK: %0 = bitcast double* %c to <2 x double>* +; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8 +; CHECK: ret void +; CHECK-AO: @test1 +; CHECK-AO-NOT: <2 x double> +} + +; Simple chain with extending loads and stores +define void @test2(float* %a, float* %b, double* %c) nounwind uwtable readonly { +entry: + %i0f = load float* %a, align 4 + %i0 = fpext float %i0f to double + %i1f = load float* %b, align 4 + %i1 = fpext float %i1f to double + %mul = fmul double %i0, %i1 + %arrayidx3 = getelementptr inbounds float* %a, i64 1 + %i3f = load float* %arrayidx3, align 4 + %i3 = fpext float %i3f to double + %arrayidx4 = getelementptr inbounds float* %b, i64 1 + %i4f = load float* %arrayidx4, align 4 + %i4 = fpext float %i4f to double + %mul5 = fmul double %i3, %i4 + store double %mul, double* %c, align 8 + %arrayidx5 = getelementptr inbounds double* %c, i64 1 + store double %mul5, double* %arrayidx5, align 8 + ret void +; CHECK: @test2 +; CHECK: %i0f.v.i0 = bitcast float* %a to <2 x float>* +; CHECK: %i1f.v.i0 = bitcast float* %b to <2 x float>* +; CHECK: %i0f = load <2 x float>* %i0f.v.i0, align 4 +; CHECK: %i0 = fpext <2 x float> %i0f to <2 x double> +; CHECK: %i1f = load <2 x float>* %i1f.v.i0, align 4 +; CHECK: %i1 = fpext <2 x float> %i1f to <2 x double> +; CHECK: %mul = fmul <2 x double> %i0, %i1 +; CHECK: %0 = bitcast double* %c to <2 x double>* +; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8 +; CHECK: ret void +; CHECK-AO: @test2 +; CHECK-AO-NOT: <2 x double> +} + +; Simple chain with loads and truncating stores +define void @test3(double* %a, double* %b, float* %c) nounwind uwtable readonly { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %mulf = fptrunc double %mul to float + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + %mul5f = fptrunc double %mul5 to float + store float %mulf, float* %c, align 8 + %arrayidx5 = getelementptr inbounds float* %c, i64 1 + store float %mul5f, float* %arrayidx5, align 4 + ret void +; CHECK: @test3 +; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>* +; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>* +; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8 +; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8 +; CHECK: %mul = fmul <2 x double> %i0, %i1 +; CHECK: %mulf = fptrunc <2 x double> %mul to <2 x float> +; CHECK: %0 = bitcast float* %c to <2 x float>* +; CHECK: store <2 x float> %mulf, <2 x float>* %0, align 8 +; CHECK: ret void +; CHECK-AO: @test3 +; CHECK-AO: %i0 = load double* %a, align 8 +; CHECK-AO: %i1 = load double* %b, align 8 +; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0 +; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0 +; CHECK-AO: %arrayidx3 = getelementptr inbounds double* %a, i64 1 +; CHECK-AO: %i3 = load double* %arrayidx3, align 8 +; CHECK-AO: %arrayidx4 = getelementptr inbounds double* %b, i64 1 +; CHECK-AO: %i4 = load double* %arrayidx4, align 8 +; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1 +; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1 +; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2 +; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float> +; CHECK-AO: %0 = bitcast float* %c to <2 x float>* +; CHECK-AO: store <2 x float> %mulf, <2 x float>* %0, align 8 +; CHECK-AO: ret void +} diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll new file mode 100644 index 0000000..904d766 --- /dev/null +++ b/test/Transforms/BBVectorize/simple.ll @@ -0,0 +1,152 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s + +; Basic depth-3 chain +define double @test1(double %A1, double %A2, double %B1, double %B2) { +; CHECK: @test1 +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 +; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 +; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 + %R = fmul double %Z1, %Z2 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 + ret double %R +; CHECK: ret double %R +} + +; Basic depth-3 chain (last pair permuted) +define double @test2(double %A1, double %A2, double %B1, double %B2) { +; CHECK: @test2 +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 +; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 + %Z1 = fadd double %Y2, %B1 + %Z2 = fadd double %Y1, %B2 +; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> +; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2 + %R = fmul double %Z1, %Z2 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 + ret double %R +; CHECK: ret double %R +} + +; Basic depth-3 chain (last pair first splat) +define double @test3(double %A1, double %A2, double %B1, double %B2) { +; CHECK: @test3 +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 +; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 + %Z1 = fadd double %Y2, %B1 + %Z2 = fadd double %Y2, %B2 +; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> +; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2 + %R = fmul double %Z1, %Z2 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 + ret double %R +; CHECK: ret double %R +} + +; Basic depth-3 chain (last pair second splat) +define double @test4(double %A1, double %A2, double %B1, double %B2) { +; CHECK: @test4 +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 +; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y1, %B2 +; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> zeroinitializer +; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2 + %R = fmul double %Z1, %Z2 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 + ret double %R +; CHECK: ret double %R +} + +; Basic depth-3 chain +define <2 x float> @test5(<2 x float> %A1, <2 x float> %A2, <2 x float> %B1, <2 x float> %B2) { +; CHECK: @test5 +; CHECK: %X1.v.i1 = shufflevector <2 x float> %B1, <2 x float> %B2, <4 x i32> +; CHECK: %X1.v.i0 = shufflevector <2 x float> %A1, <2 x float> %A2, <4 x i32> + %X1 = fsub <2 x float> %A1, %B1 + %X2 = fsub <2 x float> %A2, %B2 +; CHECK: %X1 = fsub <4 x float> %X1.v.i0, %X1.v.i1 + %Y1 = fmul <2 x float> %X1, %A1 + %Y2 = fmul <2 x float> %X2, %A2 +; CHECK: %Y1 = fmul <4 x float> %X1, %X1.v.i0 + %Z1 = fadd <2 x float> %Y1, %B1 + %Z2 = fadd <2 x float> %Y2, %B2 +; CHECK: %Z1 = fadd <4 x float> %Y1, %X1.v.i1 + %R = fmul <2 x float> %Z1, %Z2 +; CHECK: %Z1.v.r1 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32> +; CHECK: %Z1.v.r2 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32> +; CHECK: %R = fmul <2 x float> %Z1.v.r1, %Z1.v.r2 + ret <2 x float> %R +; CHECK: ret <2 x float> %R +} + +; Basic chain with shuffles +define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) { +; CHECK: @test6 +; CHECK: %X1.v.i1 = shufflevector <8 x i8> %B1, <8 x i8> %B2, <16 x i32> +; CHECK: %X1.v.i0 = shufflevector <8 x i8> %A1, <8 x i8> %A2, <16 x i32> + %X1 = sub <8 x i8> %A1, %B1 + %X2 = sub <8 x i8> %A2, %B2 +; CHECK: %X1 = sub <16 x i8> %X1.v.i0, %X1.v.i1 + %Y1 = mul <8 x i8> %X1, %A1 + %Y2 = mul <8 x i8> %X2, %A2 +; CHECK: %Y1 = mul <16 x i8> %X1, %X1.v.i0 + %Z1 = add <8 x i8> %Y1, %B1 + %Z2 = add <8 x i8> %Y2, %B2 +; CHECK: %Z1 = add <16 x i8> %Y1, %X1.v.i1 + %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> + %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> +; CHECK: %Z1.v.r2 = shufflevector <16 x i8> %Z1, <16 x i8> undef, <8 x i32> +; CHECK: %Q1.v.i1 = shufflevector <8 x i8> %Z1.v.r2, <8 x i8> undef, <16 x i32> +; CHECK: %Q1 = shufflevector <16 x i8> %Z1, <16 x i8> %Q1.v.i1, <16 x i32> + %R = mul <8 x i8> %Q1, %Q2 +; CHECK: %Q1.v.r1 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> +; CHECK: %Q1.v.r2 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> +; CHECK: %R = mul <8 x i8> %Q1.v.r1, %Q1.v.r2 + ret <8 x i8> %R +; CHECK: ret <8 x i8> %R +} + + -- cgit v1.1