diff options
author | Stephen Hines <srhines@google.com> | 2014-05-29 02:49:00 -0700 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-05-29 02:49:00 -0700 |
commit | dce4a407a24b04eebc6a376f8e62b41aaa7b071f (patch) | |
tree | dcebc53f2b182f145a2e659393bf9a0472cedf23 /test/Transforms/SLPVectorizer/X86 | |
parent | 220b921aed042f9e520c26cffd8282a94c66c3d5 (diff) | |
download | external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.zip external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.gz external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.bz2 |
Update LLVM for 3.5 rebase (r209712).
Change-Id: I149556c940fb7dc92d075273c87ff584f400941f
Diffstat (limited to 'test/Transforms/SLPVectorizer/X86')
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/align.ll | 27 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/call.ll | 128 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/consecutive-access.ll | 175 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll | 31 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/cse.ll | 30 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll | 62 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/intrinsic.ll | 44 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll | 36 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/value-bug.ll | 80 |
9 files changed, 612 insertions, 1 deletions
diff --git a/test/Transforms/SLPVectorizer/X86/align.ll b/test/Transforms/SLPVectorizer/X86/align.ll new file mode 100644 index 0000000..f586573 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/align.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Simple 3-pair chain with loads and stores +; CHECK: test1 +define void @test1(double* %a, double* %b, double* %c) { +entry: + %agg.tmp.i.i.sroa.0 = alloca [3 x double], align 16 +; CHECK: %[[V0:[0-9]+]] = load <2 x double>* %[[V2:[0-9]+]], align 8 + %i0 = load double* %a + %i1 = load double* %b + %mul = fmul double %i0, %i1 + %store1 = getelementptr inbounds [3 x double]* %agg.tmp.i.i.sroa.0, i64 0, i64 1 + %store2 = getelementptr inbounds [3 x double]* %agg.tmp.i.i.sroa.0, i64 0, i64 2 + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 +; CHECK: store <2 x double> %[[V1:[0-9]+]], <2 x double>* %[[V2:[0-9]+]], align 8 + store double %mul, double* %store1 + store double %mul5, double* %store2, align 16 +; CHECK: ret + ret void +} diff --git a/test/Transforms/SLPVectorizer/X86/call.ll b/test/Transforms/SLPVectorizer/X86/call.ll new file mode 100644 index 0000000..83d45c0 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/call.ll @@ -0,0 +1,128 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-999 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +declare double @sin(double) +declare double @cos(double) +declare double @pow(double, double) +declare double @exp2(double) +declare i64 @round(i64) + + +; CHECK: sin_libm +; CHECK: call <2 x double> @llvm.sin.v2f64 +; CHECK: ret void +define void @sin_libm(double* %a, double* %b, double* %c) { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %call = tail call double @sin(double %mul) nounwind readnone + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + %call5 = tail call double @sin(double %mul5) nounwind readnone + store double %call, double* %c, align 8 + %arrayidx5 = getelementptr inbounds double* %c, i64 1 + store double %call5, double* %arrayidx5, align 8 + ret void +} + +; CHECK: cos_libm +; CHECK: call <2 x double> @llvm.cos.v2f64 +; CHECK: ret void +define void @cos_libm(double* %a, double* %b, double* %c) { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %call = tail call double @cos(double %mul) nounwind readnone + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + %call5 = tail call double @cos(double %mul5) nounwind readnone + store double %call, double* %c, align 8 + %arrayidx5 = getelementptr inbounds double* %c, i64 1 + store double %call5, double* %arrayidx5, align 8 + ret void +} + +; CHECK: pow_libm +; CHECK: call <2 x double> @llvm.pow.v2f64 +; CHECK: ret void +define void @pow_libm(double* %a, double* %b, double* %c) { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %call = tail call double @pow(double %mul,double %mul) nounwind readnone + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + %call5 = tail call double @pow(double %mul5,double %mul5) nounwind readnone + store double %call, double* %c, align 8 + %arrayidx5 = getelementptr inbounds double* %c, i64 1 + store double %call5, double* %arrayidx5, align 8 + ret void +} + + +; CHECK: exp2_libm +; CHECK: call <2 x double> @llvm.exp2.v2f64 +; CHECK: ret void +define void @exp2_libm(double* %a, double* %b, double* %c) { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %call = tail call double @exp2(double %mul) nounwind readnone + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + %call5 = tail call double @exp2(double %mul5) nounwind readnone + store double %call, double* %c, align 8 + %arrayidx5 = getelementptr inbounds double* %c, i64 1 + store double %call5, double* %arrayidx5, align 8 + ret void +} + + +; Negative test case +; CHECK: round_custom +; CHECK-NOT: load <4 x i64> +; CHECK: ret void +define void @round_custom(i64* %a, i64* %b, i64* %c) { +entry: + %i0 = load i64* %a, align 8 + %i1 = load i64* %b, align 8 + %mul = mul i64 %i0, %i1 + %call = tail call i64 @round(i64 %mul) nounwind readnone + %arrayidx3 = getelementptr inbounds i64* %a, i64 1 + %i3 = load i64* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds i64* %b, i64 1 + %i4 = load i64* %arrayidx4, align 8 + %mul5 = mul i64 %i3, %i4 + %call5 = tail call i64 @round(i64 %mul5) nounwind readnone + store i64 %call, i64* %c, align 8 + %arrayidx5 = getelementptr inbounds i64* %c, i64 1 + store i64 %call5, i64* %arrayidx5, align 8 + ret void +} + + +; CHECK: declare <2 x double> @llvm.sin.v2f64(<2 x double>) #0 +; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) #0 +; CHECK: declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>) #0 +; CHECK: declare <2 x double> @llvm.exp2.v2f64(<2 x double>) #0 + +; CHECK: attributes #0 = { nounwind readnone } + diff --git a/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/test/Transforms/SLPVectorizer/X86/consecutive-access.ll new file mode 100644 index 0000000..f4f112f --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/consecutive-access.ll @@ -0,0 +1,175 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.9.0" + +@A = common global [2000 x double] zeroinitializer, align 16 +@B = common global [2000 x double] zeroinitializer, align 16 +@C = common global [2000 x float] zeroinitializer, align 16 +@D = common global [2000 x float] zeroinitializer, align 16 + +; Currently SCEV isn't smart enough to figure out that accesses +; A[3*i], A[3*i+1] and A[3*i+2] are consecutive, but in future +; that would hopefully be fixed. For now, check that this isn't +; vectorized. +; CHECK-LABEL: foo_3double +; CHECK-NOT: x double> +; Function Attrs: nounwind ssp uwtable +define void @foo_3double(i32 %u) #0 { +entry: + %u.addr = alloca i32, align 4 + store i32 %u, i32* %u.addr, align 4 + %mul = mul nsw i32 %u, 3 + %idxprom = sext i32 %mul to i64 + %arrayidx = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom + %0 = load double* %arrayidx, align 8 + %arrayidx4 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom + %1 = load double* %arrayidx4, align 8 + %add5 = fadd double %0, %1 + store double %add5, double* %arrayidx, align 8 + %add11 = add nsw i32 %mul, 1 + %idxprom12 = sext i32 %add11 to i64 + %arrayidx13 = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom12 + %2 = load double* %arrayidx13, align 8 + %arrayidx17 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom12 + %3 = load double* %arrayidx17, align 8 + %add18 = fadd double %2, %3 + store double %add18, double* %arrayidx13, align 8 + %add24 = add nsw i32 %mul, 2 + %idxprom25 = sext i32 %add24 to i64 + %arrayidx26 = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom25 + %4 = load double* %arrayidx26, align 8 + %arrayidx30 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom25 + %5 = load double* %arrayidx30, align 8 + %add31 = fadd double %4, %5 + store double %add31, double* %arrayidx26, align 8 + ret void +} + +; SCEV should be able to tell that accesses A[C1 + C2*i], A[C1 + C2*i], ... +; A[C1 + C2*i] are consecutive, if C2 is a power of 2, and C2 > C1 > 0. +; Thus, the following code should be vectorized. +; CHECK-LABEL: foo_2double +; CHECK: x double> +; Function Attrs: nounwind ssp uwtable +define void @foo_2double(i32 %u) #0 { +entry: + %u.addr = alloca i32, align 4 + store i32 %u, i32* %u.addr, align 4 + %mul = mul nsw i32 %u, 2 + %idxprom = sext i32 %mul to i64 + %arrayidx = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom + %0 = load double* %arrayidx, align 8 + %arrayidx4 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom + %1 = load double* %arrayidx4, align 8 + %add5 = fadd double %0, %1 + store double %add5, double* %arrayidx, align 8 + %add11 = add nsw i32 %mul, 1 + %idxprom12 = sext i32 %add11 to i64 + %arrayidx13 = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom12 + %2 = load double* %arrayidx13, align 8 + %arrayidx17 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom12 + %3 = load double* %arrayidx17, align 8 + %add18 = fadd double %2, %3 + store double %add18, double* %arrayidx13, align 8 + ret void +} + +; Similar to the previous test, but with different datatype. +; CHECK-LABEL: foo_4float +; CHECK: x float> +; Function Attrs: nounwind ssp uwtable +define void @foo_4float(i32 %u) #0 { +entry: + %u.addr = alloca i32, align 4 + store i32 %u, i32* %u.addr, align 4 + %mul = mul nsw i32 %u, 4 + %idxprom = sext i32 %mul to i64 + %arrayidx = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom + %0 = load float* %arrayidx, align 4 + %arrayidx4 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom + %1 = load float* %arrayidx4, align 4 + %add5 = fadd float %0, %1 + store float %add5, float* %arrayidx, align 4 + %add11 = add nsw i32 %mul, 1 + %idxprom12 = sext i32 %add11 to i64 + %arrayidx13 = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom12 + %2 = load float* %arrayidx13, align 4 + %arrayidx17 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom12 + %3 = load float* %arrayidx17, align 4 + %add18 = fadd float %2, %3 + store float %add18, float* %arrayidx13, align 4 + %add24 = add nsw i32 %mul, 2 + %idxprom25 = sext i32 %add24 to i64 + %arrayidx26 = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom25 + %4 = load float* %arrayidx26, align 4 + %arrayidx30 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom25 + %5 = load float* %arrayidx30, align 4 + %add31 = fadd float %4, %5 + store float %add31, float* %arrayidx26, align 4 + %add37 = add nsw i32 %mul, 3 + %idxprom38 = sext i32 %add37 to i64 + %arrayidx39 = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom38 + %6 = load float* %arrayidx39, align 4 + %arrayidx43 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom38 + %7 = load float* %arrayidx43, align 4 + %add44 = fadd float %6, %7 + store float %add44, float* %arrayidx39, align 4 + ret void +} + +; Similar to the previous tests, but now we are dealing with AddRec SCEV. +; CHECK-LABEL: foo_loop +; CHECK: x double> +; Function Attrs: nounwind ssp uwtable +define i32 @foo_loop(double* %A, i32 %n) #0 { +entry: + %A.addr = alloca double*, align 8 + %n.addr = alloca i32, align 4 + %sum = alloca double, align 8 + %i = alloca i32, align 4 + store double* %A, double** %A.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store double 0.000000e+00, double* %sum, align 8 + store i32 0, i32* %i, align 4 + %cmp1 = icmp slt i32 0, %n + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %0 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %1 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add7, %for.body ] + %mul = mul nsw i32 %0, 2 + %idxprom = sext i32 %mul to i64 + %arrayidx = getelementptr inbounds double* %A, i64 %idxprom + %2 = load double* %arrayidx, align 8 + %mul1 = fmul double 7.000000e+00, %2 + %add = add nsw i32 %mul, 1 + %idxprom3 = sext i32 %add to i64 + %arrayidx4 = getelementptr inbounds double* %A, i64 %idxprom3 + %3 = load double* %arrayidx4, align 8 + %mul5 = fmul double 7.000000e+00, %3 + %add6 = fadd double %mul1, %mul5 + %add7 = fadd double %1, %add6 + store double %add7, double* %sum, align 8 + %inc = add nsw i32 %0, 1 + store i32 %inc, i32* %i, align 4 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + %split = phi double [ %add7, %for.body ] + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + %.lcssa = phi double [ %split, %for.cond.for.end_crit_edge ], [ 0.000000e+00, %entry ] + %conv = fptosi double %.lcssa to i32 + ret i32 %conv +} + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = metadata !{metadata !"clang version 3.5.0 "} diff --git a/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll b/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll new file mode 100644 index 0000000..ed22574 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll @@ -0,0 +1,31 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; We will keep trying to vectorize the basic block even we already find vectorized store. +; CHECK: test1 +; CHECK: store <2 x double> +; CHECK: ret +define void @test1(double* %a, double* %b, double* %c, double* %d) { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + store double %mul, double* %c, align 8 + %arrayidx5 = getelementptr inbounds double* %c, i64 1 + store double %mul5, double* %arrayidx5, align 8 + %0 = bitcast double* %a to <4 x i32>* + %1 = load <4 x i32>* %0, align 8 + %2 = bitcast double* %b to <4 x i32>* + %3 = load <4 x i32>* %2, align 8 + %4 = mul <4 x i32> %1, %3 + %5 = bitcast double* %d to <4 x i32>* + store <4 x i32> %4, <4 x i32>* %5, align 8 + ret void +} diff --git a/test/Transforms/SLPVectorizer/X86/cse.ll b/test/Transforms/SLPVectorizer/X86/cse.ll index bbfd6f2..d2ad7eb 100644 --- a/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/test/Transforms/SLPVectorizer/X86/cse.ll @@ -217,3 +217,33 @@ return: ; preds = %entry, %if.end ret i32 0 } +%class.B.53.55 = type { %class.A.52.54, double } +%class.A.52.54 = type { double, double, double } + +@a = external global double, align 8 + +define void @PR19646(%class.B.53.55* %this) { +entry: + br i1 undef, label %if.end13, label %if.end13 + +sw.epilog7: ; No predecessors! + %.in = getelementptr inbounds %class.B.53.55* %this, i64 0, i32 0, i32 1 + %0 = load double* %.in, align 8 + %add = fadd double undef, 0.000000e+00 + %add6 = fadd double %add, %0 + %1 = load double* @a, align 8 + %add8 = fadd double %1, 0.000000e+00 + %_dy = getelementptr inbounds %class.B.53.55* %this, i64 0, i32 0, i32 2 + %2 = load double* %_dy, align 8 + %add10 = fadd double %add8, %2 + br i1 undef, label %if.then12, label %if.end13 + +if.then12: ; preds = %sw.epilog7 + %3 = load double* undef, align 8 + br label %if.end13 + +if.end13: ; preds = %if.then12, %sw.epilog7, %entry + %x.1 = phi double [ 0.000000e+00, %if.then12 ], [ %add6, %sw.epilog7 ], [ undef, %entry ], [ undef, %entry ] + %b.0 = phi double [ %3, %if.then12 ], [ %add10, %sw.epilog7 ], [ undef, %entry], [ undef, %entry ] + unreachable +} diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll index 7537ea3..9eda29f 100644 --- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -195,11 +195,35 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, ret <4 x float> %rb } +; Make sure that vectorization happens even if insertelements operations +; must be rescheduled. The case here is from compiling Julia. +define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @reschedule_extract( +; CHECK: %1 = fadd <4 x float> %a, %b + %a0 = extractelement <4 x float> %a, i32 0 + %b0 = extractelement <4 x float> %b, i32 0 + %c0 = fadd float %a0, %b0 + %v0 = insertelement <4 x float> undef, float %c0, i32 0 + %a1 = extractelement <4 x float> %a, i32 1 + %b1 = extractelement <4 x float> %b, i32 1 + %c1 = fadd float %a1, %b1 + %v1 = insertelement <4 x float> %v0, float %c1, i32 1 + %a2 = extractelement <4 x float> %a, i32 2 + %b2 = extractelement <4 x float> %b, i32 2 + %c2 = fadd float %a2, %b2 + %v2 = insertelement <4 x float> %v1, float %c2, i32 2 + %a3 = extractelement <4 x float> %a, i32 3 + %b3 = extractelement <4 x float> %b, i32 3 + %c3 = fadd float %a3, %b3 + %v3 = insertelement <4 x float> %v2, float %c3, i32 3 + ret <4 x float> %v3 +} + ; Check that cost model for vectorization takes credit for ; instructions that are erased. define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { ; ZEROTHRESH-LABEL: @take_credit( -; ZEROTHRESH-CHECK: %1 = fadd <4 x float> %a, %b +; ZEROTHRESH: %1 = fadd <4 x float> %a, %b %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 %c0 = fadd float %a0, %b0 @@ -219,4 +243,40 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { ret <4 x float> %v3 } +; Make sure we handle multiple trees that feed one build vector correctly. +define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) { +entry: + %t0 = fadd double %w , 0.000000e+00 + %t1 = fadd double %x , 1.000000e+00 + %t2 = fadd double %y , 2.000000e+00 + %t3 = fadd double %z , 3.000000e+00 + %t4 = fmul double %t0, 1.000000e+00 + %i1 = insertelement <4 x double> undef, double %t4, i32 3 + %t5 = fmul double %t1, 1.000000e+00 + %i2 = insertelement <4 x double> %i1, double %t5, i32 2 + %t6 = fmul double %t2, 1.000000e+00 + %i3 = insertelement <4 x double> %i2, double %t6, i32 1 + %t7 = fmul double %t3, 1.000000e+00 + %i4 = insertelement <4 x double> %i3, double %t7, i32 0 + ret <4 x double> %i4 +} +; CHECK-LABEL: @multi_tree +; CHECK-DAG: %[[V0:.+]] = insertelement <2 x double> undef, double %w, i32 0 +; CHECK-DAG: %[[V1:.+]] = insertelement <2 x double> %[[V0]], double %x, i32 1 +; CHECK-DAG: %[[V2:.+]] = fadd <2 x double> %[[V1]], <double 0.000000e+00, double 1.000000e+00> +; CHECK-DAG: %[[V3:.+]] = insertelement <2 x double> undef, double %y, i32 0 +; CHECK-DAG: %[[V4:.+]] = insertelement <2 x double> %[[V3]], double %z, i32 1 +; CHECK-DAG: %[[V5:.+]] = fadd <2 x double> %[[V4]], <double 2.000000e+00, double 3.000000e+00> +; CHECK-DAG: %[[V6:.+]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, %[[V2]] +; CHECK-DAG: %[[V7:.+]] = extractelement <2 x double> %[[V6]], i32 0 +; CHECK-DAG: %[[I1:.+]] = insertelement <4 x double> undef, double %[[V7]], i32 3 +; CHECK-DAG: %[[V8:.+]] = extractelement <2 x double> %[[V6]], i32 1 +; CHECK-DAG: %[[I2:.+]] = insertelement <4 x double> %[[I1]], double %[[V8]], i32 2 +; CHECK-DAG: %[[V9:.+]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, %[[V5]] +; CHECK-DAG: %[[V10:.+]] = extractelement <2 x double> %[[V9]], i32 0 +; CHECK-DAG: %[[I3:.+]] = insertelement <4 x double> %i2, double %[[V10]], i32 1 +; CHECK-DAG: %[[V11:.+]] = extractelement <2 x double> %[[V9]], i32 1 +; CHECK-DAG: %[[I4:.+]] = insertelement <4 x double> %i3, double %[[V11]], i32 0 +; CHECK: ret <4 x double> %[[I4]] + attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/Transforms/SLPVectorizer/X86/intrinsic.ll b/test/Transforms/SLPVectorizer/X86/intrinsic.ll index 2b7ee75..30c5093 100644 --- a/test/Transforms/SLPVectorizer/X86/intrinsic.ll +++ b/test/Transforms/SLPVectorizer/X86/intrinsic.ll @@ -71,5 +71,49 @@ entry: ret void } +declare i32 @llvm.bswap.i32(i32) nounwind readnone +define void @vec_bswap_i32(i32* %a, i32* %b, i32* %c) { +entry: + %i0 = load i32* %a, align 4 + %i1 = load i32* %b, align 4 + %add1 = add i32 %i0, %i1 + %call1 = tail call i32 @llvm.bswap.i32(i32 %add1) nounwind readnone + + %arrayidx2 = getelementptr inbounds i32* %a, i32 1 + %i2 = load i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32* %b, i32 1 + %i3 = load i32* %arrayidx3, align 4 + %add2 = add i32 %i2, %i3 + %call2 = tail call i32 @llvm.bswap.i32(i32 %add2) nounwind readnone + + %arrayidx4 = getelementptr inbounds i32* %a, i32 2 + %i4 = load i32* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds i32* %b, i32 2 + %i5 = load i32* %arrayidx5, align 4 + %add3 = add i32 %i4, %i5 + %call3 = tail call i32 @llvm.bswap.i32(i32 %add3) nounwind readnone + + %arrayidx6 = getelementptr inbounds i32* %a, i32 3 + %i6 = load i32* %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds i32* %b, i32 3 + %i7 = load i32* %arrayidx7, align 4 + %add4 = add i32 %i6, %i7 + %call4 = tail call i32 @llvm.bswap.i32(i32 %add4) nounwind readnone + store i32 %call1, i32* %c, align 4 + %arrayidx8 = getelementptr inbounds i32* %c, i32 1 + store i32 %call2, i32* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds i32* %c, i32 2 + store i32 %call3, i32* %arrayidx9, align 4 + %arrayidx10 = getelementptr inbounds i32* %c, i32 3 + store i32 %call4, i32* %arrayidx10, align 4 + ret void + +; CHECK-LABEL: @vec_bswap_i32( +; CHECK: load <4 x i32> +; CHECK: load <4 x i32> +; CHECK: call <4 x i32> @llvm.bswap.v4i32 +; CHECK: store <4 x i32> +; CHECK: ret +} diff --git a/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll b/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll new file mode 100644 index 0000000..b250735 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll @@ -0,0 +1,36 @@ +; RUN: opt < %s -slp-vectorizer -o - -S -slp-threshold=-1000 + +target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx--nvidiacl" + +; CTLZ cannot be vectorized currently because the second argument is a scalar +; for both the scalar and vector forms of the intrinsic. In the future it +; should be possible to vectorize such functions. +; Test causes an assert if LLVM tries to vectorize CTLZ. + +define <2 x i8> @cltz_test(<2 x i8> %x) #0 { +entry: + %0 = extractelement <2 x i8> %x, i32 0 + %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false) + %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0 + %1 = extractelement <2 x i8> %x, i32 1 + %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false) + %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1 + ret <2 x i8> %vecinit2 +} + +define <2 x i8> @cltz_test2(<2 x i8> %x) #1 { +entry: + %0 = extractelement <2 x i8> %x, i32 0 + %1 = extractelement <2 x i8> %x, i32 1 + %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false) + %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false) + %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0 + %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1 + ret <2 x i8> %vecinit2 +} + +declare i8 @llvm.ctlz.i8(i8, i1) #3 + +attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } diff --git a/test/Transforms/SLPVectorizer/X86/value-bug.ll b/test/Transforms/SLPVectorizer/X86/value-bug.ll new file mode 100644 index 0000000..64d2ae1 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/value-bug.ll @@ -0,0 +1,80 @@ +; RUN: opt -slp-vectorizer < %s -S -mtriple="x86_64-grtev3-linux-gnu" -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev3-linux-gnu" + +; We used to crash on this example because we were building a constant +; expression during vectorization and the vectorizer expects instructions +; as elements of the vectorized tree. +; CHECK-LABEL: @test +; PR19621 + +define void @test() { +bb279: + br label %bb283 + +bb283: + %Av.sroa.8.0 = phi float [ undef, %bb279 ], [ %tmp315, %exit ] + %Av.sroa.5.0 = phi float [ undef, %bb279 ], [ %tmp319, %exit ] + %Av.sroa.3.0 = phi float [ undef, %bb279 ], [ %tmp307, %exit ] + %Av.sroa.0.0 = phi float [ undef, %bb279 ], [ %tmp317, %exit ] + br label %bb284 + +bb284: + %tmp7.i = fpext float %Av.sroa.3.0 to double + %tmp8.i = fsub double %tmp7.i, undef + %tmp9.i = fsub double %tmp8.i, undef + %tmp17.i = fpext float %Av.sroa.8.0 to double + %tmp19.i = fsub double %tmp17.i, undef + %tmp20.i = fsub double %tmp19.i, undef + br label %bb21.i + +bb21.i: + br i1 undef, label %bb22.i, label %exit + +bb22.i: + %tmp24.i = fadd double undef, %tmp9.i + %tmp26.i = fadd double undef, %tmp20.i + br label %bb32.i + +bb32.i: + %xs.0.i = phi double [ %tmp24.i, %bb22.i ], [ 0.000000e+00, %bb32.i ] + %ys.0.i = phi double [ %tmp26.i, %bb22.i ], [ 0.000000e+00, %bb32.i ] + br i1 undef, label %bb32.i, label %bb21.i + +exit: + %tmp303 = fpext float %Av.sroa.0.0 to double + %tmp304 = fmul double %tmp303, undef + %tmp305 = fadd double undef, %tmp304 + %tmp306 = fadd double %tmp305, undef + %tmp307 = fptrunc double %tmp306 to float + %tmp311 = fpext float %Av.sroa.5.0 to double + %tmp312 = fmul double %tmp311, 0.000000e+00 + %tmp313 = fadd double undef, %tmp312 + %tmp314 = fadd double %tmp313, undef + %tmp315 = fptrunc double %tmp314 to float + %tmp317 = fptrunc double undef to float + %tmp319 = fptrunc double undef to float + br label %bb283 +} + +; Make sure that we probably handle constant folded vectorized trees. The +; vectorizer starts at the type (%t2, %t3) and wil constant fold the tree. +; The code that handles insertelement instructions must handle this. +define <4 x double> @constant_folding() { +entry: + %t0 = fadd double 1.000000e+00 , 0.000000e+00 + %t1 = fadd double 1.000000e+00 , 1.000000e+00 + %t2 = fmul double %t0, 1.000000e+00 + %i1 = insertelement <4 x double> undef, double %t2, i32 1 + %t3 = fmul double %t1, 1.000000e+00 + %i2 = insertelement <4 x double> %i1, double %t3, i32 0 + ret <4 x double> %i2 +} + +; CHECK-LABEL: @constant_folding +; CHECK: %[[V0:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 0 +; CHECK: %[[V1:.+]] = insertelement <4 x double> undef, double %[[V0]], i32 1 +; CHECK: %[[V2:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 1 +; CHECK: %[[V3:.+]] = insertelement <4 x double> %[[V1]], double %[[V2]], i32 0 +; CHECK: ret <4 x double> %[[V3]] |