aboutsummaryrefslogtreecommitdiffstats
path: root/test/Transforms/SLPVectorizer/X86
diff options
context:
space:
mode:
authorStephen Hines <srhines@google.com>2014-05-29 02:49:00 -0700
committerStephen Hines <srhines@google.com>2014-05-29 02:49:00 -0700
commitdce4a407a24b04eebc6a376f8e62b41aaa7b071f (patch)
treedcebc53f2b182f145a2e659393bf9a0472cedf23 /test/Transforms/SLPVectorizer/X86
parent220b921aed042f9e520c26cffd8282a94c66c3d5 (diff)
downloadexternal_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.zip
external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.gz
external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.bz2
Update LLVM for 3.5 rebase (r209712).
Change-Id: I149556c940fb7dc92d075273c87ff584f400941f
Diffstat (limited to 'test/Transforms/SLPVectorizer/X86')
-rw-r--r--test/Transforms/SLPVectorizer/X86/align.ll27
-rw-r--r--test/Transforms/SLPVectorizer/X86/call.ll128
-rw-r--r--test/Transforms/SLPVectorizer/X86/consecutive-access.ll175
-rw-r--r--test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll31
-rw-r--r--test/Transforms/SLPVectorizer/X86/cse.ll30
-rw-r--r--test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll62
-rw-r--r--test/Transforms/SLPVectorizer/X86/intrinsic.ll44
-rw-r--r--test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll36
-rw-r--r--test/Transforms/SLPVectorizer/X86/value-bug.ll80
9 files changed, 612 insertions, 1 deletions
diff --git a/test/Transforms/SLPVectorizer/X86/align.ll b/test/Transforms/SLPVectorizer/X86/align.ll
new file mode 100644
index 0000000..f586573
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/align.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Simple 3-pair chain with loads and stores
+; CHECK: test1
+define void @test1(double* %a, double* %b, double* %c) {
+entry:
+ %agg.tmp.i.i.sroa.0 = alloca [3 x double], align 16
+; CHECK: %[[V0:[0-9]+]] = load <2 x double>* %[[V2:[0-9]+]], align 8
+ %i0 = load double* %a
+ %i1 = load double* %b
+ %mul = fmul double %i0, %i1
+ %store1 = getelementptr inbounds [3 x double]* %agg.tmp.i.i.sroa.0, i64 0, i64 1
+ %store2 = getelementptr inbounds [3 x double]* %agg.tmp.i.i.sroa.0, i64 0, i64 2
+ %arrayidx3 = getelementptr inbounds double* %a, i64 1
+ %i3 = load double* %arrayidx3, align 8
+ %arrayidx4 = getelementptr inbounds double* %b, i64 1
+ %i4 = load double* %arrayidx4, align 8
+ %mul5 = fmul double %i3, %i4
+; CHECK: store <2 x double> %[[V1:[0-9]+]], <2 x double>* %[[V2:[0-9]+]], align 8
+ store double %mul, double* %store1
+ store double %mul5, double* %store2, align 16
+; CHECK: ret
+ ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/call.ll b/test/Transforms/SLPVectorizer/X86/call.ll
new file mode 100644
index 0000000..83d45c0
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/call.ll
@@ -0,0 +1,128 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-999 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare double @sin(double)
+declare double @cos(double)
+declare double @pow(double, double)
+declare double @exp2(double)
+declare i64 @round(i64)
+
+
+; CHECK: sin_libm
+; CHECK: call <2 x double> @llvm.sin.v2f64
+; CHECK: ret void
+define void @sin_libm(double* %a, double* %b, double* %c) {
+entry:
+ %i0 = load double* %a, align 8
+ %i1 = load double* %b, align 8
+ %mul = fmul double %i0, %i1
+ %call = tail call double @sin(double %mul) nounwind readnone
+ %arrayidx3 = getelementptr inbounds double* %a, i64 1
+ %i3 = load double* %arrayidx3, align 8
+ %arrayidx4 = getelementptr inbounds double* %b, i64 1
+ %i4 = load double* %arrayidx4, align 8
+ %mul5 = fmul double %i3, %i4
+ %call5 = tail call double @sin(double %mul5) nounwind readnone
+ store double %call, double* %c, align 8
+ %arrayidx5 = getelementptr inbounds double* %c, i64 1
+ store double %call5, double* %arrayidx5, align 8
+ ret void
+}
+
+; CHECK: cos_libm
+; CHECK: call <2 x double> @llvm.cos.v2f64
+; CHECK: ret void
+define void @cos_libm(double* %a, double* %b, double* %c) {
+entry:
+ %i0 = load double* %a, align 8
+ %i1 = load double* %b, align 8
+ %mul = fmul double %i0, %i1
+ %call = tail call double @cos(double %mul) nounwind readnone
+ %arrayidx3 = getelementptr inbounds double* %a, i64 1
+ %i3 = load double* %arrayidx3, align 8
+ %arrayidx4 = getelementptr inbounds double* %b, i64 1
+ %i4 = load double* %arrayidx4, align 8
+ %mul5 = fmul double %i3, %i4
+ %call5 = tail call double @cos(double %mul5) nounwind readnone
+ store double %call, double* %c, align 8
+ %arrayidx5 = getelementptr inbounds double* %c, i64 1
+ store double %call5, double* %arrayidx5, align 8
+ ret void
+}
+
+; CHECK: pow_libm
+; CHECK: call <2 x double> @llvm.pow.v2f64
+; CHECK: ret void
+define void @pow_libm(double* %a, double* %b, double* %c) {
+entry:
+ %i0 = load double* %a, align 8
+ %i1 = load double* %b, align 8
+ %mul = fmul double %i0, %i1
+ %call = tail call double @pow(double %mul,double %mul) nounwind readnone
+ %arrayidx3 = getelementptr inbounds double* %a, i64 1
+ %i3 = load double* %arrayidx3, align 8
+ %arrayidx4 = getelementptr inbounds double* %b, i64 1
+ %i4 = load double* %arrayidx4, align 8
+ %mul5 = fmul double %i3, %i4
+ %call5 = tail call double @pow(double %mul5,double %mul5) nounwind readnone
+ store double %call, double* %c, align 8
+ %arrayidx5 = getelementptr inbounds double* %c, i64 1
+ store double %call5, double* %arrayidx5, align 8
+ ret void
+}
+
+
+; CHECK: exp2_libm
+; CHECK: call <2 x double> @llvm.exp2.v2f64
+; CHECK: ret void
+define void @exp2_libm(double* %a, double* %b, double* %c) {
+entry:
+ %i0 = load double* %a, align 8
+ %i1 = load double* %b, align 8
+ %mul = fmul double %i0, %i1
+ %call = tail call double @exp2(double %mul) nounwind readnone
+ %arrayidx3 = getelementptr inbounds double* %a, i64 1
+ %i3 = load double* %arrayidx3, align 8
+ %arrayidx4 = getelementptr inbounds double* %b, i64 1
+ %i4 = load double* %arrayidx4, align 8
+ %mul5 = fmul double %i3, %i4
+ %call5 = tail call double @exp2(double %mul5) nounwind readnone
+ store double %call, double* %c, align 8
+ %arrayidx5 = getelementptr inbounds double* %c, i64 1
+ store double %call5, double* %arrayidx5, align 8
+ ret void
+}
+
+
+; Negative test case
+; CHECK: round_custom
+; CHECK-NOT: load <4 x i64>
+; CHECK: ret void
+define void @round_custom(i64* %a, i64* %b, i64* %c) {
+entry:
+ %i0 = load i64* %a, align 8
+ %i1 = load i64* %b, align 8
+ %mul = mul i64 %i0, %i1
+ %call = tail call i64 @round(i64 %mul) nounwind readnone
+ %arrayidx3 = getelementptr inbounds i64* %a, i64 1
+ %i3 = load i64* %arrayidx3, align 8
+ %arrayidx4 = getelementptr inbounds i64* %b, i64 1
+ %i4 = load i64* %arrayidx4, align 8
+ %mul5 = mul i64 %i3, %i4
+ %call5 = tail call i64 @round(i64 %mul5) nounwind readnone
+ store i64 %call, i64* %c, align 8
+ %arrayidx5 = getelementptr inbounds i64* %c, i64 1
+ store i64 %call5, i64* %arrayidx5, align 8
+ ret void
+}
+
+
+; CHECK: declare <2 x double> @llvm.sin.v2f64(<2 x double>) #0
+; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) #0
+; CHECK: declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>) #0
+; CHECK: declare <2 x double> @llvm.exp2.v2f64(<2 x double>) #0
+
+; CHECK: attributes #0 = { nounwind readnone }
+
diff --git a/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
new file mode 100644
index 0000000..f4f112f
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
@@ -0,0 +1,175 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+@A = common global [2000 x double] zeroinitializer, align 16
+@B = common global [2000 x double] zeroinitializer, align 16
+@C = common global [2000 x float] zeroinitializer, align 16
+@D = common global [2000 x float] zeroinitializer, align 16
+
+; Currently SCEV isn't smart enough to figure out that accesses
+; A[3*i], A[3*i+1] and A[3*i+2] are consecutive, but in future
+; that would hopefully be fixed. For now, check that this isn't
+; vectorized.
+; CHECK-LABEL: foo_3double
+; CHECK-NOT: x double>
+; Function Attrs: nounwind ssp uwtable
+define void @foo_3double(i32 %u) #0 {
+entry:
+ %u.addr = alloca i32, align 4
+ store i32 %u, i32* %u.addr, align 4
+ %mul = mul nsw i32 %u, 3
+ %idxprom = sext i32 %mul to i64
+ %arrayidx = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom
+ %0 = load double* %arrayidx, align 8
+ %arrayidx4 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom
+ %1 = load double* %arrayidx4, align 8
+ %add5 = fadd double %0, %1
+ store double %add5, double* %arrayidx, align 8
+ %add11 = add nsw i32 %mul, 1
+ %idxprom12 = sext i32 %add11 to i64
+ %arrayidx13 = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom12
+ %2 = load double* %arrayidx13, align 8
+ %arrayidx17 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom12
+ %3 = load double* %arrayidx17, align 8
+ %add18 = fadd double %2, %3
+ store double %add18, double* %arrayidx13, align 8
+ %add24 = add nsw i32 %mul, 2
+ %idxprom25 = sext i32 %add24 to i64
+ %arrayidx26 = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom25
+ %4 = load double* %arrayidx26, align 8
+ %arrayidx30 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom25
+ %5 = load double* %arrayidx30, align 8
+ %add31 = fadd double %4, %5
+ store double %add31, double* %arrayidx26, align 8
+ ret void
+}
+
+; SCEV should be able to tell that accesses A[C1 + C2*i], A[C1 + C2*i], ...
+; A[C1 + C2*i] are consecutive, if C2 is a power of 2, and C2 > C1 > 0.
+; Thus, the following code should be vectorized.
+; CHECK-LABEL: foo_2double
+; CHECK: x double>
+; Function Attrs: nounwind ssp uwtable
+define void @foo_2double(i32 %u) #0 {
+entry:
+ %u.addr = alloca i32, align 4
+ store i32 %u, i32* %u.addr, align 4
+ %mul = mul nsw i32 %u, 2
+ %idxprom = sext i32 %mul to i64
+ %arrayidx = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom
+ %0 = load double* %arrayidx, align 8
+ %arrayidx4 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom
+ %1 = load double* %arrayidx4, align 8
+ %add5 = fadd double %0, %1
+ store double %add5, double* %arrayidx, align 8
+ %add11 = add nsw i32 %mul, 1
+ %idxprom12 = sext i32 %add11 to i64
+ %arrayidx13 = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom12
+ %2 = load double* %arrayidx13, align 8
+ %arrayidx17 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom12
+ %3 = load double* %arrayidx17, align 8
+ %add18 = fadd double %2, %3
+ store double %add18, double* %arrayidx13, align 8
+ ret void
+}
+
+; Similar to the previous test, but with different datatype.
+; CHECK-LABEL: foo_4float
+; CHECK: x float>
+; Function Attrs: nounwind ssp uwtable
+define void @foo_4float(i32 %u) #0 {
+entry:
+ %u.addr = alloca i32, align 4
+ store i32 %u, i32* %u.addr, align 4
+ %mul = mul nsw i32 %u, 4
+ %idxprom = sext i32 %mul to i64
+ %arrayidx = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom
+ %0 = load float* %arrayidx, align 4
+ %arrayidx4 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom
+ %1 = load float* %arrayidx4, align 4
+ %add5 = fadd float %0, %1
+ store float %add5, float* %arrayidx, align 4
+ %add11 = add nsw i32 %mul, 1
+ %idxprom12 = sext i32 %add11 to i64
+ %arrayidx13 = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom12
+ %2 = load float* %arrayidx13, align 4
+ %arrayidx17 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom12
+ %3 = load float* %arrayidx17, align 4
+ %add18 = fadd float %2, %3
+ store float %add18, float* %arrayidx13, align 4
+ %add24 = add nsw i32 %mul, 2
+ %idxprom25 = sext i32 %add24 to i64
+ %arrayidx26 = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom25
+ %4 = load float* %arrayidx26, align 4
+ %arrayidx30 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom25
+ %5 = load float* %arrayidx30, align 4
+ %add31 = fadd float %4, %5
+ store float %add31, float* %arrayidx26, align 4
+ %add37 = add nsw i32 %mul, 3
+ %idxprom38 = sext i32 %add37 to i64
+ %arrayidx39 = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom38
+ %6 = load float* %arrayidx39, align 4
+ %arrayidx43 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom38
+ %7 = load float* %arrayidx43, align 4
+ %add44 = fadd float %6, %7
+ store float %add44, float* %arrayidx39, align 4
+ ret void
+}
+
+; Similar to the previous tests, but now we are dealing with AddRec SCEV.
+; CHECK-LABEL: foo_loop
+; CHECK: x double>
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo_loop(double* %A, i32 %n) #0 {
+entry:
+ %A.addr = alloca double*, align 8
+ %n.addr = alloca i32, align 4
+ %sum = alloca double, align 8
+ %i = alloca i32, align 4
+ store double* %A, double** %A.addr, align 8
+ store i32 %n, i32* %n.addr, align 4
+ store double 0.000000e+00, double* %sum, align 8
+ store i32 0, i32* %i, align 4
+ %cmp1 = icmp slt i32 0, %n
+ br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.lr.ph, %for.body
+ %0 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %1 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add7, %for.body ]
+ %mul = mul nsw i32 %0, 2
+ %idxprom = sext i32 %mul to i64
+ %arrayidx = getelementptr inbounds double* %A, i64 %idxprom
+ %2 = load double* %arrayidx, align 8
+ %mul1 = fmul double 7.000000e+00, %2
+ %add = add nsw i32 %mul, 1
+ %idxprom3 = sext i32 %add to i64
+ %arrayidx4 = getelementptr inbounds double* %A, i64 %idxprom3
+ %3 = load double* %arrayidx4, align 8
+ %mul5 = fmul double 7.000000e+00, %3
+ %add6 = fadd double %mul1, %mul5
+ %add7 = fadd double %1, %add6
+ store double %add7, double* %sum, align 8
+ %inc = add nsw i32 %0, 1
+ store i32 %inc, i32* %i, align 4
+ %cmp = icmp slt i32 %inc, %n
+ br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge: ; preds = %for.body
+ %split = phi double [ %add7, %for.body ]
+ br label %for.end
+
+for.end: ; preds = %for.cond.for.end_crit_edge, %entry
+ %.lcssa = phi double [ %split, %for.cond.for.end_crit_edge ], [ 0.000000e+00, %entry ]
+ %conv = fptosi double %.lcssa to i32
+ ret i32 %conv
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 "}
diff --git a/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll b/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
new file mode 100644
index 0000000..ed22574
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; We will keep trying to vectorize the basic block even we already find vectorized store.
+; CHECK: test1
+; CHECK: store <2 x double>
+; CHECK: ret
+define void @test1(double* %a, double* %b, double* %c, double* %d) {
+entry:
+ %i0 = load double* %a, align 8
+ %i1 = load double* %b, align 8
+ %mul = fmul double %i0, %i1
+ %arrayidx3 = getelementptr inbounds double* %a, i64 1
+ %i3 = load double* %arrayidx3, align 8
+ %arrayidx4 = getelementptr inbounds double* %b, i64 1
+ %i4 = load double* %arrayidx4, align 8
+ %mul5 = fmul double %i3, %i4
+ store double %mul, double* %c, align 8
+ %arrayidx5 = getelementptr inbounds double* %c, i64 1
+ store double %mul5, double* %arrayidx5, align 8
+ %0 = bitcast double* %a to <4 x i32>*
+ %1 = load <4 x i32>* %0, align 8
+ %2 = bitcast double* %b to <4 x i32>*
+ %3 = load <4 x i32>* %2, align 8
+ %4 = mul <4 x i32> %1, %3
+ %5 = bitcast double* %d to <4 x i32>*
+ store <4 x i32> %4, <4 x i32>* %5, align 8
+ ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/cse.ll b/test/Transforms/SLPVectorizer/X86/cse.ll
index bbfd6f2..d2ad7eb 100644
--- a/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -217,3 +217,33 @@ return: ; preds = %entry, %if.end
ret i32 0
}
+%class.B.53.55 = type { %class.A.52.54, double }
+%class.A.52.54 = type { double, double, double }
+
+@a = external global double, align 8
+
+define void @PR19646(%class.B.53.55* %this) {
+entry:
+ br i1 undef, label %if.end13, label %if.end13
+
+sw.epilog7: ; No predecessors!
+ %.in = getelementptr inbounds %class.B.53.55* %this, i64 0, i32 0, i32 1
+ %0 = load double* %.in, align 8
+ %add = fadd double undef, 0.000000e+00
+ %add6 = fadd double %add, %0
+ %1 = load double* @a, align 8
+ %add8 = fadd double %1, 0.000000e+00
+ %_dy = getelementptr inbounds %class.B.53.55* %this, i64 0, i32 0, i32 2
+ %2 = load double* %_dy, align 8
+ %add10 = fadd double %add8, %2
+ br i1 undef, label %if.then12, label %if.end13
+
+if.then12: ; preds = %sw.epilog7
+ %3 = load double* undef, align 8
+ br label %if.end13
+
+if.end13: ; preds = %if.then12, %sw.epilog7, %entry
+ %x.1 = phi double [ 0.000000e+00, %if.then12 ], [ %add6, %sw.epilog7 ], [ undef, %entry ], [ undef, %entry ]
+ %b.0 = phi double [ %3, %if.then12 ], [ %add10, %sw.epilog7 ], [ undef, %entry], [ undef, %entry ]
+ unreachable
+}
diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index 7537ea3..9eda29f 100644
--- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -195,11 +195,35 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b,
ret <4 x float> %rb
}
+; Make sure that vectorization happens even if insertelements operations
+; must be rescheduled. The case here is from compiling Julia.
+define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @reschedule_extract(
+; CHECK: %1 = fadd <4 x float> %a, %b
+ %a0 = extractelement <4 x float> %a, i32 0
+ %b0 = extractelement <4 x float> %b, i32 0
+ %c0 = fadd float %a0, %b0
+ %v0 = insertelement <4 x float> undef, float %c0, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %b1 = extractelement <4 x float> %b, i32 1
+ %c1 = fadd float %a1, %b1
+ %v1 = insertelement <4 x float> %v0, float %c1, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %b2 = extractelement <4 x float> %b, i32 2
+ %c2 = fadd float %a2, %b2
+ %v2 = insertelement <4 x float> %v1, float %c2, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %b3 = extractelement <4 x float> %b, i32 3
+ %c3 = fadd float %a3, %b3
+ %v3 = insertelement <4 x float> %v2, float %c3, i32 3
+ ret <4 x float> %v3
+}
+
; Check that cost model for vectorization takes credit for
; instructions that are erased.
define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
; ZEROTHRESH-LABEL: @take_credit(
-; ZEROTHRESH-CHECK: %1 = fadd <4 x float> %a, %b
+; ZEROTHRESH: %1 = fadd <4 x float> %a, %b
%a0 = extractelement <4 x float> %a, i32 0
%b0 = extractelement <4 x float> %b, i32 0
%c0 = fadd float %a0, %b0
@@ -219,4 +243,40 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %v3
}
+; Make sure we handle multiple trees that feed one build vector correctly.
+define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
+entry:
+ %t0 = fadd double %w , 0.000000e+00
+ %t1 = fadd double %x , 1.000000e+00
+ %t2 = fadd double %y , 2.000000e+00
+ %t3 = fadd double %z , 3.000000e+00
+ %t4 = fmul double %t0, 1.000000e+00
+ %i1 = insertelement <4 x double> undef, double %t4, i32 3
+ %t5 = fmul double %t1, 1.000000e+00
+ %i2 = insertelement <4 x double> %i1, double %t5, i32 2
+ %t6 = fmul double %t2, 1.000000e+00
+ %i3 = insertelement <4 x double> %i2, double %t6, i32 1
+ %t7 = fmul double %t3, 1.000000e+00
+ %i4 = insertelement <4 x double> %i3, double %t7, i32 0
+ ret <4 x double> %i4
+}
+; CHECK-LABEL: @multi_tree
+; CHECK-DAG: %[[V0:.+]] = insertelement <2 x double> undef, double %w, i32 0
+; CHECK-DAG: %[[V1:.+]] = insertelement <2 x double> %[[V0]], double %x, i32 1
+; CHECK-DAG: %[[V2:.+]] = fadd <2 x double> %[[V1]], <double 0.000000e+00, double 1.000000e+00>
+; CHECK-DAG: %[[V3:.+]] = insertelement <2 x double> undef, double %y, i32 0
+; CHECK-DAG: %[[V4:.+]] = insertelement <2 x double> %[[V3]], double %z, i32 1
+; CHECK-DAG: %[[V5:.+]] = fadd <2 x double> %[[V4]], <double 2.000000e+00, double 3.000000e+00>
+; CHECK-DAG: %[[V6:.+]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, %[[V2]]
+; CHECK-DAG: %[[V7:.+]] = extractelement <2 x double> %[[V6]], i32 0
+; CHECK-DAG: %[[I1:.+]] = insertelement <4 x double> undef, double %[[V7]], i32 3
+; CHECK-DAG: %[[V8:.+]] = extractelement <2 x double> %[[V6]], i32 1
+; CHECK-DAG: %[[I2:.+]] = insertelement <4 x double> %[[I1]], double %[[V8]], i32 2
+; CHECK-DAG: %[[V9:.+]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, %[[V5]]
+; CHECK-DAG: %[[V10:.+]] = extractelement <2 x double> %[[V9]], i32 0
+; CHECK-DAG: %[[I3:.+]] = insertelement <4 x double> %i2, double %[[V10]], i32 1
+; CHECK-DAG: %[[V11:.+]] = extractelement <2 x double> %[[V9]], i32 1
+; CHECK-DAG: %[[I4:.+]] = insertelement <4 x double> %i3, double %[[V11]], i32 0
+; CHECK: ret <4 x double> %[[I4]]
+
attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/SLPVectorizer/X86/intrinsic.ll b/test/Transforms/SLPVectorizer/X86/intrinsic.ll
index 2b7ee75..30c5093 100644
--- a/test/Transforms/SLPVectorizer/X86/intrinsic.ll
+++ b/test/Transforms/SLPVectorizer/X86/intrinsic.ll
@@ -71,5 +71,49 @@ entry:
ret void
}
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
+define void @vec_bswap_i32(i32* %a, i32* %b, i32* %c) {
+entry:
+ %i0 = load i32* %a, align 4
+ %i1 = load i32* %b, align 4
+ %add1 = add i32 %i0, %i1
+ %call1 = tail call i32 @llvm.bswap.i32(i32 %add1) nounwind readnone
+
+ %arrayidx2 = getelementptr inbounds i32* %a, i32 1
+ %i2 = load i32* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds i32* %b, i32 1
+ %i3 = load i32* %arrayidx3, align 4
+ %add2 = add i32 %i2, %i3
+ %call2 = tail call i32 @llvm.bswap.i32(i32 %add2) nounwind readnone
+
+ %arrayidx4 = getelementptr inbounds i32* %a, i32 2
+ %i4 = load i32* %arrayidx4, align 4
+ %arrayidx5 = getelementptr inbounds i32* %b, i32 2
+ %i5 = load i32* %arrayidx5, align 4
+ %add3 = add i32 %i4, %i5
+ %call3 = tail call i32 @llvm.bswap.i32(i32 %add3) nounwind readnone
+
+ %arrayidx6 = getelementptr inbounds i32* %a, i32 3
+ %i6 = load i32* %arrayidx6, align 4
+ %arrayidx7 = getelementptr inbounds i32* %b, i32 3
+ %i7 = load i32* %arrayidx7, align 4
+ %add4 = add i32 %i6, %i7
+ %call4 = tail call i32 @llvm.bswap.i32(i32 %add4) nounwind readnone
+ store i32 %call1, i32* %c, align 4
+ %arrayidx8 = getelementptr inbounds i32* %c, i32 1
+ store i32 %call2, i32* %arrayidx8, align 4
+ %arrayidx9 = getelementptr inbounds i32* %c, i32 2
+ store i32 %call3, i32* %arrayidx9, align 4
+ %arrayidx10 = getelementptr inbounds i32* %c, i32 3
+ store i32 %call4, i32* %arrayidx10, align 4
+ ret void
+
+; CHECK-LABEL: @vec_bswap_i32(
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: call <4 x i32> @llvm.bswap.v4i32
+; CHECK: store <4 x i32>
+; CHECK: ret
+}
diff --git a/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll b/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll
new file mode 100644
index 0000000..b250735
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -slp-vectorizer -o - -S -slp-threshold=-1000
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx--nvidiacl"
+
+; CTLZ cannot be vectorized currently because the second argument is a scalar
+; for both the scalar and vector forms of the intrinsic. In the future it
+; should be possible to vectorize such functions.
+; Test causes an assert if LLVM tries to vectorize CTLZ.
+
+define <2 x i8> @cltz_test(<2 x i8> %x) #0 {
+entry:
+ %0 = extractelement <2 x i8> %x, i32 0
+ %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
+ %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0
+ %1 = extractelement <2 x i8> %x, i32 1
+ %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+ %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1
+ ret <2 x i8> %vecinit2
+}
+
+define <2 x i8> @cltz_test2(<2 x i8> %x) #1 {
+entry:
+ %0 = extractelement <2 x i8> %x, i32 0
+ %1 = extractelement <2 x i8> %x, i32 1
+ %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
+ %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+ %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0
+ %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1
+ ret <2 x i8> %vecinit2
+}
+
+declare i8 @llvm.ctlz.i8(i8, i1) #3
+
+attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/SLPVectorizer/X86/value-bug.ll b/test/Transforms/SLPVectorizer/X86/value-bug.ll
new file mode 100644
index 0000000..64d2ae1
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/value-bug.ll
@@ -0,0 +1,80 @@
+; RUN: opt -slp-vectorizer < %s -S -mtriple="x86_64-grtev3-linux-gnu" -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-grtev3-linux-gnu"
+
+; We used to crash on this example because we were building a constant
+; expression during vectorization and the vectorizer expects instructions
+; as elements of the vectorized tree.
+; CHECK-LABEL: @test
+; PR19621
+
+define void @test() {
+bb279:
+ br label %bb283
+
+bb283:
+ %Av.sroa.8.0 = phi float [ undef, %bb279 ], [ %tmp315, %exit ]
+ %Av.sroa.5.0 = phi float [ undef, %bb279 ], [ %tmp319, %exit ]
+ %Av.sroa.3.0 = phi float [ undef, %bb279 ], [ %tmp307, %exit ]
+ %Av.sroa.0.0 = phi float [ undef, %bb279 ], [ %tmp317, %exit ]
+ br label %bb284
+
+bb284:
+ %tmp7.i = fpext float %Av.sroa.3.0 to double
+ %tmp8.i = fsub double %tmp7.i, undef
+ %tmp9.i = fsub double %tmp8.i, undef
+ %tmp17.i = fpext float %Av.sroa.8.0 to double
+ %tmp19.i = fsub double %tmp17.i, undef
+ %tmp20.i = fsub double %tmp19.i, undef
+ br label %bb21.i
+
+bb21.i:
+ br i1 undef, label %bb22.i, label %exit
+
+bb22.i:
+ %tmp24.i = fadd double undef, %tmp9.i
+ %tmp26.i = fadd double undef, %tmp20.i
+ br label %bb32.i
+
+bb32.i:
+ %xs.0.i = phi double [ %tmp24.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
+ %ys.0.i = phi double [ %tmp26.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
+ br i1 undef, label %bb32.i, label %bb21.i
+
+exit:
+ %tmp303 = fpext float %Av.sroa.0.0 to double
+ %tmp304 = fmul double %tmp303, undef
+ %tmp305 = fadd double undef, %tmp304
+ %tmp306 = fadd double %tmp305, undef
+ %tmp307 = fptrunc double %tmp306 to float
+ %tmp311 = fpext float %Av.sroa.5.0 to double
+ %tmp312 = fmul double %tmp311, 0.000000e+00
+ %tmp313 = fadd double undef, %tmp312
+ %tmp314 = fadd double %tmp313, undef
+ %tmp315 = fptrunc double %tmp314 to float
+ %tmp317 = fptrunc double undef to float
+ %tmp319 = fptrunc double undef to float
+ br label %bb283
+}
+
+; Make sure that we probably handle constant folded vectorized trees. The
+; vectorizer starts at the type (%t2, %t3) and wil constant fold the tree.
+; The code that handles insertelement instructions must handle this.
+define <4 x double> @constant_folding() {
+entry:
+ %t0 = fadd double 1.000000e+00 , 0.000000e+00
+ %t1 = fadd double 1.000000e+00 , 1.000000e+00
+ %t2 = fmul double %t0, 1.000000e+00
+ %i1 = insertelement <4 x double> undef, double %t2, i32 1
+ %t3 = fmul double %t1, 1.000000e+00
+ %i2 = insertelement <4 x double> %i1, double %t3, i32 0
+ ret <4 x double> %i2
+}
+
+; CHECK-LABEL: @constant_folding
+; CHECK: %[[V0:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 0
+; CHECK: %[[V1:.+]] = insertelement <4 x double> undef, double %[[V0]], i32 1
+; CHECK: %[[V2:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 1
+; CHECK: %[[V3:.+]] = insertelement <4 x double> %[[V1]], double %[[V2]], i32 0
+; CHECK: ret <4 x double> %[[V3]]