From 89008539a322b9ce1d66837b3342d0be312bcbba Mon Sep 17 00:00:00 2001
From: Yi Jiang <yjiang@apple.com>
Date: Tue, 3 Sep 2013 17:26:04 +0000
Subject: In this patch we are trying to do two things: 1) If the width of
 vectorization list candidate is bigger than vector reg width, we will break
 it down to fit the vector reg. 2) We do not vectorize the width which is not
 power of two.

The performance result shows it will help some spec benchmarks. mesa improved 6.97% and ammp improved 1.54%.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189830 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/SLPVectorizer/X86/phi.ll           | 95 ++++++++++++++++++++--
 .../SLPVectorizer/X86/phi_overalignedtype.ll       | 45 ++++++++++
 test/Transforms/SLPVectorizer/X86/rgb_phi.ll       | 14 ++--
 3 files changed, 141 insertions(+), 13 deletions(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll

(limited to 'test/Transforms')

diff --git a/test/Transforms/SLPVectorizer/X86/phi.ll b/test/Transforms/SLPVectorizer/X86/phi.ll
index f77e945..9cc4891 100644
--- a/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -95,11 +95,92 @@ for.end:                                          ; preds = %for.body
   ret i32 0
 }
 
+; float foo3(float *A) {
+;
+;   float R = A[0];
+;   float G = A[1];
+;   float B = A[2];
+;   float Y = A[3];
+;   float P = A[4];
+;   for (int i=0; i < 121; i+=3) {
+;     R+=A[i+0]*7;
+;     G+=A[i+1]*8;
+;     B+=A[i+2]*9;
+;     Y+=A[i+3]*10;
+;     P+=A[i+4]*11;
+;   }
+;
+;   return R+G+B+Y+P;
+; }
+
+;CHECK: foo3
+;CHECK: phi <4 x float>
+;CHECK: fmul <4 x float>
+;CHECK: fadd <4 x float>
+;CHECK-NOT: phi <5 x float>
+;CHECK-NOT: fmul <5 x float>
+;CHECK-NOT: fadd <5 x float>
+
+define float @foo3(float* nocapture readonly %A) #0 {
+entry:
+  %0 = load float* %A, align 4
+  %arrayidx1 = getelementptr inbounds float* %A, i64 1
+  %1 = load float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float* %A, i64 2
+  %2 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %A, i64 3
+  %3 = load float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float* %A, i64 4
+  %4 = load float* %arrayidx4, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %5 = phi float [ %1, %entry ], [ %11, %for.body ]
+  %6 = phi float [ %0, %entry ], [ %9, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %P.056 = phi float [ %4, %entry ], [ %add26, %for.body ]
+  %Y.055 = phi float [ %3, %entry ], [ %add21, %for.body ]
+  %B.054 = phi float [ %2, %entry ], [ %add16, %for.body ]
+  %G.053 = phi float [ %1, %entry ], [ %add11, %for.body ]
+  %R.052 = phi float [ %0, %entry ], [ %add6, %for.body ]
+  %mul = fmul float %6, 7.000000e+00
+  %add6 = fadd float %R.052, %mul
+  %mul10 = fmul float %5, 8.000000e+00
+  %add11 = fadd float %G.053, %mul10
+  %7 = add nsw i64 %indvars.iv, 2
+  %arrayidx14 = getelementptr inbounds float* %A, i64 %7
+  %8 = load float* %arrayidx14, align 4
+  %mul15 = fmul float %8, 9.000000e+00
+  %add16 = fadd float %B.054, %mul15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx19 = getelementptr inbounds float* %A, i64 %indvars.iv.next
+  %9 = load float* %arrayidx19, align 4
+  %mul20 = fmul float %9, 1.000000e+01
+  %add21 = fadd float %Y.055, %mul20
+  %10 = add nsw i64 %indvars.iv, 4
+  %arrayidx24 = getelementptr inbounds float* %A, i64 %10
+  %11 = load float* %arrayidx24, align 4
+  %mul25 = fmul float %11, 1.100000e+01
+  %add26 = fadd float %P.056, %mul25
+  %12 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %12, 121
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add28 = fadd float %add6, %add11
+  %add29 = fadd float %add28, %add16
+  %add30 = fadd float %add29, %add21
+  %add31 = fadd float %add30, %add26
+  ret float %add31
+}
+
 define void @test(x86_fp80* %i1, x86_fp80* %i2, x86_fp80* %o) {
 ; CHECK-LABEL: @test(
 ;
 ; Test that we correctly recognize the discontiguous memory in arrays where the
 ; size is less than the alignment, and through various different GEP formations.
+;
+; We disable the vectorization of x86_fp80 for now. 
 
 entry:
   %i1.0 = load x86_fp80* %i1, align 16
@@ -107,8 +188,8 @@ entry:
   %i1.1 = load x86_fp80* %i1.gep1, align 16
 ; CHECK: load x86_fp80*
 ; CHECK: load x86_fp80*
-; CHECK: insertelement <2 x x86_fp80>
-; CHECK: insertelement <2 x x86_fp80>
+; CHECK-NOT: insertelement <2 x x86_fp80>
+; CHECK_NOT: insertelement <2 x x86_fp80>
   br i1 undef, label %then, label %end
 
 then:
@@ -118,16 +199,16 @@ then:
   %i2.1 = load x86_fp80* %i2.gep1, align 16
 ; CHECK: load x86_fp80*
 ; CHECK: load x86_fp80*
-; CHECK: insertelement <2 x x86_fp80>
-; CHECK: insertelement <2 x x86_fp80>
+; CHECK-NOT: insertelement <2 x x86_fp80>
+; CHECK-NOT: insertelement <2 x x86_fp80>
   br label %end
 
 end:
   %phi0 = phi x86_fp80 [ %i1.0, %entry ], [ %i2.0, %then ]
   %phi1 = phi x86_fp80 [ %i1.1, %entry ], [ %i2.1, %then ]
-; CHECK: phi <2 x x86_fp80>
-; CHECK: extractelement <2 x x86_fp80>
-; CHECK: extractelement <2 x x86_fp80>
+; CHECK-NOT: phi <2 x x86_fp80>
+; CHECK-NOT: extractelement <2 x x86_fp80>
+; CHECK-NOT: extractelement <2 x x86_fp80>
   store x86_fp80 %phi0, x86_fp80* %o, align 16
   %o.gep1 = getelementptr inbounds x86_fp80* %o, i64 1
   store x86_fp80 %phi1, x86_fp80* %o.gep1, align 16
diff --git a/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll b/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
new file mode 100644
index 0000000..520e672
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+; We purposely over-align f64 to 128bit here. 
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:128:128-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+
+define void @test(double* %i1, double* %i2, double* %o) {
+; CHECK-LABEL: @test(
+;
+; Test that we correctly recognize the discontiguous memory in arrays where the
+; size is less than the alignment, and through various different GEP formations.
+
+entry:
+  %i1.0 = load double* %i1, align 16
+  %i1.gep1 = getelementptr double* %i1, i64 1
+  %i1.1 = load double* %i1.gep1, align 16
+; CHECK: load double*
+; CHECK: load double*
+; CHECK: insertelement <2 x double>
+; CHECK: insertelement <2 x double>
+  br i1 undef, label %then, label %end
+
+then:
+  %i2.gep0 = getelementptr inbounds double* %i2, i64 0
+  %i2.0 = load double* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds double* %i2, i64 1
+  %i2.1 = load double* %i2.gep1, align 16
+; CHECK: load double*
+; CHECK: load double*
+; CHECK: insertelement <2 x double>
+; CHECK: insertelement <2 x double>
+  br label %end
+
+end:
+  %phi0 = phi double [ %i1.0, %entry ], [ %i2.0, %then ]
+  %phi1 = phi double [ %i1.1, %entry ], [ %i2.1, %then ]
+; CHECK: phi <2 x double>
+; CHECK: extractelement <2 x double>
+; CHECK: extractelement <2 x double>
+  store double %phi0, double* %o, align 16
+  %o.gep1 = getelementptr inbounds double* %o, i64 1
+  store double %phi1, double* %o.gep1, align 16
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
index 3235fd9..6aea5d3 100644
--- a/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
+++ b/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
@@ -3,6 +3,8 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 target triple = "i386-apple-macosx10.9.0"
 
+; We disable the vectorization of <3 x float> for now
+
 ; float foo(float *A) {
 ;
 ;   float R = A[0];
@@ -19,14 +21,14 @@ target triple = "i386-apple-macosx10.9.0"
 
 ;CHECK-LABEL: @foo(
 ;CHECK: br
-;CHECK: phi <3 x float>
-;CHECK: fmul <3 x float>
-;CHECK: fadd <3 x float>
+;CHECK-NOT: phi <3 x float>
+;CHECK-NOT: fmul <3 x float>
+;CHECK-NOT: fadd <3 x float>
 ; At the moment we don't sink extractelements.
 ;CHECK: br
-;CHECK: extractelement
-;CHECK: extractelement
-;CHECK: extractelement
+;CHECK-NOT: extractelement
+;CHECK-NOT: extractelement
+;CHECK-NOT: extractelement
 ;CHECK: ret
 
 define float @foo(float* nocapture readonly %A) {
-- 
cgit v1.1