diff options
Diffstat (limited to 'test/Analysis/CostModel')
-rw-r--r-- | test/Analysis/CostModel/ARM/cast.ll | 34 | ||||
-rw-r--r-- | test/Analysis/CostModel/ARM64/lit.local.cfg | 3 | ||||
-rw-r--r-- | test/Analysis/CostModel/ARM64/select.ll | 38 | ||||
-rw-r--r-- | test/Analysis/CostModel/ARM64/store.ll | 22 | ||||
-rw-r--r-- | test/Analysis/CostModel/PowerPC/ext.ll | 21 | ||||
-rw-r--r-- | test/Analysis/CostModel/PowerPC/load_store.ll | 5 | ||||
-rw-r--r-- | test/Analysis/CostModel/X86/cast.ll | 97 | ||||
-rw-r--r-- | test/Analysis/CostModel/X86/cmp.ll | 4 | ||||
-rw-r--r-- | test/Analysis/CostModel/X86/scalarize.ll | 41 | ||||
-rw-r--r-- | test/Analysis/CostModel/X86/vshift-cost.ll | 167 |
10 files changed, 391 insertions, 41 deletions
diff --git a/test/Analysis/CostModel/ARM/cast.ll b/test/Analysis/CostModel/ARM/cast.ll index 0cdd61c..662110f 100644 --- a/test/Analysis/CostModel/ARM/cast.ll +++ b/test/Analysis/CostModel/ARM/cast.ll @@ -221,9 +221,9 @@ define i32 @casts() { %r96 = fptoui <2 x float> undef to <2 x i32> ; CHECK: cost of 1 {{.*}} fptosi %r97 = fptosi <2 x float> undef to <2 x i32> - ; CHECK: cost of 24 {{.*}} fptoui + ; CHECK: cost of 28 {{.*}} fptoui %r98 = fptoui <2 x float> undef to <2 x i64> - ; CHECK: cost of 24 {{.*}} fptosi + ; CHECK: cost of 28 {{.*}} fptosi %r99 = fptosi <2 x float> undef to <2 x i64> ; CHECK: cost of 8 {{.*}} fptoui @@ -242,9 +242,9 @@ define i32 @casts() { %r106 = fptoui <2 x double> undef to <2 x i32> ; CHECK: cost of 2 {{.*}} fptosi %r107 = fptosi <2 x double> undef to <2 x i32> - ; CHECK: cost of 24 {{.*}} fptoui + ; CHECK: cost of 28 {{.*}} fptoui %r108 = fptoui <2 x double> undef to <2 x i64> - ; CHECK: cost of 24 {{.*}} fptosi + ; CHECK: cost of 28 {{.*}} fptosi %r109 = fptosi <2 x double> undef to <2 x i64> ; CHECK: cost of 16 {{.*}} fptoui @@ -263,9 +263,9 @@ define i32 @casts() { %r116 = fptoui <4 x float> undef to <4 x i32> ; CHECK: cost of 1 {{.*}} fptosi %r117 = fptosi <4 x float> undef to <4 x i32> - ; CHECK: cost of 48 {{.*}} fptoui + ; CHECK: cost of 56 {{.*}} fptoui %r118 = fptoui <4 x float> undef to <4 x i64> - ; CHECK: cost of 48 {{.*}} fptosi + ; CHECK: cost of 56 {{.*}} fptosi %r119 = fptosi <4 x float> undef to <4 x i64> ; CHECK: cost of 16 {{.*}} fptoui @@ -284,9 +284,9 @@ define i32 @casts() { %r126 = fptoui <4 x double> undef to <4 x i32> ; CHECK: cost of 16 {{.*}} fptosi %r127 = fptosi <4 x double> undef to <4 x i32> - ; CHECK: cost of 48 {{.*}} fptoui + ; CHECK: cost of 56 {{.*}} fptoui %r128 = fptoui <4 x double> undef to <4 x i64> - ; CHECK: cost of 48 {{.*}} fptosi + ; CHECK: cost of 56 {{.*}} fptosi %r129 = fptosi <4 x double> undef to <4 x i64> ; CHECK: cost of 32 {{.*}} fptoui @@ -305,9 +305,9 @@ define i32 @casts() { %r136 = fptoui <8 x float> undef to <8 x i32> ; CHECK: cost of 2 {{.*}} fptosi %r137 = fptosi <8 x float> undef to <8 x i32> - ; CHECK: cost of 96 {{.*}} fptoui + ; CHECK: cost of 112 {{.*}} fptoui %r138 = fptoui <8 x float> undef to <8 x i64> - ; CHECK: cost of 96 {{.*}} fptosi + ; CHECK: cost of 112 {{.*}} fptosi %r139 = fptosi <8 x float> undef to <8 x i64> ; CHECK: cost of 32 {{.*}} fptoui @@ -326,9 +326,9 @@ define i32 @casts() { %r146 = fptoui <8 x double> undef to <8 x i32> ; CHECK: cost of 32 {{.*}} fptosi %r147 = fptosi <8 x double> undef to <8 x i32> - ; CHECK: cost of 96 {{.*}} fptoui + ; CHECK: cost of 112 {{.*}} fptoui %r148 = fptoui <8 x double> undef to <8 x i64> - ; CHECK: cost of 96 {{.*}} fptosi + ; CHECK: cost of 112 {{.*}} fptosi %r149 = fptosi <8 x double> undef to <8 x i64> ; CHECK: cost of 64 {{.*}} fptoui @@ -347,9 +347,9 @@ define i32 @casts() { %r156 = fptoui <16 x float> undef to <16 x i32> ; CHECK: cost of 4 {{.*}} fptosi %r157 = fptosi <16 x float> undef to <16 x i32> - ; CHECK: cost of 192 {{.*}} fptoui + ; CHECK: cost of 224 {{.*}} fptoui %r158 = fptoui <16 x float> undef to <16 x i64> - ; CHECK: cost of 192 {{.*}} fptosi + ; CHECK: cost of 224 {{.*}} fptosi %r159 = fptosi <16 x float> undef to <16 x i64> ; CHECK: cost of 64 {{.*}} fptoui @@ -368,9 +368,9 @@ define i32 @casts() { %r166 = fptoui <16 x double> undef to <16 x i32> ; CHECK: cost of 64 {{.*}} fptosi %r167 = fptosi <16 x double> undef to <16 x i32> - ; CHECK: cost of 192 {{.*}} fptoui + ; CHECK: cost of 224 {{.*}} fptoui %r168 = fptoui <16 x double> undef to <16 x i64> - ; CHECK: cost of 192 {{.*}} fptosi + ; CHECK: cost of 224 {{.*}} fptosi %r169 = fptosi <16 x double> undef to <16 x i64> ; CHECK: cost of 8 {{.*}} uitofp @@ -528,7 +528,7 @@ define i32 @casts() { %r242 = uitofp <16 x i8> undef to <16 x double> ; CHECK: cost of 64 {{.*}} sitofp %r243 = sitofp <16 x i8> undef to <16 x double> - ; C4ECK: cost of 64 {{.*}} uitofp + ; CHECK: cost of 64 {{.*}} uitofp %r244 = uitofp <16 x i16> undef to <16 x double> ; CHECK: cost of 64 {{.*}} sitofp %r245 = sitofp <16 x i16> undef to <16 x double> diff --git a/test/Analysis/CostModel/ARM64/lit.local.cfg b/test/Analysis/CostModel/ARM64/lit.local.cfg new file mode 100644 index 0000000..84ac981 --- /dev/null +++ b/test/Analysis/CostModel/ARM64/lit.local.cfg @@ -0,0 +1,3 @@ +targets = set(config.root.targets_to_build.split()) +if not 'ARM64' in targets: + config.unsupported = True diff --git a/test/Analysis/CostModel/ARM64/select.ll b/test/Analysis/CostModel/ARM64/select.ll new file mode 100644 index 0000000..216dc5d --- /dev/null +++ b/test/Analysis/CostModel/ARM64/select.ll @@ -0,0 +1,38 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" + +; CHECK-LABEL: select +define void @select() { + ; Scalar values + ; CHECK: cost of 1 {{.*}} select + %v1 = select i1 undef, i8 undef, i8 undef + ; CHECK: cost of 1 {{.*}} select + %v2 = select i1 undef, i16 undef, i16 undef + ; CHECK: cost of 1 {{.*}} select + %v3 = select i1 undef, i32 undef, i32 undef + ; CHECK: cost of 1 {{.*}} select + %v4 = select i1 undef, i64 undef, i64 undef + ; CHECK: cost of 1 {{.*}} select + %v5 = select i1 undef, float undef, float undef + ; CHECK: cost of 1 {{.*}} select + %v6 = select i1 undef, double undef, double undef + + ; Vector values - check for vectors that have a high cost because they end up + ; scalarized. + ; CHECK: cost of 320 {{.*}} select + %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef + + ; CHECK: cost of 160 {{.*}} select + %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef + ; CHECK: cost of 320 {{.*}} select + %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef + + ; CHECK: cost of 80 {{.*}} select + %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef + ; CHECK: cost of 160 {{.*}} select + %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef + ; CHECK: cost of 320 {{.*}} select + %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef + + ret void +} diff --git a/test/Analysis/CostModel/ARM64/store.ll b/test/Analysis/CostModel/ARM64/store.ll new file mode 100644 index 0000000..0c9883c --- /dev/null +++ b/test/Analysis/CostModel/ARM64/store.ll @@ -0,0 +1,22 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +; CHECK-LABEL: store +define void @store() { + ; Stores of <2 x i64> should be expensive because we don't split them and + ; and unaligned 16b stores have bad performance. + ; CHECK: cost of 12 {{.*}} store + store <2 x i64> undef, <2 x i64> * undef + + ; We scalarize the loads/stores because there is no vector register name for + ; these types (they get extended to v.4h/v.2s). + ; CHECK: cost of 16 {{.*}} store + store <2 x i8> undef, <2 x i8> * undef + ; CHECK: cost of 64 {{.*}} store + store <4 x i8> undef, <4 x i8> * undef + ; CHECK: cost of 16 {{.*}} load + load <2 x i8> * undef + ; CHECK: cost of 64 {{.*}} load + load <4 x i8> * undef + + ret void +} diff --git a/test/Analysis/CostModel/PowerPC/ext.ll b/test/Analysis/CostModel/PowerPC/ext.ll new file mode 100644 index 0000000..daaa8f5 --- /dev/null +++ b/test/Analysis/CostModel/PowerPC/ext.ll @@ -0,0 +1,21 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +define void @exts() { + + ; CHECK: cost of 1 {{.*}} sext + %v1 = sext i16 undef to i32 + + ; CHECK: cost of 1 {{.*}} sext + %v2 = sext <2 x i16> undef to <2 x i32> + + ; CHECK: cost of 1 {{.*}} sext + %v3 = sext <4 x i16> undef to <4 x i32> + + ; CHECK: cost of 216 {{.*}} sext + %v4 = sext <8 x i16> undef to <8 x i32> + + ret void +} + diff --git a/test/Analysis/CostModel/PowerPC/load_store.ll b/test/Analysis/CostModel/PowerPC/load_store.ll index c77cce9..8145a1d 100644 --- a/test/Analysis/CostModel/PowerPC/load_store.ll +++ b/test/Analysis/CostModel/PowerPC/load_store.ll @@ -29,6 +29,11 @@ define i32 @loads(i32 %arg) { ; CHECK: cost of 4 {{.*}} load load i128* undef, align 4 + ; FIXME: There actually are sub-vector Altivec loads, and so we could handle + ; this with a small expense, but we don't currently. + ; CHECK: cost of 60 {{.*}} load + load <4 x i16>* undef, align 2 + ret i32 undef } diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll index f3c1283..7f97b17 100644 --- a/test/Analysis/CostModel/X86/cast.ll +++ b/test/Analysis/CostModel/X86/cast.ll @@ -1,10 +1,11 @@ -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" define i32 @add(i32 %arg) { - +; CHECK-LABEL: for function 'add' ; -- Same size registeres -- ;CHECK: cost of 1 {{.*}} zext %A = zext <4 x i1> undef to <4 x i32> @@ -33,57 +34,106 @@ define i32 @add(i32 %arg) { } define i32 @zext_sext(<8 x i1> %in) { - ;CHECK: cost of 6 {{.*}} zext +; CHECK-AVX2-LABEL: for function 'zext_sext' +; CHECK-AVX-LABEL: for function 'zext_sext' + ;CHECK-AVX2: cost of 3 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext %Z = zext <8 x i1> %in to <8 x i32> - ;CHECK: cost of 9 {{.*}} sext + ;CHECK-AVX2: cost of 3 {{.*}} sext + ;CHECK-AVX: cost of 7 {{.*}} sext %S = sext <8 x i1> %in to <8 x i32> - ;CHECK: cost of 1 {{.*}} zext + ;CHECK-AVX2: cost of 1 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext %A1 = zext <16 x i8> undef to <16 x i16> - ;CHECK: cost of 1 {{.*}} sext + ;CHECK-AVX2: cost of 1 {{.*}} sext + ;CHECK-AVX: cost of 4 {{.*}} sext %A2 = sext <16 x i8> undef to <16 x i16> - ;CHECK: cost of 1 {{.*}} sext + ;CHECK-AVX2: cost of 1 {{.*}} sext + ;CHECK-AVX: cost of 4 {{.*}} sext %A = sext <8 x i16> undef to <8 x i32> - ;CHECK: cost of 1 {{.*}} zext + ;CHECK-AVX2: cost of 1 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext %B = zext <8 x i16> undef to <8 x i32> - ;CHECK: cost of 1 {{.*}} sext + ;CHECK-AVX2: cost of 1 {{.*}} sext + ;CHECK-AVX: cost of 4 {{.*}} sext %C = sext <4 x i32> undef to <4 x i64> - ;CHECK: cost of 6 {{.*}} sext - %C1 = sext <4 x i8> undef to <4 x i64> - ;CHECK: cost of 6 {{.*}} sext - %C2 = sext <4 x i16> undef to <4 x i64> - ;CHECK: cost of 1 {{.*}} zext + ;CHECK-AVX2: cost of 3 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext + %C.v8i8.z = zext <8 x i8> undef to <8 x i32> + ;CHECK-AVX2: cost of 3 {{.*}} sext + ;CHECK-AVX: cost of 7 {{.*}} sext + %C.v8i8.s = sext <8 x i8> undef to <8 x i32> + ;CHECK-AVX2: cost of 3 {{.*}} zext + ;CHECK-AVX: cost of 3 {{.*}} zext + %C.v4i16.z = zext <4 x i16> undef to <4 x i64> + ;CHECK-AVX2: cost of 3 {{.*}} sext + ;CHECK-AVX: cost of 6 {{.*}} sext + %C.v4i16.s = sext <4 x i16> undef to <4 x i64> + + ;CHECK-AVX2: cost of 3 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext + %C.v4i8.z = zext <4 x i8> undef to <4 x i64> + ;CHECK-AVX2: cost of 3 {{.*}} sext + ;CHECK-AVX: cost of 6 {{.*}} sext + %C.v4i8.s = sext <4 x i8> undef to <4 x i64> + + ;CHECK-AVX2: cost of 1 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext %D = zext <4 x i32> undef to <4 x i64> - ;CHECK: cost of 1 {{.*}} trunc + ;CHECK-AVX2: cost of 2 {{.*}} trunc + ;CHECK-AVX: cost of 4 {{.*}} trunc %E = trunc <4 x i64> undef to <4 x i32> - ;CHECK: cost of 1 {{.*}} trunc + ;CHECK-AVX2: cost of 2 {{.*}} trunc + ;CHECK-AVX: cost of 5 {{.*}} trunc %F = trunc <8 x i32> undef to <8 x i16> - ;CHECK: cost of 2 {{.*}} trunc + ;CHECK-AVX2: cost of 4 {{.*}} trunc + ;CHECK-AVX: cost of 4 {{.*}} trunc %F1 = trunc <16 x i16> undef to <16 x i8> - - ;CHECK: cost of 3 {{.*}} trunc + ;CHECK-AVX2: cost of 2 {{.*}} trunc + ;CHECK-AVX: cost of 4 {{.*}} trunc + %F2 = trunc <8 x i32> undef to <8 x i8> + ;CHECK-AVX2: cost of 2 {{.*}} trunc + ;CHECK-AVX: cost of 4 {{.*}} trunc + %F3 = trunc <4 x i64> undef to <4 x i8> + + ;CHECK-AVX2: cost of 4 {{.*}} trunc + ;CHECK-AVX: cost of 9 {{.*}} trunc %G = trunc <8 x i64> undef to <8 x i32> ret i32 undef } define i32 @masks8(<8 x i1> %in) { - ;CHECK: cost of 6 {{.*}} zext +; CHECK-AVX2-LABEL: for function 'masks8' +; CHECK-AVX-LABEL: for function 'masks8' + + ;CHECK-AVX2: cost of 3 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext %Z = zext <8 x i1> %in to <8 x i32> - ;CHECK: cost of 9 {{.*}} sext + ;CHECK-AVX2: cost of 3 {{.*}} sext + ;CHECK-AVX: cost of 7 {{.*}} sext %S = sext <8 x i1> %in to <8 x i32> ret i32 undef } define i32 @masks4(<4 x i1> %in) { - ;CHECK: cost of 8 {{.*}} sext +; CHECK-AVX2-LABEL: for function 'masks4' +; CHECK-AVX-LABEL: for function 'masks4' + + ;CHECK-AVX2: cost of 3 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext + %Z = zext <4 x i1> %in to <4 x i64> + ;CHECK-AVX2: cost of 3 {{.*}} sext + ;CHECK-AVX: cost of 6 {{.*}} sext %S = sext <4 x i1> %in to <4 x i64> ret i32 undef } define void @sitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) { +; CHECK-LABEL: for function 'sitofp4' ; CHECK: cost of 3 {{.*}} sitofp %A1 = sitofp <4 x i1> %a to <4 x float> ; CHECK: cost of 3 {{.*}} sitofp @@ -107,6 +157,7 @@ define void @sitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) { } define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { +; CHECK-LABEL: for function 'sitofp8' ; CHECK: cost of 8 {{.*}} sitofp %A1 = sitofp <8 x i1> %a to <8 x float> @@ -122,6 +173,7 @@ define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { } define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) { +; CHECK-LABEL: for function 'uitofp4' ; CHECK: cost of 7 {{.*}} uitofp %A1 = uitofp <4 x i1> %a to <4 x float> ; CHECK: cost of 7 {{.*}} uitofp @@ -145,6 +197,7 @@ define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) { } define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { +; CHECK-LABEL: for function 'uitofp8' ; CHECK: cost of 6 {{.*}} uitofp %A1 = uitofp <8 x i1> %a to <8 x float> diff --git a/test/Analysis/CostModel/X86/cmp.ll b/test/Analysis/CostModel/X86/cmp.ll index 713b374..9f2bdb3 100644 --- a/test/Analysis/CostModel/X86/cmp.ll +++ b/test/Analysis/CostModel/X86/cmp.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck --check-prefix=AVX1 %s -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core-avx2 | FileCheck --check-prefix=AVX2 %s +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck -check-prefix=CHECK -check-prefix=AVX1 %s +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core-avx2 | FileCheck -check-prefix=CHECK -check-prefix=AVX2 %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" diff --git a/test/Analysis/CostModel/X86/scalarize.ll b/test/Analysis/CostModel/X86/scalarize.ll new file mode 100644 index 0000000..fc25fcb --- /dev/null +++ b/test/Analysis/CostModel/X86/scalarize.ll @@ -0,0 +1,41 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=i386 -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK32 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK64 + +; Test vector scalarization costs. +; RUN: llc < %s -march=x86 -mcpu=i386 +; RUN: llc < %s -march=x86 -mcpu=yonah + +%i4 = type <4 x i32> +%i8 = type <2 x i64> + +;;; TEST HANDLING OF VARIOUS VECTOR SIZES + +declare %i4 @llvm.bswap.v4i32(%i4) +declare %i8 @llvm.bswap.v2i64(%i8) + +declare %i4 @llvm.ctpop.v4i32(%i4) +declare %i8 @llvm.ctpop.v2i64(%i8) + +; CHECK32-LABEL: test_scalarized_intrinsics +; CHECK64-LABEL: test_scalarized_intrinsics +define void @test_scalarized_intrinsics() { + %r1 = add %i8 undef, undef + +; CHECK32: cost of 12 {{.*}}bswap.v4i32 +; CHECK64: cost of 12 {{.*}}bswap.v4i32 + %r2 = call %i4 @llvm.bswap.v4i32(%i4 undef) +; CHECK32: cost of 10 {{.*}}bswap.v2i64 +; CHECK64: cost of 6 {{.*}}bswap.v2i64 + %r3 = call %i8 @llvm.bswap.v2i64(%i8 undef) + +; CHECK32: cost of 12 {{.*}}ctpop.v4i32 +; CHECK64: cost of 12 {{.*}}ctpop.v4i32 + %r4 = call %i4 @llvm.ctpop.v4i32(%i4 undef) +; CHECK32: cost of 10 {{.*}}ctpop.v2i64 +; CHECK64: cost of 6 {{.*}}ctpop.v2i64 + %r5 = call %i8 @llvm.ctpop.v2i64(%i8 undef) + +; CHECK32: ret +; CHECK64: ret + ret void +} diff --git a/test/Analysis/CostModel/X86/vshift-cost.ll b/test/Analysis/CostModel/X86/vshift-cost.ll new file mode 100644 index 0000000..84d7246 --- /dev/null +++ b/test/Analysis/CostModel/X86/vshift-cost.ll @@ -0,0 +1,167 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 + + +; Verify the cost of vector shift left instructions. + +; We always emit a single pmullw in the case of v8i16 vector shifts by +; non-uniform constant. + +define <8 x i16> @test1(<8 x i16> %a) { + %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> + ret <8 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test1': +; CHECK: Found an estimated cost of 1 for instruction: %shl + + +define <8 x i16> @test2(<8 x i16> %a) { + %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1> + ret <8 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test2': +; CHECK: Found an estimated cost of 1 for instruction: %shl + + +; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction. +; Make sure that the estimated cost is always 1 except for the case where +; we only have SSE2 support. With SSE2, we are forced to special lower the +; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle. + +define <4 x i32> @test3(<4 x i32> %a) { + %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3> + ret <4 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test3': +; SSE2: Found an estimated cost of 6 for instruction: %shl +; SSE41: Found an estimated cost of 1 for instruction: %shl +; AVX: Found an estimated cost of 1 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +define <4 x i32> @test4(<4 x i32> %a) { + %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1> + ret <4 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test4': +; SSE2: Found an estimated cost of 6 for instruction: %shl +; SSE41: Found an estimated cost of 1 for instruction: %shl +; AVX: Found an estimated cost of 1 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; On AVX2 we are able to lower the following shift into a single +; vpsllvq. Therefore, the expected cost is only 1. +; In all other cases, this shift is scalarized as the target does not support +; vpsllv instructions. + +define <2 x i64> @test5(<2 x i64> %a) { + %shl = shl <2 x i64> %a, <i64 2, i64 3> + ret <2 x i64> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test5': +; SSE2: Found an estimated cost of 20 for instruction: %shl +; SSE41: Found an estimated cost of 20 for instruction: %shl +; AVX: Found an estimated cost of 20 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; v16i16 and v8i32 shift left by non-uniform constant are lowered into +; vector multiply instructions. With AVX (but not AVX2), the vector multiply +; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert. +; +; With AVX2, instruction vpmullw works with 256bit quantities and +; therefore there is no need to split the resulting vector multiply into +; a sequence of two multiply. +; +; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice +; the cost computed in the case of 'test1'. That is because the backend +; simply emits 2 pmullw with no extract/insert. + + +define <16 x i16> @test6(<16 x i16> %a) { + %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> + ret <16 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test6': +; SSE2: Found an estimated cost of 2 for instruction: %shl +; SSE41: Found an estimated cost of 2 for instruction: %shl +; AVX: Found an estimated cost of 4 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice +; the cost computed in the case of 'test3'. That is because the multiply +; is type-legalized into two 4i32 vector multiply. + +define <8 x i32> @test7(<8 x i32> %a) { + %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> + ret <8 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test7': +; SSE2: Found an estimated cost of 12 for instruction: %shl +; SSE41: Found an estimated cost of 2 for instruction: %shl +; AVX: Found an estimated cost of 4 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; On AVX2 we are able to lower the following shift into a single +; vpsllvq. Therefore, the expected cost is only 1. +; In all other cases, this shift is scalarized as the target does not support +; vpsllv instructions. + +define <4 x i64> @test8(<4 x i64> %a) { + %shl = shl <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4> + ret <4 x i64> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test8': +; SSE2: Found an estimated cost of 40 for instruction: %shl +; SSE41: Found an estimated cost of 40 for instruction: %shl +; AVX: Found an estimated cost of 40 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; Same as 'test6', with the difference that the cost is double. + +define <32 x i16> @test9(<32 x i16> %a) { + %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> + ret <32 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test9': +; SSE2: Found an estimated cost of 4 for instruction: %shl +; SSE41: Found an estimated cost of 4 for instruction: %shl +; AVX: Found an estimated cost of 8 for instruction: %shl +; AVX2: Found an estimated cost of 2 for instruction: %shl + + +; Same as 'test7', except that now the cost is double. + +define <16 x i32> @test10(<16 x i32> %a) { + %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> + ret <16 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test10': +; SSE2: Found an estimated cost of 24 for instruction: %shl +; SSE41: Found an estimated cost of 4 for instruction: %shl +; AVX: Found an estimated cost of 8 for instruction: %shl +; AVX2: Found an estimated cost of 2 for instruction: %shl + + +; On AVX2 we are able to lower the following shift into a sequence of +; two vpsllvq instructions. Therefore, the expected cost is only 2. +; In all other cases, this shift is scalarized as we don't have vpsllv +; instructions. + +define <8 x i64> @test11(<8 x i64> %a) { + %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3> + ret <8 x i64> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test11': +; SSE2: Found an estimated cost of 80 for instruction: %shl +; SSE41: Found an estimated cost of 80 for instruction: %shl +; AVX: Found an estimated cost of 80 for instruction: %shl +; AVX2: Found an estimated cost of 2 for instruction: %shl + + |