From 36b56886974eae4f9c5ebc96befd3e7bfe5de338 Mon Sep 17 00:00:00 2001 From: Stephen Hines Date: Wed, 23 Apr 2014 16:57:46 -0700 Subject: Update to LLVM 3.5a. Change-Id: Ifadecab779f128e62e430c2b4f6ddd84953ed617 --- test/Analysis/CostModel/X86/cast.ll | 97 +++++++++++++---- test/Analysis/CostModel/X86/cmp.ll | 4 +- test/Analysis/CostModel/X86/scalarize.ll | 41 +++++++ test/Analysis/CostModel/X86/vshift-cost.ll | 167 +++++++++++++++++++++++++++++ 4 files changed, 285 insertions(+), 24 deletions(-) create mode 100644 test/Analysis/CostModel/X86/scalarize.ll create mode 100644 test/Analysis/CostModel/X86/vshift-cost.ll (limited to 'test/Analysis/CostModel/X86') diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll index f3c1283..7f97b17 100644 --- a/test/Analysis/CostModel/X86/cast.ll +++ b/test/Analysis/CostModel/X86/cast.ll @@ -1,10 +1,11 @@ -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" define i32 @add(i32 %arg) { - +; CHECK-LABEL: for function 'add' ; -- Same size registeres -- ;CHECK: cost of 1 {{.*}} zext %A = zext <4 x i1> undef to <4 x i32> @@ -33,57 +34,106 @@ define i32 @add(i32 %arg) { } define i32 @zext_sext(<8 x i1> %in) { - ;CHECK: cost of 6 {{.*}} zext +; CHECK-AVX2-LABEL: for function 'zext_sext' +; CHECK-AVX-LABEL: for function 'zext_sext' + ;CHECK-AVX2: cost of 3 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext %Z = zext <8 x i1> %in to <8 x i32> - ;CHECK: cost of 9 {{.*}} sext + ;CHECK-AVX2: cost of 3 {{.*}} sext + ;CHECK-AVX: cost of 7 {{.*}} sext %S = sext <8 x i1> %in to <8 x i32> - ;CHECK: cost of 1 {{.*}} zext + ;CHECK-AVX2: cost of 1 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext %A1 = zext <16 x i8> undef to <16 x i16> - ;CHECK: cost of 1 {{.*}} sext + ;CHECK-AVX2: cost of 1 {{.*}} sext + ;CHECK-AVX: cost of 4 {{.*}} sext %A2 = sext <16 x i8> undef to <16 x i16> - ;CHECK: cost of 1 {{.*}} sext + ;CHECK-AVX2: cost of 1 {{.*}} sext + ;CHECK-AVX: cost of 4 {{.*}} sext %A = sext <8 x i16> undef to <8 x i32> - ;CHECK: cost of 1 {{.*}} zext + ;CHECK-AVX2: cost of 1 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext %B = zext <8 x i16> undef to <8 x i32> - ;CHECK: cost of 1 {{.*}} sext + ;CHECK-AVX2: cost of 1 {{.*}} sext + ;CHECK-AVX: cost of 4 {{.*}} sext %C = sext <4 x i32> undef to <4 x i64> - ;CHECK: cost of 6 {{.*}} sext - %C1 = sext <4 x i8> undef to <4 x i64> - ;CHECK: cost of 6 {{.*}} sext - %C2 = sext <4 x i16> undef to <4 x i64> - ;CHECK: cost of 1 {{.*}} zext + ;CHECK-AVX2: cost of 3 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext + %C.v8i8.z = zext <8 x i8> undef to <8 x i32> + ;CHECK-AVX2: cost of 3 {{.*}} sext + ;CHECK-AVX: cost of 7 {{.*}} sext + %C.v8i8.s = sext <8 x i8> undef to <8 x i32> + ;CHECK-AVX2: cost of 3 {{.*}} zext + ;CHECK-AVX: cost of 3 {{.*}} zext + %C.v4i16.z = zext <4 x i16> undef to <4 x i64> + ;CHECK-AVX2: cost of 3 {{.*}} sext + ;CHECK-AVX: cost of 6 {{.*}} sext + %C.v4i16.s = sext <4 x i16> undef to <4 x i64> + + ;CHECK-AVX2: cost of 3 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext + %C.v4i8.z = zext <4 x i8> undef to <4 x i64> + ;CHECK-AVX2: cost of 3 {{.*}} sext + ;CHECK-AVX: cost of 6 {{.*}} sext + %C.v4i8.s = sext <4 x i8> undef to <4 x i64> + + ;CHECK-AVX2: cost of 1 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext %D = zext <4 x i32> undef to <4 x i64> - ;CHECK: cost of 1 {{.*}} trunc + ;CHECK-AVX2: cost of 2 {{.*}} trunc + ;CHECK-AVX: cost of 4 {{.*}} trunc %E = trunc <4 x i64> undef to <4 x i32> - ;CHECK: cost of 1 {{.*}} trunc + ;CHECK-AVX2: cost of 2 {{.*}} trunc + ;CHECK-AVX: cost of 5 {{.*}} trunc %F = trunc <8 x i32> undef to <8 x i16> - ;CHECK: cost of 2 {{.*}} trunc + ;CHECK-AVX2: cost of 4 {{.*}} trunc + ;CHECK-AVX: cost of 4 {{.*}} trunc %F1 = trunc <16 x i16> undef to <16 x i8> - - ;CHECK: cost of 3 {{.*}} trunc + ;CHECK-AVX2: cost of 2 {{.*}} trunc + ;CHECK-AVX: cost of 4 {{.*}} trunc + %F2 = trunc <8 x i32> undef to <8 x i8> + ;CHECK-AVX2: cost of 2 {{.*}} trunc + ;CHECK-AVX: cost of 4 {{.*}} trunc + %F3 = trunc <4 x i64> undef to <4 x i8> + + ;CHECK-AVX2: cost of 4 {{.*}} trunc + ;CHECK-AVX: cost of 9 {{.*}} trunc %G = trunc <8 x i64> undef to <8 x i32> ret i32 undef } define i32 @masks8(<8 x i1> %in) { - ;CHECK: cost of 6 {{.*}} zext +; CHECK-AVX2-LABEL: for function 'masks8' +; CHECK-AVX-LABEL: for function 'masks8' + + ;CHECK-AVX2: cost of 3 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext %Z = zext <8 x i1> %in to <8 x i32> - ;CHECK: cost of 9 {{.*}} sext + ;CHECK-AVX2: cost of 3 {{.*}} sext + ;CHECK-AVX: cost of 7 {{.*}} sext %S = sext <8 x i1> %in to <8 x i32> ret i32 undef } define i32 @masks4(<4 x i1> %in) { - ;CHECK: cost of 8 {{.*}} sext +; CHECK-AVX2-LABEL: for function 'masks4' +; CHECK-AVX-LABEL: for function 'masks4' + + ;CHECK-AVX2: cost of 3 {{.*}} zext + ;CHECK-AVX: cost of 4 {{.*}} zext + %Z = zext <4 x i1> %in to <4 x i64> + ;CHECK-AVX2: cost of 3 {{.*}} sext + ;CHECK-AVX: cost of 6 {{.*}} sext %S = sext <4 x i1> %in to <4 x i64> ret i32 undef } define void @sitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) { +; CHECK-LABEL: for function 'sitofp4' ; CHECK: cost of 3 {{.*}} sitofp %A1 = sitofp <4 x i1> %a to <4 x float> ; CHECK: cost of 3 {{.*}} sitofp @@ -107,6 +157,7 @@ define void @sitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) { } define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { +; CHECK-LABEL: for function 'sitofp8' ; CHECK: cost of 8 {{.*}} sitofp %A1 = sitofp <8 x i1> %a to <8 x float> @@ -122,6 +173,7 @@ define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { } define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) { +; CHECK-LABEL: for function 'uitofp4' ; CHECK: cost of 7 {{.*}} uitofp %A1 = uitofp <4 x i1> %a to <4 x float> ; CHECK: cost of 7 {{.*}} uitofp @@ -145,6 +197,7 @@ define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) { } define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { +; CHECK-LABEL: for function 'uitofp8' ; CHECK: cost of 6 {{.*}} uitofp %A1 = uitofp <8 x i1> %a to <8 x float> diff --git a/test/Analysis/CostModel/X86/cmp.ll b/test/Analysis/CostModel/X86/cmp.ll index 713b374..9f2bdb3 100644 --- a/test/Analysis/CostModel/X86/cmp.ll +++ b/test/Analysis/CostModel/X86/cmp.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck --check-prefix=AVX1 %s -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core-avx2 | FileCheck --check-prefix=AVX2 %s +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck -check-prefix=CHECK -check-prefix=AVX1 %s +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core-avx2 | FileCheck -check-prefix=CHECK -check-prefix=AVX2 %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" diff --git a/test/Analysis/CostModel/X86/scalarize.ll b/test/Analysis/CostModel/X86/scalarize.ll new file mode 100644 index 0000000..fc25fcb --- /dev/null +++ b/test/Analysis/CostModel/X86/scalarize.ll @@ -0,0 +1,41 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=i386 -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK32 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK64 + +; Test vector scalarization costs. +; RUN: llc < %s -march=x86 -mcpu=i386 +; RUN: llc < %s -march=x86 -mcpu=yonah + +%i4 = type <4 x i32> +%i8 = type <2 x i64> + +;;; TEST HANDLING OF VARIOUS VECTOR SIZES + +declare %i4 @llvm.bswap.v4i32(%i4) +declare %i8 @llvm.bswap.v2i64(%i8) + +declare %i4 @llvm.ctpop.v4i32(%i4) +declare %i8 @llvm.ctpop.v2i64(%i8) + +; CHECK32-LABEL: test_scalarized_intrinsics +; CHECK64-LABEL: test_scalarized_intrinsics +define void @test_scalarized_intrinsics() { + %r1 = add %i8 undef, undef + +; CHECK32: cost of 12 {{.*}}bswap.v4i32 +; CHECK64: cost of 12 {{.*}}bswap.v4i32 + %r2 = call %i4 @llvm.bswap.v4i32(%i4 undef) +; CHECK32: cost of 10 {{.*}}bswap.v2i64 +; CHECK64: cost of 6 {{.*}}bswap.v2i64 + %r3 = call %i8 @llvm.bswap.v2i64(%i8 undef) + +; CHECK32: cost of 12 {{.*}}ctpop.v4i32 +; CHECK64: cost of 12 {{.*}}ctpop.v4i32 + %r4 = call %i4 @llvm.ctpop.v4i32(%i4 undef) +; CHECK32: cost of 10 {{.*}}ctpop.v2i64 +; CHECK64: cost of 6 {{.*}}ctpop.v2i64 + %r5 = call %i8 @llvm.ctpop.v2i64(%i8 undef) + +; CHECK32: ret +; CHECK64: ret + ret void +} diff --git a/test/Analysis/CostModel/X86/vshift-cost.ll b/test/Analysis/CostModel/X86/vshift-cost.ll new file mode 100644 index 0000000..84d7246 --- /dev/null +++ b/test/Analysis/CostModel/X86/vshift-cost.ll @@ -0,0 +1,167 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 + + +; Verify the cost of vector shift left instructions. + +; We always emit a single pmullw in the case of v8i16 vector shifts by +; non-uniform constant. + +define <8 x i16> @test1(<8 x i16> %a) { + %shl = shl <8 x i16> %a, + ret <8 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test1': +; CHECK: Found an estimated cost of 1 for instruction: %shl + + +define <8 x i16> @test2(<8 x i16> %a) { + %shl = shl <8 x i16> %a, + ret <8 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test2': +; CHECK: Found an estimated cost of 1 for instruction: %shl + + +; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction. +; Make sure that the estimated cost is always 1 except for the case where +; we only have SSE2 support. With SSE2, we are forced to special lower the +; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle. + +define <4 x i32> @test3(<4 x i32> %a) { + %shl = shl <4 x i32> %a, + ret <4 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test3': +; SSE2: Found an estimated cost of 6 for instruction: %shl +; SSE41: Found an estimated cost of 1 for instruction: %shl +; AVX: Found an estimated cost of 1 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +define <4 x i32> @test4(<4 x i32> %a) { + %shl = shl <4 x i32> %a, + ret <4 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test4': +; SSE2: Found an estimated cost of 6 for instruction: %shl +; SSE41: Found an estimated cost of 1 for instruction: %shl +; AVX: Found an estimated cost of 1 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; On AVX2 we are able to lower the following shift into a single +; vpsllvq. Therefore, the expected cost is only 1. +; In all other cases, this shift is scalarized as the target does not support +; vpsllv instructions. + +define <2 x i64> @test5(<2 x i64> %a) { + %shl = shl <2 x i64> %a, + ret <2 x i64> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test5': +; SSE2: Found an estimated cost of 20 for instruction: %shl +; SSE41: Found an estimated cost of 20 for instruction: %shl +; AVX: Found an estimated cost of 20 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; v16i16 and v8i32 shift left by non-uniform constant are lowered into +; vector multiply instructions. With AVX (but not AVX2), the vector multiply +; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert. +; +; With AVX2, instruction vpmullw works with 256bit quantities and +; therefore there is no need to split the resulting vector multiply into +; a sequence of two multiply. +; +; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice +; the cost computed in the case of 'test1'. That is because the backend +; simply emits 2 pmullw with no extract/insert. + + +define <16 x i16> @test6(<16 x i16> %a) { + %shl = shl <16 x i16> %a, + ret <16 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test6': +; SSE2: Found an estimated cost of 2 for instruction: %shl +; SSE41: Found an estimated cost of 2 for instruction: %shl +; AVX: Found an estimated cost of 4 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice +; the cost computed in the case of 'test3'. That is because the multiply +; is type-legalized into two 4i32 vector multiply. + +define <8 x i32> @test7(<8 x i32> %a) { + %shl = shl <8 x i32> %a, + ret <8 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test7': +; SSE2: Found an estimated cost of 12 for instruction: %shl +; SSE41: Found an estimated cost of 2 for instruction: %shl +; AVX: Found an estimated cost of 4 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; On AVX2 we are able to lower the following shift into a single +; vpsllvq. Therefore, the expected cost is only 1. +; In all other cases, this shift is scalarized as the target does not support +; vpsllv instructions. + +define <4 x i64> @test8(<4 x i64> %a) { + %shl = shl <4 x i64> %a, + ret <4 x i64> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test8': +; SSE2: Found an estimated cost of 40 for instruction: %shl +; SSE41: Found an estimated cost of 40 for instruction: %shl +; AVX: Found an estimated cost of 40 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; Same as 'test6', with the difference that the cost is double. + +define <32 x i16> @test9(<32 x i16> %a) { + %shl = shl <32 x i16> %a, + ret <32 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test9': +; SSE2: Found an estimated cost of 4 for instruction: %shl +; SSE41: Found an estimated cost of 4 for instruction: %shl +; AVX: Found an estimated cost of 8 for instruction: %shl +; AVX2: Found an estimated cost of 2 for instruction: %shl + + +; Same as 'test7', except that now the cost is double. + +define <16 x i32> @test10(<16 x i32> %a) { + %shl = shl <16 x i32> %a, + ret <16 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test10': +; SSE2: Found an estimated cost of 24 for instruction: %shl +; SSE41: Found an estimated cost of 4 for instruction: %shl +; AVX: Found an estimated cost of 8 for instruction: %shl +; AVX2: Found an estimated cost of 2 for instruction: %shl + + +; On AVX2 we are able to lower the following shift into a sequence of +; two vpsllvq instructions. Therefore, the expected cost is only 2. +; In all other cases, this shift is scalarized as we don't have vpsllv +; instructions. + +define <8 x i64> @test11(<8 x i64> %a) { + %shl = shl <8 x i64> %a, + ret <8 x i64> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test11': +; SSE2: Found an estimated cost of 80 for instruction: %shl +; SSE41: Found an estimated cost of 80 for instruction: %shl +; AVX: Found an estimated cost of 80 for instruction: %shl +; AVX2: Found an estimated cost of 2 for instruction: %shl + + -- cgit v1.1