From 11f2bf7f15a7d3b3df500f3f3e76355c888c23c7 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sat, 26 Jan 2013 11:44:21 +0000 Subject: X86: Do splat promotion later, so the optimizer can chew on it first. This catches many cases where we can emit a more efficient shuffle for a specific mask or when the mask contains undefs. Once the splat is lowered to unpacks we can't do that anymore. There is a possibility of moving the promotion after pshufb matching, but I'm not sure if pshufb with a mask loaded from memory is faster than 3 shuffles, so I avoided that for now. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173569 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/avx-splat.ll | 4 ++-- test/CodeGen/X86/vec_splat-3.ll | 20 +++++--------------- 2 files changed, 7 insertions(+), 17 deletions(-) (limited to 'test') diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index 67e4b40..5c01c2c 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -3,8 +3,8 @@ ; CHECK: vpunpcklbw %xmm ; CHECK-NEXT: vpunpckhbw %xmm +; CHECK-NEXT: vpshufd $85 ; CHECK-NEXT: vinsertf128 $1 -; CHECK-NEXT: vpermilps $85 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> @@ -12,8 +12,8 @@ entry: } ; CHECK: vpunpckhwd %xmm +; CHECK-NEXT: vpshufd $85 ; CHECK-NEXT: vinsertf128 $1 -; CHECK-NEXT: vpermilps $85 define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll index 293ed48..cf0ecf4 100644 --- a/test/CodeGen/X86/vec_splat-3.ll +++ b/test/CodeGen/X86/vec_splat-3.ll @@ -1,14 +1,12 @@ ; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse41 | FileCheck %s ; Splat test for v8i16 -; Should generate with pshufd with masks $0, $85, $170, $255 (each mask is used twice) define <8 x i16> @shuf_8i16_0(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> ret <8 x i16> %tmp6 ; CHECK: shuf_8i16_0: -; CHECK: punpcklwd -; CHECK-NEXT: pshufd $0 +; CHECK: pshuflw $0 } define <8 x i16> @shuf_8i16_1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { @@ -16,8 +14,7 @@ define <8 x i16> @shuf_8i16_1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ret <8 x i16> %tmp6 ; CHECK: shuf_8i16_1: -; CHECK: punpcklwd -; CHECK-NEXT: pshufd $85 +; CHECK: pshuflw $5 } define <8 x i16> @shuf_8i16_2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { @@ -34,8 +31,7 @@ define <8 x i16> @shuf_8i16_3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ret <8 x i16> %tmp6 ; CHECK: shuf_8i16_3: -; CHECK: punpcklwd -; CHECK-NEXT: pshufd $-1 +; CHECK: pshuflw $15 } define <8 x i16> @shuf_8i16_4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { @@ -43,8 +39,7 @@ define <8 x i16> @shuf_8i16_4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ret <8 x i16> %tmp6 ; CHECK: shuf_8i16_4: -; CHECK: punpckhwd -; CHECK-NEXT: pshufd $0 +; CHECK: movhlps } define <8 x i16> @shuf_8i16_5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { @@ -65,7 +60,6 @@ define <8 x i16> @shuf_8i16_6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ; CHECK-NEXT: pshufd $-86 } - define <8 x i16> @shuf_8i16_7(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> ret <8 x i16> %tmp6 @@ -75,8 +69,6 @@ define <8 x i16> @shuf_8i16_7(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ; CHECK-NEXT: pshufd $-1 } -; Should generate with pshufd with masks $0, $85, $170, $255 (each mask is used 4 times) - ; Splat test for v16i8 define <16 x i8> @shuf_16i8_8(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> @@ -124,9 +116,7 @@ define <16 x i8> @shuf_16i8_12(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK: shuf_16i8_12: -; CHECK: punpcklbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $0 +; CHECK: pshufd $5 } define <16 x i8> @shuf_16i8_13(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { -- cgit v1.1