aboutsummaryrefslogtreecommitdiffstats
path: root/test/CodeGen/X86
diff options
context:
space:
mode:
authorJim Grosbach <grosbach@apple.com>2013-10-17 02:58:06 +0000
committerJim Grosbach <grosbach@apple.com>2013-10-17 02:58:06 +0000
commit49af380e3b007c678a7e4354efff601fd30a6681 (patch)
tree474f05b17ce86208494cad35eea93ed9fdcee955 /test/CodeGen/X86
parent9198657e1eb13ab717248cfacb2b504fd7e72719 (diff)
downloadexternal_llvm-49af380e3b007c678a7e4354efff601fd30a6681.zip
external_llvm-49af380e3b007c678a7e4354efff601fd30a6681.tar.gz
external_llvm-49af380e3b007c678a7e4354efff601fd30a6681.tar.bz2
x86: Move bitcasts outside concat_vector.
Consider the following: typedef unsigned short ushort4U __attribute__((ext_vector_type(4), aligned(2))); typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); typedef unsigned short ushort8 __attribute__((ext_vector_type(8))); typedef int int4 __attribute__((ext_vector_type(4))); int4 __bbase_cvt_int(ushort4 v) { ushort8 a; a.lo = v; return _mm_cvtepu16_epi32(a); } This generates the, not unreasonable, IR: define <4 x i32> @foo0(double %v.coerce) nounwind ssp { %tmp = bitcast double %v.coerce to <4 x i16> %tmp1 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 %0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> %tmp2 = tail call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp1) ret <4 x i32> %tmp2 } The problem is when type legalization gets hold of the v4i16. It legalizes that by spilling to the stack, then doing a zero-extending load. Things go even more silly from there, ending up with something like: _foo0: movsd %xmm0, -8(%rsp) <== Spill to the stack. movq -8(%rsp), %xmm0 <== Reload it right back out. pmovzxwd %xmm0, %xmm1 <== Here's what we actually asked for. pblendw $1, %xmm1, %xmm0 <== We don't need this at all pmovzxwd %xmm0, %xmm0 <== We already did this ret The v8i8 to v8i16 zext intrinsic gives even worse results, with two table lookups via pshufb instructions(!!). To avoid all that, we can move the bitcasting until after we've formed the wider (legal) vector type. Then our normal codegen flows along nicely and we get the expected: _foo0: pmovzxwd %xmm0, %xmm0 ret rdar://15245794 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@192866 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen/X86')
-rw-r--r--test/CodeGen/X86/pmovext.ll25
1 files changed, 24 insertions, 1 deletions
diff --git a/test/CodeGen/X86/pmovext.ll b/test/CodeGen/X86/pmovext.ll
index b85b4c3..f0e468f 100644
--- a/test/CodeGen/X86/pmovext.ll
+++ b/test/CodeGen/X86/pmovext.ll
@@ -18,5 +18,28 @@ define void @intrin_pmov(i16* noalias %dest, i8* noalias %src) nounwind uwtable
}
declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
-
declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+
+; rdar://15245794
+
+define <4 x i32> @foo0(double %v.coerce) nounwind ssp {
+; CHECK-LABEL: foo0
+; CHECK: pmovzxwd %xmm0, %xmm0
+; CHECK-NEXT: ret
+ %tmp = bitcast double %v.coerce to <4 x i16>
+ %tmp1 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %tmp2 = tail call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp1) nounwind
+ ret <4 x i32> %tmp2
+}
+
+define <8 x i16> @foo1(double %v.coerce) nounwind ssp {
+; CHECK-LABEL: foo1
+; CHECK: pmovzxbw %xmm0, %xmm0
+; CHECK-NEXT: ret
+ %tmp = bitcast double %v.coerce to <8 x i8>
+ %tmp1 = shufflevector <8 x i8> %tmp, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %tmp2 = tail call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %tmp1)
+ ret <8 x i16> %tmp2
+}
+
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone