aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/Target/X86/README-SSE.txt102
-rw-r--r--test/CodeGen/X86/vec_set-8.ll10
2 files changed, 35 insertions, 77 deletions
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index b4fc53a..cadfc20 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -456,6 +456,18 @@ icc generates:
So icc is smart enough to know that B is in memory so it doesn't load it and
store it back to stack.
+This should be fixed by eliminating the llvm.x86.sse2.loadl.pd intrinsic,
+lowering it to a load+insertelement instead. Already match the load+shuffle
+as movlpd, so this should be easy. We already get optimal code for:
+
+define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) {
+entry:
+ %tmp2 = load <2 x double>* %A, align 16
+ %tmp8 = insertelement <2 x double> %tmp2, double %B, i32 0
+ store <2 x double> %tmp8, <2 x double>* %r, align 16
+ ret void
+}
+
//===---------------------------------------------------------------------===//
__m128d test1( __m128d A, __m128d B) {
@@ -476,10 +488,10 @@ Don't know if unpckhpd is faster. But it is shorter.
This code generates ugly code, probably due to costs being off or something:
-void %test(float* %P, <4 x float>* %P2 ) {
+define void @test(float* %P, <4 x float>* %P2 ) {
%xFloat0.688 = load float* %P
- %loadVector37.712 = load <4 x float>* %P2
- %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
+ %tmp = load <4 x float>* %P2
+ %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
store <4 x float> %inFloat3.713, <4 x float>* %P2
ret void
}
@@ -487,17 +499,16 @@ void %test(float* %P, <4 x float>* %P2 ) {
Generates:
_test:
- pxor %xmm0, %xmm0
- movd %xmm0, %eax ;; EAX = 0!
- movl 8(%esp), %ecx
- movaps (%ecx), %xmm0
- pinsrw $6, %eax, %xmm0
- shrl $16, %eax ;; EAX = 0 again!
- pinsrw $7, %eax, %xmm0
- movaps %xmm0, (%ecx)
- ret
+ movl 8(%esp), %eax
+ movaps (%eax), %xmm0
+ pxor %xmm1, %xmm1
+ movaps %xmm0, %xmm2
+ shufps $50, %xmm1, %xmm2
+ shufps $132, %xmm2, %xmm0
+ movaps %xmm0, (%eax)
+ ret
-It would be better to generate:
+Would it be better to generate:
_test:
movl 8(%esp), %ecx
@@ -508,7 +519,7 @@ _test:
movaps %xmm0, (%ecx)
ret
-or use pxor (to make a zero vector) and shuffle (to insert it).
+?
//===---------------------------------------------------------------------===//
@@ -576,32 +587,6 @@ swizzle:
//===---------------------------------------------------------------------===//
-This code:
-
-#include <emmintrin.h>
-__m128i test(long long i) { return _mm_cvtsi64x_si128(i); }
-
-Should turn into a single 'movq %rdi, %xmm0' instruction. Instead, we
-get this (on x86-64):
-
-_test:
- movd %rdi, %xmm1
- xorps %xmm0, %xmm0
- movsd %xmm1, %xmm0
- ret
-
-The LLVM IR is:
-
-target triple = "x86_64-apple-darwin8"
-define <2 x i64> @test(i64 %i) {
-entry:
- %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
- %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
- ret <2 x i64> %tmp11
-}
-
-//===---------------------------------------------------------------------===//
-
These functions should produce the same code:
#include <emmintrin.h>
@@ -671,43 +656,6 @@ beneficial because it prevents the load from being folded into the multiply.
//===---------------------------------------------------------------------===//
-In this loop:
-
-bb49: ; preds = %bb49, %bb49.preheader
- %indvar = phi i32 [ 0, %bb49.preheader ], [ %indvar.next, %bb49 ] ; <i32> [#uses=2]
- %dp.089.0.rec = shl i32 %indvar, 3 ; <i32> [#uses=2]
- %dp.089.0 = getelementptr i32* %tmp89, i32 %dp.089.0.rec ; <i32*> [#uses=1]
- %tmp5051 = bitcast i32* %dp.089.0 to <2 x i64>* ; <<2 x i64>*> [#uses=1]
- store <2 x i64> zeroinitializer, <2 x i64>* %tmp5051, align 16
- %dp.089.0.sum105 = or i32 %dp.089.0.rec, 4 ; <i32> [#uses=1]
- %tmp56 = getelementptr i32* %tmp89, i32 %dp.089.0.sum105 ; <i32*> [#uses=1]
- %tmp5657 = bitcast i32* %tmp56 to <2 x i64>* ; <<2 x i64>*> [#uses=1]
- store <2 x i64> zeroinitializer, <2 x i64>* %tmp5657, align 16
- %indvar.next = add i32 %indvar, 1 ; <i32> [#uses=2]
- %exitcond = icmp eq i32 %indvar.next, %tmp98 ; <i1> [#uses=1]
- br i1 %exitcond, label %bb72, label %bb49
-
-we get:
-
-LBB5_6: # bb49.preheader
- shlw $2, %si
- decw %si
- movzwl %si, %eax
- incl %eax
- xorl %ecx, %ecx
-LBB5_7: # bb49
- xorps %xmm0, %xmm0 # (1)
- movaps %xmm0, (%edx)
- movaps %xmm0, 16(%edx)
- addl $32, %edx
- incl %ecx
- cmpl %eax, %ecx
- jne LBB4_7 # bb47
-
-The instruction at (1) can be moved out of the main body of the loop.
-
-//===---------------------------------------------------------------------===//
-
These functions:
#include <xmmintrin.h>
diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll
new file mode 100644
index 0000000..cca436b
--- /dev/null
+++ b/test/CodeGen/X86/vec_set-8.ll
@@ -0,0 +1,10 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | not grep movsd
+; RUN: llvm-as < %s | llc -march=x86-64 | grep {movd.*%rdi,.*%xmm0}
+
+define <2 x i64> @test(i64 %i) nounwind {
+entry:
+ %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
+ %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
+ ret <2 x i64> %tmp11
+}
+