aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Lattner <sabre@nondot.org>2009-03-08 03:04:26 +0000
committerChris Lattner <sabre@nondot.org>2009-03-08 03:04:26 +0000
commitff9dcee534bf8108dd7b6a77f45db8f17e4125c6 (patch)
treee0f8974ee0fc43e36c5cade991fcd2ac91816d73
parentb34487dcbac342512502894abee21d1dc874c767 (diff)
downloadexternal_llvm-ff9dcee534bf8108dd7b6a77f45db8f17e4125c6.zip
external_llvm-ff9dcee534bf8108dd7b6a77f45db8f17e4125c6.tar.gz
external_llvm-ff9dcee534bf8108dd7b6a77f45db8f17e4125c6.tar.bz2
add a note.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@66360 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/X86/README.txt40
1 files changed, 40 insertions, 0 deletions
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index c4746d0..5f28579 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -1840,3 +1840,43 @@ _f:
ret
//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible. A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+ %tmp = alloca <16 x float>, align 16
+ %tmp2 = alloca <16 x float>, align 16
+ store <16 x float> %A, <16 x float>* %tmp
+ %s = bitcast <16 x float>* %tmp to i8*
+ %s2 = bitcast <16 x float>* %tmp2 to i8*
+ call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+ %R = load <16 x float>* %tmp2
+ ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+ subl $140, %esp
+ movaps %xmm3, 112(%esp)
+ movaps %xmm2, 96(%esp)
+ movaps %xmm1, 80(%esp)
+ movaps %xmm0, 64(%esp)
+ movl 60(%esp), %eax
+ movl %eax, 124(%esp)
+ movl 56(%esp), %eax
+ movl %eax, 120(%esp)
+ movl 52(%esp), %eax
+ <many many more 32-bit copies>
+ movaps (%esp), %xmm0
+ movaps 16(%esp), %xmm1
+ movaps 32(%esp), %xmm2
+ movaps 48(%esp), %xmm3
+ addl $140, %esp
+ ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//