diff options
author | Evan Cheng <evan.cheng@apple.com> | 2012-12-10 23:21:26 +0000 |
---|---|---|
committer | Evan Cheng <evan.cheng@apple.com> | 2012-12-10 23:21:26 +0000 |
commit | 376642ed620ecae05b68c7bc81f79aeb2065abe0 (patch) | |
tree | 9757b2568050b3ab58af15c234df3bc9f66202b0 /test/CodeGen | |
parent | 2b475922e6169098606006a69d765160caa77848 (diff) | |
download | external_llvm-376642ed620ecae05b68c7bc81f79aeb2065abe0.zip external_llvm-376642ed620ecae05b68c7bc81f79aeb2065abe0.tar.gz external_llvm-376642ed620ecae05b68c7bc81f79aeb2065abe0.tar.bz2 |
Some enhancements for memcpy / memset inline expansion.
1. Teach it to use overlapping unaligned load / store to copy / set the trailing
bytes. e.g. On 86, use two pairs of movups / movaps for 17 - 31 byte copies.
2. Use f64 for memcpy / memset on targets where i64 is not legal but f64 is. e.g.
x86 and ARM.
3. When memcpy from a constant string, do *not* replace the load with a constant
if it's not possible to materialize an integer immediate with a single
instruction (required a new target hook: TLI.isIntImmLegal()).
4. Use unaligned load / stores more aggressively if target hooks indicates they
are "fast".
5. Update ARM target hooks to use unaligned load / stores. e.g. vld1.8 / vst1.8.
Also increase the threshold to something reasonable (8 for memset, 4 pairs
for memcpy).
This significantly improves Dhrystone, up to 50% on ARM iOS devices.
rdar://12760078
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169791 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen')
-rw-r--r-- | test/CodeGen/ARM/2011-10-26-memset-with-neon.ll | 8 | ||||
-rw-r--r-- | test/CodeGen/ARM/memcpy-inline.ll | 109 | ||||
-rw-r--r-- | test/CodeGen/ARM/memset-inline.ll | 30 | ||||
-rw-r--r-- | test/CodeGen/ARM/reg_asc_order.ll | 16 | ||||
-rw-r--r-- | test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll | 7 | ||||
-rw-r--r-- | test/CodeGen/X86/memcpy-2.ll | 12 |
6 files changed, 143 insertions, 39 deletions
diff --git a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll index 6e0ef96..f563eee 100644 --- a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll +++ b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll @@ -1,13 +1,5 @@ ; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s -; Should trigger a NEON store. -; CHECK: vstr -define void @f_0_12(i8* nocapture %c) nounwind optsize { -entry: - call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false) - ret void -} - ; Trigger multiple NEON stores. ; CHECK: vst1.64 ; CHECK-NEXT: vst1.64 diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll index dc77282..d846e5c 100644 --- a/test/CodeGen/ARM/memcpy-inline.ll +++ b/test/CodeGen/ARM/memcpy-inline.ll @@ -1,18 +1,115 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -disable-post-ra | FileCheck %s - -; CHECK: ldrd -; CHECK: strd -; CHECK: ldrb +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } @src = external global %struct.x @dst = external global %struct.x -define i32 @t() { +@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1 +@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1 +@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1 +@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1 +@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1 +@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1 +@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16 + +define i32 @t0() { entry: +; CHECK: t0: +; CHECK: vldr [[REG1:d[0-9]+]], +; CHECK: vstr [[REG1]], call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false) ret i32 0 } +define void @t1(i8* nocapture %C) nounwind { +entry: +; CHECK: t1: +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: adds r0, #15 +; CHECK: adds r1, #15 +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false) + ret void +} + +define void @t2(i8* nocapture %C) nounwind { +entry: +; CHECK: t2: +; CHECK: ldr [[REG2:r[0-9]+]], [r1, #32] +; CHECK: str [[REG2]], [r0, #32] +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: adds r0, #16 +; CHECK: adds r1, #16 +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false) + ret void +} + +define void @t3(i8* nocapture %C) nounwind { +entry: +; CHECK: t3: +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: adds r0, #16 +; CHECK: adds r1, #16 +; CHECK: vld1.8 {d{{[0-9]+}}}, [r1] +; CHECK: vst1.8 {d{{[0-9]+}}}, [r0] + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false) + ret void +} + +define void @t4(i8* nocapture %C) nounwind { +entry: +; CHECK: t4: +; CHECK: vld1.8 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1] +; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0] + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false) + ret void +} + +define void @t5(i8* nocapture %C) nounwind { +entry: +; CHECK: t5: +; CHECK: movs [[REG5:r[0-9]+]], #0 +; CHECK: strb [[REG5]], [r0, #6] +; CHECK: movw [[REG6:r[0-9]+]], #21587 +; CHECK: strh [[REG6]], [r0, #4] +; CHECK: ldr [[REG7:r[0-9]+]], +; CHECK: str [[REG7]] + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false) + ret void +} + +define void @t6() nounwind { +entry: +; CHECK: t6: +; CHECK: vld1.8 {[[REG8:d[0-9]+]]}, [r0] +; CHECK: vstr [[REG8]], [r1] +; CHECK: adds r1, #6 +; CHECK: adds r0, #6 +; CHECK: vld1.8 +; CHECK: vst1.16 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false) + ret void +} + +%struct.Foo = type { i32, i32, i32, i32 } + +define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind { +entry: +; CHECK: t7 +; CHECK: vld1.32 +; CHECK: vst1.32 + %0 = bitcast %struct.Foo* %a to i8* + %1 = bitcast %struct.Foo* %b to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false) + ret void +} + declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll new file mode 100644 index 0000000..ee8c364 --- /dev/null +++ b/test/CodeGen/ARM/memset-inline.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s + +define void @t1(i8* nocapture %c) nounwind optsize { +entry: +; CHECK: t1: +; CHECK: movs r1, #0 +; CHECK: str r1, [r0] +; CHECK: str r1, [r0, #4] +; CHECK: str r1, [r0, #8] + call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false) + ret void +} + +define void @t2() nounwind ssp { +entry: +; CHECK: t2: +; CHECK: add.w r1, r0, #10 +; CHECK: vmov.i32 {{q[0-9]+}}, #0x0 +; CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] +; CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] + %buf = alloca [26 x i8], align 1 + %0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0 + call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false) + call void @something(i8* %0) nounwind + ret void +} + +declare void @something(i8*) nounwind +declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind diff --git a/test/CodeGen/ARM/reg_asc_order.ll b/test/CodeGen/ARM/reg_asc_order.ll deleted file mode 100644 index d1d0ee5..0000000 --- a/test/CodeGen/ARM/reg_asc_order.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -; Check that memcpy gets lowered to ldm/stm, at least in this very smple case. - -%struct.Foo = type { i32, i32, i32, i32 } - -define void @_Z10CopyStructP3FooS0_(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind { -entry: -;CHECK: ldm -;CHECK: stm - %0 = bitcast %struct.Foo* %a to i8* - %1 = bitcast %struct.Foo* %b to i8* - tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false) - ret void -} - -declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind diff --git a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll index 94075e7..c2d9d84 100644 --- a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll +++ b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll @@ -6,15 +6,16 @@ define void @t(i32 %count) ssp nounwind { entry: ; CHECK: t: -; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip) -; CHECK: movups L_str(%rip), %xmm0 +; CHECK: movups L_str+12(%rip), %xmm0 +; CHECK: movups L_str(%rip), %xmm1 %tmp0 = alloca [60 x i8], align 1 %tmp1 = getelementptr inbounds [60 x i8]* %tmp0, i64 0, i64 0 br label %bb1 bb1: ; CHECK: LBB0_1: -; CHECK: movaps %xmm0, (%rsp) +; CHECK: movups %xmm0, 12(%rsp) +; CHECK: movaps %xmm1, (%rsp) %tmp2 = phi i32 [ %tmp3, %bb1 ], [ 0, %entry ] call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp1, i8* getelementptr inbounds ([28 x i8]* @str, i64 0, i64 0), i64 28, i32 1, i1 false) %tmp3 = add i32 %tmp2, 1 diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll index 7a2bbc4..dcc8f0d 100644 --- a/test/CodeGen/X86/memcpy-2.ll +++ b/test/CodeGen/X86/memcpy-2.ll @@ -10,18 +10,18 @@ define void @t1(i32 %argc, i8** %argv) nounwind { entry: ; SSE2: t1: +; SSE2: movsd _.str+16, %xmm0 +; SSE2: movsd %xmm0, 16(%esp) ; SSE2: movaps _.str, %xmm0 ; SSE2: movaps %xmm0 -; SSE2: movb $0 -; SSE2: movl $0 -; SSE2: movl $0 +; SSE2: movb $0, 24(%esp) ; SSE1: t1: +; SSE1: fldl _.str+16 +; SSE1: fstpl 16(%esp) ; SSE1: movaps _.str, %xmm0 ; SSE1: movaps %xmm0 -; SSE1: movb $0 -; SSE1: movl $0 -; SSE1: movl $0 +; SSE1: movb $0, 24(%esp) ; NOSSE: t1: ; NOSSE: movb $0 |