Experimental support for aligned NEON spills.

ARM targets with NEON units have access to aligned vector loads and stores that are potentially faster than unaligned operations. Add support for spilling the callee-saved NEON registers to an aligned stack area using 16-byte aligned NEON loads and store. This feature is off by default, controlled by an -align-neon-spills command line option. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@147211 91177308-0d34-0410-b5e6-96231b3b80d8
author: Jakob Stoklund Olesen <stoklund@2pi.dk> 2011-12-23 00:36:18 +0000
committer: Jakob Stoklund Olesen <stoklund@2pi.dk> 2011-12-23 00:36:18 +0000
commit: f06f6f50e9844b88cfbb9fb896fff9c3a752966b (patch)
tree: 346f10d968d528bc683511a5e7a2ae271e23f9a6 /test/CodeGen
parent: f4aea8f34946d4d2b101b8e3c6db95c18be80173 (diff)
download: external_llvm-f06f6f50e9844b88cfbb9fb896fff9c3a752966b.zip
external_llvm-f06f6f50e9844b88cfbb9fb896fff9c3a752966b.tar.gz
external_llvm-f06f6f50e9844b88cfbb9fb896fff9c3a752966b.tar.bz2
1 files changed, 73 insertions, 0 deletions
diff --git a/test/CodeGen/Thumb2/aligned-spill.ll b/test/CodeGen/Thumb2/aligned-spill.ll
index 1a987d0..948c8ce 100644
--- a/test/CodeGen/Thumb2/aligned-spill.ll
+++ b/test/CodeGen/Thumb2/aligned-spill.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a8 -align-neon-spills | FileCheck %s --check-prefix=NEON
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios"
 
@@ -19,4 +20,76 @@ entry:
   ret void
 }
 
+; NEON: f
+; NEON: push {r4, r7, lr}
+; NEON: sub.w r4, sp, #64
+; NEON: bic r4, r4, #15
+; Stack pointer must be updated before the spills.
+; NEON: mov sp, r4
+; NEON: vst1.64 {d8, d9, d10, d11}, [r4, :128]!
+; NEON: vst1.64 {d12, d13, d14, d15}, [r4, :128]
+; Stack pointer adjustment for the stack frame contents.
+; This could legally happen before the spills.
+; Since the spill slot is only 8 bytes, technically it would be fine to only
+; subtract #8 here. That would leave sp less aligned than some stack slots,
+; and would probably blow MFI's mind.
+; NEON: sub sp, #16
+; The epilog is free to use another scratch register than r4.
+; NEON: add r[[R4:[0-9]+]], sp, #16
+; NEON: vld1.64 {d8, d9, d10, d11}, [r[[R4]], :128]!
+; NEON: vld1.64 {d12, d13, d14, d15}, [r[[R4]], :128]
+; The stack pointer restore must happen after the reloads.
+; NEON: mov sp,
+; NEON: pop
+
 declare void @g()
+
+; Spill 7 d-registers.
+define void @f7(double* nocapture %p) nounwind ssp {
+entry:
+  tail call void asm sideeffect "", "~{d8},~{d9},~{d10},~{d11},~{d12},~{d13},~{d14}"() nounwind
+  ret void
+}
+
+; NEON: f7
+; NEON: push {r4, r7, lr}
+; NEON: sub.w r4, sp, #56
+; NEON: bic r4, r4, #15
+; Stack pointer must be updated before the spills.
+; NEON: mov sp, r4
+; NEON: vst1.64 {d8, d9, d10, d11}, [r4, :128]!
+; NEON: vst1.64 {d12, d13}, [r4, :128]
+; NEON: vstr d14, [r4, #16]
+; Epilog
+; NEON: vld1.64 {d8, d9, d10, d11},
+; NEON: vld1.64 {d12, d13},
+; NEON: vldr d14,
+; The stack pointer restore must happen after the reloads.
+; NEON: mov sp,
+; NEON: pop
+
+; Spill 7 d-registers, leave a hole.
+define void @f3plus4(double* nocapture %p) nounwind ssp {
+entry:
+  tail call void asm sideeffect "", "~{d8},~{d9},~{d10},~{d12},~{d13},~{d14},~{d15}"() nounwind
+  ret void
+}
+
+; Aligned spilling only works for contiguous ranges starting from d8.
+; The rest goes to the standard vpush instructions.
+; NEON: f3plus4
+; NEON: push {r4, r7, lr}
+; NEON: vpush {d12, d13, d14, d15}
+; NEON: sub.w r4, sp, #24
+; NEON: bic r4, r4, #15
+; Stack pointer must be updated before the spills.
+; NEON: mov sp, r4
+; NEON: vst1.64 {d8, d9}, [r4, :128]
+; NEON: vstr d10, [r4, #16]
+; Epilog
+; NEON: vld1.64 {d8, d9},
+; NEON: vldr d10, [{{.*}}, #16]
+; The stack pointer restore must happen after the reloads.
+; NEON: mov sp,
+; NEON: vpop {d12, d13, d14, d15}
+; NEON: pop
author	Jakob Stoklund Olesen <stoklund@2pi.dk>	2011-12-23 00:36:18 +0000
committer	Jakob Stoklund Olesen <stoklund@2pi.dk>	2011-12-23 00:36:18 +0000
commit	f06f6f50e9844b88cfbb9fb896fff9c3a752966b (patch)
tree	346f10d968d528bc683511a5e7a2ae271e23f9a6 /test/CodeGen
parent	f4aea8f34946d4d2b101b8e3c6db95c18be80173 (diff)
download	external_llvm-f06f6f50e9844b88cfbb9fb896fff9c3a752966b.zip external_llvm-f06f6f50e9844b88cfbb9fb896fff9c3a752966b.tar.gz external_llvm-f06f6f50e9844b88cfbb9fb896fff9c3a752966b.tar.bz2