aboutsummaryrefslogtreecommitdiffstats
path: root/test/CodeGen/PowerPC/unal-altivec.ll
diff options
context:
space:
mode:
authorHal Finkel <hfinkel@anl.gov>2013-05-24 23:00:14 +0000
committerHal Finkel <hfinkel@anl.gov>2013-05-24 23:00:14 +0000
commit80d10ded8cd4f34b87d82b03d6f63328ea337b26 (patch)
tree238b9081baf073661f9ae2bfc3ed8e22f7935110 /test/CodeGen/PowerPC/unal-altivec.ll
parent3b77151a61d2985ad5b29ee3d05b34d553322c2a (diff)
downloadexternal_llvm-80d10ded8cd4f34b87d82b03d6f63328ea337b26.zip
external_llvm-80d10ded8cd4f34b87d82b03d6f63328ea337b26.tar.gz
external_llvm-80d10ded8cd4f34b87d82b03d6f63328ea337b26.tar.bz2
PPC: Initial support for permutation-based unaligned Altivec loads
Altivec only directly supports aligned loads, but the loads have a strange property: If given an unaligned address, they truncate the address to the next lower aligned address, and load from there. This property, along with an extra load and some special-purpose permutation-control instructions that generate the appropriate permutations from the original unaligned address, allow efficient lowering of aligned loads. This code uses the trick explained in the Apple Velocity Engine optimization overview document to prevent the needed extra load from possibly causing a page fault if the original address happens to be aligned. As noted in the FIXMEs, there are several additional optimizations that can be performed to reduce the cost of these loads even more. These will be implemented in future commits. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182691 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen/PowerPC/unal-altivec.ll')
-rw-r--r--test/CodeGen/PowerPC/unal-altivec.ll45
1 files changed, 45 insertions, 0 deletions
diff --git a/test/CodeGen/PowerPC/unal-altivec.ll b/test/CodeGen/PowerPC/unal-altivec.ll
new file mode 100644
index 0000000..f89f299
--- /dev/null
+++ b/test/CodeGen/PowerPC/unal-altivec.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mcpu=g5 | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @foo(float* noalias nocapture %a, float* noalias nocapture %b) #0 {
+vector.ph:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds float* %b, i64 %index
+ %1 = bitcast float* %0 to <4 x float>*
+ %wide.load = load <4 x float>* %1, align 4
+ %.sum11 = or i64 %index, 4
+ %2 = getelementptr float* %b, i64 %.sum11
+ %3 = bitcast float* %2 to <4 x float>*
+ %wide.load8 = load <4 x float>* %3, align 4
+ %4 = fadd <4 x float> %wide.load, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+ %5 = fadd <4 x float> %wide.load8, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+ %6 = getelementptr inbounds float* %a, i64 %index
+ %7 = bitcast float* %6 to <4 x float>*
+ store <4 x float> %4, <4 x float>* %7, align 4
+ %.sum12 = or i64 %index, 4
+ %8 = getelementptr float* %a, i64 %.sum12
+ %9 = bitcast float* %8 to <4 x float>*
+ store <4 x float> %5, <4 x float>* %9, align 4
+ %index.next = add i64 %index, 8
+ %10 = icmp eq i64 %index.next, 16000
+ br i1 %10, label %for.end, label %vector.body
+
+; CHECK: @foo
+; CHECK: lvx [[CNST:[0-9]+]],
+; CHECK-DAG: lvsl [[PC:[0-9]+]], [[B1:[0-9]+]], [[B2:[0-9]+]]
+; CHECK-DAG: lvx [[LD1:[0-9]+]], [[B1]], [[B2]]
+; CHECK-DAG: add [[B3:[0-9]+]], [[B1]], [[B2]]
+; CHECK-DAG: lvx [[LD2:[0-9]+]], [[B3]],
+; CHECK-DAG: vperm [[R1:[0-9]+]], [[LD1]], [[LD2]], [[PC]]
+; CHECK: vaddfp {{[0-9]+}}, [[R1]], [[CNST]]
+; CHECK: blr
+
+for.end: ; preds = %vector.body
+ ret void
+}
+
+attributes #0 = { nounwind }