diff options
author | Hal Finkel <hfinkel@anl.gov> | 2013-05-24 23:00:14 +0000 |
---|---|---|
committer | Hal Finkel <hfinkel@anl.gov> | 2013-05-24 23:00:14 +0000 |
commit | 80d10ded8cd4f34b87d82b03d6f63328ea337b26 (patch) | |
tree | 238b9081baf073661f9ae2bfc3ed8e22f7935110 /test/CodeGen/PowerPC/unal-altivec.ll | |
parent | 3b77151a61d2985ad5b29ee3d05b34d553322c2a (diff) | |
download | external_llvm-80d10ded8cd4f34b87d82b03d6f63328ea337b26.zip external_llvm-80d10ded8cd4f34b87d82b03d6f63328ea337b26.tar.gz external_llvm-80d10ded8cd4f34b87d82b03d6f63328ea337b26.tar.bz2 |
PPC: Initial support for permutation-based unaligned Altivec loads
Altivec only directly supports aligned loads, but the loads have a strange
property: If given an unaligned address, they truncate the address to the next
lower aligned address, and load from there. This property, along with an extra
load and some special-purpose permutation-control instructions that generate
the appropriate permutations from the original unaligned address, allow
efficient lowering of aligned loads. This code uses the trick explained in the
Apple Velocity Engine optimization overview document to prevent the needed
extra load from possibly causing a page fault if the original address happens
to be aligned.
As noted in the FIXMEs, there are several additional optimizations that can be
performed to reduce the cost of these loads even more. These will be
implemented in future commits.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182691 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen/PowerPC/unal-altivec.ll')
-rw-r--r-- | test/CodeGen/PowerPC/unal-altivec.ll | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/test/CodeGen/PowerPC/unal-altivec.ll b/test/CodeGen/PowerPC/unal-altivec.ll new file mode 100644 index 0000000..f89f299 --- /dev/null +++ b/test/CodeGen/PowerPC/unal-altivec.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -mcpu=g5 | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +define void @foo(float* noalias nocapture %a, float* noalias nocapture %b) #0 { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float* %b, i64 %index + %1 = bitcast float* %0 to <4 x float>* + %wide.load = load <4 x float>* %1, align 4 + %.sum11 = or i64 %index, 4 + %2 = getelementptr float* %b, i64 %.sum11 + %3 = bitcast float* %2 to <4 x float>* + %wide.load8 = load <4 x float>* %3, align 4 + %4 = fadd <4 x float> %wide.load, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> + %5 = fadd <4 x float> %wide.load8, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> + %6 = getelementptr inbounds float* %a, i64 %index + %7 = bitcast float* %6 to <4 x float>* + store <4 x float> %4, <4 x float>* %7, align 4 + %.sum12 = or i64 %index, 4 + %8 = getelementptr float* %a, i64 %.sum12 + %9 = bitcast float* %8 to <4 x float>* + store <4 x float> %5, <4 x float>* %9, align 4 + %index.next = add i64 %index, 8 + %10 = icmp eq i64 %index.next, 16000 + br i1 %10, label %for.end, label %vector.body + +; CHECK: @foo +; CHECK: lvx [[CNST:[0-9]+]], +; CHECK-DAG: lvsl [[PC:[0-9]+]], [[B1:[0-9]+]], [[B2:[0-9]+]] +; CHECK-DAG: lvx [[LD1:[0-9]+]], [[B1]], [[B2]] +; CHECK-DAG: add [[B3:[0-9]+]], [[B1]], [[B2]] +; CHECK-DAG: lvx [[LD2:[0-9]+]], [[B3]], +; CHECK-DAG: vperm [[R1:[0-9]+]], [[LD1]], [[LD2]], [[PC]] +; CHECK: vaddfp {{[0-9]+}}, [[R1]], [[CNST]] +; CHECK: blr + +for.end: ; preds = %vector.body + ret void +} + +attributes #0 = { nounwind } |