diff options
author | Evan Cheng <evan.cheng@apple.com> | 2011-03-31 19:38:48 +0000 |
---|---|---|
committer | Evan Cheng <evan.cheng@apple.com> | 2011-03-31 19:38:48 +0000 |
commit | 463d358f1dfdd28a6900f2f109a160be71d2a8ef (patch) | |
tree | 59c104ce5689ba7b4afe0fabc7d067a3c2703c8b /test/CodeGen | |
parent | a52d7da1d8c424276f79b80c89ed045166083730 (diff) | |
download | external_llvm-463d358f1dfdd28a6900f2f109a160be71d2a8ef.zip external_llvm-463d358f1dfdd28a6900f2f109a160be71d2a8ef.tar.gz external_llvm-463d358f1dfdd28a6900f2f109a160be71d2a8ef.tar.bz2 |
Distribute (A + B) * C to (A * C) + (B * C) to make use of NEON multiplier
accumulator forwarding:
vadd d3, d0, d1
vmul d3, d3, d2
=>
vmul d3, d0, d2
vmla d3, d1, d2
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@128665 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen')
-rw-r--r-- | test/CodeGen/ARM/vmul.ll | 28 |
1 files changed, 27 insertions, 1 deletions
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll index 80ba9be..1fd6581 100644 --- a/test/CodeGen/ARM/vmul.ll +++ b/test/CodeGen/ARM/vmul.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s +; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vmuli8: @@ -466,3 +466,29 @@ entry: declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind + +; Take advantage of the Cortex-A8 multiplier accumulator forward. + +%struct.uint8x8_t = type { <8 x i8> } + +define void @distribue2(%struct.uint8x8_t* nocapture %dst, i8* %src, i32 %mul) nounwind { +entry: +; CHECK: distribue2 +; CHECK-NOT: vadd.i8 +; CHECK: vmul.i8 +; CHECK: vmla.i8 + %0 = trunc i32 %mul to i8 + %1 = insertelement <8 x i8> undef, i8 %0, i32 0 + %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1) + %4 = bitcast <16 x i8> %3 to <2 x double> + %5 = extractelement <2 x double> %4, i32 1 + %6 = bitcast double %5 to <8 x i8> + %7 = extractelement <2 x double> %4, i32 0 + %8 = bitcast double %7 to <8 x i8> + %9 = add <8 x i8> %6, %8 + %10 = mul <8 x i8> %9, %2 + %11 = getelementptr inbounds %struct.uint8x8_t* %dst, i32 0, i32 0 + store <8 x i8> %10, <8 x i8>* %11, align 8 + ret void +} |