Distribute (A + B) * C to (A * C) + (B * C) to make use of NEON multiplier

accumulator forwarding: vadd d3, d0, d1 vmul d3, d3, d2 => vmul d3, d0, d2 vmla d3, d1, d2 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@128665 91177308-0d34-0410-b5e6-96231b3b80d8
author: Evan Cheng <evan.cheng@apple.com> 2011-03-31 19:38:48 +0000
committer: Evan Cheng <evan.cheng@apple.com> 2011-03-31 19:38:48 +0000
commit: 463d358f1dfdd28a6900f2f109a160be71d2a8ef (patch)
tree: 59c104ce5689ba7b4afe0fabc7d067a3c2703c8b /test/CodeGen
parent: a52d7da1d8c424276f79b80c89ed045166083730 (diff)
download: external_llvm-463d358f1dfdd28a6900f2f109a160be71d2a8ef.zip
external_llvm-463d358f1dfdd28a6900f2f109a160be71d2a8ef.tar.gz
external_llvm-463d358f1dfdd28a6900f2f109a160be71d2a8ef.tar.bz2
1 files changed, 27 insertions, 1 deletions
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index 80ba9be..1fd6581 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
 
 define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vmuli8:
@@ -466,3 +466,29 @@ entry:
 declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
 
 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+
+; Take advantage of the Cortex-A8 multiplier accumulator forward.
+
+%struct.uint8x8_t = type { <8 x i8> }
+
+define void @distribue2(%struct.uint8x8_t* nocapture %dst, i8* %src, i32 %mul) nounwind {
+entry:
+; CHECK: distribue2
+; CHECK-NOT: vadd.i8
+; CHECK: vmul.i8
+; CHECK: vmla.i8
+  %0 = trunc i32 %mul to i8
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
+  %4 = bitcast <16 x i8> %3 to <2 x double>
+  %5 = extractelement <2 x double> %4, i32 1
+  %6 = bitcast double %5 to <8 x i8>
+  %7 = extractelement <2 x double> %4, i32 0
+  %8 = bitcast double %7 to <8 x i8>
+  %9 = add <8 x i8> %6, %8
+  %10 = mul <8 x i8> %9, %2
+  %11 = getelementptr inbounds %struct.uint8x8_t* %dst, i32 0, i32 0
+  store <8 x i8> %10, <8 x i8>* %11, align 8
+  ret void
+}
author	Evan Cheng <evan.cheng@apple.com>	2011-03-31 19:38:48 +0000
committer	Evan Cheng <evan.cheng@apple.com>	2011-03-31 19:38:48 +0000
commit	463d358f1dfdd28a6900f2f109a160be71d2a8ef (patch)
tree	59c104ce5689ba7b4afe0fabc7d067a3c2703c8b /test/CodeGen
parent	a52d7da1d8c424276f79b80c89ed045166083730 (diff)
download	external_llvm-463d358f1dfdd28a6900f2f109a160be71d2a8ef.zip external_llvm-463d358f1dfdd28a6900f2f109a160be71d2a8ef.tar.gz external_llvm-463d358f1dfdd28a6900f2f109a160be71d2a8ef.tar.bz2