diff options
author | Evan Cheng <evan.cheng@apple.com> | 2011-03-29 01:56:09 +0000 |
---|---|---|
committer | Evan Cheng <evan.cheng@apple.com> | 2011-03-29 01:56:09 +0000 |
commit | 78fe9ababead2168f7196c6a47402cf499a0aaf7 (patch) | |
tree | 625da1ee1c53c784a40e7160f7ef3faf6ea52fc6 /test/CodeGen | |
parent | 79abc9dd4a306d4ec42d09e2673a94abd225bcdc (diff) | |
download | external_llvm-78fe9ababead2168f7196c6a47402cf499a0aaf7.zip external_llvm-78fe9ababead2168f7196c6a47402cf499a0aaf7.tar.gz external_llvm-78fe9ababead2168f7196c6a47402cf499a0aaf7.tar.bz2 |
Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
isel lowering to fold the zero-extend's and take advantage of no-stall
back to back vmul + vmla:
vmull q0, d4, d6
vmlal q0, d5, d6
is faster than
vaddl q0, d4, d5
vmovl q1, d6
vmul q0, q0, q1
This allows us to vmull + vmlal for:
f = vmull_u8( vget_high_u8(s), c);
f = vmlal_u8(f, vget_low_u8(s), c);
rdar://9197392
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@128444 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen')
-rw-r--r-- | test/CodeGen/ARM/vmul.ll | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll index ee033ca..585394e 100644 --- a/test/CodeGen/ARM/vmul.ll +++ b/test/CodeGen/ARM/vmul.ll @@ -339,3 +339,32 @@ define <2 x i64> @vmull_extvec_u32(<2 x i32> %arg) nounwind { %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234> ret <2 x i64> %tmp4 } + +; rdar://9197392 +define void @distribue(i16* %dst, i8* %src, i32 %mul) nounwind { +entry: +; CHECK: distribue: +; CHECK: vmull.u8 [[REG1:(q[0-9]+)]], d{{.*}}, [[REG2:(d[0-9]+)]] +; CHECK: vmlal.u8 [[REG1]], d{{.*}}, [[REG2]] + %0 = trunc i32 %mul to i8 + %1 = insertelement <8 x i8> undef, i8 %0, i32 0 + %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1) + %4 = bitcast <16 x i8> %3 to <2 x double> + %5 = extractelement <2 x double> %4, i32 1 + %6 = bitcast double %5 to <8 x i8> + %7 = zext <8 x i8> %6 to <8 x i16> + %8 = zext <8 x i8> %2 to <8 x i16> + %9 = extractelement <2 x double> %4, i32 0 + %10 = bitcast double %9 to <8 x i8> + %11 = zext <8 x i8> %10 to <8 x i16> + %12 = add <8 x i16> %7, %11 + %13 = mul <8 x i16> %12, %8 + %14 = bitcast i16* %dst to i8* + tail call void @llvm.arm.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2) + ret void +} + +declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly + +declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind |