diff options
author | Weiming Zhao <weimingz@codeaurora.org> | 2013-09-25 23:12:06 +0000 |
---|---|---|
committer | Weiming Zhao <weimingz@codeaurora.org> | 2013-09-25 23:12:06 +0000 |
commit | 541681c8485c18b564970c80180a798b2c1663e8 (patch) | |
tree | caf8c901cb8c2f1025ae709169dc8d0f44384f26 | |
parent | 498ffb8a568992d613e654ddec69b04d350aec20 (diff) | |
download | external_llvm-541681c8485c18b564970c80180a798b2c1663e8.zip external_llvm-541681c8485c18b564970c80180a798b2c1663e8.tar.gz external_llvm-541681c8485c18b564970c80180a798b2c1663e8.tar.bz2 |
Fix PR 17368: disable vector mul distribution for square of add/sub for ARM
Generally, it is desirable to distribute (a + b) * c to a*c + b*c for
ARM with VMLx forwarding, where a, b and c are vectors.
However, for (a + b)*(a + b), distribution will result in one extra
instruction.
With distribution:
x = a + b (add)
y = a * x (mul)
z = y + b * y (mla)
Without distribution:
x = a + b (add)
z = x * x (mul)
This patch checks if a mul is a square of add/sub. If yes, skip
distribution.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@191410 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 10 | ||||
-rw-r--r-- | test/CodeGen/ARM/vmul.ll | 11 |
2 files changed, 21 insertions, 0 deletions
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index c83f7b1..773b710 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -8342,6 +8342,13 @@ static SDValue PerformSUBCombine(SDNode *N, /// is faster than /// vadd d3, d0, d1 /// vmul d3, d3, d2 +// However, for (A + B) * (A + B), +// vadd d2, d0, d1 +// vmul d3, d0, d2 +// vmla d3, d1, d2 +// is slower than +// vadd d2, d0, d1 +// vmul d3, d2, d2 static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -8361,6 +8368,9 @@ static SDValue PerformVMULCombine(SDNode *N, std::swap(N0, N1); } + if (N0 == N1) + return SDValue(); + EVT VT = N->getValueType(0); SDLoc DL(N); SDValue N00 = N0->getOperand(0); diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll index 5e5e99b..de329ac 100644 --- a/test/CodeGen/ARM/vmul.ll +++ b/test/CodeGen/ARM/vmul.ll @@ -515,6 +515,17 @@ entry: ret void } +define <8 x i8> @no_distribute(<8 x i8> %a, <8 x i8> %b) nounwind { +entry: +; CHECK: no_distribute +; CHECK: vadd.i8 +; CHECK: vmul.i8 +; CHECK-NOT: vmla.i8 + %0 = add <8 x i8> %a, %b + %1 = mul <8x i8> %0, %0 + ret <8 x i8> %1 +} + ; If one operand has a zero-extend and the other a sign-extend, vmull ; cannot be used. define i16 @vmullWithInconsistentExtensions(<8 x i8> %vec) { |