diff options
author | Arnold Schwaighofer <aschwaighofer@apple.com> | 2013-02-20 21:33:32 +0000 |
---|---|---|
committer | Arnold Schwaighofer <aschwaighofer@apple.com> | 2013-02-20 21:33:32 +0000 |
commit | c46e2df74cf75a33742f57d2b4d6c6fcf73bced9 (patch) | |
tree | 2527c3b68b24b2e2fb335d6c2988c382ee9e950a | |
parent | 64f3e763cd8e4f32f91ae5b44ac4bd9986afddf2 (diff) | |
download | external_llvm-c46e2df74cf75a33742f57d2b4d6c6fcf73bced9.zip external_llvm-c46e2df74cf75a33742f57d2b4d6c6fcf73bced9.tar.gz external_llvm-c46e2df74cf75a33742f57d2b4d6c6fcf73bced9.tar.bz2 |
DAGCombiner: Fold pointless truncate, bitcast, buildvector series
(2xi32) (truncate ((2xi64) bitcast (buildvector i32 a, i32 x, i32 b, i32 y)))
can be folded into a (2xi32) (buildvector i32 a, i32 b).
Such a DAG would cause uneccessary vdup instructions followed by vmovn
instructions.
We generate this code on ARM NEON for a setcc olt, 2xf64, 2xf64. For example, in
the vectorized version of the code below.
double A[N];
double B[N];
void test_double_compare_to_double() {
int i;
for(i=0;i<N;i++)
A[i] = (double)(A[i] < B[i]);
}
radar://13191881
Fixes bug 15283.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@175670 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 32 | ||||
-rw-r--r-- | test/CodeGen/ARM/neon_cmp.ll | 15 |
2 files changed, 47 insertions, 0 deletions
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 9d40ff7..2777d7c 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5361,6 +5361,38 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { } } + // Fold a series of buildvector, bitcast, and truncate if possible. + // For example fold + // (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to + // (2xi32 (buildvector x, y)). + if (Level == AfterLegalizeVectorOps && VT.isVector() && + N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && + N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && + N0.getOperand(0).hasOneUse()) { + + SDValue BuildVect = N0.getOperand(0); + EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType(); + EVT TruncVecEltTy = VT.getVectorElementType(); + + // Check that the element types match. + if (BuildVectEltTy == TruncVecEltTy) { + // Now we only need to compute the offset of the truncated elements. + unsigned BuildVecNumElts = BuildVect.getNumOperands(); + unsigned TruncVecNumElts = VT.getVectorNumElements(); + unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts; + + assert((BuildVecNumElts % TruncVecNumElts) == 0 && + "Invalid number of elements"); + + SmallVector<SDValue, 8> Opnds; + for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset) + Opnds.push_back(BuildVect.getOperand(i)); + + return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT, &Opnds[0], + Opnds.size()); + } + } + // See if we can simplify the input to this truncate through knowledge that // only the low bits are being used. // For example "trunc (or (shl x, 8), y)" // -> trunc y diff --git a/test/CodeGen/ARM/neon_cmp.ll b/test/CodeGen/ARM/neon_cmp.ll new file mode 100644 index 0000000..046b5da --- /dev/null +++ b/test/CodeGen/ARM/neon_cmp.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s +; bug 15283 +; radar://13191881 +; CHECK: vfcmp +define void @vfcmp(<2 x double>* %a, <2 x double>* %b) { + %wide.load = load <2 x double>* %a, align 4 + %wide.load2 = load <2 x double>* %b, align 4 +; CHECK-NOT: vdup.32 +; CHECK-NOT: vmovn.i64 + %v1 = fcmp olt <2 x double> %wide.load, %wide.load2 + %v2 = zext <2 x i1> %v1 to <2 x i32> + %v3 = sitofp <2 x i32> %v2 to <2 x double> + store <2 x double> %v3, <2 x double>* %b, align 4 + ret void +} |