aboutsummaryrefslogtreecommitdiffstats
path: root/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
diff options
context:
space:
mode:
authorBenjamin Kramer <benny.kra@googlemail.com>2011-01-15 20:30:30 +0000
committerBenjamin Kramer <benny.kra@googlemail.com>2011-01-15 20:30:30 +0000
commitb6516aeef12a05aa47515f76e18fc426d85babbd (patch)
tree673d40370dbf9d6dfec81d45a5fbe5543185b99b /lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
parent248f9f280756a1d6eb20a97ee4ab457ec18170f6 (diff)
downloadexternal_llvm-b6516aeef12a05aa47515f76e18fc426d85babbd.zip
external_llvm-b6516aeef12a05aa47515f76e18fc426d85babbd.tar.gz
external_llvm-b6516aeef12a05aa47515f76e18fc426d85babbd.tar.bz2
Reimplement CTPOP legalization with the "best" algorithm from
http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel In a silly microbenchmark on a 65 nm core2 this is 1.5x faster than the old code in 32 bit mode and about 2x faster in 64 bit mode. It's also a lot shorter, especially when counting 64 bit population on a 32 bit target. I hope this is fast enough to replace Kernighan-style counting loops even when the input is rather sparse. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@123547 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/CodeGen/SelectionDAG/LegalizeDAG.cpp')
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeDAG.cpp63
1 files changed, 45 insertions, 18 deletions
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index cbe3845..3a29424 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2388,6 +2388,17 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, DebugLoc dl) {
}
}
+/// SplatByte - Distribute ByteVal over NumBits bits.
+static APInt SplatByte(unsigned NumBits, uint8_t ByteVal) {
+ APInt Val = APInt(NumBits, ByteVal);
+ unsigned Shift = 8;
+ for (unsigned i = NumBits; i > 8; i >>= 1) {
+ Val = (Val << Shift) | Val;
+ Shift <<= 1;
+ }
+ return Val;
+}
+
/// ExpandBitCount - Expand the specified bitcount instruction into operations.
///
SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
@@ -2395,26 +2406,42 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
switch (Opc) {
default: assert(0 && "Cannot expand this yet!");
case ISD::CTPOP: {
- static const uint64_t mask[6] = {
- 0x5555555555555555ULL, 0x3333333333333333ULL,
- 0x0F0F0F0F0F0F0F0FULL, 0x00FF00FF00FF00FFULL,
- 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL
- };
EVT VT = Op.getValueType();
EVT ShVT = TLI.getShiftAmountTy();
- unsigned len = VT.getSizeInBits();
- for (unsigned i = 0; (1U << i) <= (len / 2); ++i) {
- //x = (x & mask[i][len/8]) + (x >> (1 << i) & mask[i][len/8])
- unsigned EltSize = VT.isVector() ?
- VT.getVectorElementType().getSizeInBits() : len;
- SDValue Tmp2 = DAG.getConstant(APInt(EltSize, mask[i]), VT);
- SDValue Tmp3 = DAG.getConstant(1ULL << i, ShVT);
- Op = DAG.getNode(ISD::ADD, dl, VT,
- DAG.getNode(ISD::AND, dl, VT, Op, Tmp2),
- DAG.getNode(ISD::AND, dl, VT,
- DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3),
- Tmp2));
- }
+ unsigned Len = VT.getSizeInBits();
+
+ // This is the "best" algorithm from
+ // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+
+ SDValue Mask55 = DAG.getConstant(SplatByte(Len, 0x55), VT);
+ SDValue Mask33 = DAG.getConstant(SplatByte(Len, 0x33), VT);
+ SDValue Mask0F = DAG.getConstant(SplatByte(Len, 0x0F), VT);
+ SDValue Mask01 = DAG.getConstant(SplatByte(Len, 0x01), VT);
+
+ // v = v - ((v >> 1) & 0x55555555...)
+ Op = DAG.getNode(ISD::SUB, dl, VT, Op,
+ DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRL, dl, VT, Op,
+ DAG.getConstant(1, ShVT)),
+ Mask55));
+ // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+ Op = DAG.getNode(ISD::ADD, dl, VT,
+ DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
+ DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRL, dl, VT, Op,
+ DAG.getConstant(2, ShVT)),
+ Mask33));
+ // v = (v + (v >> 4)) & 0x0F0F0F0F...
+ Op = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::ADD, dl, VT, Op,
+ DAG.getNode(ISD::SRL, dl, VT, Op,
+ DAG.getConstant(4, ShVT))),
+ Mask0F);
+ // v = (v * 0x01010101...) >> (Len - 8)
+ Op = DAG.getNode(ISD::SRL, dl, VT,
+ DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
+ DAG.getConstant(Len - 8, ShVT));
+
return Op;
}
case ISD::CTLZ: {