From e7a1e3ee8279f12d0f2b49fb198d577949795c88 Mon Sep 17 00:00:00 2001 From: Michael Gottesman Date: Sun, 17 Nov 2013 03:25:24 +0000 Subject: [block-freq] Add BlockFrequency::scale that returns a remainder from the division and make the private scale in BlockFrequency more performant. This change is the first in a series of changes improving LLVM's Block Frequency propogation implementation to not lose probability mass in branchy code when propogating block frequency information from a basic block to its successors. This patch is a simple infrastructure improvement that does not actually modify the block frequency algorithm. The specific changes are: 1. Changes the division algorithm used when scaling block frequencies by branch probabilities to a short division algorithm. This gives us the remainder for free as well as provides a nice speed boost. When I benched the old routine and the new routine on a Sandy Bridge iMac with disabled turbo mode performing 8192 iterations on an array of length 32768, I saw ~600% increase in speed in mean/median performance. 2. Exposes a scale method that returns a remainder. This is important so we can ensure that when we scale a block frequency by some branch probability BP = N/D, the remainder from the division by D can be retrieved and propagated to other children to ensure no probability mass is lost (more to come on this). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194950 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/BlockFrequency.cpp | 98 ++++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 38 deletions(-) (limited to 'lib') diff --git a/lib/Support/BlockFrequency.cpp b/lib/Support/BlockFrequency.cpp index 5e45e46..00efe90 100644 --- a/lib/Support/BlockFrequency.cpp +++ b/lib/Support/BlockFrequency.cpp @@ -19,52 +19,69 @@ using namespace llvm; /// Multiply FREQ by N and store result in W array. -static void mult96bit(uint64_t freq, uint32_t N, uint64_t W[2]) { +static void mult96bit(uint64_t freq, uint32_t N, uint32_t W[3]) { uint64_t u0 = freq & UINT32_MAX; uint64_t u1 = freq >> 32; - // Represent 96-bit value as w[2]:w[1]:w[0]; - uint32_t w[3] = { 0, 0, 0 }; - + // Represent 96-bit value as W[2]:W[1]:W[0]; uint64_t t = u0 * N; uint64_t k = t >> 32; - w[0] = t; + W[0] = t; t = u1 * N + k; - w[1] = t; - w[2] = t >> 32; - - // W[1] - higher bits. - // W[0] - lower bits. - W[0] = w[0] + ((uint64_t) w[1] << 32); - W[1] = w[2]; + W[1] = t; + W[2] = t >> 32; } - -/// Divide 96-bit value stored in W array by D. -/// Return 64-bit quotient, saturated to UINT64_MAX on overflow. -static uint64_t div96bit(uint64_t W[2], uint32_t D) { - uint64_t y = W[0]; - uint64_t x = W[1]; - unsigned i; - - assert(x != 0 && "This is really a 64-bit division"); - - // This long division algorithm automatically saturates on overflow. - for (i = 0; i < 64 && x; ++i) { - uint32_t t = -((x >> 31) & 1); // Splat bit 31 to bits 0-31. - x = (x << 1) | (y >> 63); - y = y << 1; - if ((x | t) >= D) { - x -= D; - ++y; +/// Divide 96-bit value stored in W[2]:W[1]:W[0] by D. Since our word size is a +/// 32 bit unsigned integer, we can use a short division algorithm. +static uint64_t divrem96bit(uint32_t W[3], uint32_t D, uint32_t *Rout) { + // We assume that W[2] is non-zero since if W[2] is not then the user should + // just use hardware division. + assert(W[2] && "This routine assumes that W[2] is non-zero since if W[2] is " + "zero, the caller should just use 64/32 hardware."); + uint32_t Q[3] = { 0, 0, 0 }; + + // The generalized short division algorithm sets i to m + n - 1, where n is + // the number of words in the divisior and m is the number of words by which + // the divident exceeds the divisor (i.e. m + n == the length of the dividend + // in words). Due to our assumption that W[2] is non-zero, we know that the + // dividend is of length 3 implying since n is 1 that m = 2. Thus we set i to + // m + n - 1 = 2 + 1 - 1 = 2. + uint32_t R = 0; + for (int i = 2; i >= 0; --i) { + uint64_t PartialD = uint64_t(R) << 32 | W[i]; + if (PartialD == 0) { + Q[i] = 0; + R = 0; + } else if (PartialD < D) { + Q[i] = 0; + R = uint32_t(PartialD); + } else if (PartialD == D) { + Q[i] = 1; + R = 0; + } else { + Q[i] = uint32_t(PartialD / D); + R = uint32_t(PartialD - (Q[i] * D)); } } - return y << (64 - i); -} + // If Q[2] is non-zero, then we overflowed. + uint64_t Result; + if (Q[2]) { + Result = UINT64_MAX; + R = D; + } else { + // Form the final uint64_t result, avoiding endianness issues. + Result = uint64_t(Q[0]) | (uint64_t(Q[1]) << 32); + } + + if (Rout) + *Rout = R; + return Result; +} -void BlockFrequency::scale(uint32_t N, uint32_t D) { +uint32_t BlockFrequency::scale(uint32_t N, uint32_t D) { assert(D != 0 && "Division by zero"); // Calculate Frequency * N. @@ -75,15 +92,16 @@ void BlockFrequency::scale(uint32_t N, uint32_t D) { // If the product fits in 64 bits, just use built-in division. if (MulHi <= UINT32_MAX && MulRes >= MulLo) { Frequency = MulRes / D; - return; + return MulRes % D; } // Product overflowed, use 96-bit operations. - // 96-bit value represented as W[1]:W[0]. - uint64_t W[2]; + // 96-bit value represented as W[2]:W[1]:W[0]. + uint32_t W[3]; + uint32_t R; mult96bit(Frequency, N, W); - Frequency = div96bit(W, D); - return; + Frequency = divrem96bit(W, D, &R); + return R; } BlockFrequency &BlockFrequency::operator*=(const BranchProbability &Prob) { @@ -127,6 +145,10 @@ BlockFrequency::operator+(const BlockFrequency &Prob) const { return Freq; } +uint32_t BlockFrequency::scale(const BranchProbability &Prob) { + return scale(Prob.getNumerator(), Prob.getDenominator()); +} + void BlockFrequency::print(raw_ostream &OS) const { // Convert fixed-point number to decimal. OS << Frequency / getEntryFrequency() << "."; -- cgit v1.1