Improve dynamic audio resampler filter generation

Improve dynamic audio resampler filter generation speed by 2x. The resulting filters should be the same (excepting roundoff). Also added check for upsampling sample rate changes to share previously generated filters. Modify the profiling to improve output format and sampling result reliability. Change-Id: I9aa6b914fd552a63f79dd4a95945df2f8275772a Signed-off-by: Andy Hung <hunga@google.com>
author: Andy Hung <hunga@google.com> 2014-01-03 12:30:41 -0800
committer: Andy Hung <hunga@google.com> 2014-01-08 17:54:27 -0800
commit: 6582f2b14a21e630654c5522ef9ad64e80d5058d (patch)
tree: 69dee04c065feec40569f144281f36e885b7d15b /services/audioflinger/AudioResamplerFirGen.h
parent: 0bcfa88149e2404b34d13c622e3921e1b846cdf8 (diff)
download: frameworks_av-6582f2b14a21e630654c5522ef9ad64e80d5058d.zip
frameworks_av-6582f2b14a21e630654c5522ef9ad64e80d5058d.tar.gz
frameworks_av-6582f2b14a21e630654c5522ef9ad64e80d5058d.tar.bz2
1 files changed, 428 insertions, 51 deletions
diff --git a/services/audioflinger/AudioResamplerFirGen.h b/services/audioflinger/AudioResamplerFirGen.h
index fa6ee3e..fac3001 100644
--- a/services/audioflinger/AudioResamplerFirGen.h
+++ b/services/audioflinger/AudioResamplerFirGen.h
@@ -20,19 +20,105 @@
 namespace android {
 
 /*
- * Sinc function is the traditional variant.
+ * generates a sine wave at equal steps.
  *
- * TODO: Investigate optimizations (regular sampling grid, NEON vector accelerations)
- * TODO: Remove comparison at 0 and trap at a higher level.
+ * As most of our functions use sine or cosine at equal steps,
+ * it is very efficient to compute them that way (single multiply and subtract),
+ * rather than invoking the math library sin() or cos() each time.
+ *
+ * SineGen uses Goertzel's Algorithm (as a generator not a filter)
+ * to calculate sine(wstart + n * wstep) or cosine(wstart + n * wstep)
+ * by stepping through 0, 1, ... n.
+ *
+ * e^i(wstart+wstep) = 2cos(wstep) * e^i(wstart) - e^i(wstart-wstep)
+ *
+ * or looking at just the imaginary sine term, as the cosine follows identically:
+ *
+ * sin(wstart+wstep) = 2cos(wstep) * sin(wstart) - sin(wstart-wstep)
+ *
+ * Goertzel's algorithm is more efficient than the angle addition formula,
+ * e^i(wstart+wstep) = e^i(wstart) * e^i(wstep), which takes up to
+ * 4 multiplies and 2 adds (or 3* and 3+) and requires both sine and
+ * cosine generation due to the complex * complex multiply (full rotation).
+ *
+ * See: http://en.wikipedia.org/wiki/Goertzel_algorithm
  *
  */
 
-static inline double sinc(double x) {
-    if (fabs(x) < FLT_MIN) {
-        return 1.;
+class SineGen {
+public:
+    SineGen(double wstart, double wstep, bool cosine = false) {
+        if (cosine) {
+            mCurrent = cos(wstart);
+            mPrevious = cos(wstart - wstep);
+        } else {
+            mCurrent = sin(wstart);
+            mPrevious = sin(wstart - wstep);
+        }
+        mTwoCos = 2.*cos(wstep);
     }
-    return sin(x) / x;
-}
+    SineGen(double expNow, double expPrev, double twoCosStep) {
+        mCurrent = expNow;
+        mPrevious = expPrev;
+        mTwoCos = twoCosStep;
+    }
+    inline double value() const {
+        return mCurrent;
+    }
+    inline void advance() {
+        double tmp = mCurrent;
+        mCurrent = mCurrent*mTwoCos - mPrevious;
+        mPrevious = tmp;
+    }
+    inline double valueAdvance() {
+        double tmp = mCurrent;
+        mCurrent = mCurrent*mTwoCos - mPrevious;
+        mPrevious = tmp;
+        return tmp;
+    }
+
+private:
+    double mCurrent; // current value of sine/cosine
+    double mPrevious; // previous value of sine/cosine
+    double mTwoCos; // stepping factor
+};
+
+/*
+ * generates a series of sine generators, phase offset by fixed steps.
+ *
+ * This is used to generate polyphase sine generators, one per polyphase
+ * in the filter code below.
+ *
+ * The SineGen returned by value() starts at innerStart = outerStart + n*outerStep;
+ * increments by innerStep.
+ *
+ */
+
+class SineGenGen {
+public:
+    SineGenGen(double outerStart, double outerStep, double innerStep, bool cosine = false)
+            : mSineInnerCur(outerStart, outerStep, cosine),
+              mSineInnerPrev(outerStart-innerStep, outerStep, cosine)
+    {
+        mTwoCos = 2.*cos(innerStep);
+    }
+    inline SineGen value() {
+        return SineGen(mSineInnerCur.value(), mSineInnerPrev.value(), mTwoCos);
+    }
+    inline void advance() {
+        mSineInnerCur.advance();
+        mSineInnerPrev.advance();
+    }
+    inline SineGen valueAdvance() {
+        return SineGen(mSineInnerCur.valueAdvance(), mSineInnerPrev.valueAdvance(), mTwoCos);
+    }
+
+private:
+    SineGen mSineInnerCur; // generate the inner sine values (stepped by outerStep).
+    SineGen mSineInnerPrev; // generate the inner sine previous values
+                            // (behind by innerStep, stepped by outerStep).
+    double mTwoCos; // the inner stepping factor for the returned SineGen.
+};
 
 static inline double sqr(double x) {
     return x * x;
@@ -46,8 +132,7 @@ static inline double sqr(double x) {
  * The other variant is a non-noise shaped version for
  * S32 coefficients (noise shaping doesn't gain much).
  *
- * Caution: No bounds saturation is applied, but isn't needed in
- * this case.
+ * Caution: No bounds saturation is applied, but isn't needed in this case.
  *
  * @param x is the value to round.
  *
@@ -56,12 +141,15 @@ static inline double sqr(double x) {
  * FIR coefficients generated here are never that close to 1.0 to pose an overflow condition).
  *
  * @param err is the previous error (actual - rounded) for the previous rounding op.
+ * For 16b coefficients this can improve stopband dB performance by up to 2dB.
+ *
+ * Many variants exist for the noise shaping: http://en.wikipedia.org/wiki/Noise_shaping
  *
  */
 
 static inline int64_t toint(double x, int64_t maxval, double& err) {
     double val = x * maxval;
-    double ival = floor(val + 0.5 + err*0.17);
+    double ival = floor(val + 0.5 + err*0.2);
     err = val - ival;
     return static_cast<int64_t>(ival);
 }
@@ -74,7 +162,8 @@ static inline int64_t toint(double x, int64_t maxval) {
  * Modified Bessel function of the first kind
  * http://en.wikipedia.org/wiki/Bessel_function
  *
- * The formulas are taken from Abramowitz and Stegun:
+ * The formulas are taken from Abramowitz and Stegun,
+ * _Handbook of Mathematical Functions_ (links below):
  *
  * http://people.math.sfu.ca/~cbm/aands/page_375.htm
  * http://people.math.sfu.ca/~cbm/aands/page_378.htm
@@ -92,11 +181,13 @@ static inline int64_t toint(double x, int64_t maxval) {
  * We use a bit of template math here, constexpr would probably be
  * more appropriate for a C++11 compiler.
  *
+ * For the intermediate range 3.75 < x < 15, we use minimax polynomial fit.
+ *
  */
 
 template <int N>
 struct I0Term {
-    static const double value = I0Term<N-1>::value/ (4. * N * N);
+    static const double value = I0Term<N-1>::value / (4. * N * N);
 };
 
 template <>
@@ -114,68 +205,275 @@ struct I0ATerm<0> { // 1/sqrt(2*PI);
     static const double value = 0.398942280401432677939946059934381868475858631164934657665925;
 };
 
+#if USE_HORNERS_METHOD
+/* Polynomial evaluation of A + Bx + Cx^2 + Dx^3 + ...
+ * using Horner's Method: http://en.wikipedia.org/wiki/Horner's_method
+ *
+ * This has fewer multiplications than Estrin's method below, but has back to back
+ * floating point dependencies.
+ *
+ * On ARM this appears to work slower, so USE_HORNERS_METHOD is not default enabled.
+ */
+
+inline double Poly2(double A, double B, double x) {
+    return A + x * B;
+}
+
+inline double Poly4(double A, double B, double C, double D, double x) {
+    return A + x * (B + x * (C + x * (D)));
+}
+
+inline double Poly7(double A, double B, double C, double D, double E, double F, double G,
+        double x) {
+    return A + x * (B + x * (C + x * (D + x * (E + x * (F + x * (G))))));
+}
+
+inline double Poly9(double A, double B, double C, double D, double E, double F, double G,
+        double H, double I, double x) {
+    return A + x * (B + x * (C + x * (D + x * (E + x * (F + x * (G + x * (H + x * (I))))))));
+}
+
+#else
+/* Polynomial evaluation of A + Bx + Cx^2 + Dx^3 + ...
+ * using Estrin's Method: http://en.wikipedia.org/wiki/Estrin's_scheme
+ *
+ * This is typically faster, perhaps gains about 5-10% overall on ARM processors
+ * over Horner's method above.
+ */
+
+inline double Poly2(double A, double B, double x) {
+    return A + B * x;
+}
+
+inline double Poly3(double A, double B, double C, double x, double x2) {
+    return Poly2(A, B, x) + C * x2;
+}
+
+inline double Poly3(double A, double B, double C, double x) {
+    return Poly2(A, B, x) + C * x * x;
+}
+
+inline double Poly4(double A, double B, double C, double D, double x, double x2) {
+    return Poly2(A, B, x) + Poly2(C, D, x) * x2; // same as poly2(poly2, poly2, x2);
+}
+
+inline double Poly4(double A, double B, double C, double D, double x) {
+    return Poly4(A, B, C, D, x, x * x);
+}
+
+inline double Poly7(double A, double B, double C, double D, double E, double F, double G,
+        double x) {
+    double x2 = x * x;
+    return Poly4(A, B, C, D, x, x2) + Poly3(E, F, G, x, x2) * (x2 * x2);
+}
+
+inline double Poly8(double A, double B, double C, double D, double E, double F, double G,
+        double H, double x, double x2, double x4) {
+    return Poly4(A, B, C, D, x, x2) + Poly4(E, F, G, H, x, x2) * x4;
+}
+
+inline double Poly9(double A, double B, double C, double D, double E, double F, double G,
+        double H, double I, double x) {
+    double x2 = x * x;
+#if 1
+    // It does not seem faster to explicitly decompose Poly8 into Poly4, but
+    // could depend on compiler floating point scheduling.
+    double x4 = x2 * x2;
+    return Poly8(A, B, C, D, E, F, G, H, x, x2, x4) + I * (x4 * x4);
+#else
+    double val = Poly4(A, B, C, D, x, x2);
+    double x4 = x2 * x2;
+    return val + Poly4(E, F, G, H, x, x2) * x4 + I * (x4 * x4);
+#endif
+}
+#endif
+
 static inline double I0(double x) {
-    if (x < 3.75) { // TODO: Estrin's method instead of Horner's method?
+    if (x < 3.75) {
+        x *= x;
+        return Poly7(I0Term<0>::value, I0Term<1>::value,
+                I0Term<2>::value, I0Term<3>::value,
+                I0Term<4>::value, I0Term<5>::value,
+                I0Term<6>::value, x); // e < 1.6e-7
+    }
+    if (1) {
+        /*
+         * Series expansion coefs are easy to calculate, but are expanded around 0,
+         * so error is unequal over the interval 0 < x < 3.75, the error being
+         * significantly better near 0.
+         *
+         * A better solution is to use precise minimax polynomial fits.
+         *
+         * We use a slightly more complicated solution for 3.75 < x < 15, based on
+         * the tables in Blair and Edwards, "Stable Rational Minimax Approximations
+         * to the Modified Bessel Functions I0(x) and I1(x)", Chalk Hill Nuclear Laboratory,
+         * AECL-4928.
+         *
+         * http://www.iaea.org/inis/collection/NCLCollectionStore/_Public/06/178/6178667.pdf
+         *
+         * See Table 11 for 0 < x < 15; e < 10^(-7.13).
+         *
+         * Note: Beta cannot exceed 15 (hence Stopband cannot exceed 144dB = 24b).
+         *
+         * This speeds up overall computation by about 40% over using the else clause below,
+         * which requires sqrt and exp.
+         *
+         */
+
         x *= x;
-        return I0Term<0>::value + x*(
-                I0Term<1>::value + x*(
-                I0Term<2>::value + x*(
-                I0Term<3>::value + x*(
-                I0Term<4>::value + x*(
-                I0Term<5>::value + x*(
-                I0Term<6>::value)))))); // e < 1.6e-7
+        double num = Poly9(-0.13544938430e9, -0.33153754512e8,
+                -0.19406631946e7, -0.48058318783e5,
+                -0.63269783360e3, -0.49520779070e1,
+                -0.24970910370e-1, -0.74741159550e-4,
+                -0.18257612460e-6, x);
+        double y = x - 225.; // reflection around 15 (squared)
+        double den = Poly4(-0.34598737196e8, 0.23852643181e6,
+                -0.70699387620e3, 0.10000000000e1, y);
+        return num / den;
+
+#if IO_EXTENDED_BETA
+        /* Table 42 for x > 15; e < 10^(-8.11).
+         * This is used for Beta>15, but is disabled here as
+         * we never use Beta that high.
+         *
+         * NOTE: This should be enabled only for x > 15.
+         */
+
+        double y = 1./x;
+        double z = y - (1./15);
+        double num = Poly2(0.415079861746e1, -0.5149092496e1, z);
+        double den = Poly3(0.103150763823e2, -0.14181687413e2,
+                0.1000000000e1, z);
+        return exp(x) * sqrt(y) * num / den;
+#endif
+    } else {
+        /*
+         * NOT USED, but reference for large Beta.
+         *
+         * Abramowitz and Stegun asymptotic formula.
+         * works for x > 3.75.
+         */
+        double y = 1./x;
+        return exp(x) * sqrt(y) *
+                // note: reciprocal squareroot may be easier!
+                // http://en.wikipedia.org/wiki/Fast_inverse_square_root
+                Poly9(I0ATerm<0>::value, I0ATerm<1>::value,
+                        I0ATerm<2>::value, I0ATerm<3>::value,
+                        I0ATerm<4>::value, I0ATerm<5>::value,
+                        I0ATerm<6>::value, I0ATerm<7>::value,
+                        I0ATerm<8>::value, y); // (... e) < 1.9e-7
     }
-    // a bit ugly here - perhaps we expand the top series
-    // to permit computation to x < 20 (a reasonable range)
-    double y = 1./x;
-    return exp(x) * sqrt(y) * (
-            // note: reciprocal squareroot may be easier!
-            // http://en.wikipedia.org/wiki/Fast_inverse_square_root
-            I0ATerm<0>::value + y*(
-            I0ATerm<1>::value + y*(
-            I0ATerm<2>::value + y*(
-            I0ATerm<3>::value + y*(
-            I0ATerm<4>::value + y*(
-            I0ATerm<5>::value + y*(
-            I0ATerm<6>::value + y*(
-            I0ATerm<7>::value + y*(
-            I0ATerm<8>::value))))))))); // (... e) < 1.9e-7
 }
 
 /*
  * calculates the transition bandwidth for a Kaiser filter
  *
- * Formula 3.2.8, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48
+ * Formula 3.2.8, Vaidyanathan, _Multirate Systems and Filter Banks_, p. 48
+ * Formula 7.76, Oppenheim and Schafer, _Discrete-time Signal Processing, 3e_, p. 542
  *
  * @param halfNumCoef is half the number of coefficients per filter phase.
+ *
  * @param stopBandAtten is the stop band attenuation desired.
+ *
  * @return the transition bandwidth in normalized frequency (0 <= f <= 0.5)
  */
 static inline double firKaiserTbw(int halfNumCoef, double stopBandAtten) {
-    return (stopBandAtten - 7.95)/(2.*14.36*halfNumCoef);
+    return (stopBandAtten - 7.95)/((2.*14.36)*halfNumCoef);
 }
 
 /*
- * calculates the fir transfer response.
+ * calculates the fir transfer response of the overall polyphase filter at w.
+ *
+ * Calculates the DTFT transfer coefficient H(w) for 0 <= w <= PI, utilizing the
+ * fact that h[n] is symmetric (cosines only, no complex arithmetic).
+ *
+ * We use Goertzel's algorithm to accelerate the computation to essentially
+ * a single multiply and 2 adds per filter coefficient h[].
  *
- * calculates the transfer coefficient H(w) for 0 <= w <= PI.
- * Be careful be careful to consider the fact that this is an interpolated filter
- * of length L, so normalizing H(w)/L is probably what you expect.
+ * Be careful be careful to consider that h[n] is the overall polyphase filter,
+ * with L phases, so rescaling H(w)/L is probably what you expect for "unity gain",
+ * as you only use one of the polyphases at a time.
  */
 template <typename T>
 static inline double firTransfer(const T* coef, int L, int halfNumCoef, double w) {
-    double accum = static_cast<double>(coef[0])*0.5;
-    coef += halfNumCoef;    // skip first row.
+    double accum = static_cast<double>(coef[0])*0.5;  // "center coefficient" from first bank
+    coef += halfNumCoef;    // skip first filterbank (picked up by the last filterbank).
+#if SLOW_FIRTRANSFER
+    /* Original code for reference.  This is equivalent to the code below, but slower. */
     for (int i=1 ; i<=L ; ++i) {
         for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) {
             accum += cos(ix*w)*static_cast<double>(*coef++);
         }
     }
+#else
+    /*
+     * Our overall filter is stored striped by polyphases, not a contiguous h[n].
+     * We could fetch coefficients in a non-contiguous fashion
+     * but that will not scale to vector processing.
+     *
+     * We apply Goertzel's algorithm directly to each polyphase filter bank instead of
+     * using cosine generation/multiplication, thereby saving one multiply per inner loop.
+     *
+     * See: http://en.wikipedia.org/wiki/Goertzel_algorithm
+     * Also: Oppenheim and Schafer, _Discrete Time Signal Processing, 3e_, p. 720.
+     *
+     * We use the basic recursion to incorporate the cosine steps into real sequence x[n]:
+     * s[n] = x[n] + (2cosw)*s[n-1] + s[n-2]
+     *
+     * y[n] = s[n] - e^(iw)s[n-1]
+     *      = sum_{k=-\infty}^{n} x[k]e^(-iw(n-k))
+     *      = e^(-iwn) sum_{k=0}^{n} x[k]e^(iwk)
+     *
+     * The summation contains the frequency steps we want multiplied by the source
+     * (similar to a DTFT).
+     *
+     * Using symmetry, and just the real part (be careful, this must happen
+     * after any internal complex multiplications), the polyphase filterbank
+     * transfer function is:
+     *
+     * Hpp[n, w, w_0] = sum_{k=0}^{n} x[k] * cos(wk + w_0)
+     *                = Re{ e^(iwn + iw_0) y[n]}
+     *                = cos(wn+w_0) * s[n] - cos(w(n+1)+w_0) * s[n-1]
+     *
+     * using the fact that s[n] of real x[n] is real.
+     *
+     */
+    double dcos = 2. * cos(L*w);
+    int start = ((halfNumCoef)*L + 1);
+    SineGen cc((start - L) * w, w, true); // cosine
+    SineGen cp(start * w, w, true); // cosine
+    for (int i=1 ; i<=L ; ++i) {
+        double sc = 0;
+        double sp = 0;
+        for (int j=0 ; j<halfNumCoef ; ++j) {
+            double tmp = sc;
+            sc  = static_cast<double>(*coef++) + dcos*sc - sp;
+            sp = tmp;
+        }
+        // If we are awfully clever, we can apply Goertzel's algorithm
+        // again on the sc and sp sequences returned here.
+        accum += cc.valueAdvance() * sc - cp.valueAdvance() * sp;
+    }
+#endif
     return accum*2.;
 }
 
 /*
- * returns the minimum and maximum |H(f)| bounds
+ * evaluates the minimum and maximum |H(f)| bound in a band region.
+ *
+ * This is usually done with equally spaced increments in the target band in question.
+ * The passband is often very small, and sampled that way. The stopband is often much
+ * larger.
+ *
+ * We use the fact that the overall polyphase filter has an additional bank at the end
+ * for interpolation; hence it is overspecified for the H(f) computation.  Thus the
+ * first polyphase is never actually checked, excepting its first term.
+ *
+ * In this code we use the firTransfer() evaluator above, which uses Goertzel's
+ * algorithm to calculate the transfer function at each point.
+ *
+ * TODO: An alternative with equal spacing is the FFT/DFT.  An alternative with unequal
+ * spacing is a chirp transform.
  *
  * @param coef is the designed polyphase filter banks
  *
@@ -231,7 +529,63 @@ static void testFir(const T* coef, int L, int halfNumCoef,
 }
 
 /*
- * Calculates the polyphase filter banks based on a windowed sinc function.
+ * evaluates the |H(f)| lowpass band characteristics.
+ *
+ * This function tests the lowpass characteristics for the overall polyphase filter,
+ * and is used to verify the design.  For this case, fp should be set to the
+ * passband normalized frequency from 0 to 0.5 for the overall filter (thus it
+ * is the designed polyphase bank value / L).  Likewise for fs.
+ *
+ * @param coef is the designed polyphase filter banks
+ *
+ * @param L is the number of phases (for interpolation)
+ *
+ * @param halfNumCoef should be half the number of coefficients for a single
+ * polyphase.
+ *
+ * @param fp is the passband normalized frequency, 0 < fp < fs < 0.5.
+ *
+ * @param fs is the stopband normalized frequency, 0 < fp < fs < 0.5.
+ *
+ * @param passSteps is the number of passband sampling steps.
+ *
+ * @param stopSteps is the number of stopband sampling steps.
+ *
+ * @param passMin is the minimum value in the passband
+ *
+ * @param passMax is the maximum value in the passband (useful for scaling).  This should
+ * be less than 1., to avoid sine wave test overflow.
+ *
+ * @param passRipple is the passband ripple.  Typically this should be less than 0.1 for
+ * an audio filter.  Generally speaker/headphone device characteristics will dominate
+ * the passband term.
+ *
+ * @param stopMax is the maximum value in the stopband.
+ *
+ * @param stopRipple is the stopband ripple, also known as stopband attenuation.
+ * Typically this should be greater than ~80dB for low quality, and greater than
+ * ~100dB for full 16b quality, otherwise aliasing may become noticeable.
+ *
+ */
+template <typename T>
+static void testFir(const T* coef, int L, int halfNumCoef,
+        double fp, double fs, int passSteps, int stopSteps,
+        double &passMin, double &passMax, double &passRipple,
+        double &stopMax, double &stopRipple) {
+    double fmin, fmax;
+    testFir(coef, L, halfNumCoef, 0., fp, passSteps, fmin, fmax);
+    double d1 = (fmax - fmin)/2.;
+    passMin = fmin;
+    passMax = fmax;
+    passRipple = -20.*log10(1. - d1); // passband ripple
+    testFir(coef, L, halfNumCoef, fs, 0.5, stopSteps, fmin, fmax);
+    // fmin is really not important for the stopband.
+    stopMax = fmax;
+    stopRipple = -20.*log10(fmax); // stopband ripple/attenuation
+}
+
+/*
+ * Calculates the overall polyphase filter based on a windowed sinc function.
  *
  * The windowed sinc is an odd length symmetric filter of exactly L*halfNumCoef*2+1
  * taps for the entire kernel.  This is then decomposed into L+1 polyphase filterbanks.
@@ -239,6 +593,9 @@ static void testFir(const T* coef, int L, int halfNumCoef,
  * of the first bank shifted by one sample), and is unnecessary if one does
  * not do interpolation.
  *
+ * We use the last filterbank for some transfer function calculation purposes,
+ * so it needs to be generated anyways.
+ *
  * @param coef is the caller allocated space for coefficients.  This should be
  * exactly (L+1)*halfNumCoef in size.
  *
@@ -261,7 +618,8 @@ template <typename T>
 static inline void firKaiserGen(T* coef, int L, int halfNumCoef,
         double stopBandAtten, double fcr, double atten) {
     //
-    // Formula 3.2.5, 3.2.7, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48
+    // Formula 3.2.5, 3.2.7, Vaidyanathan, _Multirate Systems and Filter Banks_, p. 48
+    // Formula 7.75, Oppenheim and Schafer, _Discrete-time Signal Processing, 3e_, p. 542
     //
     // See also: http://melodi.ee.washington.edu/courses/ee518/notes/lec17.pdf
     //
@@ -284,13 +642,32 @@ static inline void firKaiserGen(T* coef, int L, int halfNumCoef,
 
     const int N = L * halfNumCoef; // non-negative half
     const double beta = 0.1102 * (stopBandAtten - 8.7); // >= 50dB always
-    const double yscale = 2. * atten * fcr / I0(beta);
-    const double xstep = 2. * M_PI * fcr / L;
+    const double xstep = (2. * M_PI) * fcr / L;
     const double xfrac = 1. / N;
-    double err = 0; // for noise shaping on int16_t coefficients
+    const double yscale = atten * L / (I0(beta) * M_PI);
+
+    // We use sine generators, which computes sines on regular step intervals.
+    // This speeds up overall computation about 40% from computing the sine directly.
+
+    SineGenGen sgg(0., xstep, L*xstep); // generates sine generators (one per polyphase)
+
     for (int i=0 ; i<=L ; ++i) { // generate an extra set of coefs for interpolation
+
+        // computation for a single polyphase of the overall filter.
+        SineGen sg = sgg.valueAdvance(); // current sine generator for "j" inner loop.
+        double err = 0; // for noise shaping on int16_t coefficients (over each polyphase)
+
         for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) {
-            double y = I0(beta * sqrt(1.0 - sqr(ix * xfrac))) * sinc(ix * xstep) * yscale;
+            double y;
+            if (CC_LIKELY(ix)) {
+                double x = static_cast<double>(ix);
+
+                // sine generator: sg.valueAdvance() returns sin(ix*xstep);
+                y = I0(beta * sqrt(1.0 - sqr(x * xfrac))) * yscale * sg.valueAdvance() / x;
+            } else {
+                y = 2. * atten * fcr; // center of filter, sinc(0) = 1.
+                sg.advance();
+            }
 
             // (caution!) float version does not need rounding
             if (is_same<T, int16_t>::value) { // int16_t needs noise shaping
author	Andy Hung <hunga@google.com>	2014-01-03 12:30:41 -0800
committer	Andy Hung <hunga@google.com>	2014-01-08 17:54:27 -0800
commit	6582f2b14a21e630654c5522ef9ad64e80d5058d (patch)
tree	69dee04c065feec40569f144281f36e885b7d15b /services/audioflinger/AudioResamplerFirGen.h
parent	0bcfa88149e2404b34d13c622e3921e1b846cdf8 (diff)
download	frameworks_av-6582f2b14a21e630654c5522ef9ad64e80d5058d.zip frameworks_av-6582f2b14a21e630654c5522ef9ad64e80d5058d.tar.gz frameworks_av-6582f2b14a21e630654c5522ef9ad64e80d5058d.tar.bz2