diff options
author | Andy Hung <hunga@google.com> | 2014-01-10 19:28:54 +0000 |
---|---|---|
committer | Android (Google) Code Review <android-gerrit@google.com> | 2014-01-10 19:28:54 +0000 |
commit | 03f696e3eb89c0642f9a2b761ff2cfc6af003d16 (patch) | |
tree | 94a7d75134823a31a86d702e2e6443112b8e6a29 | |
parent | 6340829ffb9ff7586ffb88a214f56340d4c70d60 (diff) | |
parent | 6582f2b14a21e630654c5522ef9ad64e80d5058d (diff) | |
download | frameworks_av-03f696e3eb89c0642f9a2b761ff2cfc6af003d16.zip frameworks_av-03f696e3eb89c0642f9a2b761ff2cfc6af003d16.tar.gz frameworks_av-03f696e3eb89c0642f9a2b761ff2cfc6af003d16.tar.bz2 |
Merge "Improve dynamic audio resampler filter generation"
-rw-r--r-- | services/audioflinger/AudioResamplerDyn.cpp | 55 | ||||
-rw-r--r-- | services/audioflinger/AudioResamplerDyn.h | 3 | ||||
-rw-r--r-- | services/audioflinger/AudioResamplerFirGen.h | 479 | ||||
-rw-r--r-- | services/audioflinger/test-resample.cpp | 139 |
4 files changed, 586 insertions, 90 deletions
diff --git a/services/audioflinger/AudioResamplerDyn.cpp b/services/audioflinger/AudioResamplerDyn.cpp index 1e38d3e..984548d 100644 --- a/services/audioflinger/AudioResamplerDyn.cpp +++ b/services/audioflinger/AudioResamplerDyn.cpp @@ -161,7 +161,8 @@ void AudioResamplerDyn::Constants::set( AudioResamplerDyn::AudioResamplerDyn(int bitDepth, int inChannelCount, int32_t sampleRate, src_quality quality) : AudioResampler(bitDepth, inChannelCount, sampleRate, quality), - mResampleType(0), mFilterSampleRate(0), mCoefBuffer(NULL) + mResampleType(0), mFilterSampleRate(0), mFilterQuality(DEFAULT_QUALITY), + mCoefBuffer(NULL) { mVolumeSimd[0] = mVolumeSimd[1] = 0; mConstants.set(128, 8, mSampleRate, mSampleRate); // TODO: set better @@ -213,19 +214,16 @@ void AudioResamplerDyn::createKaiserFir(Constants &c, double stopBandAtten, // test the filter and report results double fp = (fcr - tbw/2)/c.mL; double fs = (fcr + tbw/2)/c.mL; - double fmin, fmax; - testFir(buf, c.mL, c.mHalfNumCoefs, 0., fp, 100, fmin, fmax); - double d1 = (fmax - fmin)/2.; - double ap = -20.*log10(1. - d1); // passband ripple - printf("passband(%lf, %lf): %.8lf %.8lf %.8lf\n", 0., fp, (fmax + fmin)/2., d1, ap); - testFir(buf, c.mL, c.mHalfNumCoefs, fs, 0.5, 100, fmin, fmax); - double d2 = fmax; - double as = -20.*log10(d2); // stopband attenuation - printf("stopband(%lf, %lf): %.8lf %.8lf %.3lf\n", fs, 0.5, (fmax + fmin)/2., d2, as); + double passMin, passMax, passRipple; + double stopMax, stopRipple; + testFir(buf, c.mL, c.mHalfNumCoefs, fp, fs, /*passSteps*/ 1000, /*stopSteps*/ 100000, + passMin, passMax, passRipple, stopMax, stopRipple); + printf("passband(%lf, %lf): %.8lf %.8lf %.8lf\n", 0., fp, passMin, passMax, passRipple); + printf("stopband(%lf, %lf): %.8lf %.3lf\n", fs, 0.5, stopMax, stopRipple); #endif } -// recursive gcd (TODO: verify tail recursion elimination should make this iterate) +// recursive gcd. Using objdump, it appears the tail recursion is converted to a while loop. static int gcd(int n, int m) { if (m == 0) { return n; @@ -233,7 +231,16 @@ static int gcd(int n, int m) { return gcd(m, n % m); } -static bool isClose(int32_t newSampleRate, int32_t prevSampleRate, int32_t filterSampleRate) { +static bool isClose(int32_t newSampleRate, int32_t prevSampleRate, + int32_t filterSampleRate, int32_t outSampleRate) { + + // different upsampling ratios do not need a filter change. + if (filterSampleRate != 0 + && filterSampleRate < outSampleRate + && newSampleRate < outSampleRate) + return true; + + // check design criteria again if downsampling is detected. int pdiff = absdiff(newSampleRate, prevSampleRate); int adiff = absdiff(newSampleRate, filterSampleRate); @@ -255,8 +262,10 @@ void AudioResamplerDyn::setSampleRate(int32_t inSampleRate) { // TODO: Add precalculated Equiripple filters - if (!isClose(inSampleRate, oldSampleRate, mFilterSampleRate)) { + if (mFilterQuality != getQuality() || + !isClose(inSampleRate, oldSampleRate, mFilterSampleRate, mSampleRate)) { mFilterSampleRate = inSampleRate; + mFilterQuality = getQuality(); // Begin Kaiser Filter computation // @@ -270,12 +279,12 @@ void AudioResamplerDyn::setSampleRate(int32_t inSampleRate) { double stopBandAtten; double tbwCheat = 1.; // how much we "cheat" into aliasing int halfLength; - if (getQuality() == DYN_HIGH_QUALITY) { + if (mFilterQuality == DYN_HIGH_QUALITY) { // 32b coefficients, 64 length useS32 = true; stopBandAtten = 98.; halfLength = 32; - } else if (getQuality() == DYN_LOW_QUALITY) { + } else if (mFilterQuality == DYN_LOW_QUALITY) { // 16b coefficients, 16-32 length useS32 = false; stopBandAtten = 80.; @@ -289,8 +298,9 @@ void AudioResamplerDyn::setSampleRate(int32_t inSampleRate) { } else { tbwCheat = 1.03; } - } else { // medium quality + } else { // DYN_MED_QUALITY // 16b coefficients, 32-64 length + // note: > 64 length filters with 16b coefs can have quantization noise problems useS32 = false; stopBandAtten = 84.; if (mSampleRate >= inSampleRate * 4) { @@ -315,9 +325,20 @@ void AudioResamplerDyn::setSampleRate(int32_t inSampleRate) { int phases = mSampleRate / gcd(mSampleRate, inSampleRate); - while (phases<63) { // too few phases, allow room for interpolation + // TODO: Once dynamic sample rate change is an option, the code below + // should be modified to execute only when dynamic sample rate change is enabled. + // + // as above, #phases less than 63 is too few phases for accurate linear interpolation. + // we increase the phases to compensate, but more phases means more memory per + // filter and more time to compute the filter. + // + // if we know that the filter will be used for dynamic sample rate changes, + // that would allow us skip this part for fixed sample rate resamplers. + // + while (phases<63) { phases *= 2; // this code only needed to support dynamic rate changes } + if (phases>=256) { // too many phases, always interpolate phases = 127; } diff --git a/services/audioflinger/AudioResamplerDyn.h b/services/audioflinger/AudioResamplerDyn.h index 85a01ab..df1fdbe 100644 --- a/services/audioflinger/AudioResamplerDyn.h +++ b/services/audioflinger/AudioResamplerDyn.h @@ -113,7 +113,8 @@ private: Constants mConstants; // current set of coefficient parameters int32_t __attribute__ ((aligned (8))) mVolumeSimd[2]; int32_t mResampleType; // contains the resample type. - int32_t mFilterSampleRate; // designed sample rate for the filter + int32_t mFilterSampleRate; // designed filter sample rate. + src_quality mFilterQuality; // designed filter quality. void* mCoefBuffer; // if a filter is created, this is not null }; diff --git a/services/audioflinger/AudioResamplerFirGen.h b/services/audioflinger/AudioResamplerFirGen.h index fa6ee3e..fac3001 100644 --- a/services/audioflinger/AudioResamplerFirGen.h +++ b/services/audioflinger/AudioResamplerFirGen.h @@ -20,19 +20,105 @@ namespace android { /* - * Sinc function is the traditional variant. + * generates a sine wave at equal steps. * - * TODO: Investigate optimizations (regular sampling grid, NEON vector accelerations) - * TODO: Remove comparison at 0 and trap at a higher level. + * As most of our functions use sine or cosine at equal steps, + * it is very efficient to compute them that way (single multiply and subtract), + * rather than invoking the math library sin() or cos() each time. + * + * SineGen uses Goertzel's Algorithm (as a generator not a filter) + * to calculate sine(wstart + n * wstep) or cosine(wstart + n * wstep) + * by stepping through 0, 1, ... n. + * + * e^i(wstart+wstep) = 2cos(wstep) * e^i(wstart) - e^i(wstart-wstep) + * + * or looking at just the imaginary sine term, as the cosine follows identically: + * + * sin(wstart+wstep) = 2cos(wstep) * sin(wstart) - sin(wstart-wstep) + * + * Goertzel's algorithm is more efficient than the angle addition formula, + * e^i(wstart+wstep) = e^i(wstart) * e^i(wstep), which takes up to + * 4 multiplies and 2 adds (or 3* and 3+) and requires both sine and + * cosine generation due to the complex * complex multiply (full rotation). + * + * See: http://en.wikipedia.org/wiki/Goertzel_algorithm * */ -static inline double sinc(double x) { - if (fabs(x) < FLT_MIN) { - return 1.; +class SineGen { +public: + SineGen(double wstart, double wstep, bool cosine = false) { + if (cosine) { + mCurrent = cos(wstart); + mPrevious = cos(wstart - wstep); + } else { + mCurrent = sin(wstart); + mPrevious = sin(wstart - wstep); + } + mTwoCos = 2.*cos(wstep); } - return sin(x) / x; -} + SineGen(double expNow, double expPrev, double twoCosStep) { + mCurrent = expNow; + mPrevious = expPrev; + mTwoCos = twoCosStep; + } + inline double value() const { + return mCurrent; + } + inline void advance() { + double tmp = mCurrent; + mCurrent = mCurrent*mTwoCos - mPrevious; + mPrevious = tmp; + } + inline double valueAdvance() { + double tmp = mCurrent; + mCurrent = mCurrent*mTwoCos - mPrevious; + mPrevious = tmp; + return tmp; + } + +private: + double mCurrent; // current value of sine/cosine + double mPrevious; // previous value of sine/cosine + double mTwoCos; // stepping factor +}; + +/* + * generates a series of sine generators, phase offset by fixed steps. + * + * This is used to generate polyphase sine generators, one per polyphase + * in the filter code below. + * + * The SineGen returned by value() starts at innerStart = outerStart + n*outerStep; + * increments by innerStep. + * + */ + +class SineGenGen { +public: + SineGenGen(double outerStart, double outerStep, double innerStep, bool cosine = false) + : mSineInnerCur(outerStart, outerStep, cosine), + mSineInnerPrev(outerStart-innerStep, outerStep, cosine) + { + mTwoCos = 2.*cos(innerStep); + } + inline SineGen value() { + return SineGen(mSineInnerCur.value(), mSineInnerPrev.value(), mTwoCos); + } + inline void advance() { + mSineInnerCur.advance(); + mSineInnerPrev.advance(); + } + inline SineGen valueAdvance() { + return SineGen(mSineInnerCur.valueAdvance(), mSineInnerPrev.valueAdvance(), mTwoCos); + } + +private: + SineGen mSineInnerCur; // generate the inner sine values (stepped by outerStep). + SineGen mSineInnerPrev; // generate the inner sine previous values + // (behind by innerStep, stepped by outerStep). + double mTwoCos; // the inner stepping factor for the returned SineGen. +}; static inline double sqr(double x) { return x * x; @@ -46,8 +132,7 @@ static inline double sqr(double x) { * The other variant is a non-noise shaped version for * S32 coefficients (noise shaping doesn't gain much). * - * Caution: No bounds saturation is applied, but isn't needed in - * this case. + * Caution: No bounds saturation is applied, but isn't needed in this case. * * @param x is the value to round. * @@ -56,12 +141,15 @@ static inline double sqr(double x) { * FIR coefficients generated here are never that close to 1.0 to pose an overflow condition). * * @param err is the previous error (actual - rounded) for the previous rounding op. + * For 16b coefficients this can improve stopband dB performance by up to 2dB. + * + * Many variants exist for the noise shaping: http://en.wikipedia.org/wiki/Noise_shaping * */ static inline int64_t toint(double x, int64_t maxval, double& err) { double val = x * maxval; - double ival = floor(val + 0.5 + err*0.17); + double ival = floor(val + 0.5 + err*0.2); err = val - ival; return static_cast<int64_t>(ival); } @@ -74,7 +162,8 @@ static inline int64_t toint(double x, int64_t maxval) { * Modified Bessel function of the first kind * http://en.wikipedia.org/wiki/Bessel_function * - * The formulas are taken from Abramowitz and Stegun: + * The formulas are taken from Abramowitz and Stegun, + * _Handbook of Mathematical Functions_ (links below): * * http://people.math.sfu.ca/~cbm/aands/page_375.htm * http://people.math.sfu.ca/~cbm/aands/page_378.htm @@ -92,11 +181,13 @@ static inline int64_t toint(double x, int64_t maxval) { * We use a bit of template math here, constexpr would probably be * more appropriate for a C++11 compiler. * + * For the intermediate range 3.75 < x < 15, we use minimax polynomial fit. + * */ template <int N> struct I0Term { - static const double value = I0Term<N-1>::value/ (4. * N * N); + static const double value = I0Term<N-1>::value / (4. * N * N); }; template <> @@ -114,68 +205,275 @@ struct I0ATerm<0> { // 1/sqrt(2*PI); static const double value = 0.398942280401432677939946059934381868475858631164934657665925; }; +#if USE_HORNERS_METHOD +/* Polynomial evaluation of A + Bx + Cx^2 + Dx^3 + ... + * using Horner's Method: http://en.wikipedia.org/wiki/Horner's_method + * + * This has fewer multiplications than Estrin's method below, but has back to back + * floating point dependencies. + * + * On ARM this appears to work slower, so USE_HORNERS_METHOD is not default enabled. + */ + +inline double Poly2(double A, double B, double x) { + return A + x * B; +} + +inline double Poly4(double A, double B, double C, double D, double x) { + return A + x * (B + x * (C + x * (D))); +} + +inline double Poly7(double A, double B, double C, double D, double E, double F, double G, + double x) { + return A + x * (B + x * (C + x * (D + x * (E + x * (F + x * (G)))))); +} + +inline double Poly9(double A, double B, double C, double D, double E, double F, double G, + double H, double I, double x) { + return A + x * (B + x * (C + x * (D + x * (E + x * (F + x * (G + x * (H + x * (I)))))))); +} + +#else +/* Polynomial evaluation of A + Bx + Cx^2 + Dx^3 + ... + * using Estrin's Method: http://en.wikipedia.org/wiki/Estrin's_scheme + * + * This is typically faster, perhaps gains about 5-10% overall on ARM processors + * over Horner's method above. + */ + +inline double Poly2(double A, double B, double x) { + return A + B * x; +} + +inline double Poly3(double A, double B, double C, double x, double x2) { + return Poly2(A, B, x) + C * x2; +} + +inline double Poly3(double A, double B, double C, double x) { + return Poly2(A, B, x) + C * x * x; +} + +inline double Poly4(double A, double B, double C, double D, double x, double x2) { + return Poly2(A, B, x) + Poly2(C, D, x) * x2; // same as poly2(poly2, poly2, x2); +} + +inline double Poly4(double A, double B, double C, double D, double x) { + return Poly4(A, B, C, D, x, x * x); +} + +inline double Poly7(double A, double B, double C, double D, double E, double F, double G, + double x) { + double x2 = x * x; + return Poly4(A, B, C, D, x, x2) + Poly3(E, F, G, x, x2) * (x2 * x2); +} + +inline double Poly8(double A, double B, double C, double D, double E, double F, double G, + double H, double x, double x2, double x4) { + return Poly4(A, B, C, D, x, x2) + Poly4(E, F, G, H, x, x2) * x4; +} + +inline double Poly9(double A, double B, double C, double D, double E, double F, double G, + double H, double I, double x) { + double x2 = x * x; +#if 1 + // It does not seem faster to explicitly decompose Poly8 into Poly4, but + // could depend on compiler floating point scheduling. + double x4 = x2 * x2; + return Poly8(A, B, C, D, E, F, G, H, x, x2, x4) + I * (x4 * x4); +#else + double val = Poly4(A, B, C, D, x, x2); + double x4 = x2 * x2; + return val + Poly4(E, F, G, H, x, x2) * x4 + I * (x4 * x4); +#endif +} +#endif + static inline double I0(double x) { - if (x < 3.75) { // TODO: Estrin's method instead of Horner's method? + if (x < 3.75) { + x *= x; + return Poly7(I0Term<0>::value, I0Term<1>::value, + I0Term<2>::value, I0Term<3>::value, + I0Term<4>::value, I0Term<5>::value, + I0Term<6>::value, x); // e < 1.6e-7 + } + if (1) { + /* + * Series expansion coefs are easy to calculate, but are expanded around 0, + * so error is unequal over the interval 0 < x < 3.75, the error being + * significantly better near 0. + * + * A better solution is to use precise minimax polynomial fits. + * + * We use a slightly more complicated solution for 3.75 < x < 15, based on + * the tables in Blair and Edwards, "Stable Rational Minimax Approximations + * to the Modified Bessel Functions I0(x) and I1(x)", Chalk Hill Nuclear Laboratory, + * AECL-4928. + * + * http://www.iaea.org/inis/collection/NCLCollectionStore/_Public/06/178/6178667.pdf + * + * See Table 11 for 0 < x < 15; e < 10^(-7.13). + * + * Note: Beta cannot exceed 15 (hence Stopband cannot exceed 144dB = 24b). + * + * This speeds up overall computation by about 40% over using the else clause below, + * which requires sqrt and exp. + * + */ + x *= x; - return I0Term<0>::value + x*( - I0Term<1>::value + x*( - I0Term<2>::value + x*( - I0Term<3>::value + x*( - I0Term<4>::value + x*( - I0Term<5>::value + x*( - I0Term<6>::value)))))); // e < 1.6e-7 + double num = Poly9(-0.13544938430e9, -0.33153754512e8, + -0.19406631946e7, -0.48058318783e5, + -0.63269783360e3, -0.49520779070e1, + -0.24970910370e-1, -0.74741159550e-4, + -0.18257612460e-6, x); + double y = x - 225.; // reflection around 15 (squared) + double den = Poly4(-0.34598737196e8, 0.23852643181e6, + -0.70699387620e3, 0.10000000000e1, y); + return num / den; + +#if IO_EXTENDED_BETA + /* Table 42 for x > 15; e < 10^(-8.11). + * This is used for Beta>15, but is disabled here as + * we never use Beta that high. + * + * NOTE: This should be enabled only for x > 15. + */ + + double y = 1./x; + double z = y - (1./15); + double num = Poly2(0.415079861746e1, -0.5149092496e1, z); + double den = Poly3(0.103150763823e2, -0.14181687413e2, + 0.1000000000e1, z); + return exp(x) * sqrt(y) * num / den; +#endif + } else { + /* + * NOT USED, but reference for large Beta. + * + * Abramowitz and Stegun asymptotic formula. + * works for x > 3.75. + */ + double y = 1./x; + return exp(x) * sqrt(y) * + // note: reciprocal squareroot may be easier! + // http://en.wikipedia.org/wiki/Fast_inverse_square_root + Poly9(I0ATerm<0>::value, I0ATerm<1>::value, + I0ATerm<2>::value, I0ATerm<3>::value, + I0ATerm<4>::value, I0ATerm<5>::value, + I0ATerm<6>::value, I0ATerm<7>::value, + I0ATerm<8>::value, y); // (... e) < 1.9e-7 } - // a bit ugly here - perhaps we expand the top series - // to permit computation to x < 20 (a reasonable range) - double y = 1./x; - return exp(x) * sqrt(y) * ( - // note: reciprocal squareroot may be easier! - // http://en.wikipedia.org/wiki/Fast_inverse_square_root - I0ATerm<0>::value + y*( - I0ATerm<1>::value + y*( - I0ATerm<2>::value + y*( - I0ATerm<3>::value + y*( - I0ATerm<4>::value + y*( - I0ATerm<5>::value + y*( - I0ATerm<6>::value + y*( - I0ATerm<7>::value + y*( - I0ATerm<8>::value))))))))); // (... e) < 1.9e-7 } /* * calculates the transition bandwidth for a Kaiser filter * - * Formula 3.2.8, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48 + * Formula 3.2.8, Vaidyanathan, _Multirate Systems and Filter Banks_, p. 48 + * Formula 7.76, Oppenheim and Schafer, _Discrete-time Signal Processing, 3e_, p. 542 * * @param halfNumCoef is half the number of coefficients per filter phase. + * * @param stopBandAtten is the stop band attenuation desired. + * * @return the transition bandwidth in normalized frequency (0 <= f <= 0.5) */ static inline double firKaiserTbw(int halfNumCoef, double stopBandAtten) { - return (stopBandAtten - 7.95)/(2.*14.36*halfNumCoef); + return (stopBandAtten - 7.95)/((2.*14.36)*halfNumCoef); } /* - * calculates the fir transfer response. + * calculates the fir transfer response of the overall polyphase filter at w. + * + * Calculates the DTFT transfer coefficient H(w) for 0 <= w <= PI, utilizing the + * fact that h[n] is symmetric (cosines only, no complex arithmetic). + * + * We use Goertzel's algorithm to accelerate the computation to essentially + * a single multiply and 2 adds per filter coefficient h[]. * - * calculates the transfer coefficient H(w) for 0 <= w <= PI. - * Be careful be careful to consider the fact that this is an interpolated filter - * of length L, so normalizing H(w)/L is probably what you expect. + * Be careful be careful to consider that h[n] is the overall polyphase filter, + * with L phases, so rescaling H(w)/L is probably what you expect for "unity gain", + * as you only use one of the polyphases at a time. */ template <typename T> static inline double firTransfer(const T* coef, int L, int halfNumCoef, double w) { - double accum = static_cast<double>(coef[0])*0.5; - coef += halfNumCoef; // skip first row. + double accum = static_cast<double>(coef[0])*0.5; // "center coefficient" from first bank + coef += halfNumCoef; // skip first filterbank (picked up by the last filterbank). +#if SLOW_FIRTRANSFER + /* Original code for reference. This is equivalent to the code below, but slower. */ for (int i=1 ; i<=L ; ++i) { for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) { accum += cos(ix*w)*static_cast<double>(*coef++); } } +#else + /* + * Our overall filter is stored striped by polyphases, not a contiguous h[n]. + * We could fetch coefficients in a non-contiguous fashion + * but that will not scale to vector processing. + * + * We apply Goertzel's algorithm directly to each polyphase filter bank instead of + * using cosine generation/multiplication, thereby saving one multiply per inner loop. + * + * See: http://en.wikipedia.org/wiki/Goertzel_algorithm + * Also: Oppenheim and Schafer, _Discrete Time Signal Processing, 3e_, p. 720. + * + * We use the basic recursion to incorporate the cosine steps into real sequence x[n]: + * s[n] = x[n] + (2cosw)*s[n-1] + s[n-2] + * + * y[n] = s[n] - e^(iw)s[n-1] + * = sum_{k=-\infty}^{n} x[k]e^(-iw(n-k)) + * = e^(-iwn) sum_{k=0}^{n} x[k]e^(iwk) + * + * The summation contains the frequency steps we want multiplied by the source + * (similar to a DTFT). + * + * Using symmetry, and just the real part (be careful, this must happen + * after any internal complex multiplications), the polyphase filterbank + * transfer function is: + * + * Hpp[n, w, w_0] = sum_{k=0}^{n} x[k] * cos(wk + w_0) + * = Re{ e^(iwn + iw_0) y[n]} + * = cos(wn+w_0) * s[n] - cos(w(n+1)+w_0) * s[n-1] + * + * using the fact that s[n] of real x[n] is real. + * + */ + double dcos = 2. * cos(L*w); + int start = ((halfNumCoef)*L + 1); + SineGen cc((start - L) * w, w, true); // cosine + SineGen cp(start * w, w, true); // cosine + for (int i=1 ; i<=L ; ++i) { + double sc = 0; + double sp = 0; + for (int j=0 ; j<halfNumCoef ; ++j) { + double tmp = sc; + sc = static_cast<double>(*coef++) + dcos*sc - sp; + sp = tmp; + } + // If we are awfully clever, we can apply Goertzel's algorithm + // again on the sc and sp sequences returned here. + accum += cc.valueAdvance() * sc - cp.valueAdvance() * sp; + } +#endif return accum*2.; } /* - * returns the minimum and maximum |H(f)| bounds + * evaluates the minimum and maximum |H(f)| bound in a band region. + * + * This is usually done with equally spaced increments in the target band in question. + * The passband is often very small, and sampled that way. The stopband is often much + * larger. + * + * We use the fact that the overall polyphase filter has an additional bank at the end + * for interpolation; hence it is overspecified for the H(f) computation. Thus the + * first polyphase is never actually checked, excepting its first term. + * + * In this code we use the firTransfer() evaluator above, which uses Goertzel's + * algorithm to calculate the transfer function at each point. + * + * TODO: An alternative with equal spacing is the FFT/DFT. An alternative with unequal + * spacing is a chirp transform. * * @param coef is the designed polyphase filter banks * @@ -231,7 +529,63 @@ static void testFir(const T* coef, int L, int halfNumCoef, } /* - * Calculates the polyphase filter banks based on a windowed sinc function. + * evaluates the |H(f)| lowpass band characteristics. + * + * This function tests the lowpass characteristics for the overall polyphase filter, + * and is used to verify the design. For this case, fp should be set to the + * passband normalized frequency from 0 to 0.5 for the overall filter (thus it + * is the designed polyphase bank value / L). Likewise for fs. + * + * @param coef is the designed polyphase filter banks + * + * @param L is the number of phases (for interpolation) + * + * @param halfNumCoef should be half the number of coefficients for a single + * polyphase. + * + * @param fp is the passband normalized frequency, 0 < fp < fs < 0.5. + * + * @param fs is the stopband normalized frequency, 0 < fp < fs < 0.5. + * + * @param passSteps is the number of passband sampling steps. + * + * @param stopSteps is the number of stopband sampling steps. + * + * @param passMin is the minimum value in the passband + * + * @param passMax is the maximum value in the passband (useful for scaling). This should + * be less than 1., to avoid sine wave test overflow. + * + * @param passRipple is the passband ripple. Typically this should be less than 0.1 for + * an audio filter. Generally speaker/headphone device characteristics will dominate + * the passband term. + * + * @param stopMax is the maximum value in the stopband. + * + * @param stopRipple is the stopband ripple, also known as stopband attenuation. + * Typically this should be greater than ~80dB for low quality, and greater than + * ~100dB for full 16b quality, otherwise aliasing may become noticeable. + * + */ +template <typename T> +static void testFir(const T* coef, int L, int halfNumCoef, + double fp, double fs, int passSteps, int stopSteps, + double &passMin, double &passMax, double &passRipple, + double &stopMax, double &stopRipple) { + double fmin, fmax; + testFir(coef, L, halfNumCoef, 0., fp, passSteps, fmin, fmax); + double d1 = (fmax - fmin)/2.; + passMin = fmin; + passMax = fmax; + passRipple = -20.*log10(1. - d1); // passband ripple + testFir(coef, L, halfNumCoef, fs, 0.5, stopSteps, fmin, fmax); + // fmin is really not important for the stopband. + stopMax = fmax; + stopRipple = -20.*log10(fmax); // stopband ripple/attenuation +} + +/* + * Calculates the overall polyphase filter based on a windowed sinc function. * * The windowed sinc is an odd length symmetric filter of exactly L*halfNumCoef*2+1 * taps for the entire kernel. This is then decomposed into L+1 polyphase filterbanks. @@ -239,6 +593,9 @@ static void testFir(const T* coef, int L, int halfNumCoef, * of the first bank shifted by one sample), and is unnecessary if one does * not do interpolation. * + * We use the last filterbank for some transfer function calculation purposes, + * so it needs to be generated anyways. + * * @param coef is the caller allocated space for coefficients. This should be * exactly (L+1)*halfNumCoef in size. * @@ -261,7 +618,8 @@ template <typename T> static inline void firKaiserGen(T* coef, int L, int halfNumCoef, double stopBandAtten, double fcr, double atten) { // - // Formula 3.2.5, 3.2.7, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48 + // Formula 3.2.5, 3.2.7, Vaidyanathan, _Multirate Systems and Filter Banks_, p. 48 + // Formula 7.75, Oppenheim and Schafer, _Discrete-time Signal Processing, 3e_, p. 542 // // See also: http://melodi.ee.washington.edu/courses/ee518/notes/lec17.pdf // @@ -284,13 +642,32 @@ static inline void firKaiserGen(T* coef, int L, int halfNumCoef, const int N = L * halfNumCoef; // non-negative half const double beta = 0.1102 * (stopBandAtten - 8.7); // >= 50dB always - const double yscale = 2. * atten * fcr / I0(beta); - const double xstep = 2. * M_PI * fcr / L; + const double xstep = (2. * M_PI) * fcr / L; const double xfrac = 1. / N; - double err = 0; // for noise shaping on int16_t coefficients + const double yscale = atten * L / (I0(beta) * M_PI); + + // We use sine generators, which computes sines on regular step intervals. + // This speeds up overall computation about 40% from computing the sine directly. + + SineGenGen sgg(0., xstep, L*xstep); // generates sine generators (one per polyphase) + for (int i=0 ; i<=L ; ++i) { // generate an extra set of coefs for interpolation + + // computation for a single polyphase of the overall filter. + SineGen sg = sgg.valueAdvance(); // current sine generator for "j" inner loop. + double err = 0; // for noise shaping on int16_t coefficients (over each polyphase) + for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) { - double y = I0(beta * sqrt(1.0 - sqr(ix * xfrac))) * sinc(ix * xstep) * yscale; + double y; + if (CC_LIKELY(ix)) { + double x = static_cast<double>(ix); + + // sine generator: sg.valueAdvance() returns sin(ix*xstep); + y = I0(beta * sqrt(1.0 - sqr(x * xfrac))) * yscale * sg.valueAdvance() / x; + } else { + y = 2. * atten * fcr; // center of filter, sinc(0) = 1. + sg.advance(); + } // (caution!) float version does not need rounding if (is_same<T, int16_t>::value) { // int16_t needs noise shaping diff --git a/services/audioflinger/test-resample.cpp b/services/audioflinger/test-resample.cpp index 3f2ce55..66fcd90 100644 --- a/services/audioflinger/test-resample.cpp +++ b/services/audioflinger/test-resample.cpp @@ -57,7 +57,8 @@ static int usage(const char* name) { int main(int argc, char* argv[]) { const char* const progname = argv[0]; - bool profiling = false; + bool profileResample = false; + bool profileFilter = false; bool writeHeader = false; int channels = 1; int input_freq = 0; @@ -65,10 +66,13 @@ int main(int argc, char* argv[]) { AudioResampler::src_quality quality = AudioResampler::DEFAULT_QUALITY; int ch; - while ((ch = getopt(argc, argv, "phvsq:i:o:")) != -1) { + while ((ch = getopt(argc, argv, "pfhvsq:i:o:")) != -1) { switch (ch) { case 'p': - profiling = true; + profileResample = true; + break; + case 'f': + profileFilter = true; break; case 'h': writeHeader = true; @@ -230,27 +234,108 @@ int main(int argc, char* argv[]) { size_t output_size = 2 * 4 * ((int64_t) input_frames * output_freq) / input_freq; output_size &= ~7; // always stereo, 32-bits - void* output_vaddr = malloc(output_size); - AudioResampler* resampler = AudioResampler::create(16, channels, - output_freq, quality); - size_t out_frames = output_size/8; - resampler->setSampleRate(input_freq); - resampler->setVolume(0x1000, 0x1000); - - if (profiling) { - const int looplimit = 100; + if (profileFilter) { + // Check how fast sample rate changes are that require filter changes. + // The delta sample rate changes must indicate a downsampling ratio, + // and must be larger than 10% changes. + // + // On fast devices, filters should be generated between 0.1ms - 1ms. + // (single threaded). + AudioResampler* resampler = AudioResampler::create(16, channels, + 8000, quality); + int looplimit = 100; timespec start, end; clock_gettime(CLOCK_MONOTONIC, &start); for (int i = 0; i < looplimit; ++i) { - resampler->resample((int*) output_vaddr, out_frames, &provider); - provider.reset(); // reset only provider as benchmarking + resampler->setSampleRate(9000); + resampler->setSampleRate(12000); + resampler->setSampleRate(20000); + resampler->setSampleRate(30000); } clock_gettime(CLOCK_MONOTONIC, &end); int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec; int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec; int64_t time = end_ns - start_ns; - printf("time(ns):%lld channels:%d quality:%d\n", time, channels, quality); - printf("%f Mspl/s\n", out_frames * looplimit / (time / 1e9) / 1e6); + printf("%.2f sample rate changes with filter calculation/sec\n", + looplimit * 4 / (time / 1e9)); + + // Check how fast sample rate changes are without filter changes. + // This should be very fast, probably 0.1us - 1us per sample rate + // change. + resampler->setSampleRate(1000); + looplimit = 1000; + clock_gettime(CLOCK_MONOTONIC, &start); + for (int i = 0; i < looplimit; ++i) { + resampler->setSampleRate(1000+i); + } + clock_gettime(CLOCK_MONOTONIC, &end); + start_ns = start.tv_sec * 1000000000LL + start.tv_nsec; + end_ns = end.tv_sec * 1000000000LL + end.tv_nsec; + time = end_ns - start_ns; + printf("%.2f sample rate changes without filter calculation/sec\n", + looplimit / (time / 1e9)); + resampler->reset(); + delete resampler; + } + + void* output_vaddr = malloc(output_size); + AudioResampler* resampler = AudioResampler::create(16, channels, + output_freq, quality); + size_t out_frames = output_size/8; + + /* set volume precision to 12 bits, so the volume scale is 1<<12. + * This means the "integer" part fits in the Q19.12 precision + * representation of output int32_t. + * + * Generally 0 < volumePrecision <= 14 (due to the limits of + * int16_t values for Volume). volumePrecision cannot be 0 due + * to rounding and shifts. + */ + const int volumePrecision = 12; // in bits + + resampler->setSampleRate(input_freq); + resampler->setVolume(1 << volumePrecision, 1 << volumePrecision); + + if (profileResample) { + /* + * For profiling on mobile devices, upon experimentation + * it is better to run a few trials with a shorter loop limit, + * and take the minimum time. + * + * Long tests can cause CPU temperature to build up and thermal throttling + * to reduce CPU frequency. + * + * For frequency checks (index=0, or 1, etc.): + * "cat /sys/devices/system/cpu/cpu${index}/cpufreq/scaling_*_freq" + * + * For temperature checks (index=0, or 1, etc.): + * "cat /sys/class/thermal/thermal_zone${index}/temp" + * + * Another way to avoid thermal throttling is to fix the CPU frequency + * at a lower level which prevents excessive temperatures. + */ + const int trials = 4; + const int looplimit = 4; + timespec start, end; + int64_t time; + + for (int n = 0; n < trials; ++n) { + clock_gettime(CLOCK_MONOTONIC, &start); + for (int i = 0; i < looplimit; ++i) { + resampler->resample((int*) output_vaddr, out_frames, &provider); + provider.reset(); // during benchmarking reset only the provider + } + clock_gettime(CLOCK_MONOTONIC, &end); + int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec; + int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec; + int64_t diff_ns = end_ns - start_ns; + if (n == 0 || diff_ns < time) { + time = diff_ns; // save the best out of our trials. + } + } + // Mfrms/s is "Millions of output frames per second". + printf("quality: %d channels: %d msec: %lld Mfrms/s: %.2lf\n", + quality, channels, time/1000000, out_frames * looplimit / (time / 1e9) / 1e6); resampler->reset(); } @@ -266,19 +351,31 @@ int main(int argc, char* argv[]) { if (gVerbose) { printf("reset() complete\n"); } + delete resampler; + resampler = NULL; // mono takes left channel only // stereo right channel is half amplitude of stereo left channel (due to input creation) int32_t* out = (int32_t*) output_vaddr; int16_t* convert = (int16_t*) malloc(out_frames * channels * sizeof(int16_t)); + // round to half towards zero and saturate at int16 (non-dithered) + const int roundVal = (1<<(volumePrecision-1)) - 1; // volumePrecision > 0 + for (size_t i = 0; i < out_frames; i++) { for (int j = 0; j < channels; j++) { - int32_t s = out[i * 2 + j] >> 12; - if (s > 32767) - s = 32767; - else if (s < -32768) - s = -32768; + int32_t s = out[i * 2 + j] + roundVal; // add offset here + if (s < 0) { + s = (s + 1) >> volumePrecision; // round to 0 + if (s < -32768) { + s = -32768; + } + } else { + s = s >> volumePrecision; + if (s > 32767) { + s = 32767; + } + } convert[i * channels + j] = int16_t(s); } } |