diff options
Diffstat (limited to 'services/audioflinger/AudioResamplerSinc.cpp')
-rw-r--r-- | services/audioflinger/AudioResamplerSinc.cpp | 255 |
1 files changed, 204 insertions, 51 deletions
diff --git a/services/audioflinger/AudioResamplerSinc.cpp b/services/audioflinger/AudioResamplerSinc.cpp index 8d9168b..7d3681c 100644 --- a/services/audioflinger/AudioResamplerSinc.cpp +++ b/services/audioflinger/AudioResamplerSinc.cpp @@ -17,6 +17,7 @@ #define LOG_TAG "AudioResamplerSinc" //#define LOG_NDEBUG 0 +#include <malloc.h> #include <string.h> #include <stdlib.h> #include <dlfcn.h> @@ -29,6 +30,20 @@ #include "AudioResamplerSinc.h" + +#if defined(__arm__) && !defined(__thumb__) +#define USE_INLINE_ASSEMBLY (true) +#else +#define USE_INLINE_ASSEMBLY (false) +#endif + +#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__) +#define USE_NEON (true) +#else +#define USE_NEON (false) +#endif + + namespace android { // ---------------------------------------------------------------------------- @@ -338,12 +353,16 @@ void AudioResamplerSinc::init_routine() return; } - readResampleCoefficients = (readCoefficientsFn) dlsym(resampleCoeffLib, - "readResamplerCoefficients"); - readResampleFirNumCoeffFn readResampleFirNumCoeff = (readResampleFirNumCoeffFn) + readResampleFirNumCoeffFn readResampleFirNumCoeff; + readResampleFirLerpIntBitsFn readResampleFirLerpIntBits; + + readResampleCoefficients = (readCoefficientsFn) + dlsym(resampleCoeffLib, "readResamplerCoefficients"); + readResampleFirNumCoeff = (readResampleFirNumCoeffFn) dlsym(resampleCoeffLib, "readResampleFirNumCoeff"); - readResampleFirLerpIntBitsFn readResampleFirLerpIntBits = (readResampleFirLerpIntBitsFn) + readResampleFirLerpIntBits = (readResampleFirLerpIntBitsFn) dlsym(resampleCoeffLib, "readResampleFirLerpIntBits"); + if (!readResampleCoefficients || !readResampleFirNumCoeff || !readResampleFirLerpIntBits) { readResampleCoefficients = NULL; dlclose(resampleCoeffLib); @@ -353,15 +372,14 @@ void AudioResamplerSinc::init_routine() } c = &veryHighQualityConstants; - // we have 16 coefs samples per zero-crossing c->coefsBits = readResampleFirLerpIntBits(); - ALOGV("coefsBits = %d", c->coefsBits); c->cShift = kNumPhaseBits - c->coefsBits; c->cMask = ((1<<c->coefsBits)-1) << c->cShift; c->pShift = kNumPhaseBits - c->coefsBits - pLerpBits; c->pMask = ((1<<pLerpBits)-1) << c->pShift; // number of zero-crossing on each side c->halfNumCoefs = readResampleFirNumCoeff(); + ALOGV("coefsBits = %d", c->coefsBits); ALOGV("halfNumCoefs = %d", c->halfNumCoefs); // note that we "leak" resampleCoeffLib until the process exits } @@ -371,7 +389,7 @@ void AudioResamplerSinc::init_routine() static inline int32_t mulRL(int left, int32_t in, uint32_t vRL) { -#if defined(__arm__) && !defined(__thumb__) +#if USE_INLINE_ASSEMBLY int32_t out; if (left) { asm( "smultb %[out], %[in], %[vRL] \n" @@ -394,7 +412,7 @@ int32_t mulRL(int left, int32_t in, uint32_t vRL) static inline int32_t mulAdd(int16_t in, int32_t v, int32_t a) { -#if defined(__arm__) && !defined(__thumb__) +#if USE_INLINE_ASSEMBLY int32_t out; asm( "smlawb %[out], %[v], %[in], %[a] \n" : [out]"=r"(out) @@ -409,7 +427,7 @@ int32_t mulAdd(int16_t in, int32_t v, int32_t a) static inline int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a) { -#if defined(__arm__) && !defined(__thumb__) +#if USE_INLINE_ASSEMBLY int32_t out; if (left) { asm( "smlawb %[out], %[v], %[inRL], %[a] \n" @@ -434,7 +452,7 @@ int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a) AudioResamplerSinc::AudioResamplerSinc(int bitDepth, int inChannelCount, int32_t sampleRate, src_quality quality) : AudioResampler(bitDepth, inChannelCount, sampleRate, quality), - mState(0) + mState(0), mImpulse(0), mRingFull(0), mFirCoefs(0) { /* * Layout of the state buffer for 32 tap: @@ -457,39 +475,34 @@ AudioResamplerSinc::AudioResamplerSinc(int bitDepth, if (ok != 0) { ALOGE("%s pthread_once failed: %d", __func__, ok); } - mConstants = (quality == VERY_HIGH_QUALITY) ? &veryHighQualityConstants : &highQualityConstants; + mConstants = (quality == VERY_HIGH_QUALITY) ? + &veryHighQualityConstants : &highQualityConstants; } -AudioResamplerSinc::~AudioResamplerSinc() -{ - delete[] mState; +AudioResamplerSinc::~AudioResamplerSinc() { + free(mState); } void AudioResamplerSinc::init() { - const Constants *c = mConstants; - - const size_t numCoefs = 2*c->halfNumCoefs; + const Constants& c(*mConstants); + const size_t numCoefs = 2 * c.halfNumCoefs; const size_t stateSize = numCoefs * mChannelCount * 2; - mState = new int16_t[stateSize]; + mState = (int16_t*)memalign(32, stateSize*sizeof(int16_t)); memset(mState, 0, sizeof(int16_t)*stateSize); - mImpulse = mState + (c->halfNumCoefs-1)*mChannelCount; + mImpulse = mState + (c.halfNumCoefs-1)*mChannelCount; mRingFull = mImpulse + (numCoefs+1)*mChannelCount; } void AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount, AudioBufferProvider* provider) { - // FIXME store current state (up or down sample) and only load the coefs when the state // changes. Or load two pointers one for up and one for down in the init function. // Not critical now since the read functions are fast, but would be important if read was slow. if (mConstants == &veryHighQualityConstants && readResampleCoefficients) { - ALOGV("get coefficient from libmm-audio resampler library"); - mFirCoefs = (mInSampleRate <= mSampleRate) ? readResampleCoefficients(true) : - readResampleCoefficients(false); + mFirCoefs = readResampleCoefficients( mInSampleRate <= mSampleRate ); } else { - ALOGV("Use default coefficients"); mFirCoefs = (mInSampleRate <= mSampleRate) ? mFirCoefsUp : mFirCoefsDown; } @@ -502,7 +515,6 @@ void AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount, resample<2>(out, outFrameCount, provider); break; } - } @@ -510,7 +522,8 @@ template<int CHANNELS> void AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount, AudioBufferProvider* provider) { - const Constants *c = mConstants; + const Constants& c(*mConstants); + const size_t headOffset = c.halfNumCoefs*CHANNELS; int16_t* impulse = mImpulse; uint32_t vRL = mVolumeRL; size_t inputIndex = mInputIndex; @@ -545,11 +558,11 @@ void AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount, } } } - int16_t *in = mBuffer.i16; + int16_t const * const in = mBuffer.i16; const size_t frameCount = mBuffer.frameCount; // Always read-in the first samples from the input buffer - int16_t* head = impulse + c->halfNumCoefs*CHANNELS; + int16_t* head = impulse + headOffset; for (size_t i=0 ; i<CHANNELS ; i++) { head[i] = in[inputIndex*CHANNELS + i]; } @@ -597,16 +610,17 @@ void AudioResamplerSinc::read( int16_t*& impulse, uint32_t& phaseFraction, const int16_t* in, size_t inputIndex) { - const Constants *c = mConstants; - const uint32_t phaseIndex = phaseFraction >> kNumPhaseBits; impulse += CHANNELS; phaseFraction -= 1LU<<kNumPhaseBits; + + const Constants& c(*mConstants); if (CC_UNLIKELY(impulse >= mRingFull)) { - const size_t stateSize = (c->halfNumCoefs*2)*CHANNELS; + const size_t stateSize = (c.halfNumCoefs*2)*CHANNELS; memcpy(mState, mState+stateSize, sizeof(int16_t)*stateSize); impulse -= stateSize; } - int16_t* head = impulse + c->halfNumCoefs*CHANNELS; + + int16_t* head = impulse + c.halfNumCoefs*CHANNELS; for (size_t i=0 ; i<CHANNELS ; i++) { head[i] = in[inputIndex*CHANNELS + i]; } @@ -616,39 +630,178 @@ template<int CHANNELS> void AudioResamplerSinc::filterCoefficient( int32_t& l, int32_t& r, uint32_t phase, const int16_t *samples, uint32_t vRL) { - const Constants *c = mConstants; - // compute the index of the coefficient on the positive side and // negative side - uint32_t indexP = ( phase & c->cMask) >> c->cShift; - uint32_t indexN = (-phase & c->cMask) >> c->cShift; - uint32_t lerpP = ( phase & c->pMask) >> c->pShift; - uint32_t lerpN = (-phase & c->pMask) >> c->pShift; + const Constants& c(*mConstants); + uint32_t indexP = ( phase & c.cMask) >> c.cShift; + uint32_t indexN = (-phase & c.cMask) >> c.cShift; + uint32_t lerpP = ( phase & c.pMask) >> c.pShift; + uint32_t lerpN = (-phase & c.pMask) >> c.pShift; if ((indexP == 0) && (lerpP == 0)) { - indexN = c->cMask >> c->cShift; - lerpN = c->pMask >> c->pShift; + indexN = c.cMask >> c.cShift; + lerpN = c.pMask >> c.pShift; } - const size_t offset = c->halfNumCoefs; + const size_t offset = c.halfNumCoefs; indexP *= offset; indexN *= offset; - int32_t const* const coefs = mFirCoefs; - int32_t const* coefsP = coefs + indexP; - int32_t const* coefsN = coefs + indexN; + int32_t const* coefsP = mFirCoefs + indexP; + int32_t const* coefsN = mFirCoefs + indexN; int16_t const* sP = samples; int16_t const* sN = samples + CHANNELS; l = 0; r = 0; size_t count = offset; - for (size_t i=0 ; i<count ; i++) { - interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP); - sP -= CHANNELS; - interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN); - sN += CHANNELS; + + if (!USE_NEON) { + for (size_t i=0 ; i<count ; i++) { + interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP); + sP -= CHANNELS; + interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN); + sN += CHANNELS; + } + l = 2 * mulRL(1, l, vRL); + r = 2 * mulRL(0, r, vRL); + } else if (CHANNELS == 1) { + int32_t const* coefsP1 = coefsP + offset; + int32_t const* coefsN1 = coefsN + offset; + sP -= CHANNELS*3; + asm ( + "vmov.32 d2[0], %[lerpP] \n" // load the positive phase + "vmov.32 d2[1], %[lerpN] \n" // load the negative phase + "veor q0, q0 \n" // result, initialize to 0 + + "1: \n" + "vld1.16 { d4}, [%[sP]] \n" // load 4 16-bits stereo samples + "vld1.32 { q8}, [%[coefsP0]]! \n" // load 4 32-bits coefs + "vld1.32 { q9}, [%[coefsP1]]! \n" // load 4 32-bits coefs for interpolation + "vld1.16 { d6}, [%[sN]]! \n" // load 4 16-bits stereo samples + "vld1.32 {q10}, [%[coefsN0]]! \n" // load 4 32-bits coefs + "vld1.32 {q11}, [%[coefsN1]]! \n" // load 4 32-bits coefs for interpolation + + "vrev64.16 d4, d4 \n" // reverse 2 frames of the positive side + + "vsub.s32 q9, q9, q8 \n" // interpolate (step1) 1st set of coefs + "vsub.s32 q11, q11, q10 \n" // interpolate (step1) 2nd set of coets + "vshll.s16 q12, d4, #15 \n" // extend samples to 31 bits + + "vqrdmulh.s32 q9, q9, d2[0] \n" // interpolate (step2) 1st set of coefs + "vqrdmulh.s32 q11, q11, d2[1] \n" // interpolate (step3) 2nd set of coefs + "vshll.s16 q14, d6, #15 \n" // extend samples to 31 bits + + "vadd.s32 q8, q8, q9 \n" // interpolate (step3) 1st set + "vadd.s32 q10, q10, q11 \n" // interpolate (step4) 2nd set + "subs %[count], %[count], #4 \n" // update loop counter + + "vqrdmulh.s32 q12, q12, q8 \n" // multiply samples by interpolated coef + "vqrdmulh.s32 q14, q14, q10 \n" // multiply samples by interpolated coef + "sub %[sP], %[sP], #8 \n" // move pointer to next set of samples + + "vadd.s32 q0, q0, q12 \n" // accumulate result + "vadd.s32 q0, q0, q14 \n" // accumulate result + + "bne 1b \n" // loop + + "vpadd.s32 d0, d0, d1 \n" // add all 4 partial sums + "vpadd.s32 d0, d0, d0 \n" // together + + "vmov.s32 %[l], d0[0] \n" // save result in ARM register + + : [l] "=r" (l), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsP1] "+r" (coefsP1), + [coefsN0] "+r" (coefsN), + [coefsN1] "+r" (coefsN1), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [lerpP] "r" (lerpP<<16), + [lerpN] "r" (lerpN<<16), + [vRL] "r" (vRL) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", + "q12", "q14" + ); + l = 2 * mulRL(1, l, vRL); + r = l; + } else if (CHANNELS == 2) { + int32_t const* coefsP1 = coefsP + offset; + int32_t const* coefsN1 = coefsN + offset; + sP -= CHANNELS*3; + asm ( + "vmov.32 d2[0], %[lerpP] \n" // load the positive phase + "vmov.32 d2[1], %[lerpN] \n" // load the negative phase + "veor q0, q0 \n" // result, initialize to 0 + "veor q4, q4 \n" // result, initialize to 0 + + "1: \n" + "vld2.16 {d4,d5}, [%[sP]] \n" // load 4 16-bits stereo samples + "vld1.32 { q8}, [%[coefsP0]]! \n" // load 4 32-bits coefs + "vld1.32 { q9}, [%[coefsP1]]! \n" // load 4 32-bits coefs for interpolation + "vld2.16 {d6,d7}, [%[sN]]! \n" // load 4 16-bits stereo samples + "vld1.32 {q10}, [%[coefsN0]]! \n" // load 4 32-bits coefs + "vld1.32 {q11}, [%[coefsN1]]! \n" // load 4 32-bits coefs for interpolation + + "vrev64.16 d4, d4 \n" // reverse 2 frames of the positive side + "vrev64.16 d5, d5 \n" // reverse 2 frames of the positive side + + "vsub.s32 q9, q9, q8 \n" // interpolate (step1) 1st set of coefs + "vsub.s32 q11, q11, q10 \n" // interpolate (step1) 2nd set of coets + "vshll.s16 q12, d4, #15 \n" // extend samples to 31 bits + "vshll.s16 q13, d5, #15 \n" // extend samples to 31 bits + + "vqrdmulh.s32 q9, q9, d2[0] \n" // interpolate (step2) 1st set of coefs + "vqrdmulh.s32 q11, q11, d2[1] \n" // interpolate (step3) 2nd set of coefs + "vshll.s16 q14, d6, #15 \n" // extend samples to 31 bits + "vshll.s16 q15, d7, #15 \n" // extend samples to 31 bits + + "vadd.s32 q8, q8, q9 \n" // interpolate (step3) 1st set + "vadd.s32 q10, q10, q11 \n" // interpolate (step4) 2nd set + "subs %[count], %[count], #4 \n" // update loop counter + + "vqrdmulh.s32 q12, q12, q8 \n" // multiply samples by interpolated coef + "vqrdmulh.s32 q13, q13, q8 \n" // multiply samples by interpolated coef + "vqrdmulh.s32 q14, q14, q10 \n" // multiply samples by interpolated coef + "vqrdmulh.s32 q15, q15, q10 \n" // multiply samples by interpolated coef + "sub %[sP], %[sP], #16 \n" // move pointer to next set of samples + + "vadd.s32 q0, q0, q12 \n" // accumulate result + "vadd.s32 q4, q4, q13 \n" // accumulate result + "vadd.s32 q0, q0, q14 \n" // accumulate result + "vadd.s32 q4, q4, q15 \n" // accumulate result + + "bne 1b \n" // loop + + "vpadd.s32 d0, d0, d1 \n" // add all 4 partial sums + "vpadd.s32 d8, d8, d9 \n" // add all 4 partial sums + "vpadd.s32 d0, d0, d0 \n" // together + "vpadd.s32 d8, d8, d8 \n" // together + + "vmov.s32 %[l], d0[0] \n" // save result in ARM register + "vmov.s32 %[r], d8[0] \n" // save result in ARM register + + : [l] "=r" (l), + [r] "=r" (r), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsP1] "+r" (coefsP1), + [coefsN0] "+r" (coefsN), + [coefsN1] "+r" (coefsN1), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [lerpP] "r" (lerpP<<16), + [lerpN] "r" (lerpN<<16), + [vRL] "r" (vRL) + : "cc", "memory", + "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15" + ); + l = 2 * mulRL(1, l, vRL); + r = 2 * mulRL(0, r, vRL); } - l = 2 * mulRL(1, l, vRL); - r = 2 * mulRL(0, r, vRL); } template<int CHANNELS> |