1 files changed, 204 insertions, 51 deletions
diff --git a/services/audioflinger/AudioResamplerSinc.cpp b/services/audioflinger/AudioResamplerSinc.cpp
index 8d9168b..7d3681c 100644
--- a/services/audioflinger/AudioResamplerSinc.cpp
+++ b/services/audioflinger/AudioResamplerSinc.cpp
@@ -17,6 +17,7 @@
 #define LOG_TAG "AudioResamplerSinc"
 //#define LOG_NDEBUG 0
 
+#include <malloc.h>
 #include <string.h>
 #include <stdlib.h>
 #include <dlfcn.h>
@@ -29,6 +30,20 @@
 #include "AudioResamplerSinc.h"
 
 
+
+#if defined(__arm__) && !defined(__thumb__)
+#define USE_INLINE_ASSEMBLY (true)
+#else
+#define USE_INLINE_ASSEMBLY (false)
+#endif
+
+#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__)
+#define USE_NEON (true)
+#else
+#define USE_NEON (false)
+#endif
+
+
 namespace android {
 // ----------------------------------------------------------------------------
 
@@ -338,12 +353,16 @@ void AudioResamplerSinc::init_routine()
         return;
     }
 
-    readResampleCoefficients = (readCoefficientsFn) dlsym(resampleCoeffLib,
-            "readResamplerCoefficients");
-    readResampleFirNumCoeffFn readResampleFirNumCoeff = (readResampleFirNumCoeffFn)
+    readResampleFirNumCoeffFn readResampleFirNumCoeff;
+    readResampleFirLerpIntBitsFn readResampleFirLerpIntBits;
+
+    readResampleCoefficients = (readCoefficientsFn)
+            dlsym(resampleCoeffLib, "readResamplerCoefficients");
+    readResampleFirNumCoeff = (readResampleFirNumCoeffFn)
             dlsym(resampleCoeffLib, "readResampleFirNumCoeff");
-    readResampleFirLerpIntBitsFn readResampleFirLerpIntBits = (readResampleFirLerpIntBitsFn)
+    readResampleFirLerpIntBits = (readResampleFirLerpIntBitsFn)
             dlsym(resampleCoeffLib, "readResampleFirLerpIntBits");
+
     if (!readResampleCoefficients || !readResampleFirNumCoeff || !readResampleFirLerpIntBits) {
         readResampleCoefficients = NULL;
         dlclose(resampleCoeffLib);
@@ -353,15 +372,14 @@ void AudioResamplerSinc::init_routine()
     }
 
     c = &veryHighQualityConstants;
-    // we have 16 coefs samples per zero-crossing
     c->coefsBits = readResampleFirLerpIntBits();
-    ALOGV("coefsBits = %d", c->coefsBits);
     c->cShift = kNumPhaseBits - c->coefsBits;
     c->cMask = ((1<<c->coefsBits)-1) << c->cShift;
     c->pShift = kNumPhaseBits - c->coefsBits - pLerpBits;
     c->pMask = ((1<<pLerpBits)-1) << c->pShift;
     // number of zero-crossing on each side
     c->halfNumCoefs = readResampleFirNumCoeff();
+    ALOGV("coefsBits = %d", c->coefsBits);
     ALOGV("halfNumCoefs = %d", c->halfNumCoefs);
     // note that we "leak" resampleCoeffLib until the process exits
 }
@@ -371,7 +389,7 @@ void AudioResamplerSinc::init_routine()
 static inline
 int32_t mulRL(int left, int32_t in, uint32_t vRL)
 {
-#if defined(__arm__) && !defined(__thumb__)
+#if USE_INLINE_ASSEMBLY
     int32_t out;
     if (left) {
         asm( "smultb %[out], %[in], %[vRL] \n"
@@ -394,7 +412,7 @@ int32_t mulRL(int left, int32_t in, uint32_t vRL)
 static inline
 int32_t mulAdd(int16_t in, int32_t v, int32_t a)
 {
-#if defined(__arm__) && !defined(__thumb__)
+#if USE_INLINE_ASSEMBLY
     int32_t out;
     asm( "smlawb %[out], %[v], %[in], %[a] \n"
          : [out]"=r"(out)
@@ -409,7 +427,7 @@ int32_t mulAdd(int16_t in, int32_t v, int32_t a)
 static inline
 int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a)
 {
-#if defined(__arm__) && !defined(__thumb__)
+#if USE_INLINE_ASSEMBLY
     int32_t out;
     if (left) {
         asm( "smlawb %[out], %[v], %[inRL], %[a] \n"
@@ -434,7 +452,7 @@ int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a)
 AudioResamplerSinc::AudioResamplerSinc(int bitDepth,
         int inChannelCount, int32_t sampleRate, src_quality quality)
     : AudioResampler(bitDepth, inChannelCount, sampleRate, quality),
-    mState(0)
+    mState(0), mImpulse(0), mRingFull(0), mFirCoefs(0)
 {
     /*
      * Layout of the state buffer for 32 tap:
@@ -457,39 +475,34 @@ AudioResamplerSinc::AudioResamplerSinc(int bitDepth,
     if (ok != 0) {
         ALOGE("%s pthread_once failed: %d", __func__, ok);
     }
-    mConstants = (quality == VERY_HIGH_QUALITY) ? &veryHighQualityConstants : &highQualityConstants;
+    mConstants = (quality == VERY_HIGH_QUALITY) ?
+            &veryHighQualityConstants : &highQualityConstants;
 }
 
 
-AudioResamplerSinc::~AudioResamplerSinc()
-{
-    delete[] mState;
+AudioResamplerSinc::~AudioResamplerSinc() {
+    free(mState);
 }
 
 void AudioResamplerSinc::init() {
-    const Constants *c = mConstants;
-
-    const size_t numCoefs = 2*c->halfNumCoefs;
+    const Constants& c(*mConstants);
+    const size_t numCoefs = 2 * c.halfNumCoefs;
     const size_t stateSize = numCoefs * mChannelCount * 2;
-    mState = new int16_t[stateSize];
+    mState = (int16_t*)memalign(32, stateSize*sizeof(int16_t));
     memset(mState, 0, sizeof(int16_t)*stateSize);
-    mImpulse = mState + (c->halfNumCoefs-1)*mChannelCount;
+    mImpulse  = mState   + (c.halfNumCoefs-1)*mChannelCount;
     mRingFull = mImpulse + (numCoefs+1)*mChannelCount;
 }
 
 void AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount,
             AudioBufferProvider* provider)
 {
-
     // FIXME store current state (up or down sample) and only load the coefs when the state
     // changes. Or load two pointers one for up and one for down in the init function.
     // Not critical now since the read functions are fast, but would be important if read was slow.
     if (mConstants == &veryHighQualityConstants && readResampleCoefficients) {
-        ALOGV("get coefficient from libmm-audio resampler library");
-        mFirCoefs = (mInSampleRate <= mSampleRate) ? readResampleCoefficients(true) :
-                readResampleCoefficients(false);
+        mFirCoefs = readResampleCoefficients( mInSampleRate <= mSampleRate );
     } else {
-        ALOGV("Use default coefficients");
         mFirCoefs = (mInSampleRate <= mSampleRate) ? mFirCoefsUp : mFirCoefsDown;
     }
 
@@ -502,7 +515,6 @@ void AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount,
         resample<2>(out, outFrameCount, provider);
         break;
     }
-
 }
 
 
@@ -510,7 +522,8 @@ template<int CHANNELS>
 void AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount,
         AudioBufferProvider* provider)
 {
-    const Constants *c = mConstants;
+    const Constants& c(*mConstants);
+    const size_t headOffset = c.halfNumCoefs*CHANNELS;
     int16_t* impulse = mImpulse;
     uint32_t vRL = mVolumeRL;
     size_t inputIndex = mInputIndex;
@@ -545,11 +558,11 @@ void AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount,
                 }
             }
         }
-        int16_t *in = mBuffer.i16;
+        int16_t const * const in = mBuffer.i16;
         const size_t frameCount = mBuffer.frameCount;
 
         // Always read-in the first samples from the input buffer
-        int16_t* head = impulse + c->halfNumCoefs*CHANNELS;
+        int16_t* head = impulse + headOffset;
         for (size_t i=0 ; i<CHANNELS ; i++) {
             head[i] = in[inputIndex*CHANNELS + i];
         }
@@ -597,16 +610,17 @@ void AudioResamplerSinc::read(
         int16_t*& impulse, uint32_t& phaseFraction,
         const int16_t* in, size_t inputIndex)
 {
-    const Constants *c = mConstants;
-    const uint32_t phaseIndex = phaseFraction >> kNumPhaseBits;
     impulse += CHANNELS;
     phaseFraction -= 1LU<<kNumPhaseBits;
+
+    const Constants& c(*mConstants);
     if (CC_UNLIKELY(impulse >= mRingFull)) {
-        const size_t stateSize = (c->halfNumCoefs*2)*CHANNELS;
+        const size_t stateSize = (c.halfNumCoefs*2)*CHANNELS;
         memcpy(mState, mState+stateSize, sizeof(int16_t)*stateSize);
         impulse -= stateSize;
     }
-    int16_t* head = impulse + c->halfNumCoefs*CHANNELS;
+
+    int16_t* head = impulse + c.halfNumCoefs*CHANNELS;
     for (size_t i=0 ; i<CHANNELS ; i++) {
         head[i] = in[inputIndex*CHANNELS + i];
     }
@@ -616,39 +630,178 @@ template<int CHANNELS>
 void AudioResamplerSinc::filterCoefficient(
         int32_t& l, int32_t& r, uint32_t phase, const int16_t *samples, uint32_t vRL)
 {
-    const Constants *c = mConstants;
-
     // compute the index of the coefficient on the positive side and
     // negative side
-    uint32_t indexP = ( phase & c->cMask) >> c->cShift;
-    uint32_t indexN = (-phase & c->cMask) >> c->cShift;
-    uint32_t lerpP  = ( phase & c->pMask) >> c->pShift;
-    uint32_t lerpN  = (-phase & c->pMask) >> c->pShift;
+    const Constants& c(*mConstants);
+    uint32_t indexP = ( phase & c.cMask) >> c.cShift;
+    uint32_t indexN = (-phase & c.cMask) >> c.cShift;
+    uint32_t lerpP  = ( phase & c.pMask) >> c.pShift;
+    uint32_t lerpN  = (-phase & c.pMask) >> c.pShift;
     if ((indexP == 0) && (lerpP == 0)) {
-        indexN = c->cMask >> c->cShift;
-        lerpN  = c->pMask >> c->pShift;
+        indexN = c.cMask >> c.cShift;
+        lerpN  = c.pMask >> c.pShift;
     }
-    const size_t offset = c->halfNumCoefs;
+    const size_t offset = c.halfNumCoefs;
     indexP *= offset;
     indexN *= offset;
 
-    int32_t const* const coefs = mFirCoefs;
-    int32_t const* coefsP = coefs + indexP;
-    int32_t const* coefsN = coefs + indexN;
+    int32_t const* coefsP = mFirCoefs + indexP;
+    int32_t const* coefsN = mFirCoefs + indexN;
     int16_t const* sP = samples;
     int16_t const* sN = samples + CHANNELS;
 
     l = 0;
     r = 0;
     size_t count = offset;
-    for (size_t i=0 ; i<count ; i++) {
-        interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
-        sP -= CHANNELS;
-        interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
-        sN += CHANNELS;
+
+    if (!USE_NEON) {
+        for (size_t i=0 ; i<count ; i++) {
+            interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
+            sP -= CHANNELS;
+            interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
+            sN += CHANNELS;
+        }
+        l = 2 * mulRL(1, l, vRL);
+        r = 2 * mulRL(0, r, vRL);
+    } else if (CHANNELS == 1) {
+        int32_t const* coefsP1 = coefsP + offset;
+        int32_t const* coefsN1 = coefsN + offset;
+        sP -= CHANNELS*3;
+        asm (
+            "vmov.32        d2[0], %[lerpP]          \n"    // load the positive phase
+            "vmov.32        d2[1], %[lerpN]          \n"    // load the negative phase
+            "veor           q0, q0                   \n"    // result, initialize to 0
+
+            "1:                                      \n"
+            "vld1.16        { d4}, [%[sP]]           \n"    // load 4 16-bits stereo samples
+            "vld1.32        { q8}, [%[coefsP0]]!     \n"    // load 4 32-bits coefs
+            "vld1.32        { q9}, [%[coefsP1]]!     \n"    // load 4 32-bits coefs for interpolation
+            "vld1.16        { d6}, [%[sN]]!          \n"    // load 4 16-bits stereo samples
+            "vld1.32        {q10}, [%[coefsN0]]!     \n"    // load 4 32-bits coefs
+            "vld1.32        {q11}, [%[coefsN1]]!     \n"    // load 4 32-bits coefs for interpolation
+
+            "vrev64.16      d4, d4                   \n"    // reverse 2 frames of the positive side
+
+            "vsub.s32        q9,  q9,  q8            \n"    // interpolate (step1) 1st set of coefs
+            "vsub.s32       q11, q11, q10            \n"    // interpolate (step1) 2nd set of coets
+            "vshll.s16      q12,  d4, #15            \n"    // extend samples to 31 bits
+
+            "vqrdmulh.s32    q9,  q9, d2[0]          \n"    // interpolate (step2) 1st set of coefs
+            "vqrdmulh.s32   q11, q11, d2[1]          \n"    // interpolate (step3) 2nd set of coefs
+            "vshll.s16      q14,  d6, #15            \n"    // extend samples to 31 bits
+
+            "vadd.s32        q8,  q8,  q9            \n"    // interpolate (step3) 1st set
+            "vadd.s32       q10, q10, q11            \n"    // interpolate (step4) 2nd set
+            "subs           %[count], %[count], #4   \n"    // update loop counter
+
+            "vqrdmulh.s32   q12, q12, q8             \n"    // multiply samples by interpolated coef
+            "vqrdmulh.s32   q14, q14, q10            \n"    // multiply samples by interpolated coef
+            "sub            %[sP], %[sP], #8         \n"    // move pointer to next set of samples
+
+            "vadd.s32       q0, q0, q12              \n"    // accumulate result
+            "vadd.s32       q0, q0, q14              \n"    // accumulate result
+
+            "bne            1b                       \n"    // loop
+
+            "vpadd.s32      d0, d0, d1               \n"    // add all 4 partial sums
+            "vpadd.s32      d0, d0, d0               \n"    // together
+
+            "vmov.s32       %[l], d0[0]              \n"    // save result in ARM register
+
+            : [l]  "=r" (l),
+              [count]   "+r" (count),
+              [coefsP0] "+r" (coefsP),
+              [coefsP1] "+r" (coefsP1),
+              [coefsN0] "+r" (coefsN),
+              [coefsN1] "+r" (coefsN1),
+              [sP]      "+r" (sP),
+              [sN]      "+r" (sN)
+            : [lerpP]   "r" (lerpP<<16),
+              [lerpN]   "r" (lerpN<<16),
+              [vRL]     "r" (vRL)
+            : "cc", "memory",
+              "q0", "q1", "q2", "q3",
+              "q8", "q9", "q10", "q11",
+              "q12", "q14"
+        );
+        l = 2 * mulRL(1, l, vRL);
+        r = l;
+    } else if (CHANNELS == 2) {
+        int32_t const* coefsP1 = coefsP + offset;
+        int32_t const* coefsN1 = coefsN + offset;
+        sP -= CHANNELS*3;
+        asm (
+            "vmov.32        d2[0], %[lerpP]          \n"    // load the positive phase
+            "vmov.32        d2[1], %[lerpN]          \n"    // load the negative phase
+            "veor           q0, q0                   \n"    // result, initialize to 0
+            "veor           q4, q4                   \n"    // result, initialize to 0
+
+            "1:                                      \n"
+            "vld2.16        {d4,d5}, [%[sP]]         \n"    // load 4 16-bits stereo samples
+            "vld1.32        { q8}, [%[coefsP0]]!     \n"    // load 4 32-bits coefs
+            "vld1.32        { q9}, [%[coefsP1]]!     \n"    // load 4 32-bits coefs for interpolation
+            "vld2.16        {d6,d7}, [%[sN]]!        \n"    // load 4 16-bits stereo samples
+            "vld1.32        {q10}, [%[coefsN0]]!     \n"    // load 4 32-bits coefs
+            "vld1.32        {q11}, [%[coefsN1]]!     \n"    // load 4 32-bits coefs for interpolation
+
+            "vrev64.16      d4, d4                   \n"    // reverse 2 frames of the positive side
+            "vrev64.16      d5, d5                   \n"    // reverse 2 frames of the positive side
+
+            "vsub.s32        q9,  q9,  q8            \n"    // interpolate (step1) 1st set of coefs
+            "vsub.s32       q11, q11, q10            \n"    // interpolate (step1) 2nd set of coets
+            "vshll.s16      q12,  d4, #15            \n"    // extend samples to 31 bits
+            "vshll.s16      q13,  d5, #15            \n"    // extend samples to 31 bits
+
+            "vqrdmulh.s32    q9,  q9, d2[0]          \n"    // interpolate (step2) 1st set of coefs
+            "vqrdmulh.s32   q11, q11, d2[1]          \n"    // interpolate (step3) 2nd set of coefs
+            "vshll.s16      q14,  d6, #15            \n"    // extend samples to 31 bits
+            "vshll.s16      q15,  d7, #15            \n"    // extend samples to 31 bits
+
+            "vadd.s32        q8,  q8,  q9            \n"    // interpolate (step3) 1st set
+            "vadd.s32       q10, q10, q11            \n"    // interpolate (step4) 2nd set
+            "subs           %[count], %[count], #4   \n"    // update loop counter
+
+            "vqrdmulh.s32   q12, q12, q8             \n"    // multiply samples by interpolated coef
+            "vqrdmulh.s32   q13, q13, q8             \n"    // multiply samples by interpolated coef
+            "vqrdmulh.s32   q14, q14, q10            \n"    // multiply samples by interpolated coef
+            "vqrdmulh.s32   q15, q15, q10            \n"    // multiply samples by interpolated coef
+            "sub            %[sP], %[sP], #16        \n"    // move pointer to next set of samples
+
+            "vadd.s32       q0, q0, q12              \n"    // accumulate result
+            "vadd.s32       q4, q4, q13              \n"    // accumulate result
+            "vadd.s32       q0, q0, q14              \n"    // accumulate result
+            "vadd.s32       q4, q4, q15              \n"    // accumulate result
+
+            "bne            1b                       \n"    // loop
+
+            "vpadd.s32      d0, d0, d1               \n"    // add all 4 partial sums
+            "vpadd.s32      d8, d8, d9               \n"    // add all 4 partial sums
+            "vpadd.s32      d0, d0, d0               \n"    // together
+            "vpadd.s32      d8, d8, d8               \n"    // together
+
+            "vmov.s32       %[l], d0[0]              \n"    // save result in ARM register
+            "vmov.s32       %[r], d8[0]              \n"    // save result in ARM register
+
+            : [l]  "=r" (l),
+              [r]  "=r" (r),
+              [count]   "+r" (count),
+              [coefsP0] "+r" (coefsP),
+              [coefsP1] "+r" (coefsP1),
+              [coefsN0] "+r" (coefsN),
+              [coefsN1] "+r" (coefsN1),
+              [sP]      "+r" (sP),
+              [sN]      "+r" (sN)
+            : [lerpP]   "r" (lerpP<<16),
+              [lerpN]   "r" (lerpN<<16),
+              [vRL]     "r" (vRL)
+            : "cc", "memory",
+              "q0", "q1", "q2", "q3", "q4",
+              "q8", "q9", "q10", "q11",
+              "q12", "q13", "q14", "q15"
+        );
+        l = 2 * mulRL(1, l, vRL);
+        r = 2 * mulRL(0, r, vRL);
     }
-    l = 2 * mulRL(1, l, vRL);
-    r = 2 * mulRL(0, r, vRL);
 }
 
 template<int CHANNELS>