refactor code to improve neon code

we want to make sure we don't transfer data from the neon unit to the arm register file, as this can be quite slow. instead we do all the calculation on the neon side and write the result directly to main memory. Change-Id: Ibb56664d3ab03098ae2798b75e2b6927ac900187
author: Mathias Agopian <mathias@google.com> 2012-11-10 03:26:39 -0800
committer: Mathias Agopian <mathias@google.com> 2012-11-10 03:26:39 -0800
commit: 0d585c85524eb5d398fadff5ca8dd43939ed9cb4 (patch)
tree: b3e3b69dc47885811e7549e5dd8a3c340546e902 /services/audioflinger/AudioResamplerSinc.cpp
parent: e35a582d02586cb43ad9e344138b4edad62868b3 (diff)
download: frameworks_av-0d585c85524eb5d398fadff5ca8dd43939ed9cb4.zip
frameworks_av-0d585c85524eb5d398fadff5ca8dd43939ed9cb4.tar.gz
frameworks_av-0d585c85524eb5d398fadff5ca8dd43939ed9cb4.tar.bz2
1 files changed, 42 insertions, 33 deletions
diff --git a/services/audioflinger/AudioResamplerSinc.cpp b/services/audioflinger/AudioResamplerSinc.cpp
index 7d3681c..952abb4 100644
--- a/services/audioflinger/AudioResamplerSinc.cpp
+++ b/services/audioflinger/AudioResamplerSinc.cpp
@@ -470,6 +470,9 @@ AudioResamplerSinc::AudioResamplerSinc(int bitDepth,
      *
      */
 
+    mVolumeSIMD[0] = 0;
+    mVolumeSIMD[1] = 0;
+
     // Load the constants for coefficients
     int ok = pthread_once(&once_control, init_routine);
     if (ok != 0) {
@@ -494,6 +497,12 @@ void AudioResamplerSinc::init() {
     mRingFull = mImpulse + (numCoefs+1)*mChannelCount;
 }
 
+void AudioResamplerSinc::setVolume(int16_t left, int16_t right) {
+    AudioResampler::setVolume(left, right);
+    mVolumeSIMD[0] = int32_t(left)<<16;
+    mVolumeSIMD[1] = int32_t(right)<<16;
+}
+
 void AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount,
             AudioBufferProvider* provider)
 {
@@ -568,11 +577,9 @@ void AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount,
         }
 
         // handle boundary case
-        int32_t l, r;
         while (CC_LIKELY(outputIndex < outputSampleCount)) {
-            filterCoefficient<CHANNELS>(l, r, phaseFraction, impulse, vRL);
-            out[outputIndex++] += l;
-            out[outputIndex++] += r;
+            filterCoefficient<CHANNELS>(&out[outputIndex], phaseFraction, impulse, vRL);
+            outputIndex += 2;
 
             phaseFraction += phaseIncrement;
             const size_t phaseIndex = phaseFraction >> kNumPhaseBits;
@@ -628,7 +635,7 @@ void AudioResamplerSinc::read(
 
 template<int CHANNELS>
 void AudioResamplerSinc::filterCoefficient(
-        int32_t& l, int32_t& r, uint32_t phase, const int16_t *samples, uint32_t vRL)
+        int32_t* out, uint32_t phase, const int16_t *samples, uint32_t vRL)
 {
     // compute the index of the coefficient on the positive side and
     // negative side
@@ -650,19 +657,19 @@ void AudioResamplerSinc::filterCoefficient(
     int16_t const* sP = samples;
     int16_t const* sN = samples + CHANNELS;
 
-    l = 0;
-    r = 0;
     size_t count = offset;
 
     if (!USE_NEON) {
+        int32_t l = 0;
+        int32_t r = 0;
         for (size_t i=0 ; i<count ; i++) {
             interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
             sP -= CHANNELS;
             interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
             sN += CHANNELS;
         }
-        l = 2 * mulRL(1, l, vRL);
-        r = 2 * mulRL(0, r, vRL);
+        out[0] += 2 * mulRL(1, l, vRL);
+        out[1] += 2 * mulRL(0, r, vRL);
     } else if (CHANNELS == 1) {
         int32_t const* coefsP1 = coefsP + offset;
         int32_t const* coefsN1 = coefsN + offset;
@@ -674,11 +681,11 @@ void AudioResamplerSinc::filterCoefficient(
 
             "1:                                      \n"
             "vld1.16        { d4}, [%[sP]]           \n"    // load 4 16-bits stereo samples
-            "vld1.32        { q8}, [%[coefsP0]]!     \n"    // load 4 32-bits coefs
-            "vld1.32        { q9}, [%[coefsP1]]!     \n"    // load 4 32-bits coefs for interpolation
+            "vld1.32        { q8}, [%[coefsP0]:128]! \n"    // load 4 32-bits coefs
+            "vld1.32        { q9}, [%[coefsP1]:128]! \n"    // load 4 32-bits coefs for interpolation
             "vld1.16        { d6}, [%[sN]]!          \n"    // load 4 16-bits stereo samples
-            "vld1.32        {q10}, [%[coefsN0]]!     \n"    // load 4 32-bits coefs
-            "vld1.32        {q11}, [%[coefsN1]]!     \n"    // load 4 32-bits coefs for interpolation
+            "vld1.32        {q10}, [%[coefsN0]:128]! \n"    // load 4 32-bits coefs
+            "vld1.32        {q11}, [%[coefsN1]:128]! \n"    // load 4 32-bits coefs for interpolation
 
             "vrev64.16      d4, d4                   \n"    // reverse 2 frames of the positive side
 
@@ -703,12 +710,16 @@ void AudioResamplerSinc::filterCoefficient(
 
             "bne            1b                       \n"    // loop
 
+            "vld1.s32       {d2}, [%[vLR]]           \n"    // load volumes
+            "vld1.s32       {d3}, %[out]             \n"    // load the output
             "vpadd.s32      d0, d0, d1               \n"    // add all 4 partial sums
             "vpadd.s32      d0, d0, d0               \n"    // together
+            "vdup.i32       d0, d0[0]                \n"    // interleave L,R channels
+            "vqrdmulh.s32   d0, d0, d2               \n"    // apply volume
+            "vadd.s32       d3, d3, d0               \n"    // accumulate result
+            "vst1.s32       {d0}, %[out]             \n"    // store result
 
-            "vmov.s32       %[l], d0[0]              \n"    // save result in ARM register
-
-            : [l]  "=r" (l),
+            : [out]     "=Uv" (out[0]),
               [count]   "+r" (count),
               [coefsP0] "+r" (coefsP),
               [coefsP1] "+r" (coefsP1),
@@ -718,14 +729,12 @@ void AudioResamplerSinc::filterCoefficient(
               [sN]      "+r" (sN)
             : [lerpP]   "r" (lerpP<<16),
               [lerpN]   "r" (lerpN<<16),
-              [vRL]     "r" (vRL)
+              [vLR]     "r" (mVolumeSIMD)
             : "cc", "memory",
               "q0", "q1", "q2", "q3",
               "q8", "q9", "q10", "q11",
               "q12", "q14"
         );
-        l = 2 * mulRL(1, l, vRL);
-        r = l;
     } else if (CHANNELS == 2) {
         int32_t const* coefsP1 = coefsP + offset;
         int32_t const* coefsN1 = coefsN + offset;
@@ -738,11 +747,11 @@ void AudioResamplerSinc::filterCoefficient(
 
             "1:                                      \n"
             "vld2.16        {d4,d5}, [%[sP]]         \n"    // load 4 16-bits stereo samples
-            "vld1.32        { q8}, [%[coefsP0]]!     \n"    // load 4 32-bits coefs
-            "vld1.32        { q9}, [%[coefsP1]]!     \n"    // load 4 32-bits coefs for interpolation
+            "vld1.32        { q8}, [%[coefsP0]:128]! \n"    // load 4 32-bits coefs
+            "vld1.32        { q9}, [%[coefsP1]:128]! \n"    // load 4 32-bits coefs for interpolation
             "vld2.16        {d6,d7}, [%[sN]]!        \n"    // load 4 16-bits stereo samples
-            "vld1.32        {q10}, [%[coefsN0]]!     \n"    // load 4 32-bits coefs
-            "vld1.32        {q11}, [%[coefsN1]]!     \n"    // load 4 32-bits coefs for interpolation
+            "vld1.32        {q10}, [%[coefsN0]:128]! \n"    // load 4 32-bits coefs
+            "vld1.32        {q11}, [%[coefsN1]:128]! \n"    // load 4 32-bits coefs for interpolation
 
             "vrev64.16      d4, d4                   \n"    // reverse 2 frames of the positive side
             "vrev64.16      d5, d5                   \n"    // reverse 2 frames of the positive side
@@ -774,16 +783,18 @@ void AudioResamplerSinc::filterCoefficient(
 
             "bne            1b                       \n"    // loop
 
-            "vpadd.s32      d0, d0, d1               \n"    // add all 4 partial sums
-            "vpadd.s32      d8, d8, d9               \n"    // add all 4 partial sums
+            "vld1.s32       {d2}, [%[vLR]]           \n"    // load volumes
+            "vld1.s32       {d3}, %[out]             \n"    // load the output
+            "vpadd.s32      d0, d0, d1               \n"    // add all 4 partial sums from q0
+            "vpadd.s32      d8, d8, d9               \n"    // add all 4 partial sums from q4
             "vpadd.s32      d0, d0, d0               \n"    // together
             "vpadd.s32      d8, d8, d8               \n"    // together
+            "vtrn.s32       d0, d8                   \n"    // interlace L,R channels
+            "vqrdmulh.s32   d0, d0, d2               \n"    // apply volume
+            "vadd.s32       d3, d3, d0               \n"    // accumulate result
+            "vst1.s32       {d0}, %[out]             \n"    // store result
 
-            "vmov.s32       %[l], d0[0]              \n"    // save result in ARM register
-            "vmov.s32       %[r], d8[0]              \n"    // save result in ARM register
-
-            : [l]  "=r" (l),
-              [r]  "=r" (r),
+            : [out]     "=Uv" (out[0]),
               [count]   "+r" (count),
               [coefsP0] "+r" (coefsP),
               [coefsP1] "+r" (coefsP1),
@@ -793,14 +804,12 @@ void AudioResamplerSinc::filterCoefficient(
               [sN]      "+r" (sN)
             : [lerpP]   "r" (lerpP<<16),
               [lerpN]   "r" (lerpN<<16),
-              [vRL]     "r" (vRL)
+              [vLR]     "r" (mVolumeSIMD)
             : "cc", "memory",
               "q0", "q1", "q2", "q3", "q4",
               "q8", "q9", "q10", "q11",
               "q12", "q13", "q14", "q15"
         );
-        l = 2 * mulRL(1, l, vRL);
-        r = 2 * mulRL(0, r, vRL);
     }
 }
author	Mathias Agopian <mathias@google.com>	2012-11-10 03:26:39 -0800
committer	Mathias Agopian <mathias@google.com>	2012-11-10 03:26:39 -0800
commit	0d585c85524eb5d398fadff5ca8dd43939ed9cb4 (patch)
tree	b3e3b69dc47885811e7549e5dd8a3c340546e902 /services/audioflinger/AudioResamplerSinc.cpp
parent	e35a582d02586cb43ad9e344138b4edad62868b3 (diff)
download	frameworks_av-0d585c85524eb5d398fadff5ca8dd43939ed9cb4.zip frameworks_av-0d585c85524eb5d398fadff5ca8dd43939ed9cb4.tar.gz frameworks_av-0d585c85524eb5d398fadff5ca8dd43939ed9cb4.tar.bz2