summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMathias Agopian <mathias@google.com>2012-11-04 15:16:13 -0800
committerAndroid (Google) Code Review <android-gerrit@google.com>2012-11-08 12:45:59 -0800
commitc6ccbabd2800e749bc86a34d2ae233f99102ba51 (patch)
treef11c89a8d8bbba54a0558e78e2b480a0407bc516
parentaf03dfb73d05dfabdf55640aff8b2256fdd78663 (diff)
downloadframeworks_av-c6ccbabd2800e749bc86a34d2ae233f99102ba51.zip
frameworks_av-c6ccbabd2800e749bc86a34d2ae233f99102ba51.tar.gz
frameworks_av-c6ccbabd2800e749bc86a34d2ae233f99102ba51.tar.bz2
NEON optimized SINC resampler
this currently gives us a 60% to 80% boost depending on the quality level selected. Change-Id: I7db385007e811ed7bffe5fd3403b44e300894f5b
-rw-r--r--services/audioflinger/AudioResamplerSinc.cpp175
-rw-r--r--services/audioflinger/test-resample.cpp2
2 files changed, 166 insertions, 11 deletions
diff --git a/services/audioflinger/AudioResamplerSinc.cpp b/services/audioflinger/AudioResamplerSinc.cpp
index 165bb61..7d3681c 100644
--- a/services/audioflinger/AudioResamplerSinc.cpp
+++ b/services/audioflinger/AudioResamplerSinc.cpp
@@ -30,6 +30,20 @@
#include "AudioResamplerSinc.h"
+
+#if defined(__arm__) && !defined(__thumb__)
+#define USE_INLINE_ASSEMBLY (true)
+#else
+#define USE_INLINE_ASSEMBLY (false)
+#endif
+
+#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__)
+#define USE_NEON (true)
+#else
+#define USE_NEON (false)
+#endif
+
+
namespace android {
// ----------------------------------------------------------------------------
@@ -375,7 +389,7 @@ void AudioResamplerSinc::init_routine()
static inline
int32_t mulRL(int left, int32_t in, uint32_t vRL)
{
-#if defined(__arm__) && !defined(__thumb__)
+#if USE_INLINE_ASSEMBLY
int32_t out;
if (left) {
asm( "smultb %[out], %[in], %[vRL] \n"
@@ -398,7 +412,7 @@ int32_t mulRL(int left, int32_t in, uint32_t vRL)
static inline
int32_t mulAdd(int16_t in, int32_t v, int32_t a)
{
-#if defined(__arm__) && !defined(__thumb__)
+#if USE_INLINE_ASSEMBLY
int32_t out;
asm( "smlawb %[out], %[v], %[in], %[a] \n"
: [out]"=r"(out)
@@ -413,7 +427,7 @@ int32_t mulAdd(int16_t in, int32_t v, int32_t a)
static inline
int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a)
{
-#if defined(__arm__) && !defined(__thumb__)
+#if USE_INLINE_ASSEMBLY
int32_t out;
if (left) {
asm( "smlawb %[out], %[v], %[inRL], %[a] \n"
@@ -639,14 +653,155 @@ void AudioResamplerSinc::filterCoefficient(
l = 0;
r = 0;
size_t count = offset;
- for (size_t i=0 ; i<count ; i++) {
- interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
- sP -= CHANNELS;
- interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
- sN += CHANNELS;
+
+ if (!USE_NEON) {
+ for (size_t i=0 ; i<count ; i++) {
+ interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
+ sP -= CHANNELS;
+ interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
+ sN += CHANNELS;
+ }
+ l = 2 * mulRL(1, l, vRL);
+ r = 2 * mulRL(0, r, vRL);
+ } else if (CHANNELS == 1) {
+ int32_t const* coefsP1 = coefsP + offset;
+ int32_t const* coefsN1 = coefsN + offset;
+ sP -= CHANNELS*3;
+ asm (
+ "vmov.32 d2[0], %[lerpP] \n" // load the positive phase
+ "vmov.32 d2[1], %[lerpN] \n" // load the negative phase
+ "veor q0, q0 \n" // result, initialize to 0
+
+ "1: \n"
+ "vld1.16 { d4}, [%[sP]] \n" // load 4 16-bits stereo samples
+ "vld1.32 { q8}, [%[coefsP0]]! \n" // load 4 32-bits coefs
+ "vld1.32 { q9}, [%[coefsP1]]! \n" // load 4 32-bits coefs for interpolation
+ "vld1.16 { d6}, [%[sN]]! \n" // load 4 16-bits stereo samples
+ "vld1.32 {q10}, [%[coefsN0]]! \n" // load 4 32-bits coefs
+ "vld1.32 {q11}, [%[coefsN1]]! \n" // load 4 32-bits coefs for interpolation
+
+ "vrev64.16 d4, d4 \n" // reverse 2 frames of the positive side
+
+ "vsub.s32 q9, q9, q8 \n" // interpolate (step1) 1st set of coefs
+ "vsub.s32 q11, q11, q10 \n" // interpolate (step1) 2nd set of coets
+ "vshll.s16 q12, d4, #15 \n" // extend samples to 31 bits
+
+ "vqrdmulh.s32 q9, q9, d2[0] \n" // interpolate (step2) 1st set of coefs
+ "vqrdmulh.s32 q11, q11, d2[1] \n" // interpolate (step3) 2nd set of coefs
+ "vshll.s16 q14, d6, #15 \n" // extend samples to 31 bits
+
+ "vadd.s32 q8, q8, q9 \n" // interpolate (step3) 1st set
+ "vadd.s32 q10, q10, q11 \n" // interpolate (step4) 2nd set
+ "subs %[count], %[count], #4 \n" // update loop counter
+
+ "vqrdmulh.s32 q12, q12, q8 \n" // multiply samples by interpolated coef
+ "vqrdmulh.s32 q14, q14, q10 \n" // multiply samples by interpolated coef
+ "sub %[sP], %[sP], #8 \n" // move pointer to next set of samples
+
+ "vadd.s32 q0, q0, q12 \n" // accumulate result
+ "vadd.s32 q0, q0, q14 \n" // accumulate result
+
+ "bne 1b \n" // loop
+
+ "vpadd.s32 d0, d0, d1 \n" // add all 4 partial sums
+ "vpadd.s32 d0, d0, d0 \n" // together
+
+ "vmov.s32 %[l], d0[0] \n" // save result in ARM register
+
+ : [l] "=r" (l),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsP1] "+r" (coefsP1),
+ [coefsN0] "+r" (coefsN),
+ [coefsN1] "+r" (coefsN1),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [lerpP] "r" (lerpP<<16),
+ [lerpN] "r" (lerpN<<16),
+ [vRL] "r" (vRL)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11",
+ "q12", "q14"
+ );
+ l = 2 * mulRL(1, l, vRL);
+ r = l;
+ } else if (CHANNELS == 2) {
+ int32_t const* coefsP1 = coefsP + offset;
+ int32_t const* coefsN1 = coefsN + offset;
+ sP -= CHANNELS*3;
+ asm (
+ "vmov.32 d2[0], %[lerpP] \n" // load the positive phase
+ "vmov.32 d2[1], %[lerpN] \n" // load the negative phase
+ "veor q0, q0 \n" // result, initialize to 0
+ "veor q4, q4 \n" // result, initialize to 0
+
+ "1: \n"
+ "vld2.16 {d4,d5}, [%[sP]] \n" // load 4 16-bits stereo samples
+ "vld1.32 { q8}, [%[coefsP0]]! \n" // load 4 32-bits coefs
+ "vld1.32 { q9}, [%[coefsP1]]! \n" // load 4 32-bits coefs for interpolation
+ "vld2.16 {d6,d7}, [%[sN]]! \n" // load 4 16-bits stereo samples
+ "vld1.32 {q10}, [%[coefsN0]]! \n" // load 4 32-bits coefs
+ "vld1.32 {q11}, [%[coefsN1]]! \n" // load 4 32-bits coefs for interpolation
+
+ "vrev64.16 d4, d4 \n" // reverse 2 frames of the positive side
+ "vrev64.16 d5, d5 \n" // reverse 2 frames of the positive side
+
+ "vsub.s32 q9, q9, q8 \n" // interpolate (step1) 1st set of coefs
+ "vsub.s32 q11, q11, q10 \n" // interpolate (step1) 2nd set of coets
+ "vshll.s16 q12, d4, #15 \n" // extend samples to 31 bits
+ "vshll.s16 q13, d5, #15 \n" // extend samples to 31 bits
+
+ "vqrdmulh.s32 q9, q9, d2[0] \n" // interpolate (step2) 1st set of coefs
+ "vqrdmulh.s32 q11, q11, d2[1] \n" // interpolate (step3) 2nd set of coefs
+ "vshll.s16 q14, d6, #15 \n" // extend samples to 31 bits
+ "vshll.s16 q15, d7, #15 \n" // extend samples to 31 bits
+
+ "vadd.s32 q8, q8, q9 \n" // interpolate (step3) 1st set
+ "vadd.s32 q10, q10, q11 \n" // interpolate (step4) 2nd set
+ "subs %[count], %[count], #4 \n" // update loop counter
+
+ "vqrdmulh.s32 q12, q12, q8 \n" // multiply samples by interpolated coef
+ "vqrdmulh.s32 q13, q13, q8 \n" // multiply samples by interpolated coef
+ "vqrdmulh.s32 q14, q14, q10 \n" // multiply samples by interpolated coef
+ "vqrdmulh.s32 q15, q15, q10 \n" // multiply samples by interpolated coef
+ "sub %[sP], %[sP], #16 \n" // move pointer to next set of samples
+
+ "vadd.s32 q0, q0, q12 \n" // accumulate result
+ "vadd.s32 q4, q4, q13 \n" // accumulate result
+ "vadd.s32 q0, q0, q14 \n" // accumulate result
+ "vadd.s32 q4, q4, q15 \n" // accumulate result
+
+ "bne 1b \n" // loop
+
+ "vpadd.s32 d0, d0, d1 \n" // add all 4 partial sums
+ "vpadd.s32 d8, d8, d9 \n" // add all 4 partial sums
+ "vpadd.s32 d0, d0, d0 \n" // together
+ "vpadd.s32 d8, d8, d8 \n" // together
+
+ "vmov.s32 %[l], d0[0] \n" // save result in ARM register
+ "vmov.s32 %[r], d8[0] \n" // save result in ARM register
+
+ : [l] "=r" (l),
+ [r] "=r" (r),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsP1] "+r" (coefsP1),
+ [coefsN0] "+r" (coefsN),
+ [coefsN1] "+r" (coefsN1),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [lerpP] "r" (lerpP<<16),
+ [lerpN] "r" (lerpN<<16),
+ [vRL] "r" (vRL)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15"
+ );
+ l = 2 * mulRL(1, l, vRL);
+ r = 2 * mulRL(0, r, vRL);
}
- l = 2 * mulRL(1, l, vRL);
- r = 2 * mulRL(0, r, vRL);
}
template<int CHANNELS>
diff --git a/services/audioflinger/test-resample.cpp b/services/audioflinger/test-resample.cpp
index e6d5cbe..3b66530 100644
--- a/services/audioflinger/test-resample.cpp
+++ b/services/audioflinger/test-resample.cpp
@@ -178,7 +178,7 @@ int main(int argc, char* argv[]) {
double y = sin(M_PI * k * t * t);
int16_t yi = floor(y * 32767.0 + 0.5);
for (size_t j=0 ; j<channels ; j++) {
- in[i*channels + j] = yi;
+ in[i*channels + j] = yi / (1+j);
}
}
}