summaryrefslogtreecommitdiffstats
path: root/services/audioflinger
diff options
context:
space:
mode:
authorZhongwei Yao <zhongwei.yao@arm.com>2014-04-10 17:23:42 +0100
committerZheng Xu <zheng.xu@arm.com>2014-04-10 18:28:46 +0100
commit12b44bd5fe3069cd3450d05b6c446b600e0553d3 (patch)
treed7c3ff6a66a682520248b3b39e68a470968e3663 /services/audioflinger
parent51b3ea2b8f7298bffbd246c65a606cd053357b66 (diff)
downloadframeworks_av-12b44bd5fe3069cd3450d05b6c446b600e0553d3.zip
frameworks_av-12b44bd5fe3069cd3450d05b6c446b600e0553d3.tar.gz
frameworks_av-12b44bd5fe3069cd3450d05b6c446b600e0553d3.tar.bz2
AArch64: rewrite audioflinger's sinc resample by intrinsics.
Pass conformance test on armv7 and aarch64, performance test is done on armv7. Compared with original armv7 assembly version, this version has similar result. Here is performance data on pandaboard android4.4 (input: random wave file, unit: Mspl/s, toolchain:gcc 4.8): | | origin(assembly) | current(intrinsics) | C version | |----------------+------------------+---------------------+-----------| | single channel | 6.17 | 7.14 | 3.43 | | double channel | 5.24 | 5.63 | 3.50 | Change-Id: If5670218e1586e9dfd2b8d9c66a6880f3e4808ca
Diffstat (limited to 'services/audioflinger')
-rw-r--r--services/audioflinger/AudioResamplerSinc.cpp316
1 files changed, 165 insertions, 151 deletions
diff --git a/services/audioflinger/AudioResamplerSinc.cpp b/services/audioflinger/AudioResamplerSinc.cpp
index 207f26b..e50b192 100644
--- a/services/audioflinger/AudioResamplerSinc.cpp
+++ b/services/audioflinger/AudioResamplerSinc.cpp
@@ -17,6 +17,7 @@
#define LOG_TAG "AudioResamplerSinc"
//#define LOG_NDEBUG 0
+#define __STDC_CONSTANT_MACROS
#include <malloc.h>
#include <string.h>
#include <stdlib.h>
@@ -37,12 +38,14 @@
#define USE_INLINE_ASSEMBLY (false)
#endif
-#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__)
-#define USE_NEON (true)
+#if defined(__aarch64__) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define USE_NEON
#else
-#define USE_NEON (false)
+#undef USE_NEON
#endif
+#define UNUSED(x) ((void)(x))
namespace android {
// ----------------------------------------------------------------------------
@@ -634,8 +637,8 @@ void AudioResamplerSinc::read(
}
template<int CHANNELS>
-void AudioResamplerSinc::filterCoefficient(
- int32_t* out, uint32_t phase, const int16_t *samples, uint32_t vRL)
+void AudioResamplerSinc::filterCoefficient(int32_t* out, uint32_t phase,
+ const int16_t *samples, uint32_t vRL)
{
// NOTE: be very careful when modifying the code here. register
// pressure is very high and a small change might cause the compiler
@@ -662,160 +665,171 @@ void AudioResamplerSinc::filterCoefficient(
size_t count = offset;
- if (!USE_NEON) {
- int32_t l = 0;
- int32_t r = 0;
- for (size_t i=0 ; i<count ; i++) {
- interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
- sP -= CHANNELS;
- interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
- sN += CHANNELS;
- }
- out[0] += 2 * mulRL(1, l, vRL);
- out[1] += 2 * mulRL(0, r, vRL);
- } else if (CHANNELS == 1) {
+#ifndef USE_NEON
+ int32_t l = 0;
+ int32_t r = 0;
+ for (size_t i=0 ; i<count ; i++) {
+ interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
+ sP -= CHANNELS;
+ interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
+ sN += CHANNELS;
+ }
+ out[0] += 2 * mulRL(1, l, vRL);
+ out[1] += 2 * mulRL(0, r, vRL);
+#else
+ UNUSED(vRL);
+ if (CHANNELS == 1) {
int32_t const* coefsP1 = coefsP + offset;
int32_t const* coefsN1 = coefsN + offset;
sP -= CHANNELS*3;
- asm (
- "vmov.32 d2[0], %[lerpP] \n" // load the positive phase
- "vmov.32 d2[1], %[lerpN] \n" // load the negative phase
- "veor q0, q0, q0 \n" // result, initialize to 0
- "vshl.s32 d2, d2, #16 \n" // convert to 32 bits
-
- "1: \n"
- "vld1.16 { d4}, [%[sP]] \n" // load 4 16-bits stereo samples
- "vld1.32 { q8}, [%[coefsP0]:128]! \n" // load 4 32-bits coefs
- "vld1.32 { q9}, [%[coefsP1]:128]! \n" // load 4 32-bits coefs for interpolation
- "vld1.16 { d6}, [%[sN]]! \n" // load 4 16-bits stereo samples
- "vld1.32 {q10}, [%[coefsN0]:128]! \n" // load 4 32-bits coefs
- "vld1.32 {q11}, [%[coefsN1]:128]! \n" // load 4 32-bits coefs for interpolation
-
- "vrev64.16 d4, d4 \n" // reverse 2 frames of the positive side
-
- "vsub.s32 q9, q9, q8 \n" // interpolate (step1) 1st set of coefs
- "vsub.s32 q11, q11, q10 \n" // interpolate (step1) 2nd set of coets
- "vshll.s16 q12, d4, #15 \n" // extend samples to 31 bits
-
- "vqrdmulh.s32 q9, q9, d2[0] \n" // interpolate (step2) 1st set of coefs
- "vqrdmulh.s32 q11, q11, d2[1] \n" // interpolate (step3) 2nd set of coefs
- "vshll.s16 q14, d6, #15 \n" // extend samples to 31 bits
-
- "vadd.s32 q8, q8, q9 \n" // interpolate (step3) 1st set
- "vadd.s32 q10, q10, q11 \n" // interpolate (step4) 2nd set
- "subs %[count], %[count], #4 \n" // update loop counter
-
- "vqrdmulh.s32 q12, q12, q8 \n" // multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n" // multiply samples by interpolated coef
- "sub %[sP], %[sP], #8 \n" // move pointer to next set of samples
-
- "vadd.s32 q0, q0, q12 \n" // accumulate result
- "vadd.s32 q0, q0, q14 \n" // accumulate result
-
- "bne 1b \n" // loop
-
- "vld1.s32 {d2}, [%[vLR]] \n" // load volumes
- "vld1.s32 {d3}, %[out] \n" // load the output
- "vpadd.s32 d0, d0, d1 \n" // add all 4 partial sums
- "vpadd.s32 d0, d0, d0 \n" // together
- "vdup.i32 d0, d0[0] \n" // interleave L,R channels
- "vqrdmulh.s32 d0, d0, d2 \n" // apply volume
- "vadd.s32 d3, d3, d0 \n" // accumulate result
- "vst1.s32 {d3}, %[out] \n" // store result
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsP1] "+r" (coefsP1),
- [coefsN0] "+r" (coefsN),
- [coefsN1] "+r" (coefsN1),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [lerpP] "r" (lerpP),
- [lerpN] "r" (lerpN),
- [vLR] "r" (mVolumeSIMD)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11",
- "q12", "q14"
- );
+
+ int32x4_t sum;
+ int32x2_t lerpPN;
+ lerpPN = vdup_n_s32(0);
+ lerpPN = vld1_lane_s32((int32_t *)&lerpP, lerpPN, 0);
+ lerpPN = vld1_lane_s32((int32_t *)&lerpN, lerpPN, 1);
+ lerpPN = vshl_n_s32(lerpPN, 16);
+ sum = vdupq_n_s32(0);
+
+ int16x4_t sampleP, sampleN;
+ int32x4_t samplePExt, sampleNExt;
+ int32x4_t coefsPV0, coefsPV1, coefsNV0, coefsNV1;
+
+ coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
+ coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
+ coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
+ coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
+ for (; count > 0; count -= 4) {
+ sampleP = vld1_s16(sP);
+ sampleN = vld1_s16(sN);
+ coefsPV0 = vld1q_s32(coefsP);
+ coefsNV0 = vld1q_s32(coefsN);
+ coefsPV1 = vld1q_s32(coefsP1);
+ coefsNV1 = vld1q_s32(coefsN1);
+ sP -= 4;
+ sN += 4;
+ coefsP += 4;
+ coefsN += 4;
+ coefsP1 += 4;
+ coefsN1 += 4;
+
+ sampleP = vrev64_s16(sampleP);
+
+ // interpolate (step1)
+ coefsPV1 = vsubq_s32(coefsPV1, coefsPV0);
+ coefsNV1 = vsubq_s32(coefsNV1, coefsNV0);
+ samplePExt = vshll_n_s16(sampleP, 15);
+ // interpolate (step2)
+ coefsPV1 = vqrdmulhq_lane_s32(coefsPV1, lerpPN, 0);
+ coefsNV1 = vqrdmulhq_lane_s32(coefsNV1, lerpPN, 1);
+ sampleNExt = vshll_n_s16(sampleN, 15);
+ // interpolate (step3)
+ coefsPV0 = vaddq_s32(coefsPV0, coefsPV1);
+ coefsNV0 = vaddq_s32(coefsNV0, coefsNV1);
+
+ samplePExt = vqrdmulhq_s32(samplePExt, coefsPV0);
+ sampleNExt = vqrdmulhq_s32(sampleNExt, coefsNV0);
+ sum = vaddq_s32(sum, samplePExt);
+ sum = vaddq_s32(sum, sampleNExt);
+ }
+ int32x2_t volumesV, outV;
+ volumesV = vld1_s32(mVolumeSIMD);
+ outV = vld1_s32(out);
+
+ //add all 4 partial sums
+ int32x2_t sumLow, sumHigh;
+ sumLow = vget_low_s32(sum);
+ sumHigh = vget_high_s32(sum);
+ sumLow = vpadd_s32(sumLow, sumHigh);
+ sumLow = vpadd_s32(sumLow, sumLow);
+
+ sumLow = vqrdmulh_s32(sumLow, volumesV);
+ outV = vadd_s32(outV, sumLow);
+ vst1_s32(out, outV);
} else if (CHANNELS == 2) {
int32_t const* coefsP1 = coefsP + offset;
int32_t const* coefsN1 = coefsN + offset;
sP -= CHANNELS*3;
- asm (
- "vmov.32 d2[0], %[lerpP] \n" // load the positive phase
- "vmov.32 d2[1], %[lerpN] \n" // load the negative phase
- "veor q0, q0, q0 \n" // result, initialize to 0
- "veor q4, q4, q4 \n" // result, initialize to 0
- "vshl.s32 d2, d2, #16 \n" // convert to 32 bits
-
- "1: \n"
- "vld2.16 {d4,d5}, [%[sP]] \n" // load 4 16-bits stereo samples
- "vld1.32 { q8}, [%[coefsP0]:128]! \n" // load 4 32-bits coefs
- "vld1.32 { q9}, [%[coefsP1]:128]! \n" // load 4 32-bits coefs for interpolation
- "vld2.16 {d6,d7}, [%[sN]]! \n" // load 4 16-bits stereo samples
- "vld1.32 {q10}, [%[coefsN0]:128]! \n" // load 4 32-bits coefs
- "vld1.32 {q11}, [%[coefsN1]:128]! \n" // load 4 32-bits coefs for interpolation
-
- "vrev64.16 d4, d4 \n" // reverse 2 frames of the positive side
- "vrev64.16 d5, d5 \n" // reverse 2 frames of the positive side
-
- "vsub.s32 q9, q9, q8 \n" // interpolate (step1) 1st set of coefs
- "vsub.s32 q11, q11, q10 \n" // interpolate (step1) 2nd set of coets
- "vshll.s16 q12, d4, #15 \n" // extend samples to 31 bits
- "vshll.s16 q13, d5, #15 \n" // extend samples to 31 bits
-
- "vqrdmulh.s32 q9, q9, d2[0] \n" // interpolate (step2) 1st set of coefs
- "vqrdmulh.s32 q11, q11, d2[1] \n" // interpolate (step3) 2nd set of coefs
- "vshll.s16 q14, d6, #15 \n" // extend samples to 31 bits
- "vshll.s16 q15, d7, #15 \n" // extend samples to 31 bits
-
- "vadd.s32 q8, q8, q9 \n" // interpolate (step3) 1st set
- "vadd.s32 q10, q10, q11 \n" // interpolate (step4) 2nd set
- "subs %[count], %[count], #4 \n" // update loop counter
-
- "vqrdmulh.s32 q12, q12, q8 \n" // multiply samples by interpolated coef
- "vqrdmulh.s32 q13, q13, q8 \n" // multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n" // multiply samples by interpolated coef
- "vqrdmulh.s32 q15, q15, q10 \n" // multiply samples by interpolated coef
- "sub %[sP], %[sP], #16 \n" // move pointer to next set of samples
-
- "vadd.s32 q0, q0, q12 \n" // accumulate result
- "vadd.s32 q4, q4, q13 \n" // accumulate result
- "vadd.s32 q0, q0, q14 \n" // accumulate result
- "vadd.s32 q4, q4, q15 \n" // accumulate result
-
- "bne 1b \n" // loop
-
- "vld1.s32 {d2}, [%[vLR]] \n" // load volumes
- "vld1.s32 {d3}, %[out] \n" // load the output
- "vpadd.s32 d0, d0, d1 \n" // add all 4 partial sums from q0
- "vpadd.s32 d8, d8, d9 \n" // add all 4 partial sums from q4
- "vpadd.s32 d0, d0, d0 \n" // together
- "vpadd.s32 d8, d8, d8 \n" // together
- "vtrn.s32 d0, d8 \n" // interlace L,R channels
- "vqrdmulh.s32 d0, d0, d2 \n" // apply volume
- "vadd.s32 d3, d3, d0 \n" // accumulate result
- "vst1.s32 {d3}, %[out] \n" // store result
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsP1] "+r" (coefsP1),
- [coefsN0] "+r" (coefsN),
- [coefsN1] "+r" (coefsN1),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [lerpP] "r" (lerpP),
- [lerpN] "r" (lerpN),
- [vLR] "r" (mVolumeSIMD)
- : "cc", "memory",
- "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15"
- );
+
+ int32x4_t sum0, sum1;
+ int32x2_t lerpPN;
+
+ lerpPN = vdup_n_s32(0);
+ lerpPN = vld1_lane_s32((int32_t *)&lerpP, lerpPN, 0);
+ lerpPN = vld1_lane_s32((int32_t *)&lerpN, lerpPN, 1);
+ lerpPN = vshl_n_s32(lerpPN, 16);
+ sum0 = vdupq_n_s32(0);
+ sum1 = vdupq_n_s32(0);
+
+ int16x4x2_t sampleP, sampleN;
+ int32x4x2_t samplePExt, sampleNExt;
+ int32x4_t coefsPV0, coefsPV1, coefsNV0, coefsNV1;
+
+ coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
+ coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
+ coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
+ coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
+ for (; count > 0; count -= 4) {
+ sampleP = vld2_s16(sP);
+ sampleN = vld2_s16(sN);
+ coefsPV0 = vld1q_s32(coefsP);
+ coefsNV0 = vld1q_s32(coefsN);
+ coefsPV1 = vld1q_s32(coefsP1);
+ coefsNV1 = vld1q_s32(coefsN1);
+ sP -= 8;
+ sN += 8;
+ coefsP += 4;
+ coefsN += 4;
+ coefsP1 += 4;
+ coefsN1 += 4;
+
+ sampleP.val[0] = vrev64_s16(sampleP.val[0]);
+ sampleP.val[1] = vrev64_s16(sampleP.val[1]);
+
+ // interpolate (step1)
+ coefsPV1 = vsubq_s32(coefsPV1, coefsPV0);
+ coefsNV1 = vsubq_s32(coefsNV1, coefsNV0);
+ samplePExt.val[0] = vshll_n_s16(sampleP.val[0], 15);
+ samplePExt.val[1] = vshll_n_s16(sampleP.val[1], 15);
+ // interpolate (step2)
+ coefsPV1 = vqrdmulhq_lane_s32(coefsPV1, lerpPN, 0);
+ coefsNV1 = vqrdmulhq_lane_s32(coefsNV1, lerpPN, 1);
+ sampleNExt.val[0] = vshll_n_s16(sampleN.val[0], 15);
+ sampleNExt.val[1] = vshll_n_s16(sampleN.val[1], 15);
+ // interpolate (step3)
+ coefsPV0 = vaddq_s32(coefsPV0, coefsPV1);
+ coefsNV0 = vaddq_s32(coefsNV0, coefsNV1);
+
+ samplePExt.val[0] = vqrdmulhq_s32(samplePExt.val[0], coefsPV0);
+ samplePExt.val[1] = vqrdmulhq_s32(samplePExt.val[1], coefsPV0);
+ sampleNExt.val[0] = vqrdmulhq_s32(sampleNExt.val[0], coefsNV0);
+ sampleNExt.val[1] = vqrdmulhq_s32(sampleNExt.val[1], coefsNV0);
+ sum0 = vaddq_s32(sum0, samplePExt.val[0]);
+ sum1 = vaddq_s32(sum1, samplePExt.val[1]);
+ sum0 = vaddq_s32(sum0, sampleNExt.val[0]);
+ sum1 = vaddq_s32(sum1, sampleNExt.val[1]);
+ }
+ int32x2_t volumesV, outV;
+ volumesV = vld1_s32(mVolumeSIMD);
+ outV = vld1_s32(out);
+
+ //add all 4 partial sums
+ int32x2_t sumLow0, sumHigh0, sumLow1, sumHigh1;
+ sumLow0 = vget_low_s32(sum0);
+ sumHigh0 = vget_high_s32(sum0);
+ sumLow1 = vget_low_s32(sum1);
+ sumHigh1 = vget_high_s32(sum1);
+ sumLow0 = vpadd_s32(sumLow0, sumHigh0);
+ sumLow0 = vpadd_s32(sumLow0, sumLow0);
+ sumLow1 = vpadd_s32(sumLow1, sumHigh1);
+ sumLow1 = vpadd_s32(sumLow1, sumLow1);
+
+ sumLow0 = vtrn_s32(sumLow0, sumLow1).val[0];
+ sumLow0 = vqrdmulh_s32(sumLow0, volumesV);
+ outV = vadd_s32(outV, sumLow0);
+ vst1_s32(out, outV);
}
+#endif
}
template<int CHANNELS>