summaryrefslogtreecommitdiffstats
path: root/services/audioflinger/AudioResamplerSinc.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'services/audioflinger/AudioResamplerSinc.cpp')
-rw-r--r--services/audioflinger/AudioResamplerSinc.cpp316
1 files changed, 165 insertions, 151 deletions
diff --git a/services/audioflinger/AudioResamplerSinc.cpp b/services/audioflinger/AudioResamplerSinc.cpp
index 207f26b..e50b192 100644
--- a/services/audioflinger/AudioResamplerSinc.cpp
+++ b/services/audioflinger/AudioResamplerSinc.cpp
@@ -17,6 +17,7 @@
#define LOG_TAG "AudioResamplerSinc"
//#define LOG_NDEBUG 0
+#define __STDC_CONSTANT_MACROS
#include <malloc.h>
#include <string.h>
#include <stdlib.h>
@@ -37,12 +38,14 @@
#define USE_INLINE_ASSEMBLY (false)
#endif
-#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__)
-#define USE_NEON (true)
+#if defined(__aarch64__) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define USE_NEON
#else
-#define USE_NEON (false)
+#undef USE_NEON
#endif
+#define UNUSED(x) ((void)(x))
namespace android {
// ----------------------------------------------------------------------------
@@ -634,8 +637,8 @@ void AudioResamplerSinc::read(
}
template<int CHANNELS>
-void AudioResamplerSinc::filterCoefficient(
- int32_t* out, uint32_t phase, const int16_t *samples, uint32_t vRL)
+void AudioResamplerSinc::filterCoefficient(int32_t* out, uint32_t phase,
+ const int16_t *samples, uint32_t vRL)
{
// NOTE: be very careful when modifying the code here. register
// pressure is very high and a small change might cause the compiler
@@ -662,160 +665,171 @@ void AudioResamplerSinc::filterCoefficient(
size_t count = offset;
- if (!USE_NEON) {
- int32_t l = 0;
- int32_t r = 0;
- for (size_t i=0 ; i<count ; i++) {
- interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
- sP -= CHANNELS;
- interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
- sN += CHANNELS;
- }
- out[0] += 2 * mulRL(1, l, vRL);
- out[1] += 2 * mulRL(0, r, vRL);
- } else if (CHANNELS == 1) {
+#ifndef USE_NEON
+ int32_t l = 0;
+ int32_t r = 0;
+ for (size_t i=0 ; i<count ; i++) {
+ interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
+ sP -= CHANNELS;
+ interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
+ sN += CHANNELS;
+ }
+ out[0] += 2 * mulRL(1, l, vRL);
+ out[1] += 2 * mulRL(0, r, vRL);
+#else
+ UNUSED(vRL);
+ if (CHANNELS == 1) {
int32_t const* coefsP1 = coefsP + offset;
int32_t const* coefsN1 = coefsN + offset;
sP -= CHANNELS*3;
- asm (
- "vmov.32 d2[0], %[lerpP] \n" // load the positive phase
- "vmov.32 d2[1], %[lerpN] \n" // load the negative phase
- "veor q0, q0, q0 \n" // result, initialize to 0
- "vshl.s32 d2, d2, #16 \n" // convert to 32 bits
-
- "1: \n"
- "vld1.16 { d4}, [%[sP]] \n" // load 4 16-bits stereo samples
- "vld1.32 { q8}, [%[coefsP0]:128]! \n" // load 4 32-bits coefs
- "vld1.32 { q9}, [%[coefsP1]:128]! \n" // load 4 32-bits coefs for interpolation
- "vld1.16 { d6}, [%[sN]]! \n" // load 4 16-bits stereo samples
- "vld1.32 {q10}, [%[coefsN0]:128]! \n" // load 4 32-bits coefs
- "vld1.32 {q11}, [%[coefsN1]:128]! \n" // load 4 32-bits coefs for interpolation
-
- "vrev64.16 d4, d4 \n" // reverse 2 frames of the positive side
-
- "vsub.s32 q9, q9, q8 \n" // interpolate (step1) 1st set of coefs
- "vsub.s32 q11, q11, q10 \n" // interpolate (step1) 2nd set of coets
- "vshll.s16 q12, d4, #15 \n" // extend samples to 31 bits
-
- "vqrdmulh.s32 q9, q9, d2[0] \n" // interpolate (step2) 1st set of coefs
- "vqrdmulh.s32 q11, q11, d2[1] \n" // interpolate (step3) 2nd set of coefs
- "vshll.s16 q14, d6, #15 \n" // extend samples to 31 bits
-
- "vadd.s32 q8, q8, q9 \n" // interpolate (step3) 1st set
- "vadd.s32 q10, q10, q11 \n" // interpolate (step4) 2nd set
- "subs %[count], %[count], #4 \n" // update loop counter
-
- "vqrdmulh.s32 q12, q12, q8 \n" // multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n" // multiply samples by interpolated coef
- "sub %[sP], %[sP], #8 \n" // move pointer to next set of samples
-
- "vadd.s32 q0, q0, q12 \n" // accumulate result
- "vadd.s32 q0, q0, q14 \n" // accumulate result
-
- "bne 1b \n" // loop
-
- "vld1.s32 {d2}, [%[vLR]] \n" // load volumes
- "vld1.s32 {d3}, %[out] \n" // load the output
- "vpadd.s32 d0, d0, d1 \n" // add all 4 partial sums
- "vpadd.s32 d0, d0, d0 \n" // together
- "vdup.i32 d0, d0[0] \n" // interleave L,R channels
- "vqrdmulh.s32 d0, d0, d2 \n" // apply volume
- "vadd.s32 d3, d3, d0 \n" // accumulate result
- "vst1.s32 {d3}, %[out] \n" // store result
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsP1] "+r" (coefsP1),
- [coefsN0] "+r" (coefsN),
- [coefsN1] "+r" (coefsN1),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [lerpP] "r" (lerpP),
- [lerpN] "r" (lerpN),
- [vLR] "r" (mVolumeSIMD)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11",
- "q12", "q14"
- );
+
+ int32x4_t sum;
+ int32x2_t lerpPN;
+ lerpPN = vdup_n_s32(0);
+ lerpPN = vld1_lane_s32((int32_t *)&lerpP, lerpPN, 0);
+ lerpPN = vld1_lane_s32((int32_t *)&lerpN, lerpPN, 1);
+ lerpPN = vshl_n_s32(lerpPN, 16);
+ sum = vdupq_n_s32(0);
+
+ int16x4_t sampleP, sampleN;
+ int32x4_t samplePExt, sampleNExt;
+ int32x4_t coefsPV0, coefsPV1, coefsNV0, coefsNV1;
+
+ coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
+ coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
+ coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
+ coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
+ for (; count > 0; count -= 4) {
+ sampleP = vld1_s16(sP);
+ sampleN = vld1_s16(sN);
+ coefsPV0 = vld1q_s32(coefsP);
+ coefsNV0 = vld1q_s32(coefsN);
+ coefsPV1 = vld1q_s32(coefsP1);
+ coefsNV1 = vld1q_s32(coefsN1);
+ sP -= 4;
+ sN += 4;
+ coefsP += 4;
+ coefsN += 4;
+ coefsP1 += 4;
+ coefsN1 += 4;
+
+ sampleP = vrev64_s16(sampleP);
+
+ // interpolate (step1)
+ coefsPV1 = vsubq_s32(coefsPV1, coefsPV0);
+ coefsNV1 = vsubq_s32(coefsNV1, coefsNV0);
+ samplePExt = vshll_n_s16(sampleP, 15);
+ // interpolate (step2)
+ coefsPV1 = vqrdmulhq_lane_s32(coefsPV1, lerpPN, 0);
+ coefsNV1 = vqrdmulhq_lane_s32(coefsNV1, lerpPN, 1);
+ sampleNExt = vshll_n_s16(sampleN, 15);
+ // interpolate (step3)
+ coefsPV0 = vaddq_s32(coefsPV0, coefsPV1);
+ coefsNV0 = vaddq_s32(coefsNV0, coefsNV1);
+
+ samplePExt = vqrdmulhq_s32(samplePExt, coefsPV0);
+ sampleNExt = vqrdmulhq_s32(sampleNExt, coefsNV0);
+ sum = vaddq_s32(sum, samplePExt);
+ sum = vaddq_s32(sum, sampleNExt);
+ }
+ int32x2_t volumesV, outV;
+ volumesV = vld1_s32(mVolumeSIMD);
+ outV = vld1_s32(out);
+
+ //add all 4 partial sums
+ int32x2_t sumLow, sumHigh;
+ sumLow = vget_low_s32(sum);
+ sumHigh = vget_high_s32(sum);
+ sumLow = vpadd_s32(sumLow, sumHigh);
+ sumLow = vpadd_s32(sumLow, sumLow);
+
+ sumLow = vqrdmulh_s32(sumLow, volumesV);
+ outV = vadd_s32(outV, sumLow);
+ vst1_s32(out, outV);
} else if (CHANNELS == 2) {
int32_t const* coefsP1 = coefsP + offset;
int32_t const* coefsN1 = coefsN + offset;
sP -= CHANNELS*3;
- asm (
- "vmov.32 d2[0], %[lerpP] \n" // load the positive phase
- "vmov.32 d2[1], %[lerpN] \n" // load the negative phase
- "veor q0, q0, q0 \n" // result, initialize to 0
- "veor q4, q4, q4 \n" // result, initialize to 0
- "vshl.s32 d2, d2, #16 \n" // convert to 32 bits
-
- "1: \n"
- "vld2.16 {d4,d5}, [%[sP]] \n" // load 4 16-bits stereo samples
- "vld1.32 { q8}, [%[coefsP0]:128]! \n" // load 4 32-bits coefs
- "vld1.32 { q9}, [%[coefsP1]:128]! \n" // load 4 32-bits coefs for interpolation
- "vld2.16 {d6,d7}, [%[sN]]! \n" // load 4 16-bits stereo samples
- "vld1.32 {q10}, [%[coefsN0]:128]! \n" // load 4 32-bits coefs
- "vld1.32 {q11}, [%[coefsN1]:128]! \n" // load 4 32-bits coefs for interpolation
-
- "vrev64.16 d4, d4 \n" // reverse 2 frames of the positive side
- "vrev64.16 d5, d5 \n" // reverse 2 frames of the positive side
-
- "vsub.s32 q9, q9, q8 \n" // interpolate (step1) 1st set of coefs
- "vsub.s32 q11, q11, q10 \n" // interpolate (step1) 2nd set of coets
- "vshll.s16 q12, d4, #15 \n" // extend samples to 31 bits
- "vshll.s16 q13, d5, #15 \n" // extend samples to 31 bits
-
- "vqrdmulh.s32 q9, q9, d2[0] \n" // interpolate (step2) 1st set of coefs
- "vqrdmulh.s32 q11, q11, d2[1] \n" // interpolate (step3) 2nd set of coefs
- "vshll.s16 q14, d6, #15 \n" // extend samples to 31 bits
- "vshll.s16 q15, d7, #15 \n" // extend samples to 31 bits
-
- "vadd.s32 q8, q8, q9 \n" // interpolate (step3) 1st set
- "vadd.s32 q10, q10, q11 \n" // interpolate (step4) 2nd set
- "subs %[count], %[count], #4 \n" // update loop counter
-
- "vqrdmulh.s32 q12, q12, q8 \n" // multiply samples by interpolated coef
- "vqrdmulh.s32 q13, q13, q8 \n" // multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n" // multiply samples by interpolated coef
- "vqrdmulh.s32 q15, q15, q10 \n" // multiply samples by interpolated coef
- "sub %[sP], %[sP], #16 \n" // move pointer to next set of samples
-
- "vadd.s32 q0, q0, q12 \n" // accumulate result
- "vadd.s32 q4, q4, q13 \n" // accumulate result
- "vadd.s32 q0, q0, q14 \n" // accumulate result
- "vadd.s32 q4, q4, q15 \n" // accumulate result
-
- "bne 1b \n" // loop
-
- "vld1.s32 {d2}, [%[vLR]] \n" // load volumes
- "vld1.s32 {d3}, %[out] \n" // load the output
- "vpadd.s32 d0, d0, d1 \n" // add all 4 partial sums from q0
- "vpadd.s32 d8, d8, d9 \n" // add all 4 partial sums from q4
- "vpadd.s32 d0, d0, d0 \n" // together
- "vpadd.s32 d8, d8, d8 \n" // together
- "vtrn.s32 d0, d8 \n" // interlace L,R channels
- "vqrdmulh.s32 d0, d0, d2 \n" // apply volume
- "vadd.s32 d3, d3, d0 \n" // accumulate result
- "vst1.s32 {d3}, %[out] \n" // store result
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsP1] "+r" (coefsP1),
- [coefsN0] "+r" (coefsN),
- [coefsN1] "+r" (coefsN1),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [lerpP] "r" (lerpP),
- [lerpN] "r" (lerpN),
- [vLR] "r" (mVolumeSIMD)
- : "cc", "memory",
- "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15"
- );
+
+ int32x4_t sum0, sum1;
+ int32x2_t lerpPN;
+
+ lerpPN = vdup_n_s32(0);
+ lerpPN = vld1_lane_s32((int32_t *)&lerpP, lerpPN, 0);
+ lerpPN = vld1_lane_s32((int32_t *)&lerpN, lerpPN, 1);
+ lerpPN = vshl_n_s32(lerpPN, 16);
+ sum0 = vdupq_n_s32(0);
+ sum1 = vdupq_n_s32(0);
+
+ int16x4x2_t sampleP, sampleN;
+ int32x4x2_t samplePExt, sampleNExt;
+ int32x4_t coefsPV0, coefsPV1, coefsNV0, coefsNV1;
+
+ coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
+ coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
+ coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
+ coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
+ for (; count > 0; count -= 4) {
+ sampleP = vld2_s16(sP);
+ sampleN = vld2_s16(sN);
+ coefsPV0 = vld1q_s32(coefsP);
+ coefsNV0 = vld1q_s32(coefsN);
+ coefsPV1 = vld1q_s32(coefsP1);
+ coefsNV1 = vld1q_s32(coefsN1);
+ sP -= 8;
+ sN += 8;
+ coefsP += 4;
+ coefsN += 4;
+ coefsP1 += 4;
+ coefsN1 += 4;
+
+ sampleP.val[0] = vrev64_s16(sampleP.val[0]);
+ sampleP.val[1] = vrev64_s16(sampleP.val[1]);
+
+ // interpolate (step1)
+ coefsPV1 = vsubq_s32(coefsPV1, coefsPV0);
+ coefsNV1 = vsubq_s32(coefsNV1, coefsNV0);
+ samplePExt.val[0] = vshll_n_s16(sampleP.val[0], 15);
+ samplePExt.val[1] = vshll_n_s16(sampleP.val[1], 15);
+ // interpolate (step2)
+ coefsPV1 = vqrdmulhq_lane_s32(coefsPV1, lerpPN, 0);
+ coefsNV1 = vqrdmulhq_lane_s32(coefsNV1, lerpPN, 1);
+ sampleNExt.val[0] = vshll_n_s16(sampleN.val[0], 15);
+ sampleNExt.val[1] = vshll_n_s16(sampleN.val[1], 15);
+ // interpolate (step3)
+ coefsPV0 = vaddq_s32(coefsPV0, coefsPV1);
+ coefsNV0 = vaddq_s32(coefsNV0, coefsNV1);
+
+ samplePExt.val[0] = vqrdmulhq_s32(samplePExt.val[0], coefsPV0);
+ samplePExt.val[1] = vqrdmulhq_s32(samplePExt.val[1], coefsPV0);
+ sampleNExt.val[0] = vqrdmulhq_s32(sampleNExt.val[0], coefsNV0);
+ sampleNExt.val[1] = vqrdmulhq_s32(sampleNExt.val[1], coefsNV0);
+ sum0 = vaddq_s32(sum0, samplePExt.val[0]);
+ sum1 = vaddq_s32(sum1, samplePExt.val[1]);
+ sum0 = vaddq_s32(sum0, sampleNExt.val[0]);
+ sum1 = vaddq_s32(sum1, sampleNExt.val[1]);
+ }
+ int32x2_t volumesV, outV;
+ volumesV = vld1_s32(mVolumeSIMD);
+ outV = vld1_s32(out);
+
+ //add all 4 partial sums
+ int32x2_t sumLow0, sumHigh0, sumLow1, sumHigh1;
+ sumLow0 = vget_low_s32(sum0);
+ sumHigh0 = vget_high_s32(sum0);
+ sumLow1 = vget_low_s32(sum1);
+ sumHigh1 = vget_high_s32(sum1);
+ sumLow0 = vpadd_s32(sumLow0, sumHigh0);
+ sumLow0 = vpadd_s32(sumLow0, sumLow0);
+ sumLow1 = vpadd_s32(sumLow1, sumHigh1);
+ sumLow1 = vpadd_s32(sumLow1, sumLow1);
+
+ sumLow0 = vtrn_s32(sumLow0, sumLow1).val[0];
+ sumLow0 = vqrdmulh_s32(sumLow0, volumesV);
+ outV = vadd_s32(outV, sumLow0);
+ vst1_s32(out, outV);
}
+#endif
}
template<int CHANNELS>