summaryrefslogtreecommitdiffstats
path: root/services/audioflinger/AudioResamplerFirProcessNeon.h
diff options
context:
space:
mode:
Diffstat (limited to 'services/audioflinger/AudioResamplerFirProcessNeon.h')
-rw-r--r--services/audioflinger/AudioResamplerFirProcessNeon.h1137
1 files changed, 601 insertions, 536 deletions
diff --git a/services/audioflinger/AudioResamplerFirProcessNeon.h b/services/audioflinger/AudioResamplerFirProcessNeon.h
index f311cef..3de9edd 100644
--- a/services/audioflinger/AudioResamplerFirProcessNeon.h
+++ b/services/audioflinger/AudioResamplerFirProcessNeon.h
@@ -22,14 +22,35 @@ namespace android {
// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
#if USE_NEON
+
+// use intrinsics if inline arm32 assembly is not possible
+#if !USE_INLINE_ASSEMBLY
+#define USE_INTRINSIC
+#endif
+
+// following intrinsics available only on ARM 64 bit ACLE
+#ifndef __aarch64__
+#undef vld1q_f32_x2
+#undef vld1q_s32_x2
+#endif
+
+#define TO_STRING2(x) #x
+#define TO_STRING(x) TO_STRING2(x)
+// uncomment to print GCC version, may be relevant for intrinsic optimizations
+/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
+ "." TO_STRING(__GNUC_MINOR__) \
+ "." TO_STRING(__GNUC_PATCHLEVEL__)) */
+
//
-// NEON specializations are enabled for Process() and ProcessL()
+// NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
+//
+// Two variants are presented here:
+// ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32.
+// ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header.
//
-// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
-// and looping stride 16 (or vice versa). This has some polyphase coef data alignment
-// issues with S16 coefs. Consider this later.
// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
+// These are only used for inline assembly.
#define ASSEMBLY_ACCUMULATE_MONO \
"vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\
"vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\
@@ -49,6 +70,458 @@ namespace android {
"vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\
"vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/
+template <int CHANNELS, int STRIDE, bool FIXED>
+static inline void ProcessNeonIntrinsic(int32_t* out,
+ int count,
+ const int16_t* coefsP,
+ const int16_t* coefsN,
+ const int16_t* sP,
+ const int16_t* sN,
+ const int32_t* volumeLR,
+ uint32_t lerpP,
+ const int16_t* coefsP1,
+ const int16_t* coefsN1)
+{
+ ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
+ COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2);
+
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16);
+ coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16);
+
+ int16x4_t interp;
+ if (!FIXED) {
+ interp = vdup_n_s16(lerpP);
+ //interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0);
+ coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16);
+ coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16);
+ }
+ int32x4_t accum, accum2;
+ // warning uninitialized if we use veorq_s32
+ // (alternative to below) accum = veorq_s32(accum, accum);
+ accum = vdupq_n_s32(0);
+ if (CHANNELS == 2) {
+ // (alternative to below) accum2 = veorq_s32(accum2, accum2);
+ accum2 = vdupq_n_s32(0);
+ }
+ do {
+ int16x8_t posCoef = vld1q_s16(coefsP);
+ coefsP += 8;
+ int16x8_t negCoef = vld1q_s16(coefsN);
+ coefsN += 8;
+ if (!FIXED) { // interpolate
+ int16x8_t posCoef1 = vld1q_s16(coefsP1);
+ coefsP1 += 8;
+ int16x8_t negCoef1 = vld1q_s16(coefsN1);
+ coefsN1 += 8;
+
+ posCoef1 = vsubq_s16(posCoef1, posCoef);
+ negCoef = vsubq_s16(negCoef, negCoef1);
+
+ posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0);
+ negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0);
+
+ posCoef = vaddq_s16(posCoef, posCoef1);
+ negCoef = vaddq_s16(negCoef, negCoef1);
+ }
+ switch (CHANNELS) {
+ case 1: {
+ int16x8_t posSamp = vld1q_s16(sP);
+ int16x8_t negSamp = vld1q_s16(sN);
+ sN += 8;
+ posSamp = vrev64q_s16(posSamp);
+
+ // dot product
+ accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed
+ accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed
+ accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef));
+ accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef));
+ sP -= 8;
+ } break;
+ case 2: {
+ int16x8x2_t posSamp = vld2q_s16(sP);
+ int16x8x2_t negSamp = vld2q_s16(sN);
+ sN += 16;
+ posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
+ posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
+
+ // dot product
+ accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r
+ accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r
+ accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r
+ accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r
+ accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef));
+ accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef));
+ accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef));
+ accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef));
+ sP -= 16;
+ }
+ } break;
+ } while (count -= 8);
+
+ // multiply by volume and save
+ volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
+ int32x2_t vLR = vld1_s32(volumeLR);
+ int32x2_t outSamp = vld1_s32(out);
+ // combine and funnel down accumulator
+ int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ if (CHANNELS == 1) {
+ // duplicate accum to both L and R
+ outAccum = vpadd_s32(outAccum, outAccum);
+ } else if (CHANNELS == 2) {
+ // accum2 contains R, fold in
+ int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
+ outAccum = vpadd_s32(outAccum, outAccum2);
+ }
+ outAccum = vqrdmulh_s32(outAccum, vLR);
+ outSamp = vqadd_s32(outSamp, outAccum);
+ vst1_s32(out, outSamp);
+}
+
+template <int CHANNELS, int STRIDE, bool FIXED>
+static inline void ProcessNeonIntrinsic(int32_t* out,
+ int count,
+ const int32_t* coefsP,
+ const int32_t* coefsN,
+ const int16_t* sP,
+ const int16_t* sN,
+ const int32_t* volumeLR,
+ uint32_t lerpP,
+ const int32_t* coefsP1,
+ const int32_t* coefsN1)
+{
+ ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
+ COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2);
+
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
+ coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
+
+ int32x2_t interp;
+ if (!FIXED) {
+ interp = vdup_n_s32(lerpP);
+ coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
+ coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
+ }
+ int32x4_t accum, accum2;
+ // warning uninitialized if we use veorq_s32
+ // (alternative to below) accum = veorq_s32(accum, accum);
+ accum = vdupq_n_s32(0);
+ if (CHANNELS == 2) {
+ // (alternative to below) accum2 = veorq_s32(accum2, accum2);
+ accum2 = vdupq_n_s32(0);
+ }
+ do {
+#ifdef vld1q_s32_x2
+ int32x4x2_t posCoef = vld1q_s32_x2(coefsP);
+ coefsP += 8;
+ int32x4x2_t negCoef = vld1q_s32_x2(coefsN);
+ coefsN += 8;
+#else
+ int32x4x2_t posCoef;
+ posCoef.val[0] = vld1q_s32(coefsP);
+ coefsP += 4;
+ posCoef.val[1] = vld1q_s32(coefsP);
+ coefsP += 4;
+ int32x4x2_t negCoef;
+ negCoef.val[0] = vld1q_s32(coefsN);
+ coefsN += 4;
+ negCoef.val[1] = vld1q_s32(coefsN);
+ coefsN += 4;
+#endif
+ if (!FIXED) { // interpolate
+#ifdef vld1q_s32_x2
+ int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1);
+ coefsP1 += 8;
+ int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1);
+ coefsN1 += 8;
+#else
+ int32x4x2_t posCoef1;
+ posCoef1.val[0] = vld1q_s32(coefsP1);
+ coefsP1 += 4;
+ posCoef1.val[1] = vld1q_s32(coefsP1);
+ coefsP1 += 4;
+ int32x4x2_t negCoef1;
+ negCoef1.val[0] = vld1q_s32(coefsN1);
+ coefsN1 += 4;
+ negCoef1.val[1] = vld1q_s32(coefsN1);
+ coefsN1 += 4;
+#endif
+
+ posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]);
+ posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]);
+ negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]);
+ negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]);
+
+ posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0);
+ posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0);
+ negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0);
+ negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0);
+
+ posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]);
+ posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]);
+ negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]);
+ negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]);
+ }
+ switch (CHANNELS) {
+ case 1: {
+ int16x8_t posSamp = vld1q_s16(sP);
+ int16x8_t negSamp = vld1q_s16(sN);
+ sN += 8;
+ posSamp = vrev64q_s16(posSamp);
+
+ int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15);
+ int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15);
+ int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15);
+ int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15);
+
+ // dot product
+ posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
+ posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
+ negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
+ negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
+
+ accum = vaddq_s32(accum, posSamp0);
+ negSamp0 = vaddq_s32(negSamp0, negSamp1);
+ accum = vaddq_s32(accum, posSamp1);
+ accum = vaddq_s32(accum, negSamp0);
+
+ sP -= 8;
+ } break;
+ case 2: {
+ int16x8x2_t posSamp = vld2q_s16(sP);
+ int16x8x2_t negSamp = vld2q_s16(sN);
+ sN += 16;
+ posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
+ posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
+
+ // left
+ int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15);
+ int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15);
+ int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15);
+ int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15);
+
+ // dot product
+ posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
+ posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
+ negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
+ negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
+
+ accum = vaddq_s32(accum, posSamp0);
+ negSamp0 = vaddq_s32(negSamp0, negSamp1);
+ accum = vaddq_s32(accum, posSamp1);
+ accum = vaddq_s32(accum, negSamp0);
+
+ // right
+ posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15);
+ posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15);
+ negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15);
+ negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15);
+
+ // dot product
+ posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
+ posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
+ negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
+ negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
+
+ accum2 = vaddq_s32(accum2, posSamp0);
+ negSamp0 = vaddq_s32(negSamp0, negSamp1);
+ accum2 = vaddq_s32(accum2, posSamp1);
+ accum2 = vaddq_s32(accum2, negSamp0);
+
+ sP -= 16;
+ } break;
+ }
+ } while (count -= 8);
+
+ // multiply by volume and save
+ volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
+ int32x2_t vLR = vld1_s32(volumeLR);
+ int32x2_t outSamp = vld1_s32(out);
+ // combine and funnel down accumulator
+ int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
+ if (CHANNELS == 1) {
+ // duplicate accum to both L and R
+ outAccum = vpadd_s32(outAccum, outAccum);
+ } else if (CHANNELS == 2) {
+ // accum2 contains R, fold in
+ int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
+ outAccum = vpadd_s32(outAccum, outAccum2);
+ }
+ outAccum = vqrdmulh_s32(outAccum, vLR);
+ outSamp = vqadd_s32(outSamp, outAccum);
+ vst1_s32(out, outSamp);
+}
+
+template <int CHANNELS, int STRIDE, bool FIXED>
+static inline void ProcessNeonIntrinsic(float* out,
+ int count,
+ const float* coefsP,
+ const float* coefsN,
+ const float* sP,
+ const float* sN,
+ const float* volumeLR,
+ float lerpP,
+ const float* coefsP1,
+ const float* coefsN1)
+{
+ ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
+ COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2);
+
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ coefsP = (const float*)__builtin_assume_aligned(coefsP, 16);
+ coefsN = (const float*)__builtin_assume_aligned(coefsN, 16);
+
+ float32x2_t interp;
+ if (!FIXED) {
+ interp = vdup_n_f32(lerpP);
+ coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16);
+ coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16);
+ }
+ float32x4_t accum, accum2;
+ // warning uninitialized if we use veorq_s32
+ // (alternative to below) accum = veorq_s32(accum, accum);
+ accum = vdupq_n_f32(0);
+ if (CHANNELS == 2) {
+ // (alternative to below) accum2 = veorq_s32(accum2, accum2);
+ accum2 = vdupq_n_f32(0);
+ }
+ do {
+#ifdef vld1q_f32_x2
+ float32x4x2_t posCoef = vld1q_f32_x2(coefsP);
+ coefsP += 8;
+ float32x4x2_t negCoef = vld1q_f32_x2(coefsN);
+ coefsN += 8;
+#else
+ float32x4x2_t posCoef;
+ posCoef.val[0] = vld1q_f32(coefsP);
+ coefsP += 4;
+ posCoef.val[1] = vld1q_f32(coefsP);
+ coefsP += 4;
+ float32x4x2_t negCoef;
+ negCoef.val[0] = vld1q_f32(coefsN);
+ coefsN += 4;
+ negCoef.val[1] = vld1q_f32(coefsN);
+ coefsN += 4;
+#endif
+ if (!FIXED) { // interpolate
+#ifdef vld1q_f32_x2
+ float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1);
+ coefsP1 += 8;
+ float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1);
+ coefsN1 += 8;
+#else
+ float32x4x2_t posCoef1;
+ posCoef1.val[0] = vld1q_f32(coefsP1);
+ coefsP1 += 4;
+ posCoef1.val[1] = vld1q_f32(coefsP1);
+ coefsP1 += 4;
+ float32x4x2_t negCoef1;
+ negCoef1.val[0] = vld1q_f32(coefsN1);
+ coefsN1 += 4;
+ negCoef1.val[1] = vld1q_f32(coefsN1);
+ coefsN1 += 4;
+#endif
+ posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]);
+ posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]);
+ negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]);
+ negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]);
+
+ posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0);
+ posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0);
+ negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev
+ negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev
+ }
+ switch (CHANNELS) {
+ case 1: {
+#ifdef vld1q_f32_x2
+ float32x4x2_t posSamp = vld1q_f32_x2(sP);
+ float32x4x2_t negSamp = vld1q_f32_x2(sN);
+ sN += 8;
+ sP -= 8;
+#else
+ float32x4x2_t posSamp;
+ posSamp.val[0] = vld1q_f32(sP);
+ sP += 4;
+ posSamp.val[1] = vld1q_f32(sP);
+ sP -= 12;
+ float32x4x2_t negSamp;
+ negSamp.val[0] = vld1q_f32(sN);
+ sN += 4;
+ negSamp.val[1] = vld1q_f32(sN);
+ sN += 4;
+#endif
+ // effectively we want a vrev128q_f32()
+ posSamp.val[0] = vrev64q_f32(posSamp.val[0]);
+ posSamp.val[1] = vrev64q_f32(posSamp.val[1]);
+ posSamp.val[0] = vcombine_f32(
+ vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0]));
+ posSamp.val[1] = vcombine_f32(
+ vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1]));
+
+ accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]);
+ accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]);
+ accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]);
+ accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]);
+ } break;
+ case 2: {
+ float32x4x2_t posSamp0 = vld2q_f32(sP);
+ sP += 8;
+ float32x4x2_t negSamp0 = vld2q_f32(sN);
+ sN += 8;
+ posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]);
+ posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]);
+ posSamp0.val[0] = vcombine_f32(
+ vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0]));
+ posSamp0.val[1] = vcombine_f32(
+ vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1]));
+
+ float32x4x2_t posSamp1 = vld2q_f32(sP);
+ sP -= 24;
+ float32x4x2_t negSamp1 = vld2q_f32(sN);
+ sN += 8;
+ posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]);
+ posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]);
+ posSamp1.val[0] = vcombine_f32(
+ vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0]));
+ posSamp1.val[1] = vcombine_f32(
+ vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1]));
+
+ // Note: speed is affected by accumulation order.
+ // Also, speed appears slower using vmul/vadd instead of vmla for
+ // stereo case, comparable for mono.
+
+ accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]);
+ accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]);
+ accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]);
+ accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]);
+
+ accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed
+ accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed
+ accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed
+ accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed
+ } break;
+ }
+ } while (count -= 8);
+
+ // multiply by volume and save
+ volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8);
+ float32x2_t vLR = vld1_f32(volumeLR);
+ float32x2_t outSamp = vld1_f32(out);
+ // combine and funnel down accumulator
+ float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
+ if (CHANNELS == 1) {
+ // duplicate accum to both L and R
+ outAccum = vpadd_f32(outAccum, outAccum);
+ } else if (CHANNELS == 2) {
+ // accum2 contains R, fold in
+ float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
+ outAccum = vpadd_f32(outAccum, outAccum2);
+ }
+ outSamp = vmla_f32(outSamp, outAccum, vLR);
+ vst1_f32(out, outSamp);
+}
+
template <>
inline void ProcessL<1, 16>(int32_t* const out,
int count,
@@ -58,6 +531,10 @@ inline void ProcessL<1, 16>(int32_t* const out,
const int16_t* sN,
const int32_t* const volumeLR)
{
+#ifdef USE_INTRINSIC
+ ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+ 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
+#else
const int CHANNELS = 1; // template specialization does not preserve params
const int STRIDE = 16;
sP -= CHANNELS*((STRIDE>>1)-1);
@@ -99,6 +576,7 @@ inline void ProcessL<1, 16>(int32_t* const out,
"q0", "q1", "q2", "q3",
"q8", "q10"
);
+#endif
}
template <>
@@ -110,6 +588,10 @@ inline void ProcessL<2, 16>(int32_t* const out,
const int16_t* sN,
const int32_t* const volumeLR)
{
+#ifdef USE_INTRINSIC
+ ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+ 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
+#else
const int CHANNELS = 2; // template specialization does not preserve params
const int STRIDE = 16;
sP -= CHANNELS*((STRIDE>>1)-1);
@@ -119,13 +601,13 @@ inline void ProcessL<2, 16>(int32_t* const out,
"1: \n"
- "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
- "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
+ "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames
+ "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames
"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
"vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
- "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
- "vrev64.16 q3, q3 \n"// (0 combines+) reverse right positive
+ "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left
+ "vrev64.16 q3, q3 \n"// (0 combines+) reverse positive right
"vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left
"vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left
@@ -157,6 +639,7 @@ inline void ProcessL<2, 16>(int32_t* const out,
"q4", "q5", "q6",
"q8", "q10"
);
+#endif
}
template <>
@@ -171,6 +654,11 @@ inline void Process<1, 16>(int32_t* const out,
uint32_t lerpP,
const int32_t* const volumeLR)
{
+#ifdef USE_INTRINSIC
+ ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+ lerpP, coefsP1, coefsN1);
+#else
+
const int CHANNELS = 1; // template specialization does not preserve params
const int STRIDE = 16;
sP -= CHANNELS*((STRIDE>>1)-1);
@@ -227,6 +715,7 @@ inline void Process<1, 16>(int32_t* const out,
"q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11"
);
+#endif
}
template <>
@@ -241,6 +730,10 @@ inline void Process<2, 16>(int32_t* const out,
uint32_t lerpP,
const int32_t* const volumeLR)
{
+#ifdef USE_INTRINSIC
+ ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+ lerpP, coefsP1, coefsN1);
+#else
const int CHANNELS = 2; // template specialization does not preserve params
const int STRIDE = 16;
sP -= CHANNELS*((STRIDE>>1)-1);
@@ -251,8 +744,8 @@ inline void Process<2, 16>(int32_t* const out,
"1: \n"
- "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
- "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
+ "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames
+ "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames
"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
"vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
"vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
@@ -264,8 +757,8 @@ inline void Process<2, 16>(int32_t* const out,
"vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
"vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
- "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
- "vrev64.16 q3, q3 \n"// (1) reverse 8 frames of the right positive
+ "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left
+ "vrev64.16 q3, q3 \n"// (1) reverse 8 samples of positive right
"vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set
"vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
@@ -303,6 +796,7 @@ inline void Process<2, 16>(int32_t* const out,
"q4", "q5", "q6",
"q8", "q9", "q10", "q11"
);
+#endif
}
template <>
@@ -314,6 +808,10 @@ inline void ProcessL<1, 16>(int32_t* const out,
const int16_t* sN,
const int32_t* const volumeLR)
{
+#ifdef USE_INTRINSIC
+ ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+ 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
+#else
const int CHANNELS = 1; // template specialization does not preserve params
const int STRIDE = 16;
sP -= CHANNELS*((STRIDE>>1)-1);
@@ -327,7 +825,7 @@ inline void ProcessL<1, 16>(int32_t* const out,
"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
"vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
- "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
+ "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side
"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
@@ -335,10 +833,10 @@ inline void ProcessL<1, 16>(int32_t* const out,
"vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
"vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
- "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples
+ "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples
+ "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples
"vadd.s32 q0, q0, q12 \n"// accumulate result
"vadd.s32 q13, q13, q14 \n"// accumulate result
@@ -364,6 +862,7 @@ inline void ProcessL<1, 16>(int32_t* const out,
"q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15"
);
+#endif
}
template <>
@@ -375,6 +874,10 @@ inline void ProcessL<2, 16>(int32_t* const out,
const int16_t* sN,
const int32_t* const volumeLR)
{
+#ifdef USE_INTRINSIC
+ ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+ 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
+#else
const int CHANNELS = 2; // template specialization does not preserve params
const int STRIDE = 16;
sP -= CHANNELS*((STRIDE>>1)-1);
@@ -384,13 +887,13 @@ inline void ProcessL<2, 16>(int32_t* const out,
"1: \n"
- "vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples
- "vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples
- "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+ "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames
+ "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames
+ "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
+ "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
- "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
- "vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side
+ "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left
+ "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right
"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
@@ -398,15 +901,15 @@ inline void ProcessL<2, 16>(int32_t* const out,
"vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
"vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
- "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef
+ "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
+ "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef
"vadd.s32 q0, q0, q12 \n"// accumulate result
"vadd.s32 q13, q13, q14 \n"// accumulate result
- "vadd.s32 q0, q0, q15 \n"// (+1) accumulate result
- "vadd.s32 q0, q0, q13 \n"// (+1) accumulate result
+ "vadd.s32 q0, q0, q15 \n"// accumulate result
+ "vadd.s32 q0, q0, q13 \n"// accumulate result
"vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
"vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
@@ -414,15 +917,15 @@ inline void ProcessL<2, 16>(int32_t* const out,
"vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
"vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
- "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef
+ "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
+ "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef
"vadd.s32 q4, q4, q12 \n"// accumulate result
"vadd.s32 q13, q13, q14 \n"// accumulate result
- "vadd.s32 q4, q4, q15 \n"// (+1) accumulate result
- "vadd.s32 q4, q4, q13 \n"// (+1) accumulate result
+ "vadd.s32 q4, q4, q15 \n"// accumulate result
+ "vadd.s32 q4, q4, q13 \n"// accumulate result
"subs %[count], %[count], #8 \n"// update loop counter
"sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
@@ -444,6 +947,7 @@ inline void ProcessL<2, 16>(int32_t* const out,
"q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15"
);
+#endif
}
template <>
@@ -458,6 +962,10 @@ inline void Process<1, 16>(int32_t* const out,
uint32_t lerpP,
const int32_t* const volumeLR)
{
+#ifdef USE_INTRINSIC
+ ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+ lerpP, coefsP1, coefsN1);
+#else
const int CHANNELS = 1; // template specialization does not preserve params
const int STRIDE = 16;
sP -= CHANNELS*((STRIDE>>1)-1);
@@ -489,7 +997,7 @@ inline void Process<1, 16>(int32_t* const out,
"vadd.s32 q10, q10, q14 \n"// interpolate (step3)
"vadd.s32 q11, q11, q15 \n"// interpolate (step3)
- "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
+ "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side
"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
@@ -529,6 +1037,7 @@ inline void Process<1, 16>(int32_t* const out,
"q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15"
);
+#endif
}
template <>
@@ -543,6 +1052,10 @@ inline void Process<2, 16>(int32_t* const out,
uint32_t lerpP,
const int32_t* const volumeLR)
{
+#ifdef USE_INTRINSIC
+ ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+ lerpP, coefsP1, coefsN1);
+#else
const int CHANNELS = 2; // template specialization does not preserve params
const int STRIDE = 16;
sP -= CHANNELS*((STRIDE>>1)-1);
@@ -553,8 +1066,8 @@ inline void Process<2, 16>(int32_t* const out,
"1: \n"
- "vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples
- "vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples
+ "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames
+ "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames
"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
"vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
"vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
@@ -575,8 +1088,8 @@ inline void Process<2, 16>(int32_t* const out,
"vadd.s32 q10, q10, q14 \n"// interpolate (step3)
"vadd.s32 q11, q11, q15 \n"// interpolate (step3)
- "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
- "vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side
+ "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left
+ "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right
"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
@@ -591,8 +1104,8 @@ inline void Process<2, 16>(int32_t* const out,
"vadd.s32 q0, q0, q12 \n"// accumulate result
"vadd.s32 q13, q13, q14 \n"// accumulate result
- "vadd.s32 q0, q0, q15 \n"// (+1) accumulate result
- "vadd.s32 q0, q0, q13 \n"// (+1) accumulate result
+ "vadd.s32 q0, q0, q15 \n"// accumulate result
+ "vadd.s32 q0, q0, q13 \n"// accumulate result
"vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
"vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
@@ -607,8 +1120,8 @@ inline void Process<2, 16>(int32_t* const out,
"vadd.s32 q4, q4, q12 \n"// accumulate result
"vadd.s32 q13, q13, q14 \n"// accumulate result
- "vadd.s32 q4, q4, q15 \n"// (+1) accumulate result
- "vadd.s32 q4, q4, q13 \n"// (+1) accumulate result
+ "vadd.s32 q4, q4, q15 \n"// accumulate result
+ "vadd.s32 q4, q4, q13 \n"// accumulate result
"subs %[count], %[count], #8 \n"// update loop counter
"sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
@@ -633,517 +1146,69 @@ inline void Process<2, 16>(int32_t* const out,
"q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15"
);
+#endif
}
-template <>
-inline void ProcessL<1, 8>(int32_t* const out,
+template<>
+inline void ProcessL<1, 16>(float* const out,
int count,
- const int16_t* coefsP,
- const int16_t* coefsN,
- const int16_t* sP,
- const int16_t* sN,
- const int32_t* const volumeLR)
+ const float* coefsP,
+ const float* coefsN,
+ const float* sP,
+ const float* sN,
+ const float* const volumeLR)
{
- const int CHANNELS = 1; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
-
- "1: \n"
-
- "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples
- "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples
- "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs
- "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs
-
- "vrev64.16 d4, d4 \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
-
- // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
- "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed)samples by coef
- "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
-
- // moving these ARM instructions before neon above seems to be slower
- "subs %[count], %[count], #4 \n"// (1) update loop counter
- "sub %[sP], %[sP], #8 \n"// (0) move pointer to next set of samples
-
- // sP used after branch (warning)
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_MONO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsN0] "+r" (coefsN),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q8", "q10"
- );
+ ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+ 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
}
-template <>
-inline void ProcessL<2, 8>(int32_t* const out,
+template<>
+inline void ProcessL<2, 16>(float* const out,
int count,
- const int16_t* coefsP,
- const int16_t* coefsN,
- const int16_t* sP,
- const int16_t* sN,
- const int32_t* const volumeLR)
+ const float* coefsP,
+ const float* coefsN,
+ const float* sP,
+ const float* sN,
+ const float* const volumeLR)
{
- const int CHANNELS = 2; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "veor q0, q0, q0 \n"// (1) acc_L = 0
- "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
-
- "1: \n"
-
- "vld2.16 {d4, d5}, [%[sP]] \n"// (2+0d) load 8 16-bits stereo samples
- "vld2.16 {d6, d7}, [%[sN]]! \n"// (2) load 8 16-bits stereo samples
- "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs
- "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs
-
- "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
-
- "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left
- "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right
- "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left
- "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right
-
- // moving these ARM before neon seems to be slower
- "subs %[count], %[count], #4 \n"// (1) update loop counter
- "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
-
- // sP used after branch (warning)
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_STEREO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsN0] "+r" (coefsN),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q4", "q5", "q6",
- "q8", "q10"
- );
+ ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+ 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
}
-template <>
-inline void Process<1, 8>(int32_t* const out,
+template<>
+inline void Process<1, 16>(float* const out,
int count,
- const int16_t* coefsP,
- const int16_t* coefsN,
- const int16_t* coefsP1,
- const int16_t* coefsN1,
- const int16_t* sP,
- const int16_t* sN,
- uint32_t lerpP,
- const int32_t* const volumeLR)
+ const float* coefsP,
+ const float* coefsN,
+ const float* coefsP1,
+ const float* coefsN1,
+ const float* sP,
+ const float* sN,
+ float lerpP,
+ const float* const volumeLR)
{
- const int CHANNELS = 1; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
- "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
-
- "1: \n"
-
- "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples
- "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples
- "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs
- "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 4 16-bits coefs for interpolation
- "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 4 16-bits coefs
- "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs for interpolation
-
- "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs
- "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets
-
- "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
- "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
-
- "vrev64.16 d4, d4 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
-
- "vadd.s16 d16, d16, d17 \n"// (1+2d) interpolate (step3) 1st set
- "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set
-
- // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
- "vmlal.s16 q0, d4, d16 \n"// (1+0d) multiply (reversed)by coef
- "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
-
- // moving these ARM instructions before neon above seems to be slower
- "subs %[count], %[count], #4 \n"// (1) update loop counter
- "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
-
- // sP used after branch (warning)
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_MONO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsN0] "+r" (coefsN),
- [coefsP1] "+r" (coefsP1),
- [coefsN1] "+r" (coefsN1),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [lerpP] "r" (lerpP),
- [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11"
- );
+ ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+ lerpP, coefsP1, coefsN1);
}
-template <>
-inline void Process<2, 8>(int32_t* const out,
+template<>
+inline void Process<2, 16>(float* const out,
int count,
- const int16_t* coefsP,
- const int16_t* coefsN,
- const int16_t* coefsP1,
- const int16_t* coefsN1,
- const int16_t* sP,
- const int16_t* sN,
- uint32_t lerpP,
- const int32_t* const volumeLR)
+ const float* coefsP,
+ const float* coefsN,
+ const float* coefsP1,
+ const float* coefsN1,
+ const float* sP,
+ const float* sN,
+ float lerpP,
+ const float* const volumeLR)
{
- const int CHANNELS = 2; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
- "veor q0, q0, q0 \n"// (1) acc_L = 0
- "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
-
- "1: \n"
-
- "vld2.16 {d4, d5}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
- "vld2.16 {d6, d7}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
- "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs
- "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 8 16-bits coefs for interpolation
- "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 8 16-bits coefs
- "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs for interpolation
-
- "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs
- "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets
-
- "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
- "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
-
- "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
-
- "vadd.s16 d16, d16, d17 \n"// (1+1d) interpolate (step3) 1st set
- "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set
-
- "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left
- "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right
- "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left
- "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right
-
- // moving these ARM before neon seems to be slower
- "subs %[count], %[count], #4 \n"// (1) update loop counter
- "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
-
- // sP used after branch (warning)
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_STEREO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsN0] "+r" (coefsN),
- [coefsP1] "+r" (coefsP1),
- [coefsN1] "+r" (coefsN1),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [lerpP] "r" (lerpP),
- [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q4", "q5", "q6",
- "q8", "q9", "q10", "q11"
- );
-}
-
-template <>
-inline void ProcessL<1, 8>(int32_t* const out,
- int count,
- const int32_t* coefsP,
- const int32_t* coefsN,
- const int16_t* sP,
- const int16_t* sN,
- const int32_t* const volumeLR)
-{
- const int CHANNELS = 1; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "veor q0, q0, q0 \n"// result, initialize to 0
-
- "1: \n"
-
- "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples
- "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples
- "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
-
- "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side
-
- "vshll.s16 q12, d4, #15 \n"// (stall) extend samples to 31 bits
- "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
-
- "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
-
- "vadd.s32 q0, q0, q12 \n"// accumulate result
- "vadd.s32 q0, q0, q14 \n"// (stall) accumulate result
-
- "subs %[count], %[count], #4 \n"// update loop counter
- "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
-
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_MONO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsN0] "+r" (coefsN),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11",
- "q12", "q14"
- );
-}
-
-template <>
-inline void ProcessL<2, 8>(int32_t* const out,
- int count,
- const int32_t* coefsP,
- const int32_t* coefsN,
- const int16_t* sP,
- const int16_t* sN,
- const int32_t* const volumeLR)
-{
- const int CHANNELS = 2; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "veor q0, q0, q0 \n"// result, initialize to 0
- "veor q4, q4, q4 \n"// result, initialize to 0
-
- "1: \n"
-
- "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples
- "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples
- "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
-
- "vrev64.16 q2, q2 \n"// reverse 2 frames of the positive side
-
- "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
- "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
-
- "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
- "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
-
- "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by coef
- "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
- "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
- "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by coef
-
- "vadd.s32 q0, q0, q12 \n"// accumulate result
- "vadd.s32 q4, q4, q13 \n"// accumulate result
- "vadd.s32 q0, q0, q14 \n"// accumulate result
- "vadd.s32 q4, q4, q15 \n"// accumulate result
-
- "subs %[count], %[count], #4 \n"// update loop counter
- "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
-
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_STEREO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsN0] "+r" (coefsN),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15"
- );
-}
-
-template <>
-inline void Process<1, 8>(int32_t* const out,
- int count,
- const int32_t* coefsP,
- const int32_t* coefsN,
- const int32_t* coefsP1,
- const int32_t* coefsN1,
- const int16_t* sP,
- const int16_t* sN,
- uint32_t lerpP,
- const int32_t* const volumeLR)
-{
- const int CHANNELS = 1; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
- "veor q0, q0, q0 \n"// result, initialize to 0
-
- "1: \n"
-
- "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples
- "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples
- "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation
- "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
-
- "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side
-
- "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs
- "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets
- "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
-
- "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs
- "vqrdmulh.s32 q11, q11, d2[0] \n"// interpolate (step2) 2nd set of coefs
- "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
-
- "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set
- "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set
-
- "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
-
- "vadd.s32 q0, q0, q12 \n"// accumulate result
- "vadd.s32 q0, q0, q14 \n"// accumulate result
-
- "subs %[count], %[count], #4 \n"// update loop counter
- "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
-
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_MONO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsP1] "+r" (coefsP1),
- [coefsN0] "+r" (coefsN),
- [coefsN1] "+r" (coefsN1),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [lerpP] "r" (lerpP),
- [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11",
- "q12", "q14"
- );
-}
-
-template <>
-inline
-void Process<2, 8>(int32_t* const out,
- int count,
- const int32_t* coefsP,
- const int32_t* coefsN,
- const int32_t* coefsP1,
- const int32_t* coefsN1,
- const int16_t* sP,
- const int16_t* sN,
- uint32_t lerpP,
- const int32_t* const volumeLR)
-{
- const int CHANNELS = 2; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
- "veor q0, q0, q0 \n"// result, initialize to 0
- "veor q4, q4, q4 \n"// result, initialize to 0
-
- "1: \n"
- "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples
- "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples
- "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation
- "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
-
- "vrev64.16 q2, q2 \n"// (reversed) 2 frames of the positive side
-
- "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs
- "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets
- "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
- "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
-
- "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs
- "vqrdmulh.s32 q11, q11, d2[1] \n"// interpolate (step3) 2nd set of coefs
- "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
- "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
-
- "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set
- "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set
-
- "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by interpolated coef
-
- "vadd.s32 q0, q0, q12 \n"// accumulate result
- "vadd.s32 q4, q4, q13 \n"// accumulate result
- "vadd.s32 q0, q0, q14 \n"// accumulate result
- "vadd.s32 q4, q4, q15 \n"// accumulate result
-
- "subs %[count], %[count], #4 \n"// update loop counter
- "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
-
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_STEREO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsP1] "+r" (coefsP1),
- [coefsN0] "+r" (coefsN),
- [coefsN1] "+r" (coefsN1),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [lerpP] "r" (lerpP),
- [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15"
- );
+ ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+ lerpP, coefsP1, coefsN1);
}
#endif //USE_NEON
-}; // namespace android
+} // namespace android
#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/