/* * Copyright (C) 2013 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H namespace android { // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h #if USE_NEON // use intrinsics if inline arm32 assembly is not possible #if !USE_INLINE_ASSEMBLY #define USE_INTRINSIC #endif // following intrinsics available only on ARM 64 bit ACLE #ifndef __aarch64__ #undef vld1q_f32_x2 #undef vld1q_s32_x2 #endif #define TO_STRING2(x) #x #define TO_STRING(x) TO_STRING2(x) // uncomment to print GCC version, may be relevant for intrinsic optimizations /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \ "." TO_STRING(__GNUC_MINOR__) \ "." TO_STRING(__GNUC_PATCHLEVEL__)) */ // // NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h // // Two variants are presented here: // ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32. // ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header. // // Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out. // These are only used for inline assembly. #define ASSEMBLY_ACCUMULATE_MONO \ "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\ "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\ "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\ "vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\ "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\ "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\ "vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */ #define ASSEMBLY_ACCUMULATE_STEREO \ "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\ "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\ "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\ "vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\ "vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\ "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\ "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\ "vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/ template static inline void ProcessNeonIntrinsic(int32_t* out, int count, const int16_t* coefsP, const int16_t* coefsN, const int16_t* sP, const int16_t* sN, const int32_t* volumeLR, uint32_t lerpP, const int16_t* coefsP1, const int16_t* coefsN1) { ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8 COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2); sP -= CHANNELS*((STRIDE>>1)-1); coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16); coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16); int16x4_t interp; if (!FIXED) { interp = vdup_n_s16(lerpP); //interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0); coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16); coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16); } int32x4_t accum, accum2; // warning uninitialized if we use veorq_s32 // (alternative to below) accum = veorq_s32(accum, accum); accum = vdupq_n_s32(0); if (CHANNELS == 2) { // (alternative to below) accum2 = veorq_s32(accum2, accum2); accum2 = vdupq_n_s32(0); } do { int16x8_t posCoef = vld1q_s16(coefsP); coefsP += 8; int16x8_t negCoef = vld1q_s16(coefsN); coefsN += 8; if (!FIXED) { // interpolate int16x8_t posCoef1 = vld1q_s16(coefsP1); coefsP1 += 8; int16x8_t negCoef1 = vld1q_s16(coefsN1); coefsN1 += 8; posCoef1 = vsubq_s16(posCoef1, posCoef); negCoef = vsubq_s16(negCoef, negCoef1); posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0); negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0); posCoef = vaddq_s16(posCoef, posCoef1); negCoef = vaddq_s16(negCoef, negCoef1); } switch (CHANNELS) { case 1: { int16x8_t posSamp = vld1q_s16(sP); int16x8_t negSamp = vld1q_s16(sN); sN += 8; posSamp = vrev64q_s16(posSamp); // dot product accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef)); accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef)); sP -= 8; } break; case 2: { int16x8x2_t posSamp = vld2q_s16(sP); int16x8x2_t negSamp = vld2q_s16(sN); sN += 16; posSamp.val[0] = vrev64q_s16(posSamp.val[0]); posSamp.val[1] = vrev64q_s16(posSamp.val[1]); // dot product accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef)); accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef)); accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef)); accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef)); sP -= 16; } } break; } while (count -= 8); // multiply by volume and save volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8); int32x2_t vLR = vld1_s32(volumeLR); int32x2_t outSamp = vld1_s32(out); // combine and funnel down accumulator int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); if (CHANNELS == 1) { // duplicate accum to both L and R outAccum = vpadd_s32(outAccum, outAccum); } else if (CHANNELS == 2) { // accum2 contains R, fold in int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); outAccum = vpadd_s32(outAccum, outAccum2); } outAccum = vqrdmulh_s32(outAccum, vLR); outSamp = vqadd_s32(outSamp, outAccum); vst1_s32(out, outSamp); } template static inline void ProcessNeonIntrinsic(int32_t* out, int count, const int32_t* coefsP, const int32_t* coefsN, const int16_t* sP, const int16_t* sN, const int32_t* volumeLR, uint32_t lerpP, const int32_t* coefsP1, const int32_t* coefsN1) { ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8 COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2); sP -= CHANNELS*((STRIDE>>1)-1); coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16); coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16); int32x2_t interp; if (!FIXED) { interp = vdup_n_s32(lerpP); coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16); coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16); } int32x4_t accum, accum2; // warning uninitialized if we use veorq_s32 // (alternative to below) accum = veorq_s32(accum, accum); accum = vdupq_n_s32(0); if (CHANNELS == 2) { // (alternative to below) accum2 = veorq_s32(accum2, accum2); accum2 = vdupq_n_s32(0); } do { #ifdef vld1q_s32_x2 int32x4x2_t posCoef = vld1q_s32_x2(coefsP); coefsP += 8; int32x4x2_t negCoef = vld1q_s32_x2(coefsN); coefsN += 8; #else int32x4x2_t posCoef; posCoef.val[0] = vld1q_s32(coefsP); coefsP += 4; posCoef.val[1] = vld1q_s32(coefsP); coefsP += 4; int32x4x2_t negCoef; negCoef.val[0] = vld1q_s32(coefsN); coefsN += 4; negCoef.val[1] = vld1q_s32(coefsN); coefsN += 4; #endif if (!FIXED) { // interpolate #ifdef vld1q_s32_x2 int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1); coefsP1 += 8; int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1); coefsN1 += 8; #else int32x4x2_t posCoef1; posCoef1.val[0] = vld1q_s32(coefsP1); coefsP1 += 4; posCoef1.val[1] = vld1q_s32(coefsP1); coefsP1 += 4; int32x4x2_t negCoef1; negCoef1.val[0] = vld1q_s32(coefsN1); coefsN1 += 4; negCoef1.val[1] = vld1q_s32(coefsN1); coefsN1 += 4; #endif posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]); posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]); negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]); negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]); posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0); posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0); negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0); negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0); posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]); posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]); negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]); negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]); } switch (CHANNELS) { case 1: { int16x8_t posSamp = vld1q_s16(sP); int16x8_t negSamp = vld1q_s16(sN); sN += 8; posSamp = vrev64q_s16(posSamp); int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15); int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15); int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15); int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15); // dot product posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]); negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]); accum = vaddq_s32(accum, posSamp0); negSamp0 = vaddq_s32(negSamp0, negSamp1); accum = vaddq_s32(accum, posSamp1); accum = vaddq_s32(accum, negSamp0); sP -= 8; } break; case 2: { int16x8x2_t posSamp = vld2q_s16(sP); int16x8x2_t negSamp = vld2q_s16(sN); sN += 16; posSamp.val[0] = vrev64q_s16(posSamp.val[0]); posSamp.val[1] = vrev64q_s16(posSamp.val[1]); // left int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15); int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15); int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15); int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15); // dot product posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]); negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]); accum = vaddq_s32(accum, posSamp0); negSamp0 = vaddq_s32(negSamp0, negSamp1); accum = vaddq_s32(accum, posSamp1); accum = vaddq_s32(accum, negSamp0); // right posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15); posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15); negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15); negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15); // dot product posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]); negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]); accum2 = vaddq_s32(accum2, posSamp0); negSamp0 = vaddq_s32(negSamp0, negSamp1); accum2 = vaddq_s32(accum2, posSamp1); accum2 = vaddq_s32(accum2, negSamp0); sP -= 16; } break; } } while (count -= 8); // multiply by volume and save volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8); int32x2_t vLR = vld1_s32(volumeLR); int32x2_t outSamp = vld1_s32(out); // combine and funnel down accumulator int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); if (CHANNELS == 1) { // duplicate accum to both L and R outAccum = vpadd_s32(outAccum, outAccum); } else if (CHANNELS == 2) { // accum2 contains R, fold in int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); outAccum = vpadd_s32(outAccum, outAccum2); } outAccum = vqrdmulh_s32(outAccum, vLR); outSamp = vqadd_s32(outSamp, outAccum); vst1_s32(out, outSamp); } template static inline void ProcessNeonIntrinsic(float* out, int count, const float* coefsP, const float* coefsN, const float* sP, const float* sN, const float* volumeLR, float lerpP, const float* coefsP1, const float* coefsN1) { ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8 COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2); sP -= CHANNELS*((STRIDE>>1)-1); coefsP = (const float*)__builtin_assume_aligned(coefsP, 16); coefsN = (const float*)__builtin_assume_aligned(coefsN, 16); float32x2_t interp; if (!FIXED) { interp = vdup_n_f32(lerpP); coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16); coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16); } float32x4_t accum, accum2; // warning uninitialized if we use veorq_s32 // (alternative to below) accum = veorq_s32(accum, accum); accum = vdupq_n_f32(0); if (CHANNELS == 2) { // (alternative to below) accum2 = veorq_s32(accum2, accum2); accum2 = vdupq_n_f32(0); } do { #ifdef vld1q_f32_x2 float32x4x2_t posCoef = vld1q_f32_x2(coefsP); coefsP += 8; float32x4x2_t negCoef = vld1q_f32_x2(coefsN); coefsN += 8; #else float32x4x2_t posCoef; posCoef.val[0] = vld1q_f32(coefsP); coefsP += 4; posCoef.val[1] = vld1q_f32(coefsP); coefsP += 4; float32x4x2_t negCoef; negCoef.val[0] = vld1q_f32(coefsN); coefsN += 4; negCoef.val[1] = vld1q_f32(coefsN); coefsN += 4; #endif if (!FIXED) { // interpolate #ifdef vld1q_f32_x2 float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1); coefsP1 += 8; float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1); coefsN1 += 8; #else float32x4x2_t posCoef1; posCoef1.val[0] = vld1q_f32(coefsP1); coefsP1 += 4; posCoef1.val[1] = vld1q_f32(coefsP1); coefsP1 += 4; float32x4x2_t negCoef1; negCoef1.val[0] = vld1q_f32(coefsN1); coefsN1 += 4; negCoef1.val[1] = vld1q_f32(coefsN1); coefsN1 += 4; #endif posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]); posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]); negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]); negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]); posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0); posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0); negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev } switch (CHANNELS) { case 1: { #ifdef vld1q_f32_x2 float32x4x2_t posSamp = vld1q_f32_x2(sP); float32x4x2_t negSamp = vld1q_f32_x2(sN); sN += 8; sP -= 8; #else float32x4x2_t posSamp; posSamp.val[0] = vld1q_f32(sP); sP += 4; posSamp.val[1] = vld1q_f32(sP); sP -= 12; float32x4x2_t negSamp; negSamp.val[0] = vld1q_f32(sN); sN += 4; negSamp.val[1] = vld1q_f32(sN); sN += 4; #endif // effectively we want a vrev128q_f32() posSamp.val[0] = vrev64q_f32(posSamp.val[0]); posSamp.val[1] = vrev64q_f32(posSamp.val[1]); posSamp.val[0] = vcombine_f32( vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0])); posSamp.val[1] = vcombine_f32( vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1])); accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]); accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]); accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]); accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]); } break; case 2: { float32x4x2_t posSamp0 = vld2q_f32(sP); sP += 8; float32x4x2_t negSamp0 = vld2q_f32(sN); sN += 8; posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]); posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]); posSamp0.val[0] = vcombine_f32( vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0])); posSamp0.val[1] = vcombine_f32( vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1])); float32x4x2_t posSamp1 = vld2q_f32(sP); sP -= 24; float32x4x2_t negSamp1 = vld2q_f32(sN); sN += 8; posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]); posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]); posSamp1.val[0] = vcombine_f32( vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0])); posSamp1.val[1] = vcombine_f32( vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1])); // Note: speed is affected by accumulation order. // Also, speed appears slower using vmul/vadd instead of vmla for // stereo case, comparable for mono. accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]); accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]); accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]); accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]); accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed } break; } } while (count -= 8); // multiply by volume and save volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8); float32x2_t vLR = vld1_f32(volumeLR); float32x2_t outSamp = vld1_f32(out); // combine and funnel down accumulator float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)); if (CHANNELS == 1) { // duplicate accum to both L and R outAccum = vpadd_f32(outAccum, outAccum); } else if (CHANNELS == 2) { // accum2 contains R, fold in float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2)); outAccum = vpadd_f32(outAccum, outAccum2); } outSamp = vmla_f32(outSamp, outAccum, vLR); vst1_f32(out, outSamp); } template <> inline void ProcessL<1, 16>(int32_t* const out, int count, const int16_t* coefsP, const int16_t* coefsN, const int16_t* sP, const int16_t* sN, const int32_t* const volumeLR) { #ifdef USE_INTRINSIC ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); #else const int CHANNELS = 1; // template specialization does not preserve params const int STRIDE = 16; sP -= CHANNELS*((STRIDE>>1)-1); asm ( "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 "1: \n" "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 // reordering the vmal to do d6, d7 before d4, d5 is slower(?) "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples // moving these ARM instructions before neon above seems to be slower "subs %[count], %[count], #8 \n"// (1) update loop counter "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples // sP used after branch (warning) "bne 1b \n"// loop ASSEMBLY_ACCUMULATE_MONO : [out] "=Uv" (out[0]), [count] "+r" (count), [coefsP0] "+r" (coefsP), [coefsN0] "+r" (coefsN), [sP] "+r" (sP), [sN] "+r" (sN) : [vLR] "r" (volumeLR) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q10" ); #endif } template <> inline void ProcessL<2, 16>(int32_t* const out, int count, const int16_t* coefsP, const int16_t* coefsN, const int16_t* sP, const int16_t* sN, const int32_t* const volumeLR) { #ifdef USE_INTRINSIC ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); #else const int CHANNELS = 2; // template specialization does not preserve params const int STRIDE = 16; sP -= CHANNELS*((STRIDE>>1)-1); asm ( "veor q0, q0, q0 \n"// (1) acc_L = 0 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 "1: \n" "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left "vrev64.16 q3, q3 \n"// (0 combines+) reverse positive right "vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left "vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right "vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right // moving these ARM before neon seems to be slower "subs %[count], %[count], #8 \n"// (1) update loop counter "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples // sP used after branch (warning) "bne 1b \n"// loop ASSEMBLY_ACCUMULATE_STEREO : [out] "=Uv" (out[0]), [count] "+r" (count), [coefsP0] "+r" (coefsP), [coefsN0] "+r" (coefsN), [sP] "+r" (sP), [sN] "+r" (sN) : [vLR] "r" (volumeLR) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q8", "q10" ); #endif } template <> inline void Process<1, 16>(int32_t* const out, int count, const int16_t* coefsP, const int16_t* coefsN, const int16_t* coefsP1, const int16_t* coefsN1, const int16_t* sP, const int16_t* sN, uint32_t lerpP, const int32_t* const volumeLR) { #ifdef USE_INTRINSIC ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, lerpP, coefsP1, coefsN1); #else const int CHANNELS = 1; // template specialization does not preserve params const int STRIDE = 16; sP -= CHANNELS*((STRIDE>>1)-1); asm ( "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 "1: \n" "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 "vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set // reordering the vmal to do d6, d7 before d4, d5 is slower(?) "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples // moving these ARM instructions before neon above seems to be slower "subs %[count], %[count], #8 \n"// (1) update loop counter "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples // sP used after branch (warning) "bne 1b \n"// loop ASSEMBLY_ACCUMULATE_MONO : [out] "=Uv" (out[0]), [count] "+r" (count), [coefsP0] "+r" (coefsP), [coefsN0] "+r" (coefsN), [coefsP1] "+r" (coefsP1), [coefsN1] "+r" (coefsN1), [sP] "+r" (sP), [sN] "+r" (sN) : [lerpP] "r" (lerpP), [vLR] "r" (volumeLR) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" ); #endif } template <> inline void Process<2, 16>(int32_t* const out, int count, const int16_t* coefsP, const int16_t* coefsN, const int16_t* coefsP1, const int16_t* coefsN1, const int16_t* sP, const int16_t* sN, uint32_t lerpP, const int32_t* const volumeLR) { #ifdef USE_INTRINSIC ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, lerpP, coefsP1, coefsN1); #else const int CHANNELS = 2; // template specialization does not preserve params const int STRIDE = 16; sP -= CHANNELS*((STRIDE>>1)-1); asm ( "vmov.32 d2[0], %[lerpP] \n"// load the positive phase "veor q0, q0, q0 \n"// (1) acc_L = 0 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 "1: \n" "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left "vrev64.16 q3, q3 \n"// (1) reverse 8 samples of positive right "vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set "vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left "vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right "vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right // moving these ARM before neon seems to be slower "subs %[count], %[count], #8 \n"// (1) update loop counter "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples // sP used after branch (warning) "bne 1b \n"// loop ASSEMBLY_ACCUMULATE_STEREO : [out] "=Uv" (out[0]), [count] "+r" (count), [coefsP0] "+r" (coefsP), [coefsN0] "+r" (coefsN), [coefsP1] "+r" (coefsP1), [coefsN1] "+r" (coefsN1), [sP] "+r" (sP), [sN] "+r" (sN) : [lerpP] "r" (lerpP), [vLR] "r" (volumeLR) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q8", "q9", "q10", "q11" ); #endif } template <> inline void ProcessL<1, 16>(int32_t* const out, int count, const int32_t* coefsP, const int32_t* coefsN, const int16_t* sP, const int16_t* sN, const int32_t* const volumeLR) { #ifdef USE_INTRINSIC ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); #else const int CHANNELS = 1; // template specialization does not preserve params const int STRIDE = 16; sP -= CHANNELS*((STRIDE>>1)-1); asm ( "veor q0, q0, q0 \n"// result, initialize to 0 "1: \n" "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples "vadd.s32 q0, q0, q12 \n"// accumulate result "vadd.s32 q13, q13, q14 \n"// accumulate result "vadd.s32 q0, q0, q15 \n"// accumulate result "vadd.s32 q0, q0, q13 \n"// accumulate result "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples "subs %[count], %[count], #8 \n"// update loop counter "bne 1b \n"// loop ASSEMBLY_ACCUMULATE_MONO : [out] "=Uv" (out[0]), [count] "+r" (count), [coefsP0] "+r" (coefsP), [coefsN0] "+r" (coefsN), [sP] "+r" (sP), [sN] "+r" (sN) : [vLR] "r" (volumeLR) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); #endif } template <> inline void ProcessL<2, 16>(int32_t* const out, int count, const int32_t* coefsP, const int32_t* coefsN, const int16_t* sP, const int16_t* sN, const int32_t* const volumeLR) { #ifdef USE_INTRINSIC ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); #else const int CHANNELS = 2; // template specialization does not preserve params const int STRIDE = 16; sP -= CHANNELS*((STRIDE>>1)-1); asm ( "veor q0, q0, q0 \n"// result, initialize to 0 "veor q4, q4, q4 \n"// result, initialize to 0 "1: \n" "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef "vadd.s32 q0, q0, q12 \n"// accumulate result "vadd.s32 q13, q13, q14 \n"// accumulate result "vadd.s32 q0, q0, q15 \n"// accumulate result "vadd.s32 q0, q0, q13 \n"// accumulate result "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef "vadd.s32 q4, q4, q12 \n"// accumulate result "vadd.s32 q13, q13, q14 \n"// accumulate result "vadd.s32 q4, q4, q15 \n"// accumulate result "vadd.s32 q4, q4, q13 \n"// accumulate result "subs %[count], %[count], #8 \n"// update loop counter "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples "bne 1b \n"// loop ASSEMBLY_ACCUMULATE_STEREO : [out] "=Uv" (out[0]), [count] "+r" (count), [coefsP0] "+r" (coefsP), [coefsN0] "+r" (coefsN), [sP] "+r" (sP), [sN] "+r" (sN) : [vLR] "r" (volumeLR) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); #endif } template <> inline void Process<1, 16>(int32_t* const out, int count, const int32_t* coefsP, const int32_t* coefsN, const int32_t* coefsP1, const int32_t* coefsN1, const int16_t* sP, const int16_t* sN, uint32_t lerpP, const int32_t* const volumeLR) { #ifdef USE_INTRINSIC ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, lerpP, coefsP1, coefsN1); #else const int CHANNELS = 1; // template specialization does not preserve params const int STRIDE = 16; sP -= CHANNELS*((STRIDE>>1)-1); asm ( "vmov.32 d2[0], %[lerpP] \n"// load the positive phase "veor q0, q0, q0 \n"// result, initialize to 0 "1: \n" "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs "vsub.s32 q12, q12, q8 \n"// interpolate (step1) "vsub.s32 q13, q13, q9 \n"// interpolate (step1) "vsub.s32 q14, q14, q10 \n"// interpolate (step1) "vsub.s32 q15, q15, q11 \n"// interpolate (step1) "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) "vadd.s32 q8, q8, q12 \n"// interpolate (step3) "vadd.s32 q9, q9, q13 \n"// interpolate (step3) "vadd.s32 q10, q10, q14 \n"// interpolate (step3) "vadd.s32 q11, q11, q15 \n"// interpolate (step3) "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef "vadd.s32 q0, q0, q12 \n"// accumulate result "vadd.s32 q13, q13, q14 \n"// accumulate result "vadd.s32 q0, q0, q15 \n"// accumulate result "vadd.s32 q0, q0, q13 \n"// accumulate result "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples "subs %[count], %[count], #8 \n"// update loop counter "bne 1b \n"// loop ASSEMBLY_ACCUMULATE_MONO : [out] "=Uv" (out[0]), [count] "+r" (count), [coefsP0] "+r" (coefsP), [coefsN0] "+r" (coefsN), [coefsP1] "+r" (coefsP1), [coefsN1] "+r" (coefsN1), [sP] "+r" (sP), [sN] "+r" (sN) : [lerpP] "r" (lerpP), [vLR] "r" (volumeLR) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); #endif } template <> inline void Process<2, 16>(int32_t* const out, int count, const int32_t* coefsP, const int32_t* coefsN, const int32_t* coefsP1, const int32_t* coefsN1, const int16_t* sP, const int16_t* sN, uint32_t lerpP, const int32_t* const volumeLR) { #ifdef USE_INTRINSIC ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, lerpP, coefsP1, coefsN1); #else const int CHANNELS = 2; // template specialization does not preserve params const int STRIDE = 16; sP -= CHANNELS*((STRIDE>>1)-1); asm ( "vmov.32 d2[0], %[lerpP] \n"// load the positive phase "veor q0, q0, q0 \n"// result, initialize to 0 "veor q4, q4, q4 \n"// result, initialize to 0 "1: \n" "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs "vsub.s32 q12, q12, q8 \n"// interpolate (step1) "vsub.s32 q13, q13, q9 \n"// interpolate (step1) "vsub.s32 q14, q14, q10 \n"// interpolate (step1) "vsub.s32 q15, q15, q11 \n"// interpolate (step1) "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) "vadd.s32 q8, q8, q12 \n"// interpolate (step3) "vadd.s32 q9, q9, q13 \n"// interpolate (step3) "vadd.s32 q10, q10, q14 \n"// interpolate (step3) "vadd.s32 q11, q11, q15 \n"// interpolate (step3) "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef "vadd.s32 q0, q0, q12 \n"// accumulate result "vadd.s32 q13, q13, q14 \n"// accumulate result "vadd.s32 q0, q0, q15 \n"// accumulate result "vadd.s32 q0, q0, q13 \n"// accumulate result "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef "vadd.s32 q4, q4, q12 \n"// accumulate result "vadd.s32 q13, q13, q14 \n"// accumulate result "vadd.s32 q4, q4, q15 \n"// accumulate result "vadd.s32 q4, q4, q13 \n"// accumulate result "subs %[count], %[count], #8 \n"// update loop counter "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples "bne 1b \n"// loop ASSEMBLY_ACCUMULATE_STEREO : [out] "=Uv" (out[0]), [count] "+r" (count), [coefsP0] "+r" (coefsP), [coefsN0] "+r" (coefsN), [coefsP1] "+r" (coefsP1), [coefsN1] "+r" (coefsN1), [sP] "+r" (sP), [sN] "+r" (sN) : [lerpP] "r" (lerpP), [vLR] "r" (volumeLR) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); #endif } template<> inline void ProcessL<1, 16>(float* const out, int count, const float* coefsP, const float* coefsN, const float* sP, const float* sN, const float* const volumeLR) { ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); } template<> inline void ProcessL<2, 16>(float* const out, int count, const float* coefsP, const float* coefsN, const float* sP, const float* sN, const float* const volumeLR) { ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); } template<> inline void Process<1, 16>(float* const out, int count, const float* coefsP, const float* coefsN, const float* coefsP1, const float* coefsN1, const float* sP, const float* sN, float lerpP, const float* const volumeLR) { ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, lerpP, coefsP1, coefsN1); } template<> inline void Process<2, 16>(float* const out, int count, const float* coefsP, const float* coefsN, const float* coefsP1, const float* coefsN1, const float* sP, const float* sN, float lerpP, const float* const volumeLR) { ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, lerpP, coefsP1, coefsN1); } #endif //USE_NEON } // namespace android #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/