7 files changed, 41 insertions, 562 deletions
diff --git a/services/audioflinger/AudioResamplerFirProcessNeon.h b/services/audioflinger/AudioResamplerFirProcessNeon.h
index f311cef..d4fa7ad 100644
--- a/services/audioflinger/AudioResamplerFirProcessNeon.h
+++ b/services/audioflinger/AudioResamplerFirProcessNeon.h
@@ -24,10 +24,6 @@ namespace android {
 #if USE_NEON
 //
 // NEON specializations are enabled for Process() and ProcessL()
-//
-// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
-// and looping stride 16 (or vice versa). This has some polyphase coef data alignment
-// issues with S16 coefs. Consider this later.
 
 // Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
 #define ASSEMBLY_ACCUMULATE_MONO \
@@ -635,513 +631,6 @@ inline void Process<2, 16>(int32_t* const out,
     );
 }
 
-template <>
-inline void ProcessL<1, 8>(int32_t* const out,
-        int count,
-        const int16_t* coefsP,
-        const int16_t* coefsN,
-        const int16_t* sP,
-        const int16_t* sN,
-        const int32_t* const volumeLR)
-{
-    const int CHANNELS = 1; // template specialization does not preserve params
-    const int STRIDE = 8;
-    sP -= CHANNELS*((STRIDE>>1)-1);
-    asm (
-        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
-
-        "1:                                      \n"
-
-        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
-        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
-        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
-        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs
-
-        "vrev64.16      d4, d4                   \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
-
-        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
-        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed)samples by coef
-        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
-
-        // moving these ARM instructions before neon above seems to be slower
-        "subs           %[count], %[count], #4   \n"// (1) update loop counter
-        "sub            %[sP], %[sP], #8         \n"// (0) move pointer to next set of samples
-
-        // sP used after branch (warning)
-        "bne            1b                       \n"// loop
-
-        ASSEMBLY_ACCUMULATE_MONO
-
-        : [out]     "=Uv" (out[0]),
-          [count]   "+r" (count),
-          [coefsP0] "+r" (coefsP),
-          [coefsN0] "+r" (coefsN),
-          [sP]      "+r" (sP),
-          [sN]      "+r" (sN)
-        : [vLR]     "r" (volumeLR)
-        : "cc", "memory",
-          "q0", "q1", "q2", "q3",
-          "q8", "q10"
-    );
-}
-
-template <>
-inline void ProcessL<2, 8>(int32_t* const out,
-        int count,
-        const int16_t* coefsP,
-        const int16_t* coefsN,
-        const int16_t* sP,
-        const int16_t* sN,
-        const int32_t* const volumeLR)
-{
-    const int CHANNELS = 2; // template specialization does not preserve params
-    const int STRIDE = 8;
-    sP -= CHANNELS*((STRIDE>>1)-1);
-    asm (
-        "veor           q0, q0, q0               \n"// (1) acc_L = 0
-        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
-
-        "1:                                      \n"
-
-        "vld2.16        {d4, d5}, [%[sP]]        \n"// (2+0d) load 8 16-bits stereo samples
-        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (2) load 8 16-bits stereo samples
-        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
-        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs
-
-        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
-
-        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
-        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
-        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
-        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
-
-        // moving these ARM before neon seems to be slower
-        "subs           %[count], %[count], #4   \n"// (1) update loop counter
-        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
-
-        // sP used after branch (warning)
-        "bne            1b                       \n"// loop
-
-        ASSEMBLY_ACCUMULATE_STEREO
-
-        : [out] "=Uv" (out[0]),
-          [count] "+r" (count),
-          [coefsP0] "+r" (coefsP),
-          [coefsN0] "+r" (coefsN),
-          [sP] "+r" (sP),
-          [sN] "+r" (sN)
-        : [vLR] "r" (volumeLR)
-        : "cc", "memory",
-          "q0", "q1", "q2", "q3",
-          "q4", "q5", "q6",
-          "q8", "q10"
-     );
-}
-
-template <>
-inline void Process<1, 8>(int32_t* const out,
-        int count,
-        const int16_t* coefsP,
-        const int16_t* coefsN,
-        const int16_t* coefsP1,
-        const int16_t* coefsN1,
-        const int16_t* sP,
-        const int16_t* sN,
-        uint32_t lerpP,
-        const int32_t* const volumeLR)
-{
-    const int CHANNELS = 1; // template specialization does not preserve params
-    const int STRIDE = 8;
-    sP -= CHANNELS*((STRIDE>>1)-1);
-    asm (
-        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
-        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
-
-        "1:                                      \n"
-
-        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
-        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
-        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
-        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
-        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 4 16-bits coefs
-        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
-
-        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
-        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
-
-        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
-        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
-
-        "vrev64.16      d4, d4                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
-
-        "vadd.s16       d16, d16, d17            \n"// (1+2d) interpolate (step3) 1st set
-        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
-
-        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
-        "vmlal.s16      q0, d4, d16              \n"// (1+0d) multiply (reversed)by coef
-        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
-
-        // moving these ARM instructions before neon above seems to be slower
-        "subs           %[count], %[count], #4   \n"// (1) update loop counter
-        "sub            %[sP], %[sP], #8        \n"// move pointer to next set of samples
-
-        // sP used after branch (warning)
-        "bne            1b                       \n"// loop
-
-        ASSEMBLY_ACCUMULATE_MONO
-
-        : [out]     "=Uv" (out[0]),
-          [count]   "+r" (count),
-          [coefsP0] "+r" (coefsP),
-          [coefsN0] "+r" (coefsN),
-          [coefsP1] "+r" (coefsP1),
-          [coefsN1] "+r" (coefsN1),
-          [sP]      "+r" (sP),
-          [sN]      "+r" (sN)
-        : [lerpP]   "r" (lerpP),
-          [vLR]     "r" (volumeLR)
-        : "cc", "memory",
-          "q0", "q1", "q2", "q3",
-          "q8", "q9", "q10", "q11"
-    );
-}
-
-template <>
-inline void Process<2, 8>(int32_t* const out,
-        int count,
-        const int16_t* coefsP,
-        const int16_t* coefsN,
-        const int16_t* coefsP1,
-        const int16_t* coefsN1,
-        const int16_t* sP,
-        const int16_t* sN,
-        uint32_t lerpP,
-        const int32_t* const volumeLR)
-{
-    const int CHANNELS = 2; // template specialization does not preserve params
-    const int STRIDE = 8;
-    sP -= CHANNELS*((STRIDE>>1)-1);
-    asm (
-        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
-        "veor           q0, q0, q0               \n"// (1) acc_L = 0
-        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
-
-        "1:                                      \n"
-
-        "vld2.16        {d4, d5}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
-        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
-        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
-        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
-        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 8 16-bits coefs
-        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
-
-        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
-        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
-
-        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
-        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
-
-        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
-
-        "vadd.s16       d16, d16, d17            \n"// (1+1d) interpolate (step3) 1st set
-        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
-
-        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
-        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
-        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
-        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
-
-        // moving these ARM before neon seems to be slower
-        "subs           %[count], %[count], #4   \n"// (1) update loop counter
-        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
-
-        // sP used after branch (warning)
-        "bne            1b                       \n"// loop
-
-        ASSEMBLY_ACCUMULATE_STEREO
-
-        : [out] "=Uv" (out[0]),
-          [count] "+r" (count),
-          [coefsP0] "+r" (coefsP),
-          [coefsN0] "+r" (coefsN),
-          [coefsP1] "+r" (coefsP1),
-          [coefsN1] "+r" (coefsN1),
-          [sP] "+r" (sP),
-          [sN] "+r" (sN)
-        : [lerpP]   "r" (lerpP),
-          [vLR] "r" (volumeLR)
-        : "cc", "memory",
-          "q0", "q1", "q2", "q3",
-          "q4", "q5", "q6",
-          "q8", "q9", "q10", "q11"
-    );
-}
-
-template <>
-inline void ProcessL<1, 8>(int32_t* const out,
-        int count,
-        const int32_t* coefsP,
-        const int32_t* coefsN,
-        const int16_t* sP,
-        const int16_t* sN,
-        const int32_t* const volumeLR)
-{
-    const int CHANNELS = 1; // template specialization does not preserve params
-    const int STRIDE = 8;
-    sP -= CHANNELS*((STRIDE>>1)-1);
-    asm (
-        "veor           q0, q0, q0               \n"// result, initialize to 0
-
-        "1:                                      \n"
-
-        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
-        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
-        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
-        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
-
-        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
-
-        "vshll.s16      q12, d4, #15             \n"// (stall) extend samples to 31 bits
-        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
-
-        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
-        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
-
-        "vadd.s32       q0, q0, q12              \n"// accumulate result
-        "vadd.s32       q0, q0, q14              \n"// (stall) accumulate result
-
-        "subs           %[count], %[count], #4   \n"// update loop counter
-        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
-
-        "bne            1b                       \n"// loop
-
-        ASSEMBLY_ACCUMULATE_MONO
-
-        : [out] "=Uv" (out[0]),
-          [count] "+r" (count),
-          [coefsP0] "+r" (coefsP),
-          [coefsN0] "+r" (coefsN),
-          [sP] "+r" (sP),
-          [sN] "+r" (sN)
-        : [vLR] "r" (volumeLR)
-        : "cc", "memory",
-          "q0", "q1", "q2", "q3",
-          "q8", "q9", "q10", "q11",
-          "q12", "q14"
-    );
-}
-
-template <>
-inline void ProcessL<2, 8>(int32_t* const out,
-        int count,
-        const int32_t* coefsP,
-        const int32_t* coefsN,
-        const int16_t* sP,
-        const int16_t* sN,
-        const int32_t* const volumeLR)
-{
-    const int CHANNELS = 2; // template specialization does not preserve params
-    const int STRIDE = 8;
-    sP -= CHANNELS*((STRIDE>>1)-1);
-    asm (
-        "veor           q0, q0, q0               \n"// result, initialize to 0
-        "veor           q4, q4, q4               \n"// result, initialize to 0
-
-        "1:                                      \n"
-
-        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
-        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
-        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
-        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
-
-        "vrev64.16      q2, q2                   \n"// reverse 2 frames of the positive side
-
-        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
-        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
-
-        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
-        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
-
-        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by coef
-        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by coef
-        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by coef
-        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by coef
-
-        "vadd.s32       q0, q0, q12              \n"// accumulate result
-        "vadd.s32       q4, q4, q13              \n"// accumulate result
-        "vadd.s32       q0, q0, q14              \n"// accumulate result
-        "vadd.s32       q4, q4, q15              \n"// accumulate result
-
-        "subs           %[count], %[count], #4   \n"// update loop counter
-        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
-
-        "bne            1b                       \n"// loop
-
-        ASSEMBLY_ACCUMULATE_STEREO
-
-        : [out]     "=Uv" (out[0]),
-          [count]   "+r" (count),
-          [coefsP0] "+r" (coefsP),
-          [coefsN0] "+r" (coefsN),
-          [sP]      "+r" (sP),
-          [sN]      "+r" (sN)
-        : [vLR]     "r" (volumeLR)
-        : "cc", "memory",
-          "q0", "q1", "q2", "q3", "q4",
-          "q8", "q9", "q10", "q11",
-          "q12", "q13", "q14", "q15"
-    );
-}
-
-template <>
-inline void Process<1, 8>(int32_t* const out,
-        int count,
-        const int32_t* coefsP,
-        const int32_t* coefsN,
-        const int32_t* coefsP1,
-        const int32_t* coefsN1,
-        const int16_t* sP,
-        const int16_t* sN,
-        uint32_t lerpP,
-        const int32_t* const volumeLR)
-{
-    const int CHANNELS = 1; // template specialization does not preserve params
-    const int STRIDE = 8;
-    sP -= CHANNELS*((STRIDE>>1)-1);
-    asm (
-        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
-        "veor           q0, q0, q0               \n"// result, initialize to 0
-
-        "1:                                      \n"
-
-        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
-        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
-        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
-        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
-        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
-        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
-
-        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
-
-        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
-        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
-        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
-
-        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
-        "vqrdmulh.s32   q11, q11, d2[0]          \n"// interpolate (step2) 2nd set of coefs
-        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
-
-        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
-        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
-
-        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
-        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
-
-        "vadd.s32       q0, q0, q12              \n"// accumulate result
-        "vadd.s32       q0, q0, q14              \n"// accumulate result
-
-        "subs           %[count], %[count], #4   \n"// update loop counter
-        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
-
-        "bne            1b                       \n"// loop
-
-        ASSEMBLY_ACCUMULATE_MONO
-
-        : [out]     "=Uv" (out[0]),
-          [count]   "+r" (count),
-          [coefsP0] "+r" (coefsP),
-          [coefsP1] "+r" (coefsP1),
-          [coefsN0] "+r" (coefsN),
-          [coefsN1] "+r" (coefsN1),
-          [sP]      "+r" (sP),
-          [sN]      "+r" (sN)
-        : [lerpP]   "r" (lerpP),
-          [vLR]     "r" (volumeLR)
-        : "cc", "memory",
-          "q0", "q1", "q2", "q3",
-          "q8", "q9", "q10", "q11",
-          "q12", "q14"
-    );
-}
-
-template <>
-inline
-void Process<2, 8>(int32_t* const out,
-        int count,
-        const int32_t* coefsP,
-        const int32_t* coefsN,
-        const int32_t* coefsP1,
-        const int32_t* coefsN1,
-        const int16_t* sP,
-        const int16_t* sN,
-        uint32_t lerpP,
-        const int32_t* const volumeLR)
-{
-    const int CHANNELS = 2; // template specialization does not preserve params
-    const int STRIDE = 8;
-    sP -= CHANNELS*((STRIDE>>1)-1);
-    asm (
-        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
-        "veor           q0, q0, q0               \n"// result, initialize to 0
-        "veor           q4, q4, q4               \n"// result, initialize to 0
-
-        "1:                                      \n"
-        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
-        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
-        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
-        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
-        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
-        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
-
-        "vrev64.16      q2, q2                   \n"// (reversed) 2 frames of the positive side
-
-        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
-        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
-        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
-        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
-
-        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
-        "vqrdmulh.s32   q11, q11, d2[1]          \n"// interpolate (step3) 2nd set of coefs
-        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
-        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
-
-        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
-        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
-
-        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
-        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by interpolated coef
-        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
-        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by interpolated coef
-
-        "vadd.s32       q0, q0, q12              \n"// accumulate result
-        "vadd.s32       q4, q4, q13              \n"// accumulate result
-        "vadd.s32       q0, q0, q14              \n"// accumulate result
-        "vadd.s32       q4, q4, q15              \n"// accumulate result
-
-        "subs           %[count], %[count], #4   \n"// update loop counter
-        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
-
-        "bne            1b                       \n"// loop
-
-        ASSEMBLY_ACCUMULATE_STEREO
-
-        : [out]     "=Uv" (out[0]),
-          [count]   "+r" (count),
-          [coefsP0] "+r" (coefsP),
-          [coefsP1] "+r" (coefsP1),
-          [coefsN0] "+r" (coefsN),
-          [coefsN1] "+r" (coefsN1),
-          [sP]      "+r" (sP),
-          [sN]      "+r" (sN)
-        : [lerpP]   "r" (lerpP),
-          [vLR]     "r" (volumeLR)
-        : "cc", "memory",
-          "q0", "q1", "q2", "q3", "q4",
-          "q8", "q9", "q10", "q11",
-          "q12", "q13", "q14", "q15"
-    );
-}
-
 #endif //USE_NEON
 
 }; // namespace android
diff --git a/services/audioflinger/FastCapture.cpp b/services/audioflinger/FastCapture.cpp
index 1c4f670..255496e 100644
--- a/services/audioflinger/FastCapture.cpp
+++ b/services/audioflinger/FastCapture.cpp
@@ -138,13 +138,15 @@ void FastCapture::onStateChange()
             underrunNs = (frameCount * 1750000000LL) / sampleRate;  // 1.75
             overrunNs = (frameCount * 500000000LL) / sampleRate;    // 0.50
             forceNs = (frameCount * 950000000LL) / sampleRate;      // 0.95
-            warmupNs = (frameCount * 500000000LL) / sampleRate;     // 0.50
+            warmupNsMin = (frameCount * 750000000LL) / sampleRate;  // 0.75
+            warmupNsMax = (frameCount * 1250000000LL) / sampleRate; // 1.25
         } else {
             periodNs = 0;
             underrunNs = 0;
             overrunNs = 0;
             forceNs = 0;
-            warmupNs = 0;
+            warmupNsMin = 0;
+            warmupNsMax = LONG_MAX;
         }
         readBufferState = -1;
         dumpState->mFrameCount = frameCount;
diff --git a/services/audioflinger/FastMixer.cpp b/services/audioflinger/FastMixer.cpp
index 67e2e6e..8b12f28 100644
--- a/services/audioflinger/FastMixer.cpp
+++ b/services/audioflinger/FastMixer.cpp
@@ -195,13 +195,15 @@ void FastMixer::onStateChange()
             underrunNs = (frameCount * 1750000000LL) / sampleRate;  // 1.75
             overrunNs = (frameCount * 500000000LL) / sampleRate;    // 0.50
             forceNs = (frameCount * 950000000LL) / sampleRate;      // 0.95
-            warmupNs = (frameCount * 500000000LL) / sampleRate;     // 0.50
+            warmupNsMin = (frameCount * 750000000LL) / sampleRate;  // 0.75
+            warmupNsMax = (frameCount * 1250000000LL) / sampleRate; // 1.25
         } else {
             periodNs = 0;
             underrunNs = 0;
             overrunNs = 0;
             forceNs = 0;
-            warmupNs = 0;
+            warmupNsMin = 0;
+            warmupNsMax = LONG_MAX;
         }
         mMixerBufferState = UNDEFINED;
 #if !LOG_NDEBUG
diff --git a/services/audioflinger/FastThread.cpp b/services/audioflinger/FastThread.cpp
index 3e12cca..b69cc85 100644
--- a/services/audioflinger/FastThread.cpp
+++ b/services/audioflinger/FastThread.cpp
@@ -29,7 +29,8 @@
 
 #define FAST_DEFAULT_NS    999999999L   // ~1 sec: default time to sleep
 #define FAST_HOT_IDLE_NS     1000000L   // 1 ms: time to sleep while hot idling
-#define MIN_WARMUP_CYCLES          2    // minimum number of loop cycles to wait for warmup
+#define MIN_WARMUP_CYCLES          2    // minimum number of consecutive in-range loop cycles
+                                        // to wait for warmup
 #define MAX_WARMUP_CYCLES         10    // maximum number of loop cycles to wait for warmup
 
 namespace android {
@@ -44,7 +45,8 @@ FastThread::FastThread() : Thread(false /*canCallJava*/),
     underrunNs(0),
     overrunNs(0),
     forceNs(0),
-    warmupNs(0),
+    warmupNsMin(0),
+    warmupNsMax(LONG_MAX),
     // re-initialized to &dummyDumpState by subclass constructor
     mDummyDumpState(NULL),
     dumpState(NULL),
@@ -60,6 +62,7 @@ FastThread::FastThread() : Thread(false /*canCallJava*/),
     isWarm(false),
     /* measuredWarmupTs({0, 0}), */
     warmupCycles(0),
+    warmupConsecutiveInRangeCycles(0),
     // dummyLogWriter
     logWriter(&dummyLogWriter),
     timestampStatus(INVALID_OPERATION),
@@ -169,6 +172,7 @@ bool FastThread::threadLoop()
                 measuredWarmupTs.tv_sec = 0;
                 measuredWarmupTs.tv_nsec = 0;
                 warmupCycles = 0;
+                warmupConsecutiveInRangeCycles = 0;
                 sleepNs = -1;
                 coldGen = current->mColdGen;
 #ifdef FAST_MIXER_STATISTICS
@@ -222,7 +226,8 @@ bool FastThread::threadLoop()
                 // To avoid an initial underrun on fast tracks after exiting standby,
                 // do not start pulling data from tracks and mixing until warmup is complete.
                 // Warmup is considered complete after the earlier of:
-                //      MIN_WARMUP_CYCLES write() attempts and last one blocks for at least warmupNs
+                //      MIN_WARMUP_CYCLES consecutive in-range write() attempts,
+                //          where "in-range" means warmupNsMin <= cycle time <= warmupNsMax
                 //      MAX_WARMUP_CYCLES write() attempts.
                 // This is overly conservative, but to get better accuracy requires a new HAL API.
                 if (!isWarm && attemptedWrite) {
@@ -233,7 +238,14 @@ bool FastThread::threadLoop()
                         measuredWarmupTs.tv_nsec -= 1000000000;
                     }
                     ++warmupCycles;
-                    if ((nsec > warmupNs && warmupCycles >= MIN_WARMUP_CYCLES) ||
+                    if (warmupNsMin <= nsec && nsec <= warmupNsMax) {
+                        ALOGV("warmup cycle %d in range: %.03f ms", warmupCycles, nsec * 1e-9);
+                        ++warmupConsecutiveInRangeCycles;
+                    } else {
+                        ALOGV("warmup cycle %d out of range: %.03f ms", warmupCycles, nsec * 1e-9);
+                        warmupConsecutiveInRangeCycles = 0;
+                    }
+                    if ((warmupConsecutiveInRangeCycles >= MIN_WARMUP_CYCLES) ||
                             (warmupCycles >= MAX_WARMUP_CYCLES)) {
                         isWarm = true;
                         dumpState->mMeasuredWarmupTs = measuredWarmupTs;
diff --git a/services/audioflinger/FastThread.h b/services/audioflinger/FastThread.h
index 1330334..cb32e9d 100644
--- a/services/audioflinger/FastThread.h
+++ b/services/audioflinger/FastThread.h
@@ -58,7 +58,8 @@ protected:
     long underrunNs;    // underrun likely when write cycle is greater than this value
     long overrunNs;     // overrun likely when write cycle is less than this value
     long forceNs;       // if overrun detected, force the write cycle to take this much time
-    long warmupNs;      // warmup complete when write cycle is greater than to this value
+    long warmupNsMin;   // warmup complete when write cycle is greater than or equal to this value
+    long warmupNsMax;   //                                 and less than or equal to this value
     FastThreadDumpState *mDummyDumpState;
     FastThreadDumpState *dumpState;
     bool ignoreNextOverrun;  // used to ignore initial overrun and first after an underrun
@@ -74,7 +75,8 @@ protected:
     unsigned coldGen;   // last observed mColdGen
     bool isWarm;        // true means ready to mix, false means wait for warmup before mixing
     struct timespec measuredWarmupTs;  // how long did it take for warmup to complete
-    uint32_t warmupCycles;  // counter of number of loop cycles required to warmup
+    uint32_t warmupCycles;  // counter of number of loop cycles during warmup phase
+    uint32_t warmupConsecutiveInRangeCycles;    // number of consecutive cycles in range
     NBLog::Writer dummyLogWriter;
     NBLog::Writer *logWriter;
     status_t timestampStatus;
diff --git a/services/audioflinger/Threads.cpp b/services/audioflinger/Threads.cpp
index 384bd25..40ab0af 100644
--- a/services/audioflinger/Threads.cpp
+++ b/services/audioflinger/Threads.cpp
@@ -174,18 +174,6 @@ static int sFastTrackMultiplier = kFastTrackMultiplier;
 // and that all "fast" AudioRecord clients read from.  In either case, the size can be small.
 static const size_t kRecordThreadReadOnlyHeapSize = 0x2000;
 
-// Returns the source frames needed to resample to destination frames.  This is not a precise
-// value and depends on the resampler (and possibly how it handles rounding internally).
-// If srcSampleRate and dstSampleRate are equal, then it returns destination frames, which
-// may not be a true if the resampler is asynchronous.
-static inline size_t sourceFramesNeeded(
-        uint32_t srcSampleRate, size_t dstFramesRequired, uint32_t dstSampleRate) {
-    // +1 for rounding - always do this even if matched ratio
-    // +1 for additional sample needed for interpolation
-    return srcSampleRate == dstSampleRate ? dstFramesRequired :
-            size_t((uint64_t)dstFramesRequired * srcSampleRate / dstSampleRate + 1 + 1);
-}
-
 // ----------------------------------------------------------------------------
 
 static pthread_once_t sFastTrackMultiplierOnce = PTHREAD_ONCE_INIT;
@@ -1497,20 +1485,25 @@ sp<AudioFlinger::PlaybackThread::Track> AudioFlinger::PlaybackThread::createTrac
                 audio_is_linear_pcm(format),
                 channelMask, sampleRate, mSampleRate, hasFastMixer(), tid, mFastTrackAvailMask);
         *flags &= ~IAudioFlinger::TRACK_FAST;
-        // For compatibility with AudioTrack calculation, buffer depth is forced
-        // to be at least 2 x the normal mixer frame count and cover audio hardware latency.
-        // This is probably too conservative, but legacy application code may depend on it.
-        // If you change this calculation, also review the start threshold which is related.
+      }
+    }
+    // For normal PCM streaming tracks, update minimum frame count.
+    // For compatibility with AudioTrack calculation, buffer depth is forced
+    // to be at least 2 x the normal mixer frame count and cover audio hardware latency.
+    // This is probably too conservative, but legacy application code may depend on it.
+    // If you change this calculation, also review the start threshold which is related.
+    if (!(*flags & IAudioFlinger::TRACK_FAST)
+            && audio_is_linear_pcm(format) && sharedBuffer == 0) {
         uint32_t latencyMs = mOutput->stream->get_latency(mOutput->stream);
         uint32_t minBufCount = latencyMs / ((1000 * mNormalFrameCount) / mSampleRate);
         if (minBufCount < 2) {
             minBufCount = 2;
         }
-        size_t minFrameCount = mNormalFrameCount * minBufCount;
-        if (frameCount < minFrameCount) {
+        size_t minFrameCount =
+                minBufCount * sourceFramesNeeded(sampleRate, mNormalFrameCount, mSampleRate);
+        if (frameCount < minFrameCount) { // including frameCount == 0
             frameCount = minFrameCount;
         }
-      }
     }
     *pFrameCount = frameCount;
 
diff --git a/services/audioflinger/Tracks.cpp b/services/audioflinger/Tracks.cpp
index 78cec31..fa0beaa 100644
--- a/services/audioflinger/Tracks.cpp
+++ b/services/audioflinger/Tracks.cpp
@@ -1722,28 +1722,7 @@ bool AudioFlinger::PlaybackThread::OutputTrack::write(void* data, uint32_t frame
     uint32_t waitTimeLeftMs = mSourceThread->waitTimeMs();
 
     if (!mActive && frames != 0) {
-        start();
-        sp<ThreadBase> thread = mThread.promote();
-        if (thread != 0) {
-            MixerThread *mixerThread = (MixerThread *)thread.get();
-            if (mFrameCount > frames) {
-                // For the first write after being inactive, ensure that we have
-                // enough frames to fill mFrameCount (which should be multiples of
-                // the minimum buffer requirements of the downstream MixerThread).
-                // This provides enough frames for the downstream mixer to begin
-                // (see AudioFlinger::PlaybackThread::Track::isReady()).
-                if (mBufferQueue.size() < kMaxOverFlowBuffers) {
-                    uint32_t startFrames = (mFrameCount - frames);
-                    pInBuffer = new Buffer;
-                    pInBuffer->mBuffer = calloc(1, startFrames * mFrameSize);
-                    pInBuffer->frameCount = startFrames;
-                    pInBuffer->raw = pInBuffer->mBuffer;
-                    mBufferQueue.add(pInBuffer);
-                } else {
-                    ALOGW("OutputTrack::write() %p no more buffers in queue", this);
-                }
-            }
-        }
+        (void) start();
     }
 
     while (waitTimeLeftMs) {