diff options
Diffstat (limited to 'services')
-rw-r--r-- | services/audioflinger/AudioResamplerFirProcessNeon.h | 511 | ||||
-rw-r--r-- | services/audioflinger/FastCapture.cpp | 6 | ||||
-rw-r--r-- | services/audioflinger/FastMixer.cpp | 6 | ||||
-rw-r--r-- | services/audioflinger/FastThread.cpp | 20 | ||||
-rw-r--r-- | services/audioflinger/FastThread.h | 6 | ||||
-rw-r--r-- | services/audioflinger/Threads.cpp | 31 | ||||
-rw-r--r-- | services/audioflinger/Tracks.cpp | 23 |
7 files changed, 41 insertions, 562 deletions
diff --git a/services/audioflinger/AudioResamplerFirProcessNeon.h b/services/audioflinger/AudioResamplerFirProcessNeon.h index f311cef..d4fa7ad 100644 --- a/services/audioflinger/AudioResamplerFirProcessNeon.h +++ b/services/audioflinger/AudioResamplerFirProcessNeon.h @@ -24,10 +24,6 @@ namespace android { #if USE_NEON // // NEON specializations are enabled for Process() and ProcessL() -// -// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary) -// and looping stride 16 (or vice versa). This has some polyphase coef data alignment -// issues with S16 coefs. Consider this later. // Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out. #define ASSEMBLY_ACCUMULATE_MONO \ @@ -635,513 +631,6 @@ inline void Process<2, 16>(int32_t* const out, ); } -template <> -inline void ProcessL<1, 8>(int32_t* const out, - int count, - const int16_t* coefsP, - const int16_t* coefsN, - const int16_t* sP, - const int16_t* sN, - const int32_t* const volumeLR) -{ - const int CHANNELS = 1; // template specialization does not preserve params - const int STRIDE = 8; - sP -= CHANNELS*((STRIDE>>1)-1); - asm ( - "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 - - "1: \n" - - "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples - "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples - "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs - "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs - - "vrev64.16 d4, d4 \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4 - - // reordering the vmal to do d6, d7 before d4, d5 is slower(?) - "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed)samples by coef - "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples - - // moving these ARM instructions before neon above seems to be slower - "subs %[count], %[count], #4 \n"// (1) update loop counter - "sub %[sP], %[sP], #8 \n"// (0) move pointer to next set of samples - - // sP used after branch (warning) - "bne 1b \n"// loop - - ASSEMBLY_ACCUMULATE_MONO - - : [out] "=Uv" (out[0]), - [count] "+r" (count), - [coefsP0] "+r" (coefsP), - [coefsN0] "+r" (coefsN), - [sP] "+r" (sP), - [sN] "+r" (sN) - : [vLR] "r" (volumeLR) - : "cc", "memory", - "q0", "q1", "q2", "q3", - "q8", "q10" - ); -} - -template <> -inline void ProcessL<2, 8>(int32_t* const out, - int count, - const int16_t* coefsP, - const int16_t* coefsN, - const int16_t* sP, - const int16_t* sN, - const int32_t* const volumeLR) -{ - const int CHANNELS = 2; // template specialization does not preserve params - const int STRIDE = 8; - sP -= CHANNELS*((STRIDE>>1)-1); - asm ( - "veor q0, q0, q0 \n"// (1) acc_L = 0 - "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 - - "1: \n" - - "vld2.16 {d4, d5}, [%[sP]] \n"// (2+0d) load 8 16-bits stereo samples - "vld2.16 {d6, d7}, [%[sN]]! \n"// (2) load 8 16-bits stereo samples - "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs - "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs - - "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive - - "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left - "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right - "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left - "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right - - // moving these ARM before neon seems to be slower - "subs %[count], %[count], #4 \n"// (1) update loop counter - "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples - - // sP used after branch (warning) - "bne 1b \n"// loop - - ASSEMBLY_ACCUMULATE_STEREO - - : [out] "=Uv" (out[0]), - [count] "+r" (count), - [coefsP0] "+r" (coefsP), - [coefsN0] "+r" (coefsN), - [sP] "+r" (sP), - [sN] "+r" (sN) - : [vLR] "r" (volumeLR) - : "cc", "memory", - "q0", "q1", "q2", "q3", - "q4", "q5", "q6", - "q8", "q10" - ); -} - -template <> -inline void Process<1, 8>(int32_t* const out, - int count, - const int16_t* coefsP, - const int16_t* coefsN, - const int16_t* coefsP1, - const int16_t* coefsN1, - const int16_t* sP, - const int16_t* sN, - uint32_t lerpP, - const int32_t* const volumeLR) -{ - const int CHANNELS = 1; // template specialization does not preserve params - const int STRIDE = 8; - sP -= CHANNELS*((STRIDE>>1)-1); - asm ( - "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15 - "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 - - "1: \n" - - "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples - "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples - "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs - "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 4 16-bits coefs for interpolation - "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 4 16-bits coefs - "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs for interpolation - - "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs - "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets - - "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs - "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs - - "vrev64.16 d4, d4 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 - - "vadd.s16 d16, d16, d17 \n"// (1+2d) interpolate (step3) 1st set - "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set - - // reordering the vmal to do d6, d7 before d4, d5 is slower(?) - "vmlal.s16 q0, d4, d16 \n"// (1+0d) multiply (reversed)by coef - "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples - - // moving these ARM instructions before neon above seems to be slower - "subs %[count], %[count], #4 \n"// (1) update loop counter - "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples - - // sP used after branch (warning) - "bne 1b \n"// loop - - ASSEMBLY_ACCUMULATE_MONO - - : [out] "=Uv" (out[0]), - [count] "+r" (count), - [coefsP0] "+r" (coefsP), - [coefsN0] "+r" (coefsN), - [coefsP1] "+r" (coefsP1), - [coefsN1] "+r" (coefsN1), - [sP] "+r" (sP), - [sN] "+r" (sN) - : [lerpP] "r" (lerpP), - [vLR] "r" (volumeLR) - : "cc", "memory", - "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11" - ); -} - -template <> -inline void Process<2, 8>(int32_t* const out, - int count, - const int16_t* coefsP, - const int16_t* coefsN, - const int16_t* coefsP1, - const int16_t* coefsN1, - const int16_t* sP, - const int16_t* sN, - uint32_t lerpP, - const int32_t* const volumeLR) -{ - const int CHANNELS = 2; // template specialization does not preserve params - const int STRIDE = 8; - sP -= CHANNELS*((STRIDE>>1)-1); - asm ( - "vmov.32 d2[0], %[lerpP] \n"// load the positive phase - "veor q0, q0, q0 \n"// (1) acc_L = 0 - "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 - - "1: \n" - - "vld2.16 {d4, d5}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples - "vld2.16 {d6, d7}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples - "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs - "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 8 16-bits coefs for interpolation - "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 8 16-bits coefs - "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs for interpolation - - "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs - "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets - - "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs - "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs - - "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive - - "vadd.s16 d16, d16, d17 \n"// (1+1d) interpolate (step3) 1st set - "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set - - "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left - "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right - "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left - "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right - - // moving these ARM before neon seems to be slower - "subs %[count], %[count], #4 \n"// (1) update loop counter - "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples - - // sP used after branch (warning) - "bne 1b \n"// loop - - ASSEMBLY_ACCUMULATE_STEREO - - : [out] "=Uv" (out[0]), - [count] "+r" (count), - [coefsP0] "+r" (coefsP), - [coefsN0] "+r" (coefsN), - [coefsP1] "+r" (coefsP1), - [coefsN1] "+r" (coefsN1), - [sP] "+r" (sP), - [sN] "+r" (sN) - : [lerpP] "r" (lerpP), - [vLR] "r" (volumeLR) - : "cc", "memory", - "q0", "q1", "q2", "q3", - "q4", "q5", "q6", - "q8", "q9", "q10", "q11" - ); -} - -template <> -inline void ProcessL<1, 8>(int32_t* const out, - int count, - const int32_t* coefsP, - const int32_t* coefsN, - const int16_t* sP, - const int16_t* sN, - const int32_t* const volumeLR) -{ - const int CHANNELS = 1; // template specialization does not preserve params - const int STRIDE = 8; - sP -= CHANNELS*((STRIDE>>1)-1); - asm ( - "veor q0, q0, q0 \n"// result, initialize to 0 - - "1: \n" - - "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples - "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples - "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs - "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs - - "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side - - "vshll.s16 q12, d4, #15 \n"// (stall) extend samples to 31 bits - "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits - - "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef - "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef - - "vadd.s32 q0, q0, q12 \n"// accumulate result - "vadd.s32 q0, q0, q14 \n"// (stall) accumulate result - - "subs %[count], %[count], #4 \n"// update loop counter - "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples - - "bne 1b \n"// loop - - ASSEMBLY_ACCUMULATE_MONO - - : [out] "=Uv" (out[0]), - [count] "+r" (count), - [coefsP0] "+r" (coefsP), - [coefsN0] "+r" (coefsN), - [sP] "+r" (sP), - [sN] "+r" (sN) - : [vLR] "r" (volumeLR) - : "cc", "memory", - "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", - "q12", "q14" - ); -} - -template <> -inline void ProcessL<2, 8>(int32_t* const out, - int count, - const int32_t* coefsP, - const int32_t* coefsN, - const int16_t* sP, - const int16_t* sN, - const int32_t* const volumeLR) -{ - const int CHANNELS = 2; // template specialization does not preserve params - const int STRIDE = 8; - sP -= CHANNELS*((STRIDE>>1)-1); - asm ( - "veor q0, q0, q0 \n"// result, initialize to 0 - "veor q4, q4, q4 \n"// result, initialize to 0 - - "1: \n" - - "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples - "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples - "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs - "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs - - "vrev64.16 q2, q2 \n"// reverse 2 frames of the positive side - - "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits - "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits - - "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits - "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits - - "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by coef - "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef - "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef - "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by coef - - "vadd.s32 q0, q0, q12 \n"// accumulate result - "vadd.s32 q4, q4, q13 \n"// accumulate result - "vadd.s32 q0, q0, q14 \n"// accumulate result - "vadd.s32 q4, q4, q15 \n"// accumulate result - - "subs %[count], %[count], #4 \n"// update loop counter - "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples - - "bne 1b \n"// loop - - ASSEMBLY_ACCUMULATE_STEREO - - : [out] "=Uv" (out[0]), - [count] "+r" (count), - [coefsP0] "+r" (coefsP), - [coefsN0] "+r" (coefsN), - [sP] "+r" (sP), - [sN] "+r" (sN) - : [vLR] "r" (volumeLR) - : "cc", "memory", - "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15" - ); -} - -template <> -inline void Process<1, 8>(int32_t* const out, - int count, - const int32_t* coefsP, - const int32_t* coefsN, - const int32_t* coefsP1, - const int32_t* coefsN1, - const int16_t* sP, - const int16_t* sN, - uint32_t lerpP, - const int32_t* const volumeLR) -{ - const int CHANNELS = 1; // template specialization does not preserve params - const int STRIDE = 8; - sP -= CHANNELS*((STRIDE>>1)-1); - asm ( - "vmov.32 d2[0], %[lerpP] \n"// load the positive phase - "veor q0, q0, q0 \n"// result, initialize to 0 - - "1: \n" - - "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples - "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples - "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs - "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation - "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs - "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation - - "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side - - "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs - "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets - "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits - - "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs - "vqrdmulh.s32 q11, q11, d2[0] \n"// interpolate (step2) 2nd set of coefs - "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits - - "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set - "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set - - "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef - "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef - - "vadd.s32 q0, q0, q12 \n"// accumulate result - "vadd.s32 q0, q0, q14 \n"// accumulate result - - "subs %[count], %[count], #4 \n"// update loop counter - "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples - - "bne 1b \n"// loop - - ASSEMBLY_ACCUMULATE_MONO - - : [out] "=Uv" (out[0]), - [count] "+r" (count), - [coefsP0] "+r" (coefsP), - [coefsP1] "+r" (coefsP1), - [coefsN0] "+r" (coefsN), - [coefsN1] "+r" (coefsN1), - [sP] "+r" (sP), - [sN] "+r" (sN) - : [lerpP] "r" (lerpP), - [vLR] "r" (volumeLR) - : "cc", "memory", - "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", - "q12", "q14" - ); -} - -template <> -inline -void Process<2, 8>(int32_t* const out, - int count, - const int32_t* coefsP, - const int32_t* coefsN, - const int32_t* coefsP1, - const int32_t* coefsN1, - const int16_t* sP, - const int16_t* sN, - uint32_t lerpP, - const int32_t* const volumeLR) -{ - const int CHANNELS = 2; // template specialization does not preserve params - const int STRIDE = 8; - sP -= CHANNELS*((STRIDE>>1)-1); - asm ( - "vmov.32 d2[0], %[lerpP] \n"// load the positive phase - "veor q0, q0, q0 \n"// result, initialize to 0 - "veor q4, q4, q4 \n"// result, initialize to 0 - - "1: \n" - "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples - "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples - "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs - "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation - "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs - "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation - - "vrev64.16 q2, q2 \n"// (reversed) 2 frames of the positive side - - "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs - "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets - "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits - "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits - - "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs - "vqrdmulh.s32 q11, q11, d2[1] \n"// interpolate (step3) 2nd set of coefs - "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits - "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits - - "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set - "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set - - "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef - "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef - "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef - "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by interpolated coef - - "vadd.s32 q0, q0, q12 \n"// accumulate result - "vadd.s32 q4, q4, q13 \n"// accumulate result - "vadd.s32 q0, q0, q14 \n"// accumulate result - "vadd.s32 q4, q4, q15 \n"// accumulate result - - "subs %[count], %[count], #4 \n"// update loop counter - "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples - - "bne 1b \n"// loop - - ASSEMBLY_ACCUMULATE_STEREO - - : [out] "=Uv" (out[0]), - [count] "+r" (count), - [coefsP0] "+r" (coefsP), - [coefsP1] "+r" (coefsP1), - [coefsN0] "+r" (coefsN), - [coefsN1] "+r" (coefsN1), - [sP] "+r" (sP), - [sN] "+r" (sN) - : [lerpP] "r" (lerpP), - [vLR] "r" (volumeLR) - : "cc", "memory", - "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15" - ); -} - #endif //USE_NEON }; // namespace android diff --git a/services/audioflinger/FastCapture.cpp b/services/audioflinger/FastCapture.cpp index 1c4f670..255496e 100644 --- a/services/audioflinger/FastCapture.cpp +++ b/services/audioflinger/FastCapture.cpp @@ -138,13 +138,15 @@ void FastCapture::onStateChange() underrunNs = (frameCount * 1750000000LL) / sampleRate; // 1.75 overrunNs = (frameCount * 500000000LL) / sampleRate; // 0.50 forceNs = (frameCount * 950000000LL) / sampleRate; // 0.95 - warmupNs = (frameCount * 500000000LL) / sampleRate; // 0.50 + warmupNsMin = (frameCount * 750000000LL) / sampleRate; // 0.75 + warmupNsMax = (frameCount * 1250000000LL) / sampleRate; // 1.25 } else { periodNs = 0; underrunNs = 0; overrunNs = 0; forceNs = 0; - warmupNs = 0; + warmupNsMin = 0; + warmupNsMax = LONG_MAX; } readBufferState = -1; dumpState->mFrameCount = frameCount; diff --git a/services/audioflinger/FastMixer.cpp b/services/audioflinger/FastMixer.cpp index 67e2e6e..8b12f28 100644 --- a/services/audioflinger/FastMixer.cpp +++ b/services/audioflinger/FastMixer.cpp @@ -195,13 +195,15 @@ void FastMixer::onStateChange() underrunNs = (frameCount * 1750000000LL) / sampleRate; // 1.75 overrunNs = (frameCount * 500000000LL) / sampleRate; // 0.50 forceNs = (frameCount * 950000000LL) / sampleRate; // 0.95 - warmupNs = (frameCount * 500000000LL) / sampleRate; // 0.50 + warmupNsMin = (frameCount * 750000000LL) / sampleRate; // 0.75 + warmupNsMax = (frameCount * 1250000000LL) / sampleRate; // 1.25 } else { periodNs = 0; underrunNs = 0; overrunNs = 0; forceNs = 0; - warmupNs = 0; + warmupNsMin = 0; + warmupNsMax = LONG_MAX; } mMixerBufferState = UNDEFINED; #if !LOG_NDEBUG diff --git a/services/audioflinger/FastThread.cpp b/services/audioflinger/FastThread.cpp index 3e12cca..b69cc85 100644 --- a/services/audioflinger/FastThread.cpp +++ b/services/audioflinger/FastThread.cpp @@ -29,7 +29,8 @@ #define FAST_DEFAULT_NS 999999999L // ~1 sec: default time to sleep #define FAST_HOT_IDLE_NS 1000000L // 1 ms: time to sleep while hot idling -#define MIN_WARMUP_CYCLES 2 // minimum number of loop cycles to wait for warmup +#define MIN_WARMUP_CYCLES 2 // minimum number of consecutive in-range loop cycles + // to wait for warmup #define MAX_WARMUP_CYCLES 10 // maximum number of loop cycles to wait for warmup namespace android { @@ -44,7 +45,8 @@ FastThread::FastThread() : Thread(false /*canCallJava*/), underrunNs(0), overrunNs(0), forceNs(0), - warmupNs(0), + warmupNsMin(0), + warmupNsMax(LONG_MAX), // re-initialized to &dummyDumpState by subclass constructor mDummyDumpState(NULL), dumpState(NULL), @@ -60,6 +62,7 @@ FastThread::FastThread() : Thread(false /*canCallJava*/), isWarm(false), /* measuredWarmupTs({0, 0}), */ warmupCycles(0), + warmupConsecutiveInRangeCycles(0), // dummyLogWriter logWriter(&dummyLogWriter), timestampStatus(INVALID_OPERATION), @@ -169,6 +172,7 @@ bool FastThread::threadLoop() measuredWarmupTs.tv_sec = 0; measuredWarmupTs.tv_nsec = 0; warmupCycles = 0; + warmupConsecutiveInRangeCycles = 0; sleepNs = -1; coldGen = current->mColdGen; #ifdef FAST_MIXER_STATISTICS @@ -222,7 +226,8 @@ bool FastThread::threadLoop() // To avoid an initial underrun on fast tracks after exiting standby, // do not start pulling data from tracks and mixing until warmup is complete. // Warmup is considered complete after the earlier of: - // MIN_WARMUP_CYCLES write() attempts and last one blocks for at least warmupNs + // MIN_WARMUP_CYCLES consecutive in-range write() attempts, + // where "in-range" means warmupNsMin <= cycle time <= warmupNsMax // MAX_WARMUP_CYCLES write() attempts. // This is overly conservative, but to get better accuracy requires a new HAL API. if (!isWarm && attemptedWrite) { @@ -233,7 +238,14 @@ bool FastThread::threadLoop() measuredWarmupTs.tv_nsec -= 1000000000; } ++warmupCycles; - if ((nsec > warmupNs && warmupCycles >= MIN_WARMUP_CYCLES) || + if (warmupNsMin <= nsec && nsec <= warmupNsMax) { + ALOGV("warmup cycle %d in range: %.03f ms", warmupCycles, nsec * 1e-9); + ++warmupConsecutiveInRangeCycles; + } else { + ALOGV("warmup cycle %d out of range: %.03f ms", warmupCycles, nsec * 1e-9); + warmupConsecutiveInRangeCycles = 0; + } + if ((warmupConsecutiveInRangeCycles >= MIN_WARMUP_CYCLES) || (warmupCycles >= MAX_WARMUP_CYCLES)) { isWarm = true; dumpState->mMeasuredWarmupTs = measuredWarmupTs; diff --git a/services/audioflinger/FastThread.h b/services/audioflinger/FastThread.h index 1330334..cb32e9d 100644 --- a/services/audioflinger/FastThread.h +++ b/services/audioflinger/FastThread.h @@ -58,7 +58,8 @@ protected: long underrunNs; // underrun likely when write cycle is greater than this value long overrunNs; // overrun likely when write cycle is less than this value long forceNs; // if overrun detected, force the write cycle to take this much time - long warmupNs; // warmup complete when write cycle is greater than to this value + long warmupNsMin; // warmup complete when write cycle is greater than or equal to this value + long warmupNsMax; // and less than or equal to this value FastThreadDumpState *mDummyDumpState; FastThreadDumpState *dumpState; bool ignoreNextOverrun; // used to ignore initial overrun and first after an underrun @@ -74,7 +75,8 @@ protected: unsigned coldGen; // last observed mColdGen bool isWarm; // true means ready to mix, false means wait for warmup before mixing struct timespec measuredWarmupTs; // how long did it take for warmup to complete - uint32_t warmupCycles; // counter of number of loop cycles required to warmup + uint32_t warmupCycles; // counter of number of loop cycles during warmup phase + uint32_t warmupConsecutiveInRangeCycles; // number of consecutive cycles in range NBLog::Writer dummyLogWriter; NBLog::Writer *logWriter; status_t timestampStatus; diff --git a/services/audioflinger/Threads.cpp b/services/audioflinger/Threads.cpp index 384bd25..40ab0af 100644 --- a/services/audioflinger/Threads.cpp +++ b/services/audioflinger/Threads.cpp @@ -174,18 +174,6 @@ static int sFastTrackMultiplier = kFastTrackMultiplier; // and that all "fast" AudioRecord clients read from. In either case, the size can be small. static const size_t kRecordThreadReadOnlyHeapSize = 0x2000; -// Returns the source frames needed to resample to destination frames. This is not a precise -// value and depends on the resampler (and possibly how it handles rounding internally). -// If srcSampleRate and dstSampleRate are equal, then it returns destination frames, which -// may not be a true if the resampler is asynchronous. -static inline size_t sourceFramesNeeded( - uint32_t srcSampleRate, size_t dstFramesRequired, uint32_t dstSampleRate) { - // +1 for rounding - always do this even if matched ratio - // +1 for additional sample needed for interpolation - return srcSampleRate == dstSampleRate ? dstFramesRequired : - size_t((uint64_t)dstFramesRequired * srcSampleRate / dstSampleRate + 1 + 1); -} - // ---------------------------------------------------------------------------- static pthread_once_t sFastTrackMultiplierOnce = PTHREAD_ONCE_INIT; @@ -1497,20 +1485,25 @@ sp<AudioFlinger::PlaybackThread::Track> AudioFlinger::PlaybackThread::createTrac audio_is_linear_pcm(format), channelMask, sampleRate, mSampleRate, hasFastMixer(), tid, mFastTrackAvailMask); *flags &= ~IAudioFlinger::TRACK_FAST; - // For compatibility with AudioTrack calculation, buffer depth is forced - // to be at least 2 x the normal mixer frame count and cover audio hardware latency. - // This is probably too conservative, but legacy application code may depend on it. - // If you change this calculation, also review the start threshold which is related. + } + } + // For normal PCM streaming tracks, update minimum frame count. + // For compatibility with AudioTrack calculation, buffer depth is forced + // to be at least 2 x the normal mixer frame count and cover audio hardware latency. + // This is probably too conservative, but legacy application code may depend on it. + // If you change this calculation, also review the start threshold which is related. + if (!(*flags & IAudioFlinger::TRACK_FAST) + && audio_is_linear_pcm(format) && sharedBuffer == 0) { uint32_t latencyMs = mOutput->stream->get_latency(mOutput->stream); uint32_t minBufCount = latencyMs / ((1000 * mNormalFrameCount) / mSampleRate); if (minBufCount < 2) { minBufCount = 2; } - size_t minFrameCount = mNormalFrameCount * minBufCount; - if (frameCount < minFrameCount) { + size_t minFrameCount = + minBufCount * sourceFramesNeeded(sampleRate, mNormalFrameCount, mSampleRate); + if (frameCount < minFrameCount) { // including frameCount == 0 frameCount = minFrameCount; } - } } *pFrameCount = frameCount; diff --git a/services/audioflinger/Tracks.cpp b/services/audioflinger/Tracks.cpp index 78cec31..fa0beaa 100644 --- a/services/audioflinger/Tracks.cpp +++ b/services/audioflinger/Tracks.cpp @@ -1722,28 +1722,7 @@ bool AudioFlinger::PlaybackThread::OutputTrack::write(void* data, uint32_t frame uint32_t waitTimeLeftMs = mSourceThread->waitTimeMs(); if (!mActive && frames != 0) { - start(); - sp<ThreadBase> thread = mThread.promote(); - if (thread != 0) { - MixerThread *mixerThread = (MixerThread *)thread.get(); - if (mFrameCount > frames) { - // For the first write after being inactive, ensure that we have - // enough frames to fill mFrameCount (which should be multiples of - // the minimum buffer requirements of the downstream MixerThread). - // This provides enough frames for the downstream mixer to begin - // (see AudioFlinger::PlaybackThread::Track::isReady()). - if (mBufferQueue.size() < kMaxOverFlowBuffers) { - uint32_t startFrames = (mFrameCount - frames); - pInBuffer = new Buffer; - pInBuffer->mBuffer = calloc(1, startFrames * mFrameSize); - pInBuffer->frameCount = startFrames; - pInBuffer->raw = pInBuffer->mBuffer; - mBufferQueue.add(pInBuffer); - } else { - ALOGW("OutputTrack::write() %p no more buffers in queue", this); - } - } - } + (void) start(); } while (waitTimeLeftMs) { |