summaryrefslogtreecommitdiffstats
path: root/services
diff options
context:
space:
mode:
Diffstat (limited to 'services')
-rw-r--r--services/audioflinger/AudioResamplerFirProcessNeon.h511
-rw-r--r--services/audioflinger/FastCapture.cpp6
-rw-r--r--services/audioflinger/FastMixer.cpp6
-rw-r--r--services/audioflinger/FastThread.cpp20
-rw-r--r--services/audioflinger/FastThread.h6
-rw-r--r--services/audioflinger/Threads.cpp31
-rw-r--r--services/audioflinger/Tracks.cpp23
7 files changed, 41 insertions, 562 deletions
diff --git a/services/audioflinger/AudioResamplerFirProcessNeon.h b/services/audioflinger/AudioResamplerFirProcessNeon.h
index f311cef..d4fa7ad 100644
--- a/services/audioflinger/AudioResamplerFirProcessNeon.h
+++ b/services/audioflinger/AudioResamplerFirProcessNeon.h
@@ -24,10 +24,6 @@ namespace android {
#if USE_NEON
//
// NEON specializations are enabled for Process() and ProcessL()
-//
-// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
-// and looping stride 16 (or vice versa). This has some polyphase coef data alignment
-// issues with S16 coefs. Consider this later.
// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
#define ASSEMBLY_ACCUMULATE_MONO \
@@ -635,513 +631,6 @@ inline void Process<2, 16>(int32_t* const out,
);
}
-template <>
-inline void ProcessL<1, 8>(int32_t* const out,
- int count,
- const int16_t* coefsP,
- const int16_t* coefsN,
- const int16_t* sP,
- const int16_t* sN,
- const int32_t* const volumeLR)
-{
- const int CHANNELS = 1; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
-
- "1: \n"
-
- "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples
- "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples
- "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs
- "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs
-
- "vrev64.16 d4, d4 \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
-
- // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
- "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed)samples by coef
- "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
-
- // moving these ARM instructions before neon above seems to be slower
- "subs %[count], %[count], #4 \n"// (1) update loop counter
- "sub %[sP], %[sP], #8 \n"// (0) move pointer to next set of samples
-
- // sP used after branch (warning)
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_MONO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsN0] "+r" (coefsN),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q8", "q10"
- );
-}
-
-template <>
-inline void ProcessL<2, 8>(int32_t* const out,
- int count,
- const int16_t* coefsP,
- const int16_t* coefsN,
- const int16_t* sP,
- const int16_t* sN,
- const int32_t* const volumeLR)
-{
- const int CHANNELS = 2; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "veor q0, q0, q0 \n"// (1) acc_L = 0
- "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
-
- "1: \n"
-
- "vld2.16 {d4, d5}, [%[sP]] \n"// (2+0d) load 8 16-bits stereo samples
- "vld2.16 {d6, d7}, [%[sN]]! \n"// (2) load 8 16-bits stereo samples
- "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs
- "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs
-
- "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
-
- "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left
- "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right
- "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left
- "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right
-
- // moving these ARM before neon seems to be slower
- "subs %[count], %[count], #4 \n"// (1) update loop counter
- "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
-
- // sP used after branch (warning)
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_STEREO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsN0] "+r" (coefsN),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q4", "q5", "q6",
- "q8", "q10"
- );
-}
-
-template <>
-inline void Process<1, 8>(int32_t* const out,
- int count,
- const int16_t* coefsP,
- const int16_t* coefsN,
- const int16_t* coefsP1,
- const int16_t* coefsN1,
- const int16_t* sP,
- const int16_t* sN,
- uint32_t lerpP,
- const int32_t* const volumeLR)
-{
- const int CHANNELS = 1; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
- "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
-
- "1: \n"
-
- "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples
- "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples
- "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs
- "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 4 16-bits coefs for interpolation
- "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 4 16-bits coefs
- "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs for interpolation
-
- "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs
- "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets
-
- "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
- "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
-
- "vrev64.16 d4, d4 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
-
- "vadd.s16 d16, d16, d17 \n"// (1+2d) interpolate (step3) 1st set
- "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set
-
- // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
- "vmlal.s16 q0, d4, d16 \n"// (1+0d) multiply (reversed)by coef
- "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
-
- // moving these ARM instructions before neon above seems to be slower
- "subs %[count], %[count], #4 \n"// (1) update loop counter
- "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
-
- // sP used after branch (warning)
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_MONO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsN0] "+r" (coefsN),
- [coefsP1] "+r" (coefsP1),
- [coefsN1] "+r" (coefsN1),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [lerpP] "r" (lerpP),
- [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11"
- );
-}
-
-template <>
-inline void Process<2, 8>(int32_t* const out,
- int count,
- const int16_t* coefsP,
- const int16_t* coefsN,
- const int16_t* coefsP1,
- const int16_t* coefsN1,
- const int16_t* sP,
- const int16_t* sN,
- uint32_t lerpP,
- const int32_t* const volumeLR)
-{
- const int CHANNELS = 2; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
- "veor q0, q0, q0 \n"// (1) acc_L = 0
- "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
-
- "1: \n"
-
- "vld2.16 {d4, d5}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
- "vld2.16 {d6, d7}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
- "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs
- "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 8 16-bits coefs for interpolation
- "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 8 16-bits coefs
- "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs for interpolation
-
- "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs
- "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets
-
- "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
- "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
-
- "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
-
- "vadd.s16 d16, d16, d17 \n"// (1+1d) interpolate (step3) 1st set
- "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set
-
- "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left
- "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right
- "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left
- "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right
-
- // moving these ARM before neon seems to be slower
- "subs %[count], %[count], #4 \n"// (1) update loop counter
- "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
-
- // sP used after branch (warning)
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_STEREO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsN0] "+r" (coefsN),
- [coefsP1] "+r" (coefsP1),
- [coefsN1] "+r" (coefsN1),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [lerpP] "r" (lerpP),
- [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q4", "q5", "q6",
- "q8", "q9", "q10", "q11"
- );
-}
-
-template <>
-inline void ProcessL<1, 8>(int32_t* const out,
- int count,
- const int32_t* coefsP,
- const int32_t* coefsN,
- const int16_t* sP,
- const int16_t* sN,
- const int32_t* const volumeLR)
-{
- const int CHANNELS = 1; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "veor q0, q0, q0 \n"// result, initialize to 0
-
- "1: \n"
-
- "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples
- "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples
- "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
-
- "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side
-
- "vshll.s16 q12, d4, #15 \n"// (stall) extend samples to 31 bits
- "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
-
- "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
-
- "vadd.s32 q0, q0, q12 \n"// accumulate result
- "vadd.s32 q0, q0, q14 \n"// (stall) accumulate result
-
- "subs %[count], %[count], #4 \n"// update loop counter
- "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
-
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_MONO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsN0] "+r" (coefsN),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11",
- "q12", "q14"
- );
-}
-
-template <>
-inline void ProcessL<2, 8>(int32_t* const out,
- int count,
- const int32_t* coefsP,
- const int32_t* coefsN,
- const int16_t* sP,
- const int16_t* sN,
- const int32_t* const volumeLR)
-{
- const int CHANNELS = 2; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "veor q0, q0, q0 \n"// result, initialize to 0
- "veor q4, q4, q4 \n"// result, initialize to 0
-
- "1: \n"
-
- "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples
- "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples
- "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
-
- "vrev64.16 q2, q2 \n"// reverse 2 frames of the positive side
-
- "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
- "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
-
- "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
- "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
-
- "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by coef
- "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
- "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
- "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by coef
-
- "vadd.s32 q0, q0, q12 \n"// accumulate result
- "vadd.s32 q4, q4, q13 \n"// accumulate result
- "vadd.s32 q0, q0, q14 \n"// accumulate result
- "vadd.s32 q4, q4, q15 \n"// accumulate result
-
- "subs %[count], %[count], #4 \n"// update loop counter
- "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
-
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_STEREO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsN0] "+r" (coefsN),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15"
- );
-}
-
-template <>
-inline void Process<1, 8>(int32_t* const out,
- int count,
- const int32_t* coefsP,
- const int32_t* coefsN,
- const int32_t* coefsP1,
- const int32_t* coefsN1,
- const int16_t* sP,
- const int16_t* sN,
- uint32_t lerpP,
- const int32_t* const volumeLR)
-{
- const int CHANNELS = 1; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
- "veor q0, q0, q0 \n"// result, initialize to 0
-
- "1: \n"
-
- "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples
- "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples
- "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation
- "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
-
- "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side
-
- "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs
- "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets
- "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
-
- "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs
- "vqrdmulh.s32 q11, q11, d2[0] \n"// interpolate (step2) 2nd set of coefs
- "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
-
- "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set
- "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set
-
- "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
-
- "vadd.s32 q0, q0, q12 \n"// accumulate result
- "vadd.s32 q0, q0, q14 \n"// accumulate result
-
- "subs %[count], %[count], #4 \n"// update loop counter
- "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
-
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_MONO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsP1] "+r" (coefsP1),
- [coefsN0] "+r" (coefsN),
- [coefsN1] "+r" (coefsN1),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [lerpP] "r" (lerpP),
- [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3",
- "q8", "q9", "q10", "q11",
- "q12", "q14"
- );
-}
-
-template <>
-inline
-void Process<2, 8>(int32_t* const out,
- int count,
- const int32_t* coefsP,
- const int32_t* coefsN,
- const int32_t* coefsP1,
- const int32_t* coefsN1,
- const int16_t* sP,
- const int16_t* sN,
- uint32_t lerpP,
- const int32_t* const volumeLR)
-{
- const int CHANNELS = 2; // template specialization does not preserve params
- const int STRIDE = 8;
- sP -= CHANNELS*((STRIDE>>1)-1);
- asm (
- "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
- "veor q0, q0, q0 \n"// result, initialize to 0
- "veor q4, q4, q4 \n"// result, initialize to 0
-
- "1: \n"
- "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples
- "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples
- "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation
- "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
- "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
-
- "vrev64.16 q2, q2 \n"// (reversed) 2 frames of the positive side
-
- "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs
- "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets
- "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
- "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
-
- "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs
- "vqrdmulh.s32 q11, q11, d2[1] \n"// interpolate (step3) 2nd set of coefs
- "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
- "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
-
- "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set
- "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set
-
- "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
- "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by interpolated coef
-
- "vadd.s32 q0, q0, q12 \n"// accumulate result
- "vadd.s32 q4, q4, q13 \n"// accumulate result
- "vadd.s32 q0, q0, q14 \n"// accumulate result
- "vadd.s32 q4, q4, q15 \n"// accumulate result
-
- "subs %[count], %[count], #4 \n"// update loop counter
- "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
-
- "bne 1b \n"// loop
-
- ASSEMBLY_ACCUMULATE_STEREO
-
- : [out] "=Uv" (out[0]),
- [count] "+r" (count),
- [coefsP0] "+r" (coefsP),
- [coefsP1] "+r" (coefsP1),
- [coefsN0] "+r" (coefsN),
- [coefsN1] "+r" (coefsN1),
- [sP] "+r" (sP),
- [sN] "+r" (sN)
- : [lerpP] "r" (lerpP),
- [vLR] "r" (volumeLR)
- : "cc", "memory",
- "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15"
- );
-}
-
#endif //USE_NEON
}; // namespace android
diff --git a/services/audioflinger/FastCapture.cpp b/services/audioflinger/FastCapture.cpp
index 1c4f670..255496e 100644
--- a/services/audioflinger/FastCapture.cpp
+++ b/services/audioflinger/FastCapture.cpp
@@ -138,13 +138,15 @@ void FastCapture::onStateChange()
underrunNs = (frameCount * 1750000000LL) / sampleRate; // 1.75
overrunNs = (frameCount * 500000000LL) / sampleRate; // 0.50
forceNs = (frameCount * 950000000LL) / sampleRate; // 0.95
- warmupNs = (frameCount * 500000000LL) / sampleRate; // 0.50
+ warmupNsMin = (frameCount * 750000000LL) / sampleRate; // 0.75
+ warmupNsMax = (frameCount * 1250000000LL) / sampleRate; // 1.25
} else {
periodNs = 0;
underrunNs = 0;
overrunNs = 0;
forceNs = 0;
- warmupNs = 0;
+ warmupNsMin = 0;
+ warmupNsMax = LONG_MAX;
}
readBufferState = -1;
dumpState->mFrameCount = frameCount;
diff --git a/services/audioflinger/FastMixer.cpp b/services/audioflinger/FastMixer.cpp
index 67e2e6e..8b12f28 100644
--- a/services/audioflinger/FastMixer.cpp
+++ b/services/audioflinger/FastMixer.cpp
@@ -195,13 +195,15 @@ void FastMixer::onStateChange()
underrunNs = (frameCount * 1750000000LL) / sampleRate; // 1.75
overrunNs = (frameCount * 500000000LL) / sampleRate; // 0.50
forceNs = (frameCount * 950000000LL) / sampleRate; // 0.95
- warmupNs = (frameCount * 500000000LL) / sampleRate; // 0.50
+ warmupNsMin = (frameCount * 750000000LL) / sampleRate; // 0.75
+ warmupNsMax = (frameCount * 1250000000LL) / sampleRate; // 1.25
} else {
periodNs = 0;
underrunNs = 0;
overrunNs = 0;
forceNs = 0;
- warmupNs = 0;
+ warmupNsMin = 0;
+ warmupNsMax = LONG_MAX;
}
mMixerBufferState = UNDEFINED;
#if !LOG_NDEBUG
diff --git a/services/audioflinger/FastThread.cpp b/services/audioflinger/FastThread.cpp
index 3e12cca..b69cc85 100644
--- a/services/audioflinger/FastThread.cpp
+++ b/services/audioflinger/FastThread.cpp
@@ -29,7 +29,8 @@
#define FAST_DEFAULT_NS 999999999L // ~1 sec: default time to sleep
#define FAST_HOT_IDLE_NS 1000000L // 1 ms: time to sleep while hot idling
-#define MIN_WARMUP_CYCLES 2 // minimum number of loop cycles to wait for warmup
+#define MIN_WARMUP_CYCLES 2 // minimum number of consecutive in-range loop cycles
+ // to wait for warmup
#define MAX_WARMUP_CYCLES 10 // maximum number of loop cycles to wait for warmup
namespace android {
@@ -44,7 +45,8 @@ FastThread::FastThread() : Thread(false /*canCallJava*/),
underrunNs(0),
overrunNs(0),
forceNs(0),
- warmupNs(0),
+ warmupNsMin(0),
+ warmupNsMax(LONG_MAX),
// re-initialized to &dummyDumpState by subclass constructor
mDummyDumpState(NULL),
dumpState(NULL),
@@ -60,6 +62,7 @@ FastThread::FastThread() : Thread(false /*canCallJava*/),
isWarm(false),
/* measuredWarmupTs({0, 0}), */
warmupCycles(0),
+ warmupConsecutiveInRangeCycles(0),
// dummyLogWriter
logWriter(&dummyLogWriter),
timestampStatus(INVALID_OPERATION),
@@ -169,6 +172,7 @@ bool FastThread::threadLoop()
measuredWarmupTs.tv_sec = 0;
measuredWarmupTs.tv_nsec = 0;
warmupCycles = 0;
+ warmupConsecutiveInRangeCycles = 0;
sleepNs = -1;
coldGen = current->mColdGen;
#ifdef FAST_MIXER_STATISTICS
@@ -222,7 +226,8 @@ bool FastThread::threadLoop()
// To avoid an initial underrun on fast tracks after exiting standby,
// do not start pulling data from tracks and mixing until warmup is complete.
// Warmup is considered complete after the earlier of:
- // MIN_WARMUP_CYCLES write() attempts and last one blocks for at least warmupNs
+ // MIN_WARMUP_CYCLES consecutive in-range write() attempts,
+ // where "in-range" means warmupNsMin <= cycle time <= warmupNsMax
// MAX_WARMUP_CYCLES write() attempts.
// This is overly conservative, but to get better accuracy requires a new HAL API.
if (!isWarm && attemptedWrite) {
@@ -233,7 +238,14 @@ bool FastThread::threadLoop()
measuredWarmupTs.tv_nsec -= 1000000000;
}
++warmupCycles;
- if ((nsec > warmupNs && warmupCycles >= MIN_WARMUP_CYCLES) ||
+ if (warmupNsMin <= nsec && nsec <= warmupNsMax) {
+ ALOGV("warmup cycle %d in range: %.03f ms", warmupCycles, nsec * 1e-9);
+ ++warmupConsecutiveInRangeCycles;
+ } else {
+ ALOGV("warmup cycle %d out of range: %.03f ms", warmupCycles, nsec * 1e-9);
+ warmupConsecutiveInRangeCycles = 0;
+ }
+ if ((warmupConsecutiveInRangeCycles >= MIN_WARMUP_CYCLES) ||
(warmupCycles >= MAX_WARMUP_CYCLES)) {
isWarm = true;
dumpState->mMeasuredWarmupTs = measuredWarmupTs;
diff --git a/services/audioflinger/FastThread.h b/services/audioflinger/FastThread.h
index 1330334..cb32e9d 100644
--- a/services/audioflinger/FastThread.h
+++ b/services/audioflinger/FastThread.h
@@ -58,7 +58,8 @@ protected:
long underrunNs; // underrun likely when write cycle is greater than this value
long overrunNs; // overrun likely when write cycle is less than this value
long forceNs; // if overrun detected, force the write cycle to take this much time
- long warmupNs; // warmup complete when write cycle is greater than to this value
+ long warmupNsMin; // warmup complete when write cycle is greater than or equal to this value
+ long warmupNsMax; // and less than or equal to this value
FastThreadDumpState *mDummyDumpState;
FastThreadDumpState *dumpState;
bool ignoreNextOverrun; // used to ignore initial overrun and first after an underrun
@@ -74,7 +75,8 @@ protected:
unsigned coldGen; // last observed mColdGen
bool isWarm; // true means ready to mix, false means wait for warmup before mixing
struct timespec measuredWarmupTs; // how long did it take for warmup to complete
- uint32_t warmupCycles; // counter of number of loop cycles required to warmup
+ uint32_t warmupCycles; // counter of number of loop cycles during warmup phase
+ uint32_t warmupConsecutiveInRangeCycles; // number of consecutive cycles in range
NBLog::Writer dummyLogWriter;
NBLog::Writer *logWriter;
status_t timestampStatus;
diff --git a/services/audioflinger/Threads.cpp b/services/audioflinger/Threads.cpp
index 384bd25..40ab0af 100644
--- a/services/audioflinger/Threads.cpp
+++ b/services/audioflinger/Threads.cpp
@@ -174,18 +174,6 @@ static int sFastTrackMultiplier = kFastTrackMultiplier;
// and that all "fast" AudioRecord clients read from. In either case, the size can be small.
static const size_t kRecordThreadReadOnlyHeapSize = 0x2000;
-// Returns the source frames needed to resample to destination frames. This is not a precise
-// value and depends on the resampler (and possibly how it handles rounding internally).
-// If srcSampleRate and dstSampleRate are equal, then it returns destination frames, which
-// may not be a true if the resampler is asynchronous.
-static inline size_t sourceFramesNeeded(
- uint32_t srcSampleRate, size_t dstFramesRequired, uint32_t dstSampleRate) {
- // +1 for rounding - always do this even if matched ratio
- // +1 for additional sample needed for interpolation
- return srcSampleRate == dstSampleRate ? dstFramesRequired :
- size_t((uint64_t)dstFramesRequired * srcSampleRate / dstSampleRate + 1 + 1);
-}
-
// ----------------------------------------------------------------------------
static pthread_once_t sFastTrackMultiplierOnce = PTHREAD_ONCE_INIT;
@@ -1497,20 +1485,25 @@ sp<AudioFlinger::PlaybackThread::Track> AudioFlinger::PlaybackThread::createTrac
audio_is_linear_pcm(format),
channelMask, sampleRate, mSampleRate, hasFastMixer(), tid, mFastTrackAvailMask);
*flags &= ~IAudioFlinger::TRACK_FAST;
- // For compatibility with AudioTrack calculation, buffer depth is forced
- // to be at least 2 x the normal mixer frame count and cover audio hardware latency.
- // This is probably too conservative, but legacy application code may depend on it.
- // If you change this calculation, also review the start threshold which is related.
+ }
+ }
+ // For normal PCM streaming tracks, update minimum frame count.
+ // For compatibility with AudioTrack calculation, buffer depth is forced
+ // to be at least 2 x the normal mixer frame count and cover audio hardware latency.
+ // This is probably too conservative, but legacy application code may depend on it.
+ // If you change this calculation, also review the start threshold which is related.
+ if (!(*flags & IAudioFlinger::TRACK_FAST)
+ && audio_is_linear_pcm(format) && sharedBuffer == 0) {
uint32_t latencyMs = mOutput->stream->get_latency(mOutput->stream);
uint32_t minBufCount = latencyMs / ((1000 * mNormalFrameCount) / mSampleRate);
if (minBufCount < 2) {
minBufCount = 2;
}
- size_t minFrameCount = mNormalFrameCount * minBufCount;
- if (frameCount < minFrameCount) {
+ size_t minFrameCount =
+ minBufCount * sourceFramesNeeded(sampleRate, mNormalFrameCount, mSampleRate);
+ if (frameCount < minFrameCount) { // including frameCount == 0
frameCount = minFrameCount;
}
- }
}
*pFrameCount = frameCount;
diff --git a/services/audioflinger/Tracks.cpp b/services/audioflinger/Tracks.cpp
index 78cec31..fa0beaa 100644
--- a/services/audioflinger/Tracks.cpp
+++ b/services/audioflinger/Tracks.cpp
@@ -1722,28 +1722,7 @@ bool AudioFlinger::PlaybackThread::OutputTrack::write(void* data, uint32_t frame
uint32_t waitTimeLeftMs = mSourceThread->waitTimeMs();
if (!mActive && frames != 0) {
- start();
- sp<ThreadBase> thread = mThread.promote();
- if (thread != 0) {
- MixerThread *mixerThread = (MixerThread *)thread.get();
- if (mFrameCount > frames) {
- // For the first write after being inactive, ensure that we have
- // enough frames to fill mFrameCount (which should be multiples of
- // the minimum buffer requirements of the downstream MixerThread).
- // This provides enough frames for the downstream mixer to begin
- // (see AudioFlinger::PlaybackThread::Track::isReady()).
- if (mBufferQueue.size() < kMaxOverFlowBuffers) {
- uint32_t startFrames = (mFrameCount - frames);
- pInBuffer = new Buffer;
- pInBuffer->mBuffer = calloc(1, startFrames * mFrameSize);
- pInBuffer->frameCount = startFrames;
- pInBuffer->raw = pInBuffer->mBuffer;
- mBufferQueue.add(pInBuffer);
- } else {
- ALOGW("OutputTrack::write() %p no more buffers in queue", this);
- }
- }
- }
+ (void) start();
}
while (waitTimeLeftMs) {