core/java/android/speech/tts/BlockingAudioTrack.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343

// Copyright 2011 Google Inc. All Rights Reserved.

package android.speech.tts;

import android.media.AudioFormat;
import android.media.AudioTrack;
import android.speech.tts.TextToSpeechService.AudioOutputParams;
import android.util.Log;

/**
 * Exposes parts of the {@link AudioTrack} API by delegating calls to an
 * underlying {@link AudioTrack}. Additionally, provides methods like
 * {@link #waitAndRelease()} that will block until all audiotrack
 * data has been flushed to the mixer, and is estimated to have completed
 * playback.
 */
class BlockingAudioTrack {
    private static final String TAG = "TTS.BlockingAudioTrack";
    private static final boolean DBG = false;


    /**
     * The minimum increment of time to wait for an AudioTrack to finish
     * playing.
     */
    private static final long MIN_SLEEP_TIME_MS = 20;

    /**
     * The maximum increment of time to sleep while waiting for an AudioTrack
     * to finish playing.
     */
    private static final long MAX_SLEEP_TIME_MS = 2500;

    /**
     * The maximum amount of time to wait for an audio track to make progress while
     * it remains in PLAYSTATE_PLAYING. This should never happen in normal usage, but
     * could happen in exceptional circumstances like a media_server crash.
     */
    private static final long MAX_PROGRESS_WAIT_MS = MAX_SLEEP_TIME_MS;

    /**
     * Minimum size of the buffer of the underlying {@link android.media.AudioTrack}
     * we create.
     */
    private static final int MIN_AUDIO_BUFFER_SIZE = 8192;


    private final AudioOutputParams mAudioParams;
    private final int mSampleRateInHz;
    private final int mAudioFormat;
    private final int mChannelCount;


    private final int mBytesPerFrame;
    /**
     * A "short utterance" is one that uses less bytes than the audio
     * track buffer size (mAudioBufferSize). In this case, we need to call
     * {@link AudioTrack#stop()} to send pending buffers to the mixer, and slightly
     * different logic is required to wait for the track to finish.
     *
     * Not volatile, accessed only from the audio playback thread.
     */
    private boolean mIsShortUtterance;
    /**
     * Will be valid after a call to {@link #init()}.
     */
    private int mAudioBufferSize;
    private int mBytesWritten = 0;

    // Need to be seen by stop() which can be called from another thread. mAudioTrack will be
    // set to null only after waitAndRelease().
    private Object mAudioTrackLock = new Object();
    private AudioTrack mAudioTrack;
    private volatile boolean mStopped;

    private int mSessionId;

    BlockingAudioTrack(AudioOutputParams audioParams, int sampleRate,
            int audioFormat, int channelCount) {
        mAudioParams = audioParams;
        mSampleRateInHz = sampleRate;
        mAudioFormat = audioFormat;
        mChannelCount = channelCount;

        mBytesPerFrame = AudioFormat.getBytesPerSample(mAudioFormat) * mChannelCount;
        mIsShortUtterance = false;
        mAudioBufferSize = 0;
        mBytesWritten = 0;

        mAudioTrack = null;
        mStopped = false;
    }

    public boolean init() {
        AudioTrack track = createStreamingAudioTrack();
        synchronized (mAudioTrackLock) {
            mAudioTrack = track;
        }

        if (track == null) {
            return false;
        } else {
            return true;
        }
    }

    public void stop() {
        synchronized (mAudioTrackLock) {
            if (mAudioTrack != null) {
                mAudioTrack.stop();
            }
            mStopped = true;
        }
    }

    public int write(byte[] data) {
        AudioTrack track = null;
        synchronized (mAudioTrackLock) {
            track = mAudioTrack;
        }

        if (track == null || mStopped) {
            return -1;
        }
        final int bytesWritten = writeToAudioTrack(track, data);

        mBytesWritten += bytesWritten;
        return bytesWritten;
    }

    public void waitAndRelease() {
        AudioTrack track = null;
        synchronized (mAudioTrackLock) {
            track = mAudioTrack;
        }
        if (track == null) {
            if (DBG) Log.d(TAG, "Audio track null [duplicate call to waitAndRelease ?]");
            return;
        }

        // For "small" audio tracks, we have to stop() them to make them mixable,
        // else the audio subsystem will wait indefinitely for us to fill the buffer
        // before rendering the track mixable.
        //
        // If mStopped is true, the track would already have been stopped, so not
        // much point not doing that again.
        if (mBytesWritten < mAudioBufferSize && !mStopped) {
            if (DBG) {
                Log.d(TAG, "Stopping audio track to flush audio, state was : " +
                        track.getPlayState() + ",stopped= " + mStopped);
            }

            mIsShortUtterance = true;
            track.stop();
        }

        // Block until the audio track is done only if we haven't stopped yet.
        if (!mStopped) {
            if (DBG) Log.d(TAG, "Waiting for audio track to complete : " + mAudioTrack.hashCode());
            blockUntilDone(mAudioTrack);
        }

        // The last call to AudioTrack.write( ) will return only after
        // all data from the audioTrack has been sent to the mixer, so
        // it's safe to release at this point.
        if (DBG) Log.d(TAG, "Releasing audio track [" + track.hashCode() + "]");
        synchronized(mAudioTrackLock) {
            mAudioTrack = null;
        }
        track.release();
    }


    static int getChannelConfig(int channelCount) {
        if (channelCount == 1) {
            return AudioFormat.CHANNEL_OUT_MONO;
        } else if (channelCount == 2){
            return AudioFormat.CHANNEL_OUT_STEREO;
        }

        return 0;
    }

    long getAudioLengthMs(int numBytes) {
        final int unconsumedFrames = numBytes / mBytesPerFrame;
        final long estimatedTimeMs = unconsumedFrames * 1000 / mSampleRateInHz;

        return estimatedTimeMs;
    }

    private static int writeToAudioTrack(AudioTrack audioTrack, byte[] bytes) {
        if (audioTrack.getPlayState() != AudioTrack.PLAYSTATE_PLAYING) {
            if (DBG) Log.d(TAG, "AudioTrack not playing, restarting : " + audioTrack.hashCode());
            audioTrack.play();
        }

        int count = 0;
        while (count < bytes.length) {
            // Note that we don't take bufferCopy.mOffset into account because
            // it is guaranteed to be 0.
            int written = audioTrack.write(bytes, count, bytes.length);
            if (written <= 0) {
                break;
            }
            count += written;
        }
        return count;
    }

    private AudioTrack createStreamingAudioTrack() {
        final int channelConfig = getChannelConfig(mChannelCount);

        int minBufferSizeInBytes
                = AudioTrack.getMinBufferSize(mSampleRateInHz, channelConfig, mAudioFormat);
        int bufferSizeInBytes = Math.max(MIN_AUDIO_BUFFER_SIZE, minBufferSizeInBytes);

        AudioFormat audioFormat = (new AudioFormat.Builder())
                .setChannelMask(channelConfig)
                .setEncoding(mAudioFormat)
                .setSampleRate(mSampleRateInHz).build();
        AudioTrack audioTrack = new AudioTrack(mAudioParams.mAudioAttributes,
                audioFormat, bufferSizeInBytes, AudioTrack.MODE_STREAM,
                mAudioParams.mSessionId);

        if (audioTrack.getState() != AudioTrack.STATE_INITIALIZED) {
            Log.w(TAG, "Unable to create audio track.");
            audioTrack.release();
            return null;
        }

        mAudioBufferSize = bufferSizeInBytes;

        setupVolume(audioTrack, mAudioParams.mVolume, mAudioParams.mPan);
        return audioTrack;
    }

    private void blockUntilDone(AudioTrack audioTrack) {
        if (mBytesWritten <= 0) {
            return;
        }

        if (mIsShortUtterance) {
            // In this case we would have called AudioTrack#stop() to flush
            // buffers to the mixer. This makes the playback head position
            // unobservable and notification markers do not work reliably. We
            // have no option but to wait until we think the track would finish
            // playing and release it after.
            //
            // This isn't as bad as it looks because (a) We won't end up waiting
            // for much longer than we should because even at 4khz mono, a short
            // utterance weighs in at about 2 seconds, and (b) such short utterances
            // are expected to be relatively infrequent and in a stream of utterances
            // this shows up as a slightly longer pause.
            blockUntilEstimatedCompletion();
        } else {
            blockUntilCompletion(audioTrack);
        }
    }

    private void blockUntilEstimatedCompletion() {
        final int lengthInFrames = mBytesWritten / mBytesPerFrame;
        final long estimatedTimeMs = (lengthInFrames * 1000 / mSampleRateInHz);

        if (DBG) Log.d(TAG, "About to sleep for: " + estimatedTimeMs + "ms for a short utterance");

        try {
            Thread.sleep(estimatedTimeMs);
        } catch (InterruptedException ie) {
            // Do nothing.
        }
    }

    private void blockUntilCompletion(AudioTrack audioTrack) {
        final int lengthInFrames = mBytesWritten / mBytesPerFrame;

        int previousPosition = -1;
        int currentPosition = 0;
        long blockedTimeMs = 0;

        while ((currentPosition = audioTrack.getPlaybackHeadPosition()) < lengthInFrames &&
                audioTrack.getPlayState() == AudioTrack.PLAYSTATE_PLAYING && !mStopped) {

            final long estimatedTimeMs = ((lengthInFrames - currentPosition) * 1000) /
                    audioTrack.getSampleRate();
            final long sleepTimeMs = clip(estimatedTimeMs, MIN_SLEEP_TIME_MS, MAX_SLEEP_TIME_MS);

            // Check if the audio track has made progress since the last loop
            // iteration. We should then add in the amount of time that was
            // spent sleeping in the last iteration.
            if (currentPosition == previousPosition) {
                // This works only because the sleep time that would have been calculated
                // would be the same in the previous iteration too.
                blockedTimeMs += sleepTimeMs;
                // If we've taken too long to make progress, bail.
                if (blockedTimeMs > MAX_PROGRESS_WAIT_MS) {
                    Log.w(TAG, "Waited unsuccessfully for " + MAX_PROGRESS_WAIT_MS + "ms " +
                            "for AudioTrack to make progress, Aborting");
                    break;
                }
            } else {
                blockedTimeMs = 0;
            }
            previousPosition = currentPosition;

            if (DBG) {
                Log.d(TAG, "About to sleep for : " + sleepTimeMs + " ms," +
                        " Playback position : " + currentPosition + ", Length in frames : "
                        + lengthInFrames);
            }
            try {
                Thread.sleep(sleepTimeMs);
            } catch (InterruptedException ie) {
                break;
            }
        }
    }

    private static void setupVolume(AudioTrack audioTrack, float volume, float pan) {
        final float vol = clip(volume, 0.0f, 1.0f);
        final float panning = clip(pan, -1.0f, 1.0f);

        float volLeft = vol;
        float volRight = vol;
        if (panning > 0.0f) {
            volLeft *= (1.0f - panning);
        } else if (panning < 0.0f) {
            volRight *= (1.0f + panning);
        }
        if (DBG) Log.d(TAG, "volLeft=" + volLeft + ",volRight=" + volRight);
        if (audioTrack.setStereoVolume(volLeft, volRight) != AudioTrack.SUCCESS) {
            Log.e(TAG, "Failed to set volume");
        }
    }

    private static final long clip(long value, long min, long max) {
        return value < min ? min : (value < max ? value : max);
    }

    private static final float clip(float value, float min, float max) {
        return value < min ? min : (value < max ? value : max);
    }

}