25 files changed, 4504 insertions, 937 deletions
diff --git a/services/audioflinger/Android.mk b/services/audioflinger/Android.mk
index 54377f1..4524d3c 100644
--- a/services/audioflinger/Android.mk
+++ b/services/audioflinger/Android.mk
@@ -23,7 +23,8 @@ LOCAL_SRC_FILES:=               \
     AudioPolicyService.cpp      \
     ServiceUtilities.cpp        \
     AudioResamplerCubic.cpp.arm \
-    AudioResamplerSinc.cpp.arm
+    AudioResamplerSinc.cpp.arm  \
+    AudioResamplerDyn.cpp.arm
 
 LOCAL_SRC_FILES += StateQueue.cpp
 
@@ -74,12 +75,20 @@ include $(BUILD_SHARED_LIBRARY)
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES:=               \
-	test-resample.cpp 			\
+    test-resample.cpp           \
     AudioResampler.cpp.arm      \
-	AudioResamplerCubic.cpp.arm \
-    AudioResamplerSinc.cpp.arm
+    AudioResamplerCubic.cpp.arm \
+    AudioResamplerSinc.cpp.arm  \
+    AudioResamplerDyn.cpp.arm
+
+LOCAL_C_INCLUDES := \
+    $(call include-path-for, audio-utils)
+
+LOCAL_STATIC_LIBRARIES := \
+    libsndfile
 
 LOCAL_SHARED_LIBRARIES := \
+    libaudioutils \
     libdl \
     libcutils \
     libutils \
diff --git a/services/audioflinger/AudioFlinger.cpp b/services/audioflinger/AudioFlinger.cpp
index e9c38e3..21bd2f4 100644
--- a/services/audioflinger/AudioFlinger.cpp
+++ b/services/audioflinger/AudioFlinger.cpp
@@ -104,6 +104,27 @@ static const nsecs_t kMinGlobalEffectEnabletimeNs = seconds(7200);
 
 // ----------------------------------------------------------------------------
 
+const char *formatToString(audio_format_t format) {
+    switch(format) {
+    case AUDIO_FORMAT_PCM_SUB_8_BIT: return "pcm8";
+    case AUDIO_FORMAT_PCM_SUB_16_BIT: return "pcm16";
+    case AUDIO_FORMAT_PCM_SUB_32_BIT: return "pcm32";
+    case AUDIO_FORMAT_PCM_SUB_8_24_BIT: return "pcm8.24";
+    case AUDIO_FORMAT_PCM_SUB_24_BIT_PACKED: return "pcm24";
+    case AUDIO_FORMAT_PCM_SUB_FLOAT: return "pcmfloat";
+    case AUDIO_FORMAT_MP3: return "mp3";
+    case AUDIO_FORMAT_AMR_NB: return "amr-nb";
+    case AUDIO_FORMAT_AMR_WB: return "amr-wb";
+    case AUDIO_FORMAT_AAC: return "aac";
+    case AUDIO_FORMAT_HE_AAC_V1: return "he-aac-v1";
+    case AUDIO_FORMAT_HE_AAC_V2: return "he-aac-v2";
+    case AUDIO_FORMAT_VORBIS: return "vorbis";
+    default:
+        break;
+    }
+    return "unknown";
+}
+
 static int load_audio_interface(const char *if_name, audio_hw_device_t **dev)
 {
     const hw_module_t *mod;
@@ -162,12 +183,15 @@ AudioFlinger::AudioFlinger()
         (void) property_get("af.tee", value, "0");
         teeEnabled = atoi(value);
     }
-    if (teeEnabled & 1)
+    if (teeEnabled & 1) {
         mTeeSinkInputEnabled = true;
-    if (teeEnabled & 2)
+    }
+    if (teeEnabled & 2) {
         mTeeSinkOutputEnabled = true;
-    if (teeEnabled & 4)
+    }
+    if (teeEnabled & 4) {
         mTeeSinkTrackEnabled = true;
+    }
 #endif
 }
 
@@ -210,6 +234,18 @@ AudioFlinger::~AudioFlinger()
         audio_hw_device_close(mAudioHwDevs.valueAt(i)->hwDevice());
         delete mAudioHwDevs.valueAt(i);
     }
+
+    // Tell media.log service about any old writers that still need to be unregistered
+    sp<IBinder> binder = defaultServiceManager()->getService(String16("media.log"));
+    if (binder != 0) {
+        sp<IMediaLogService> mediaLogService(interface_cast<IMediaLogService>(binder));
+        for (size_t count = mUnregisteredWriters.size(); count > 0; count--) {
+            sp<IMemory> iMemory(mUnregisteredWriters.top()->getIMemory());
+            mUnregisteredWriters.pop();
+            mediaLogService->unregisterWriter(iMemory);
+        }
+    }
+
 }
 
 static const char * const audio_interfaces[] = {
@@ -249,7 +285,7 @@ AudioFlinger::AudioHwDevice* AudioFlinger::findSuitableHwDev_l(
     return NULL;
 }
 
-void AudioFlinger::dumpClients(int fd, const Vector<String16>& args)
+void AudioFlinger::dumpClients(int fd, const Vector<String16>& args __unused)
 {
     const size_t SIZE = 256;
     char buffer[SIZE];
@@ -271,17 +307,17 @@ void AudioFlinger::dumpClients(int fd, const Vector<String16>& args)
     }
 
     result.append("Global session refs:\n");
-    result.append(" session pid count\n");
+    result.append("  session   pid count\n");
     for (size_t i = 0; i < mAudioSessionRefs.size(); i++) {
         AudioSessionRef *r = mAudioSessionRefs[i];
-        snprintf(buffer, SIZE, " %7d %3d %3d\n", r->mSessionid, r->mPid, r->mCnt);
+        snprintf(buffer, SIZE, "  %7d %5d %5d\n", r->mSessionid, r->mPid, r->mCnt);
         result.append(buffer);
     }
     write(fd, result.string(), result.size());
 }
 
 
-void AudioFlinger::dumpInternals(int fd, const Vector<String16>& args)
+void AudioFlinger::dumpInternals(int fd, const Vector<String16>& args __unused)
 {
     const size_t SIZE = 256;
     char buffer[SIZE];
@@ -296,7 +332,7 @@ void AudioFlinger::dumpInternals(int fd, const Vector<String16>& args)
     write(fd, result.string(), result.size());
 }
 
-void AudioFlinger::dumpPermissionDenial(int fd, const Vector<String16>& args)
+void AudioFlinger::dumpPermissionDenial(int fd, const Vector<String16>& args __unused)
 {
     const size_t SIZE = 256;
     char buffer[SIZE];
@@ -403,16 +439,44 @@ sp<AudioFlinger::Client> AudioFlinger::registerPid_l(pid_t pid)
 
 sp<NBLog::Writer> AudioFlinger::newWriter_l(size_t size, const char *name)
 {
+    // If there is no memory allocated for logs, return a dummy writer that does nothing
     if (mLogMemoryDealer == 0) {
         return new NBLog::Writer();
     }
-    sp<IMemory> shared = mLogMemoryDealer->allocate(NBLog::Timeline::sharedSize(size));
-    sp<NBLog::Writer> writer = new NBLog::Writer(size, shared);
     sp<IBinder> binder = defaultServiceManager()->getService(String16("media.log"));
-    if (binder != 0) {
-        interface_cast<IMediaLogService>(binder)->registerWriter(shared, size, name);
+    // Similarly if we can't contact the media.log service, also return a dummy writer
+    if (binder == 0) {
+        return new NBLog::Writer();
     }
-    return writer;
+    sp<IMediaLogService> mediaLogService(interface_cast<IMediaLogService>(binder));
+    sp<IMemory> shared = mLogMemoryDealer->allocate(NBLog::Timeline::sharedSize(size));
+    // If allocation fails, consult the vector of previously unregistered writers
+    // and garbage-collect one or more them until an allocation succeeds
+    if (shared == 0) {
+        Mutex::Autolock _l(mUnregisteredWritersLock);
+        for (size_t count = mUnregisteredWriters.size(); count > 0; count--) {
+            {
+                // Pick the oldest stale writer to garbage-collect
+                sp<IMemory> iMemory(mUnregisteredWriters[0]->getIMemory());
+                mUnregisteredWriters.removeAt(0);
+                mediaLogService->unregisterWriter(iMemory);
+                // Now the media.log remote reference to IMemory is gone.  When our last local
+                // reference to IMemory also drops to zero at end of this block,
+                // the IMemory destructor will deallocate the region from mLogMemoryDealer.
+            }
+            // Re-attempt the allocation
+            shared = mLogMemoryDealer->allocate(NBLog::Timeline::sharedSize(size));
+            if (shared != 0) {
+                goto success;
+            }
+        }
+        // Even after garbage-collecting all old writers, there is still not enough memory,
+        // so return a dummy writer
+        return new NBLog::Writer();
+    }
+success:
+    mediaLogService->registerWriter(shared, size, name);
+    return new NBLog::Writer(size, shared);
 }
 
 void AudioFlinger::unregisterWriter(const sp<NBLog::Writer>& writer)
@@ -424,13 +488,10 @@ void AudioFlinger::unregisterWriter(const sp<NBLog::Writer>& writer)
     if (iMemory == 0) {
         return;
     }
-    sp<IBinder> binder = defaultServiceManager()->getService(String16("media.log"));
-    if (binder != 0) {
-        interface_cast<IMediaLogService>(binder)->unregisterWriter(iMemory);
-        // Now the media.log remote reference to IMemory is gone.
-        // When our last local reference to IMemory also drops to zero,
-        // the IMemory destructor will deallocate the region from mMemoryDealer.
-    }
+    // Rather than removing the writer immediately, append it to a queue of old writers to
+    // be garbage-collected later.  This allows us to continue to view old logs for a while.
+    Mutex::Autolock _l(mUnregisteredWritersLock);
+    mUnregisteredWriters.push(writer);
 }
 
 // IAudioFlinger interface
@@ -441,7 +502,7 @@ sp<IAudioTrack> AudioFlinger::createTrack(
         uint32_t sampleRate,
         audio_format_t format,
         audio_channel_mask_t channelMask,
-        size_t frameCount,
+        size_t *frameCount,
         IAudioFlinger::track_flags_t *flags,
         const sp<IMemory>& sharedBuffer,
         audio_io_handle_t output,
@@ -468,7 +529,13 @@ sp<IAudioTrack> AudioFlinger::createTrack(
     // client is responsible for conversion of 8-bit PCM to 16-bit PCM,
     // and we don't yet support 8.24 or 32-bit PCM
     if (audio_is_linear_pcm(format) && format != AUDIO_FORMAT_PCM_16_BIT) {
-        ALOGE("createTrack() invalid format %d", format);
+        ALOGE("createTrack() invalid format %#x", format);
+        lStatus = BAD_VALUE;
+        goto Exit;
+    }
+
+    if (sharedBuffer != 0 && sharedBuffer->pointer() == NULL) {
+        ALOGE("createTrack() sharedBuffer is non-0 but has NULL pointer()");
         lStatus = BAD_VALUE;
         goto Exit;
     }
@@ -488,7 +555,7 @@ sp<IAudioTrack> AudioFlinger::createTrack(
         client = registerPid_l(pid);
 
         ALOGV("createTrack() sessionId: %d", (sessionId == NULL) ? -2 : *sessionId);
-        if (sessionId != NULL && *sessionId != AUDIO_SESSION_OUTPUT_MIX) {
+        if (sessionId != NULL && *sessionId != AUDIO_SESSION_ALLOCATE) {
             // check if an effect chain with the same session ID is present on another
             // output thread and move it here.
             for (size_t i = 0; i < mPlaybackThreads.size(); i++) {
@@ -513,10 +580,13 @@ sp<IAudioTrack> AudioFlinger::createTrack(
 
         track = thread->createTrack_l(client, streamType, sampleRate, format,
                 channelMask, frameCount, sharedBuffer, lSessionId, flags, tid, clientUid, &lStatus);
+        LOG_ALWAYS_FATAL_IF((lStatus == NO_ERROR) && (track == 0));
+        // we don't abort yet if lStatus != NO_ERROR; there is still work to be done regardless
 
         // move effect chain to this output thread if an effect on same session was waiting
         // for a track to be created
         if (lStatus == NO_ERROR && effectThread != NULL) {
+            // no risk of deadlock because AudioFlinger::mLock is held
             Mutex::Autolock _dl(thread->mLock);
             Mutex::Autolock _sl(effectThread->mLock);
             moveEffectChain_l(lSessionId, effectThread, thread, true);
@@ -536,7 +606,9 @@ sp<IAudioTrack> AudioFlinger::createTrack(
                 }
             }
         }
+
     }
+
     if (lStatus == NO_ERROR) {
         // s for server's pid, n for normal mixer name, f for fast index
         name = String8::format("s:%d;n:%d;f:%d", getpid_cached, track->name() - AudioMixer::TRACK0,
@@ -550,9 +622,7 @@ sp<IAudioTrack> AudioFlinger::createTrack(
     }
 
 Exit:
-    if (status != NULL) {
-        *status = lStatus;
-    }
+    *status = lStatus;
     return trackHandle;
 }
 
@@ -1210,7 +1280,7 @@ AudioFlinger::NotificationClient::~NotificationClient()
 {
 }
 
-void AudioFlinger::NotificationClient::binderDied(const wp<IBinder>& who)
+void AudioFlinger::NotificationClient::binderDied(const wp<IBinder>& who __unused)
 {
     sp<NotificationClient> keep(this);
     mAudioFlinger->removeNotificationClient(mPid);
@@ -1228,7 +1298,7 @@ sp<IAudioRecord> AudioFlinger::openRecord(
         uint32_t sampleRate,
         audio_format_t format,
         audio_channel_mask_t channelMask,
-        size_t frameCount,
+        size_t *frameCount,
         IAudioFlinger::track_flags_t *flags,
         pid_t tid,
         int *sessionId,
@@ -1250,7 +1320,7 @@ sp<IAudioRecord> AudioFlinger::openRecord(
     }
 
     if (format != AUDIO_FORMAT_PCM_16_BIT) {
-        ALOGE("openRecord() invalid format %d", format);
+        ALOGE("openRecord() invalid format %#x", format);
         lStatus = BAD_VALUE;
         goto Exit;
     }
@@ -1276,7 +1346,7 @@ sp<IAudioRecord> AudioFlinger::openRecord(
         client = registerPid_l(pid);
 
         // If no audio session id is provided, create one here
-        if (sessionId != NULL && *sessionId != AUDIO_SESSION_OUTPUT_MIX) {
+        if (sessionId != NULL && *sessionId != AUDIO_SESSION_ALLOCATE) {
             lSessionId = *sessionId;
         } else {
             lSessionId = nextUniqueId();
@@ -1291,8 +1361,9 @@ sp<IAudioRecord> AudioFlinger::openRecord(
                                                   frameCount, lSessionId,
                                                   IPCThreadState::self()->getCallingUid(),
                                                   flags, tid, &lStatus);
-        LOG_ALWAYS_FATAL_IF((recordTrack != 0) != (lStatus == NO_ERROR));
+        LOG_ALWAYS_FATAL_IF((lStatus == NO_ERROR) && (recordTrack == 0));
     }
+
     if (lStatus != NO_ERROR) {
         // remove local strong reference to Client before deleting the RecordTrack so that the
         // Client destructor is called by the TrackBase destructor with mLock held
@@ -1301,14 +1372,11 @@ sp<IAudioRecord> AudioFlinger::openRecord(
         goto Exit;
     }
 
-    // return to handle to client
+    // return handle to client
     recordHandle = new RecordHandle(recordTrack);
-    lStatus = NO_ERROR;
 
 Exit:
-    if (status) {
-        *status = lStatus;
-    }
+    *status = lStatus;
     return recordHandle;
 }
 
@@ -1449,18 +1517,15 @@ audio_io_handle_t AudioFlinger::openOutput(audio_module_handle_t module,
                                            audio_output_flags_t flags,
                                            const audio_offload_info_t *offloadInfo)
 {
-    PlaybackThread *thread = NULL;
     struct audio_config config;
+    memset(&config, 0, sizeof(config));
     config.sample_rate = (pSamplingRate != NULL) ? *pSamplingRate : 0;
     config.channel_mask = (pChannelMask != NULL) ? *pChannelMask : 0;
     config.format = (pFormat != NULL) ? *pFormat : AUDIO_FORMAT_DEFAULT;
-    if (offloadInfo) {
+    if (offloadInfo != NULL) {
         config.offload_info = *offloadInfo;
     }
 
-    audio_stream_out_t *outStream = NULL;
-    AudioHwDevice *outHwDev;
-
     ALOGV("openOutput(), module %d Device %x, SamplingRate %d, Format %#08x, Channels %x, flags %x",
               module,
               (pDevices != NULL) ? *pDevices : 0,
@@ -1469,7 +1534,7 @@ audio_io_handle_t AudioFlinger::openOutput(audio_module_handle_t module,
               config.channel_mask,
               flags);
     ALOGV("openOutput(), offloadInfo %p version 0x%04x",
-          offloadInfo, offloadInfo == NULL ? -1 : offloadInfo->version );
+          offloadInfo, offloadInfo == NULL ? -1 : offloadInfo->version);
 
     if (pDevices == NULL || *pDevices == 0) {
         return 0;
@@ -1477,15 +1542,17 @@ audio_io_handle_t AudioFlinger::openOutput(audio_module_handle_t module,
 
     Mutex::Autolock _l(mLock);
 
-    outHwDev = findSuitableHwDev_l(module, *pDevices);
-    if (outHwDev == NULL)
+    AudioHwDevice *outHwDev = findSuitableHwDev_l(module, *pDevices);
+    if (outHwDev == NULL) {
         return 0;
+    }
 
     audio_hw_device_t *hwDevHal = outHwDev->hwDevice();
     audio_io_handle_t id = nextUniqueId();
 
     mHardwareStatus = AUDIO_HW_OUTPUT_OPEN;
 
+    audio_stream_out_t *outStream = NULL;
     status_t status = hwDevHal->open_output_stream(hwDevHal,
                                           id,
                                           *pDevices,
@@ -1505,6 +1572,7 @@ audio_io_handle_t AudioFlinger::openOutput(audio_module_handle_t module,
     if (status == NO_ERROR && outStream != NULL) {
         AudioStreamOut *output = new AudioStreamOut(outHwDev, outStream, flags);
 
+        PlaybackThread *thread;
         if (flags & AUDIO_OUTPUT_FLAG_COMPRESS_OFFLOAD) {
             thread = new OffloadThread(this, output, id, *pDevices);
             ALOGV("openOutput() created offload output: ID %d thread %p", id, thread);
@@ -1672,18 +1740,15 @@ audio_io_handle_t AudioFlinger::openInput(audio_module_handle_t module,
                                           audio_format_t *pFormat,
                                           audio_channel_mask_t *pChannelMask)
 {
-    status_t status;
-    RecordThread *thread = NULL;
     struct audio_config config;
+    memset(&config, 0, sizeof(config));
     config.sample_rate = (pSamplingRate != NULL) ? *pSamplingRate : 0;
     config.channel_mask = (pChannelMask != NULL) ? *pChannelMask : 0;
     config.format = (pFormat != NULL) ? *pFormat : AUDIO_FORMAT_DEFAULT;
 
     uint32_t reqSamplingRate = config.sample_rate;
     audio_format_t reqFormat = config.format;
-    audio_channel_mask_t reqChannels = config.channel_mask;
-    audio_stream_in_t *inStream = NULL;
-    AudioHwDevice *inHwDev;
+    audio_channel_mask_t reqChannelMask = config.channel_mask;
 
     if (pDevices == NULL || *pDevices == 0) {
         return 0;
@@ -1691,16 +1756,18 @@ audio_io_handle_t AudioFlinger::openInput(audio_module_handle_t module,
 
     Mutex::Autolock _l(mLock);
 
-    inHwDev = findSuitableHwDev_l(module, *pDevices);
-    if (inHwDev == NULL)
+    AudioHwDevice *inHwDev = findSuitableHwDev_l(module, *pDevices);
+    if (inHwDev == NULL) {
         return 0;
+    }
 
     audio_hw_device_t *inHwHal = inHwDev->hwDevice();
     audio_io_handle_t id = nextUniqueId();
 
-    status = inHwHal->open_input_stream(inHwHal, id, *pDevices, &config,
+    audio_stream_in_t *inStream = NULL;
+    status_t status = inHwHal->open_input_stream(inHwHal, id, *pDevices, &config,
                                         &inStream);
-    ALOGV("openInput() openInputStream returned input %p, SamplingRate %d, Format %d, Channels %x, "
+    ALOGV("openInput() openInputStream returned input %p, SamplingRate %d, Format %#x, Channels %x, "
             "status %d",
             inStream,
             config.sample_rate,
@@ -1714,10 +1781,12 @@ audio_io_handle_t AudioFlinger::openInput(audio_module_handle_t module,
     if (status == BAD_VALUE &&
         reqFormat == config.format && config.format == AUDIO_FORMAT_PCM_16_BIT &&
         (config.sample_rate <= 2 * reqSamplingRate) &&
-        (popcount(config.channel_mask) <= FCC_2) && (popcount(reqChannels) <= FCC_2)) {
+        (popcount(config.channel_mask) <= FCC_2) && (popcount(reqChannelMask) <= FCC_2)) {
+        // FIXME describe the change proposed by HAL (save old values so we can log them here)
         ALOGV("openInput() reopening with proposed sampling rate and channel mask");
         inStream = NULL;
         status = inHwHal->open_input_stream(inHwHal, id, *pDevices, &config, &inStream);
+        // FIXME log this new status; HAL should not propose any further changes
     }
 
     if (status == NO_ERROR && inStream != NULL) {
@@ -1735,7 +1804,7 @@ audio_io_handle_t AudioFlinger::openInput(audio_module_handle_t module,
                                         popcount(inStream->common.get_channels(&inStream->common)));
         if (!mTeeSinkInputEnabled) {
             kind = TEE_SINK_NO;
-        } else if (format == Format_Invalid) {
+        } else if (!Format_isValid(format)) {
             kind = TEE_SINK_NO;
         } else if (mRecordTeeSink == 0) {
             kind = TEE_SINK_NEW;
@@ -1776,10 +1845,10 @@ audio_io_handle_t AudioFlinger::openInput(audio_module_handle_t module,
         // Start record thread
         // RecordThread requires both input and output device indication to forward to audio
         // pre processing modules
-        thread = new RecordThread(this,
+        RecordThread *thread = new RecordThread(this,
                                   input,
                                   reqSamplingRate,
-                                  reqChannels,
+                                  reqChannelMask,
                                   id,
                                   primaryOutputDevice_l(),
                                   *pDevices
@@ -1796,7 +1865,7 @@ audio_io_handle_t AudioFlinger::openInput(audio_module_handle_t module,
             *pFormat = config.format;
         }
         if (pChannelMask != NULL) {
-            *pChannelMask = reqChannels;
+            *pChannelMask = reqChannelMask;
         }
 
         // notify client processes of the new input creation
@@ -1954,7 +2023,7 @@ void AudioFlinger::purgeStaleEffects_l() {
             }
         }
         if (!found) {
-            Mutex::Autolock _l (t->mLock);
+            Mutex::Autolock _l(t->mLock);
             // remove all effects from the chain
             while (ec->mEffects.size()) {
                 sp<EffectModule> effect = ec->mEffects[0];
@@ -2249,9 +2318,7 @@ sp<IEffect> AudioFlinger::createEffect(
     }
 
 Exit:
-    if (status != NULL) {
-        *status = lStatus;
-    }
+    *status = lStatus;
     return handle;
 }
 
diff --git a/services/audioflinger/AudioFlinger.h b/services/audioflinger/AudioFlinger.h
index 7320144..459d2ec 100644
--- a/services/audioflinger/AudioFlinger.h
+++ b/services/audioflinger/AudioFlinger.h
@@ -60,8 +60,8 @@
 
 namespace android {
 
-class audio_track_cblk_t;
-class effect_param_cblk_t;
+struct audio_track_cblk_t;
+struct effect_param_cblk_t;
 class AudioMixer;
 class AudioBuffer;
 class AudioResampler;
@@ -102,7 +102,7 @@ public:
                                 uint32_t sampleRate,
                                 audio_format_t format,
                                 audio_channel_mask_t channelMask,
-                                size_t frameCount,
+                                size_t *pFrameCount,
                                 IAudioFlinger::track_flags_t *flags,
                                 const sp<IMemory>& sharedBuffer,
                                 audio_io_handle_t output,
@@ -110,18 +110,18 @@ public:
                                 int *sessionId,
                                 String8& name,
                                 int clientUid,
-                                status_t *status);
+                                status_t *status /*non-NULL*/);
 
     virtual sp<IAudioRecord> openRecord(
                                 audio_io_handle_t input,
                                 uint32_t sampleRate,
                                 audio_format_t format,
                                 audio_channel_mask_t channelMask,
-                                size_t frameCount,
+                                size_t *pFrameCount,
                                 IAudioFlinger::track_flags_t *flags,
                                 pid_t tid,
                                 int *sessionId,
-                                status_t *status);
+                                status_t *status /*non-NULL*/);
 
     virtual     uint32_t    sampleRate(audio_io_handle_t output) const;
     virtual     int         channelCount(audio_io_handle_t output) const;
@@ -210,7 +210,7 @@ public:
                         int32_t priority,
                         audio_io_handle_t io,
                         int sessionId,
-                        status_t *status,
+                        status_t *status /*non-NULL*/,
                         int *id,
                         int *enabled);
 
@@ -235,8 +235,12 @@ public:
     sp<NBLog::Writer>   newWriter_l(size_t size, const char *name);
     void                unregisterWriter(const sp<NBLog::Writer>& writer);
 private:
-    static const size_t kLogMemorySize = 10 * 1024;
+    static const size_t kLogMemorySize = 40 * 1024;
     sp<MemoryDealer>    mLogMemoryDealer;   // == 0 when NBLog is disabled
+    // When a log writer is unregistered, it is done lazily so that media.log can continue to see it
+    // for as long as possible.  The memory is only freed when it is needed for another log writer.
+    Vector< sp<NBLog::Writer> > mUnregisteredWriters;
+    Mutex               mUnregisteredWritersLock;
 public:
 
     class SyncEvent;
@@ -499,7 +503,7 @@ private:
     private:
         const char * const mModuleName;
         audio_hw_device_t * const mHwDevice;
-        Flags mFlags;
+        const Flags mFlags;
     };
 
     // AudioStreamOut and AudioStreamIn are immutable, so their fields are const.
@@ -509,7 +513,7 @@ private:
     struct AudioStreamOut {
         AudioHwDevice* const audioHwDev;
         audio_stream_out_t* const stream;
-        audio_output_flags_t flags;
+        const audio_output_flags_t flags;
 
         audio_hw_device_t* hwDev() const { return audioHwDev->hwDevice(); }
 
@@ -651,6 +655,8 @@ private:
 
 #undef INCLUDING_FROM_AUDIOFLINGER_H
 
+const char *formatToString(audio_format_t format);
+
 // ----------------------------------------------------------------------------
 
 }; // namespace android
diff --git a/services/audioflinger/AudioMixer.cpp b/services/audioflinger/AudioMixer.cpp
index f92421e..67d83b1 100644
--- a/services/audioflinger/AudioMixer.cpp
+++ b/services/audioflinger/AudioMixer.cpp
@@ -58,7 +58,7 @@ AudioMixer::DownmixerBufferProvider::~DownmixerBufferProvider()
 status_t AudioMixer::DownmixerBufferProvider::getNextBuffer(AudioBufferProvider::Buffer *pBuffer,
         int64_t pts) {
     //ALOGV("DownmixerBufferProvider::getNextBuffer()");
-    if (this->mTrackBufferProvider != NULL) {
+    if (mTrackBufferProvider != NULL) {
         status_t res = mTrackBufferProvider->getNextBuffer(pBuffer, pts);
         if (res == OK) {
             mDownmixConfig.inputCfg.buffer.frameCount = pBuffer->frameCount;
@@ -81,7 +81,7 @@ status_t AudioMixer::DownmixerBufferProvider::getNextBuffer(AudioBufferProvider:
 
 void AudioMixer::DownmixerBufferProvider::releaseBuffer(AudioBufferProvider::Buffer *pBuffer) {
     //ALOGV("DownmixerBufferProvider::releaseBuffer()");
-    if (this->mTrackBufferProvider != NULL) {
+    if (mTrackBufferProvider != NULL) {
         mTrackBufferProvider->releaseBuffer(pBuffer);
     } else {
         ALOGE("DownmixerBufferProvider::releaseBuffer() error: NULL track buffer provider");
@@ -90,9 +90,9 @@ void AudioMixer::DownmixerBufferProvider::releaseBuffer(AudioBufferProvider::Buf
 
 
 // ----------------------------------------------------------------------------
-bool AudioMixer::isMultichannelCapable = false;
+bool AudioMixer::sIsMultichannelCapable = false;
 
-effect_descriptor_t AudioMixer::dwnmFxDesc;
+effect_descriptor_t AudioMixer::sDwnmFxDesc;
 
 // Ensure mConfiguredNames bitmask is initialized properly on all architectures.
 // The value of 1 << x is undefined in C when x >= 32.
@@ -113,8 +113,6 @@ AudioMixer::AudioMixer(size_t frameCount, uint32_t sampleRate, uint32_t maxNumTr
     // AudioMixer is not yet capable of multi-channel output beyond stereo
     ALOG_ASSERT(2 == MAX_NUM_CHANNELS, "bad MAX_NUM_CHANNELS %d", MAX_NUM_CHANNELS);
 
-    LocalClock lc;
-
     pthread_once(&sOnceControl, &sInitRoutine);
 
     mState.enabledTracks= 0;
@@ -136,27 +134,6 @@ AudioMixer::AudioMixer(size_t frameCount, uint32_t sampleRate, uint32_t maxNumTr
         t++;
     }
 
-    // find multichannel downmix effect if we have to play multichannel content
-    uint32_t numEffects = 0;
-    int ret = EffectQueryNumberEffects(&numEffects);
-    if (ret != 0) {
-        ALOGE("AudioMixer() error %d querying number of effects", ret);
-        return;
-    }
-    ALOGV("EffectQueryNumberEffects() numEffects=%d", numEffects);
-
-    for (uint32_t i = 0 ; i < numEffects ; i++) {
-        if (EffectQueryEffect(i, &dwnmFxDesc) == 0) {
-            ALOGV("effect %d is called %s", i, dwnmFxDesc.name);
-            if (memcmp(&dwnmFxDesc.type, EFFECT_UIID_DOWNMIX, sizeof(effect_uuid_t)) == 0) {
-                ALOGI("found effect \"%s\" from %s",
-                        dwnmFxDesc.name, dwnmFxDesc.implementor);
-                isMultichannelCapable = true;
-                break;
-            }
-        }
-    }
-    ALOGE_IF(!isMultichannelCapable, "unable to find downmix effect");
 }
 
 AudioMixer::~AudioMixer()
@@ -229,7 +206,7 @@ int AudioMixer::getTrackName(audio_channel_mask_t channelMask, int sessionId)
 
 void AudioMixer::invalidateState(uint32_t mask)
 {
-    if (mask) {
+    if (mask != 0) {
         mState.needsChanged |= mask;
         mState.hook = process__validate;
     }
@@ -252,7 +229,7 @@ status_t AudioMixer::initTrackDownmix(track_t* pTrack, int trackNum, audio_chann
     return status;
 }
 
-void AudioMixer::unprepareTrackForDownmix(track_t* pTrack, int trackName) {
+void AudioMixer::unprepareTrackForDownmix(track_t* pTrack, int trackName __unused) {
     ALOGV("AudioMixer::unprepareTrackForDownmix(%d)", trackName);
 
     if (pTrack->downmixerBufferProvider != NULL) {
@@ -276,13 +253,13 @@ status_t AudioMixer::prepareTrackForDownmix(track_t* pTrack, int trackName)
     DownmixerBufferProvider* pDbp = new DownmixerBufferProvider();
     int32_t status;
 
-    if (!isMultichannelCapable) {
+    if (!sIsMultichannelCapable) {
         ALOGE("prepareTrackForDownmix(%d) fails: mixer doesn't support multichannel content",
                 trackName);
         goto noDownmixForActiveTrack;
     }
 
-    if (EffectCreate(&dwnmFxDesc.uuid,
+    if (EffectCreate(&sDwnmFxDesc.uuid,
             pTrack->sessionId /*sessionId*/, -2 /*ioId not relevant here, using random value*/,
             &pDbp->mDownmixHandle/*pHandle*/) != 0) {
         ALOGE("prepareTrackForDownmix(%d) fails: error creating downmixer effect", trackName);
@@ -560,14 +537,14 @@ bool AudioMixer::track_t::setResampler(uint32_t value, uint32_t devSampleRate)
                 // Should have a way to distinguish tracks with static ratios vs. dynamic ratios.
                 if (!((value == 44100 && devSampleRate == 48000) ||
                       (value == 48000 && devSampleRate == 44100))) {
-                    quality = AudioResampler::LOW_QUALITY;
+                    quality = AudioResampler::DYN_LOW_QUALITY;
                 } else {
                     quality = AudioResampler::DEFAULT_QUALITY;
                 }
                 resampler = AudioResampler::create(
                         format,
                         // the resampler sees the number of channels after the downmixer, if any
-                        downmixerBufferProvider != NULL ? MAX_NUM_CHANNELS : channelCount,
+                        (int) (downmixerBufferProvider != NULL ? MAX_NUM_CHANNELS : channelCount),
                         devSampleRate, quality);
                 resampler->setLocalTimeFreq(sLocalTimeFreq);
             }
@@ -668,27 +645,29 @@ void AudioMixer::process__validate(state_t* state, int64_t pts)
         countActiveTracks++;
         track_t& t = state->tracks[i];
         uint32_t n = 0;
+        // FIXME can overflow (mask is only 3 bits)
         n |= NEEDS_CHANNEL_1 + t.channelCount - 1;
-        n |= NEEDS_FORMAT_16;
-        n |= t.doesResample() ? NEEDS_RESAMPLE_ENABLED : NEEDS_RESAMPLE_DISABLED;
+        if (t.doesResample()) {
+            n |= NEEDS_RESAMPLE;
+        }
         if (t.auxLevel != 0 && t.auxBuffer != NULL) {
-            n |= NEEDS_AUX_ENABLED;
+            n |= NEEDS_AUX;
         }
 
         if (t.volumeInc[0]|t.volumeInc[1]) {
             volumeRamp = true;
         } else if (!t.doesResample() && t.volumeRL == 0) {
-            n |= NEEDS_MUTE_ENABLED;
+            n |= NEEDS_MUTE;
         }
         t.needs = n;
 
-        if ((n & NEEDS_MUTE__MASK) == NEEDS_MUTE_ENABLED) {
+        if (n & NEEDS_MUTE) {
             t.hook = track__nop;
         } else {
-            if ((n & NEEDS_AUX__MASK) == NEEDS_AUX_ENABLED) {
+            if (n & NEEDS_AUX) {
                 all16BitsStereoNoResample = false;
             }
-            if ((n & NEEDS_RESAMPLE__MASK) == NEEDS_RESAMPLE_ENABLED) {
+            if (n & NEEDS_RESAMPLE) {
                 all16BitsStereoNoResample = false;
                 resampling = true;
                 t.hook = track__genericResample;
@@ -710,7 +689,7 @@ void AudioMixer::process__validate(state_t* state, int64_t pts)
 
     // select the processing hooks
     state->hook = process__nop;
-    if (countActiveTracks) {
+    if (countActiveTracks > 0) {
         if (resampling) {
             if (!state->outputTemp) {
                 state->outputTemp = new int32_t[MAX_NUM_CHANNELS * state->frameCount];
@@ -746,16 +725,15 @@ void AudioMixer::process__validate(state_t* state, int64_t pts)
 
     // Now that the volume ramp has been done, set optimal state and
     // track hooks for subsequent mixer process
-    if (countActiveTracks) {
+    if (countActiveTracks > 0) {
         bool allMuted = true;
         uint32_t en = state->enabledTracks;
         while (en) {
             const int i = 31 - __builtin_clz(en);
             en &= ~(1<<i);
             track_t& t = state->tracks[i];
-            if (!t.doesResample() && t.volumeRL == 0)
-            {
-                t.needs |= NEEDS_MUTE_ENABLED;
+            if (!t.doesResample() && t.volumeRL == 0) {
+                t.needs |= NEEDS_MUTE;
                 t.hook = track__nop;
             } else {
                 allMuted = false;
@@ -806,8 +784,8 @@ void AudioMixer::track__genericResample(track_t* t, int32_t* out, size_t outFram
     }
 }
 
-void AudioMixer::track__nop(track_t* t, int32_t* out, size_t outFrameCount, int32_t* temp,
-        int32_t* aux)
+void AudioMixer::track__nop(track_t* t __unused, int32_t* out __unused,
+        size_t outFrameCount __unused, int32_t* temp __unused, int32_t* aux __unused)
 {
 }
 
@@ -883,8 +861,8 @@ void AudioMixer::volumeStereo(track_t* t, int32_t* out, size_t frameCount, int32
     }
 }
 
-void AudioMixer::track__16BitsStereo(track_t* t, int32_t* out, size_t frameCount, int32_t* temp,
-        int32_t* aux)
+void AudioMixer::track__16BitsStereo(track_t* t, int32_t* out, size_t frameCount,
+        int32_t* temp __unused, int32_t* aux)
 {
     const int16_t *in = static_cast<const int16_t *>(t->in);
 
@@ -974,8 +952,8 @@ void AudioMixer::track__16BitsStereo(track_t* t, int32_t* out, size_t frameCount
     t->in = in;
 }
 
-void AudioMixer::track__16BitsMono(track_t* t, int32_t* out, size_t frameCount, int32_t* temp,
-        int32_t* aux)
+void AudioMixer::track__16BitsMono(track_t* t, int32_t* out, size_t frameCount,
+        int32_t* temp __unused, int32_t* aux)
 {
     const int16_t *in = static_cast<int16_t const *>(t->in);
 
@@ -1154,7 +1132,7 @@ void AudioMixer::process__genericNoResampling(state_t* state, int64_t pts)
                 track_t& t = state->tracks[i];
                 size_t outFrames = BLOCKSIZE;
                 int32_t *aux = NULL;
-                if (CC_UNLIKELY((t.needs & NEEDS_AUX__MASK) == NEEDS_AUX_ENABLED)) {
+                if (CC_UNLIKELY(t.needs & NEEDS_AUX)) {
                     aux = t.auxBuffer + numFrames;
                 }
                 while (outFrames) {
@@ -1166,7 +1144,7 @@ void AudioMixer::process__genericNoResampling(state_t* state, int64_t pts)
                         break;
                     }
                     size_t inFrames = (t.frameCount > outFrames)?outFrames:t.frameCount;
-                    if (inFrames) {
+                    if (inFrames > 0) {
                         t.hook(&t, outTemp + (BLOCKSIZE-outFrames)*MAX_NUM_CHANNELS, inFrames,
                                 state->resampleTemp, aux);
                         t.frameCount -= inFrames;
@@ -1242,14 +1220,14 @@ void AudioMixer::process__genericResampling(state_t* state, int64_t pts)
             e1 &= ~(1<<i);
             track_t& t = state->tracks[i];
             int32_t *aux = NULL;
-            if (CC_UNLIKELY((t.needs & NEEDS_AUX__MASK) == NEEDS_AUX_ENABLED)) {
+            if (CC_UNLIKELY(t.needs & NEEDS_AUX)) {
                 aux = t.auxBuffer;
             }
 
             // this is a little goofy, on the resampling case we don't
             // acquire/release the buffers because it's done by
             // the resampler.
-            if ((t.needs & NEEDS_RESAMPLE__MASK) == NEEDS_RESAMPLE_ENABLED) {
+            if (t.needs & NEEDS_RESAMPLE) {
                 t.resampler->setPTS(pts);
                 t.hook(&t, outTemp, numFrames, state->resampleTemp, aux);
             } else {
@@ -1449,8 +1427,9 @@ void AudioMixer::process__TwoTracks16BitsStereoNoResampling(state_t* state,
 int64_t AudioMixer::calculateOutputPTS(const track_t& t, int64_t basePTS,
                                        int outputFrameIndex)
 {
-    if (AudioBufferProvider::kInvalidPTS == basePTS)
+    if (AudioBufferProvider::kInvalidPTS == basePTS) {
         return AudioBufferProvider::kInvalidPTS;
+    }
 
     return basePTS + ((outputFrameIndex * sLocalTimeFreq) / t.sampleRate);
 }
@@ -1462,6 +1441,28 @@ int64_t AudioMixer::calculateOutputPTS(const track_t& t, int64_t basePTS,
 {
     LocalClock lc;
     sLocalTimeFreq = lc.getLocalFreq();
+
+    // find multichannel downmix effect if we have to play multichannel content
+    uint32_t numEffects = 0;
+    int ret = EffectQueryNumberEffects(&numEffects);
+    if (ret != 0) {
+        ALOGE("AudioMixer() error %d querying number of effects", ret);
+        return;
+    }
+    ALOGV("EffectQueryNumberEffects() numEffects=%d", numEffects);
+
+    for (uint32_t i = 0 ; i < numEffects ; i++) {
+        if (EffectQueryEffect(i, &sDwnmFxDesc) == 0) {
+            ALOGV("effect %d is called %s", i, sDwnmFxDesc.name);
+            if (memcmp(&sDwnmFxDesc.type, EFFECT_UIID_DOWNMIX, sizeof(effect_uuid_t)) == 0) {
+                ALOGI("found effect \"%s\" from %s",
+                        sDwnmFxDesc.name, sDwnmFxDesc.implementor);
+                sIsMultichannelCapable = true;
+                break;
+            }
+        }
+    }
+    ALOGW_IF(!sIsMultichannelCapable, "unable to find downmix effect");
 }
 
 // ----------------------------------------------------------------------------
diff --git a/services/audioflinger/AudioMixer.h b/services/audioflinger/AudioMixer.h
index 43aeb86..d286986 100644
--- a/services/audioflinger/AudioMixer.h
+++ b/services/audioflinger/AudioMixer.h
@@ -120,27 +120,19 @@ public:
 private:
 
     enum {
+        // FIXME this representation permits up to 8 channels
         NEEDS_CHANNEL_COUNT__MASK   = 0x00000007,
-        NEEDS_FORMAT__MASK          = 0x000000F0,
-        NEEDS_MUTE__MASK            = 0x00000100,
-        NEEDS_RESAMPLE__MASK        = 0x00001000,
-        NEEDS_AUX__MASK             = 0x00010000,
     };
 
     enum {
-        NEEDS_CHANNEL_1             = 0x00000000,
-        NEEDS_CHANNEL_2             = 0x00000001,
+        NEEDS_CHANNEL_1             = 0x00000000,   // mono
+        NEEDS_CHANNEL_2             = 0x00000001,   // stereo
 
-        NEEDS_FORMAT_16             = 0x00000010,
+        // sample format is not explicitly specified, and is assumed to be AUDIO_FORMAT_PCM_16_BIT
 
-        NEEDS_MUTE_DISABLED         = 0x00000000,
-        NEEDS_MUTE_ENABLED          = 0x00000100,
-
-        NEEDS_RESAMPLE_DISABLED     = 0x00000000,
-        NEEDS_RESAMPLE_ENABLED      = 0x00001000,
-
-        NEEDS_AUX_DISABLED     = 0x00000000,
-        NEEDS_AUX_ENABLED      = 0x00010000,
+        NEEDS_MUTE                  = 0x00000100,
+        NEEDS_RESAMPLE              = 0x00001000,
+        NEEDS_AUX                   = 0x00010000,
     };
 
     struct state_t;
@@ -224,7 +216,7 @@ private:
         NBLog::Writer*  mLog;
         int32_t         reserved[1];
         // FIXME allocate dynamically to save some memory when maxNumTracks < MAX_NUM_TRACKS
-        track_t         tracks[MAX_NUM_TRACKS]; __attribute__((aligned(32)));
+        track_t         tracks[MAX_NUM_TRACKS] __attribute__((aligned(32)));
     };
 
     // AudioBufferProvider that wraps a track AudioBufferProvider by a call to a downmix effect
@@ -256,9 +248,9 @@ private:
     state_t         mState __attribute__((aligned(32)));
 
     // effect descriptor for the downmixer used by the mixer
-    static effect_descriptor_t dwnmFxDesc;
+    static effect_descriptor_t sDwnmFxDesc;
     // indicates whether a downmix effect has been found and is usable by this mixer
-    static bool                isMultichannelCapable;
+    static bool                sIsMultichannelCapable;
 
     // Call after changing either the enabled status of a track, or parameters of an enabled track.
     // OK to call more often than that, but unnecessary.
diff --git a/services/audioflinger/AudioPolicyService.cpp b/services/audioflinger/AudioPolicyService.cpp
index 646a317..9980344 100644
--- a/services/audioflinger/AudioPolicyService.cpp
+++ b/services/audioflinger/AudioPolicyService.cpp
@@ -60,7 +60,7 @@ namespace {
 // ----------------------------------------------------------------------------
 
 AudioPolicyService::AudioPolicyService()
-    : BnAudioPolicyService() , mpAudioPolicyDev(NULL) , mpAudioPolicy(NULL)
+    : BnAudioPolicyService(), mpAudioPolicyDev(NULL), mpAudioPolicy(NULL)
 {
     char value[PROPERTY_VALUE_MAX];
     const struct hw_module_t *module;
@@ -77,24 +77,28 @@ AudioPolicyService::AudioPolicyService()
     mOutputCommandThread = new AudioCommandThread(String8("ApmOutput"), this);
     /* instantiate the audio policy manager */
     rc = hw_get_module(AUDIO_POLICY_HARDWARE_MODULE_ID, &module);
-    if (rc)
+    if (rc) {
         return;
+    }
 
     rc = audio_policy_dev_open(module, &mpAudioPolicyDev);
     ALOGE_IF(rc, "couldn't open audio policy device (%s)", strerror(-rc));
-    if (rc)
+    if (rc) {
         return;
+    }
 
     rc = mpAudioPolicyDev->create_audio_policy(mpAudioPolicyDev, &aps_ops, this,
                                                &mpAudioPolicy);
     ALOGE_IF(rc, "couldn't create audio policy (%s)", strerror(-rc));
-    if (rc)
+    if (rc) {
         return;
+    }
 
     rc = mpAudioPolicy->init_check(mpAudioPolicy);
     ALOGE_IF(rc, "couldn't init_check the audio policy (%s)", strerror(-rc));
-    if (rc)
+    if (rc) {
         return;
+    }
 
     ALOGI("Loaded audio policy from %s (%s)", module->name, module->id);
 
@@ -126,10 +130,12 @@ AudioPolicyService::~AudioPolicyService()
     }
     mInputs.clear();
 
-    if (mpAudioPolicy != NULL && mpAudioPolicyDev != NULL)
+    if (mpAudioPolicy != NULL && mpAudioPolicyDev != NULL) {
         mpAudioPolicyDev->destroy_audio_policy(mpAudioPolicyDev, mpAudioPolicy);
-    if (mpAudioPolicyDev != NULL)
+    }
+    if (mpAudioPolicyDev != NULL) {
         audio_policy_dev_close(mpAudioPolicyDev);
+    }
 }
 
 status_t AudioPolicyService::setDeviceConnectionState(audio_devices_t device,
@@ -469,8 +475,9 @@ audio_devices_t AudioPolicyService::getDevicesForStream(audio_stream_type_t stre
 
 audio_io_handle_t AudioPolicyService::getOutputForEffect(const effect_descriptor_t *desc)
 {
+    // FIXME change return type to status_t, and return NO_INIT here
     if (mpAudioPolicy == NULL) {
-        return NO_INIT;
+        return 0;
     }
     Mutex::Autolock _l(mLock);
     return mpAudioPolicy->get_output_for_effect(mpAudioPolicy, desc);
@@ -606,7 +613,7 @@ status_t AudioPolicyService::dumpInternals(int fd)
     return NO_ERROR;
 }
 
-status_t AudioPolicyService::dump(int fd, const Vector<String16>& args)
+status_t AudioPolicyService::dump(int fd, const Vector<String16>& args __unused)
 {
     if (!dumpAllowed()) {
         dumpPermissionDenial(fd);
@@ -1114,11 +1121,13 @@ int AudioPolicyService::setStreamVolume(audio_stream_type_t stream,
 int AudioPolicyService::startTone(audio_policy_tone_t tone,
                                   audio_stream_type_t stream)
 {
-    if (tone != AUDIO_POLICY_TONE_IN_CALL_NOTIFICATION)
+    if (tone != AUDIO_POLICY_TONE_IN_CALL_NOTIFICATION) {
         ALOGE("startTone: illegal tone requested (%d)", tone);
-    if (stream != AUDIO_STREAM_VOICE_CALL)
+    }
+    if (stream != AUDIO_STREAM_VOICE_CALL) {
         ALOGE("startTone: illegal stream (%d) requested for tone %d", stream,
             tone);
+    }
     mTonePlaybackThread->startToneCommand(ToneGenerator::TONE_SUP_CALL_WAITING,
                                           AUDIO_STREAM_VOICE_CALL);
     return 0;
@@ -1452,7 +1461,7 @@ status_t AudioPolicyService::loadPreProcessorConfig(const char *path)
 extern "C" {
 
 
-static audio_module_handle_t aps_load_hw_module(void *service,
+static audio_module_handle_t aps_load_hw_module(void *service __unused,
                                              const char *name)
 {
     sp<IAudioFlinger> af = AudioSystem::get_audio_flinger();
@@ -1465,7 +1474,7 @@ static audio_module_handle_t aps_load_hw_module(void *service,
 }
 
 // deprecated: replaced by aps_open_output_on_module()
-static audio_io_handle_t aps_open_output(void *service,
+static audio_io_handle_t aps_open_output(void *service __unused,
                                          audio_devices_t *pDevices,
                                          uint32_t *pSamplingRate,
                                          audio_format_t *pFormat,
@@ -1483,7 +1492,7 @@ static audio_io_handle_t aps_open_output(void *service,
                           pLatencyMs, flags);
 }
 
-static audio_io_handle_t aps_open_output_on_module(void *service,
+static audio_io_handle_t aps_open_output_on_module(void *service __unused,
                                                    audio_module_handle_t module,
                                                    audio_devices_t *pDevices,
                                                    uint32_t *pSamplingRate,
@@ -1502,7 +1511,7 @@ static audio_io_handle_t aps_open_output_on_module(void *service,
                           pLatencyMs, flags, offloadInfo);
 }
 
-static audio_io_handle_t aps_open_dup_output(void *service,
+static audio_io_handle_t aps_open_dup_output(void *service __unused,
                                                  audio_io_handle_t output1,
                                                  audio_io_handle_t output2)
 {
@@ -1514,16 +1523,17 @@ static audio_io_handle_t aps_open_dup_output(void *service,
     return af->openDuplicateOutput(output1, output2);
 }
 
-static int aps_close_output(void *service, audio_io_handle_t output)
+static int aps_close_output(void *service __unused, audio_io_handle_t output)
 {
     sp<IAudioFlinger> af = AudioSystem::get_audio_flinger();
-    if (af == 0)
+    if (af == 0) {
         return PERMISSION_DENIED;
+    }
 
     return af->closeOutput(output);
 }
 
-static int aps_suspend_output(void *service, audio_io_handle_t output)
+static int aps_suspend_output(void *service __unused, audio_io_handle_t output)
 {
     sp<IAudioFlinger> af = AudioSystem::get_audio_flinger();
     if (af == 0) {
@@ -1534,7 +1544,7 @@ static int aps_suspend_output(void *service, audio_io_handle_t output)
     return af->suspendOutput(output);
 }
 
-static int aps_restore_output(void *service, audio_io_handle_t output)
+static int aps_restore_output(void *service __unused, audio_io_handle_t output)
 {
     sp<IAudioFlinger> af = AudioSystem::get_audio_flinger();
     if (af == 0) {
@@ -1546,12 +1556,12 @@ static int aps_restore_output(void *service, audio_io_handle_t output)
 }
 
 // deprecated: replaced by aps_open_input_on_module(), and acoustics parameter is ignored
-static audio_io_handle_t aps_open_input(void *service,
+static audio_io_handle_t aps_open_input(void *service __unused,
                                         audio_devices_t *pDevices,
                                         uint32_t *pSamplingRate,
                                         audio_format_t *pFormat,
                                         audio_channel_mask_t *pChannelMask,
-                                        audio_in_acoustics_t acoustics)
+                                        audio_in_acoustics_t acoustics __unused)
 {
     sp<IAudioFlinger> af = AudioSystem::get_audio_flinger();
     if (af == 0) {
@@ -1562,7 +1572,7 @@ static audio_io_handle_t aps_open_input(void *service,
     return af->openInput((audio_module_handle_t)0, pDevices, pSamplingRate, pFormat, pChannelMask);
 }
 
-static audio_io_handle_t aps_open_input_on_module(void *service,
+static audio_io_handle_t aps_open_input_on_module(void *service __unused,
                                                   audio_module_handle_t module,
                                                   audio_devices_t *pDevices,
                                                   uint32_t *pSamplingRate,
@@ -1578,37 +1588,40 @@ static audio_io_handle_t aps_open_input_on_module(void *service,
     return af->openInput(module, pDevices, pSamplingRate, pFormat, pChannelMask);
 }
 
-static int aps_close_input(void *service, audio_io_handle_t input)
+static int aps_close_input(void *service __unused, audio_io_handle_t input)
 {
     sp<IAudioFlinger> af = AudioSystem::get_audio_flinger();
-    if (af == 0)
+    if (af == 0) {
         return PERMISSION_DENIED;
+    }
 
     return af->closeInput(input);
 }
 
-static int aps_set_stream_output(void *service, audio_stream_type_t stream,
+static int aps_set_stream_output(void *service __unused, audio_stream_type_t stream,
                                      audio_io_handle_t output)
 {
     sp<IAudioFlinger> af = AudioSystem::get_audio_flinger();
-    if (af == 0)
+    if (af == 0) {
         return PERMISSION_DENIED;
+    }
 
     return af->setStreamOutput(stream, output);
 }
 
-static int aps_move_effects(void *service, int session,
+static int aps_move_effects(void *service __unused, int session,
                                 audio_io_handle_t src_output,
                                 audio_io_handle_t dst_output)
 {
     sp<IAudioFlinger> af = AudioSystem::get_audio_flinger();
-    if (af == 0)
+    if (af == 0) {
         return PERMISSION_DENIED;
+    }
 
     return af->moveEffects(session, src_output, dst_output);
 }
 
-static char * aps_get_parameters(void *service, audio_io_handle_t io_handle,
+static char * aps_get_parameters(void *service __unused, audio_io_handle_t io_handle,
                                      const char *keys)
 {
     String8 result = AudioSystem::getParameters(io_handle, String8(keys));
@@ -1659,24 +1672,24 @@ static int aps_set_voice_volume(void *service, float volume, int delay_ms)
 
 namespace {
     struct audio_policy_service_ops aps_ops = {
-        open_output           : aps_open_output,
-        open_duplicate_output : aps_open_dup_output,
-        close_output          : aps_close_output,
-        suspend_output        : aps_suspend_output,
-        restore_output        : aps_restore_output,
-        open_input            : aps_open_input,
-        close_input           : aps_close_input,
-        set_stream_volume     : aps_set_stream_volume,
-        set_stream_output     : aps_set_stream_output,
-        set_parameters        : aps_set_parameters,
-        get_parameters        : aps_get_parameters,
-        start_tone            : aps_start_tone,
-        stop_tone             : aps_stop_tone,
-        set_voice_volume      : aps_set_voice_volume,
-        move_effects          : aps_move_effects,
-        load_hw_module        : aps_load_hw_module,
-        open_output_on_module : aps_open_output_on_module,
-        open_input_on_module  : aps_open_input_on_module,
+        .open_output           = aps_open_output,
+        .open_duplicate_output = aps_open_dup_output,
+        .close_output          = aps_close_output,
+        .suspend_output        = aps_suspend_output,
+        .restore_output        = aps_restore_output,
+        .open_input            = aps_open_input,
+        .close_input           = aps_close_input,
+        .set_stream_volume     = aps_set_stream_volume,
+        .set_stream_output     = aps_set_stream_output,
+        .set_parameters        = aps_set_parameters,
+        .get_parameters        = aps_get_parameters,
+        .start_tone            = aps_start_tone,
+        .stop_tone             = aps_stop_tone,
+        .set_voice_volume      = aps_set_voice_volume,
+        .move_effects          = aps_move_effects,
+        .load_hw_module        = aps_load_hw_module,
+        .open_output_on_module = aps_open_output_on_module,
+        .open_input_on_module  = aps_open_input_on_module,
     };
 }; // namespace <unnamed>
 
diff --git a/services/audioflinger/AudioResampler.cpp b/services/audioflinger/AudioResampler.cpp
index e5cceb1..b206116 100644
--- a/services/audioflinger/AudioResampler.cpp
+++ b/services/audioflinger/AudioResampler.cpp
@@ -25,6 +25,7 @@
 #include "AudioResampler.h"
 #include "AudioResamplerSinc.h"
 #include "AudioResamplerCubic.h"
+#include "AudioResamplerDyn.h"
 
 #ifdef __arm__
 #include <machine/cpu-features.h>
@@ -77,6 +78,9 @@ private:
     int mX0R;
 };
 
+/*static*/
+const double AudioResampler::kPhaseMultiplier = 1L << AudioResampler::kNumPhaseBits;
+
 bool AudioResampler::qualityIsSupported(src_quality quality)
 {
     switch (quality) {
@@ -85,6 +89,9 @@ bool AudioResampler::qualityIsSupported(src_quality quality)
     case MED_QUALITY:
     case HIGH_QUALITY:
     case VERY_HIGH_QUALITY:
+    case DYN_LOW_QUALITY:
+    case DYN_MED_QUALITY:
+    case DYN_HIGH_QUALITY:
         return true;
     default:
         return false;
@@ -105,7 +112,7 @@ void AudioResampler::init_routine()
         if (*endptr == '\0') {
             defaultQuality = (src_quality) l;
             ALOGD("forcing AudioResampler quality to %d", defaultQuality);
-            if (defaultQuality < DEFAULT_QUALITY || defaultQuality > VERY_HIGH_QUALITY) {
+            if (defaultQuality < DEFAULT_QUALITY || defaultQuality > DYN_HIGH_QUALITY) {
                 defaultQuality = DEFAULT_QUALITY;
             }
         }
@@ -125,6 +132,12 @@ uint32_t AudioResampler::qualityMHz(src_quality quality)
         return 20;
     case VERY_HIGH_QUALITY:
         return 34;
+    case DYN_LOW_QUALITY:
+        return 4;
+    case DYN_MED_QUALITY:
+        return 6;
+    case DYN_HIGH_QUALITY:
+        return 12;
     }
 }
 
@@ -148,6 +161,16 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount,
         atFinalQuality = true;
     }
 
+    /* if the caller requests DEFAULT_QUALITY and af.resampler.property
+     * has not been set, the target resampler quality is set to DYN_MED_QUALITY,
+     * and allowed to "throttle" down to DYN_LOW_QUALITY if necessary
+     * due to estimated CPU load of having too many active resamplers
+     * (the code below the if).
+     */
+    if (quality == DEFAULT_QUALITY) {
+        quality = DYN_MED_QUALITY;
+    }
+
     // naive implementation of CPU load throttling doesn't account for whether resampler is active
     pthread_mutex_lock(&mutex);
     for (;;) {
@@ -162,7 +185,6 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount,
         // not enough CPU available for proposed quality level, so try next lowest level
         switch (quality) {
         default:
-        case DEFAULT_QUALITY:
         case LOW_QUALITY:
             atFinalQuality = true;
             break;
@@ -175,6 +197,15 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount,
         case VERY_HIGH_QUALITY:
             quality = HIGH_QUALITY;
             break;
+        case DYN_LOW_QUALITY:
+            atFinalQuality = true;
+            break;
+        case DYN_MED_QUALITY:
+            quality = DYN_LOW_QUALITY;
+            break;
+        case DYN_HIGH_QUALITY:
+            quality = DYN_MED_QUALITY;
+            break;
         }
     }
     pthread_mutex_unlock(&mutex);
@@ -183,7 +214,6 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount,
 
     switch (quality) {
     default:
-    case DEFAULT_QUALITY:
     case LOW_QUALITY:
         ALOGV("Create linear Resampler");
         resampler = new AudioResamplerOrder1(bitDepth, inChannelCount, sampleRate);
@@ -200,6 +230,12 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount,
         ALOGV("Create VERY_HIGH_QUALITY sinc Resampler = %d", quality);
         resampler = new AudioResamplerSinc(bitDepth, inChannelCount, sampleRate, quality);
         break;
+    case DYN_LOW_QUALITY:
+    case DYN_MED_QUALITY:
+    case DYN_HIGH_QUALITY:
+        ALOGV("Create dynamic Resampler = %d", quality);
+        resampler = new AudioResamplerDyn(bitDepth, inChannelCount, sampleRate, quality);
+        break;
     }
 
     // initialize resampler
@@ -339,8 +375,9 @@ void AudioResamplerOrder1::resampleStereo16(int32_t* out, size_t outFrameCount,
             out[outputIndex++] += vl * Interp(mX0L, in[0], phaseFraction);
             out[outputIndex++] += vr * Interp(mX0R, in[1], phaseFraction);
             Advance(&inputIndex, &phaseFraction, phaseIncrement);
-            if (outputIndex == outputSampleCount)
+            if (outputIndex == outputSampleCount) {
                 break;
+            }
         }
 
         // process input samples
@@ -434,8 +471,9 @@ void AudioResamplerOrder1::resampleMono16(int32_t* out, size_t outFrameCount,
             out[outputIndex++] += vl * sample;
             out[outputIndex++] += vr * sample;
             Advance(&inputIndex, &phaseFraction, phaseIncrement);
-            if (outputIndex == outputSampleCount)
+            if (outputIndex == outputSampleCount) {
                 break;
+            }
         }
 
         // process input samples
@@ -514,6 +552,16 @@ void AudioResamplerOrder1::AsmMono16Loop(int16_t *in, int32_t* maxOutPt, int32_t
             size_t &outputIndex, int32_t* out, size_t &inputIndex, int32_t vl, int32_t vr,
             uint32_t &phaseFraction, uint32_t phaseIncrement)
 {
+    (void)maxOutPt; // remove unused parameter warnings
+    (void)maxInIdx;
+    (void)outputIndex;
+    (void)out;
+    (void)inputIndex;
+    (void)vl;
+    (void)vr;
+    (void)phaseFraction;
+    (void)phaseIncrement;
+    (void)in;
 #define MO_PARAM5   "36"        // offset of parameter 5 (outputIndex)
 
     asm(
@@ -625,6 +673,16 @@ void AudioResamplerOrder1::AsmStereo16Loop(int16_t *in, int32_t* maxOutPt, int32
             size_t &outputIndex, int32_t* out, size_t &inputIndex, int32_t vl, int32_t vr,
             uint32_t &phaseFraction, uint32_t phaseIncrement)
 {
+    (void)maxOutPt; // remove unused parameter warnings
+    (void)maxInIdx;
+    (void)outputIndex;
+    (void)out;
+    (void)inputIndex;
+    (void)vl;
+    (void)vr;
+    (void)phaseFraction;
+    (void)phaseIncrement;
+    (void)in;
 #define ST_PARAM5    "40"     // offset of parameter 5 (outputIndex)
     asm(
         "stmfd  sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}\n"
diff --git a/services/audioflinger/AudioResampler.h b/services/audioflinger/AudioResampler.h
index 33e64ce..dc33f29 100644
--- a/services/audioflinger/AudioResampler.h
+++ b/services/audioflinger/AudioResampler.h
@@ -41,6 +41,9 @@ public:
         MED_QUALITY=2,
         HIGH_QUALITY=3,
         VERY_HIGH_QUALITY=4,
+        DYN_LOW_QUALITY=5,
+        DYN_MED_QUALITY=6,
+        DYN_HIGH_QUALITY=7,
     };
 
     static AudioResampler* create(int bitDepth, int inChannelCount,
@@ -81,7 +84,7 @@ protected:
     static const uint32_t kPhaseMask = (1LU<<kNumPhaseBits)-1;
 
     // multiplier to calculate fixed point phase increment
-    static const double kPhaseMultiplier = 1L << kNumPhaseBits;
+    static const double kPhaseMultiplier;
 
     AudioResampler(int bitDepth, int inChannelCount, int32_t sampleRate, src_quality quality);
 
diff --git a/services/audioflinger/AudioResamplerCubic.cpp b/services/audioflinger/AudioResamplerCubic.cpp
index 18e59e9..1f9714b 100644
--- a/services/audioflinger/AudioResamplerCubic.cpp
+++ b/services/audioflinger/AudioResamplerCubic.cpp
@@ -66,8 +66,9 @@ void AudioResamplerCubic::resampleStereo16(int32_t* out, size_t outFrameCount,
     if (mBuffer.frameCount == 0) {
         mBuffer.frameCount = inFrameCount;
         provider->getNextBuffer(&mBuffer, mPTS);
-        if (mBuffer.raw == NULL)
+        if (mBuffer.raw == NULL) {
             return;
+        }
         // ALOGW("New buffer: offset=%p, frames=%dn", mBuffer.raw, mBuffer.frameCount);
     }
     int16_t *in = mBuffer.i16;
@@ -97,8 +98,9 @@ void AudioResamplerCubic::resampleStereo16(int32_t* out, size_t outFrameCount,
                 mBuffer.frameCount = inFrameCount;
                 provider->getNextBuffer(&mBuffer,
                                         calculateOutputPTS(outputIndex / 2));
-                if (mBuffer.raw == NULL)
+                if (mBuffer.raw == NULL) {
                     goto save_state;  // ugly, but efficient
+                }
                 in = mBuffer.i16;
                 // ALOGW("New buffer: offset=%p, frames=%d", mBuffer.raw, mBuffer.frameCount);
             }
@@ -132,8 +134,9 @@ void AudioResamplerCubic::resampleMono16(int32_t* out, size_t outFrameCount,
     if (mBuffer.frameCount == 0) {
         mBuffer.frameCount = inFrameCount;
         provider->getNextBuffer(&mBuffer, mPTS);
-        if (mBuffer.raw == NULL)
+        if (mBuffer.raw == NULL) {
             return;
+        }
         // ALOGW("New buffer: offset=%p, frames=%d", mBuffer.raw, mBuffer.frameCount);
     }
     int16_t *in = mBuffer.i16;
@@ -163,8 +166,9 @@ void AudioResamplerCubic::resampleMono16(int32_t* out, size_t outFrameCount,
                 mBuffer.frameCount = inFrameCount;
                 provider->getNextBuffer(&mBuffer,
                                         calculateOutputPTS(outputIndex / 2));
-                if (mBuffer.raw == NULL)
+                if (mBuffer.raw == NULL) {
                     goto save_state;  // ugly, but efficient
+                }
                 // ALOGW("New buffer: offset=%p, frames=%dn", mBuffer.raw, mBuffer.frameCount);
                 in = mBuffer.i16;
             }
diff --git a/services/audioflinger/AudioResamplerDyn.cpp b/services/audioflinger/AudioResamplerDyn.cpp
new file mode 100644
index 0000000..cd67df5
--- /dev/null
+++ b/services/audioflinger/AudioResamplerDyn.cpp
@@ -0,0 +1,547 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define LOG_TAG "AudioResamplerDyn"
+//#define LOG_NDEBUG 0
+
+#include <malloc.h>
+#include <string.h>
+#include <stdlib.h>
+#include <dlfcn.h>
+#include <math.h>
+
+#include <cutils/compiler.h>
+#include <cutils/properties.h>
+#include <utils/Log.h>
+
+#include "AudioResamplerFirOps.h" // USE_NEON and USE_INLINE_ASSEMBLY defined here
+#include "AudioResamplerFirProcess.h"
+#include "AudioResamplerFirProcessNeon.h"
+#include "AudioResamplerFirGen.h" // requires math.h
+#include "AudioResamplerDyn.h"
+
+//#define DEBUG_RESAMPLER
+
+namespace android {
+
+// generate a unique resample type compile-time constant (constexpr)
+#define RESAMPLETYPE(CHANNELS, LOCKED, STRIDE, COEFTYPE) \
+    ((((CHANNELS)-1)&1) | !!(LOCKED)<<1 | (COEFTYPE)<<2 \
+    | ((STRIDE)==8 ? 1 : (STRIDE)==16 ? 2 : 0)<<3)
+
+/*
+ * InBuffer is a type agnostic input buffer.
+ *
+ * Layout of the state buffer for halfNumCoefs=8.
+ *
+ * [rrrrrrppppppppnnnnnnnnrrrrrrrrrrrrrrrrrrr.... rrrrrrr]
+ *  S            I                                R
+ *
+ * S = mState
+ * I = mImpulse
+ * R = mRingFull
+ * p = past samples, convoluted with the (p)ositive side of sinc()
+ * n = future samples, convoluted with the (n)egative side of sinc()
+ * r = extra space for implementing the ring buffer
+ */
+
+template<typename TI>
+AudioResamplerDyn::InBuffer<TI>::InBuffer()
+    : mState(NULL), mImpulse(NULL), mRingFull(NULL), mStateSize(0) {
+}
+
+template<typename TI>
+AudioResamplerDyn::InBuffer<TI>::~InBuffer() {
+    init();
+}
+
+template<typename TI>
+void AudioResamplerDyn::InBuffer<TI>::init() {
+    free(mState);
+    mState = NULL;
+    mImpulse = NULL;
+    mRingFull = NULL;
+    mStateSize = 0;
+}
+
+// resizes the state buffer to accommodate the appropriate filter length
+template<typename TI>
+void AudioResamplerDyn::InBuffer<TI>::resize(int CHANNELS, int halfNumCoefs) {
+    // calculate desired state size
+    int stateSize = halfNumCoefs * CHANNELS * 2
+            * kStateSizeMultipleOfFilterLength;
+
+    // check if buffer needs resizing
+    if (mState
+            && stateSize == mStateSize
+            && mRingFull-mState == mStateSize-halfNumCoefs*CHANNELS) {
+        return;
+    }
+
+    // create new buffer
+    TI* state = (int16_t*)memalign(32, stateSize*sizeof(*state));
+    memset(state, 0, stateSize*sizeof(*state));
+
+    // attempt to preserve state
+    if (mState) {
+        TI* srcLo = mImpulse - halfNumCoefs*CHANNELS;
+        TI* srcHi = mImpulse + halfNumCoefs*CHANNELS;
+        TI* dst = state;
+
+        if (srcLo < mState) {
+            dst += mState-srcLo;
+            srcLo = mState;
+        }
+        if (srcHi > mState + mStateSize) {
+            srcHi = mState + mStateSize;
+        }
+        memcpy(dst, srcLo, (srcHi - srcLo) * sizeof(*srcLo));
+        free(mState);
+    }
+
+    // set class member vars
+    mState = state;
+    mStateSize = stateSize;
+    mImpulse = mState + halfNumCoefs*CHANNELS; // actually one sample greater than needed
+    mRingFull = mState + mStateSize - halfNumCoefs*CHANNELS;
+}
+
+// copy in the input data into the head (impulse+halfNumCoefs) of the buffer.
+template<typename TI>
+template<int CHANNELS>
+void AudioResamplerDyn::InBuffer<TI>::readAgain(TI*& impulse, const int halfNumCoefs,
+        const TI* const in, const size_t inputIndex) {
+    int16_t* head = impulse + halfNumCoefs*CHANNELS;
+    for (size_t i=0 ; i<CHANNELS ; i++) {
+        head[i] = in[inputIndex*CHANNELS + i];
+    }
+}
+
+// advance the impulse pointer, and load in data into the head (impulse+halfNumCoefs)
+template<typename TI>
+template<int CHANNELS>
+void AudioResamplerDyn::InBuffer<TI>::readAdvance(TI*& impulse, const int halfNumCoefs,
+        const TI* const in, const size_t inputIndex) {
+    impulse += CHANNELS;
+
+    if (CC_UNLIKELY(impulse >= mRingFull)) {
+        const size_t shiftDown = mRingFull - mState - halfNumCoefs*CHANNELS;
+        memcpy(mState, mState+shiftDown, halfNumCoefs*CHANNELS*2*sizeof(TI));
+        impulse -= shiftDown;
+    }
+    readAgain<CHANNELS>(impulse, halfNumCoefs, in, inputIndex);
+}
+
+void AudioResamplerDyn::Constants::set(
+        int L, int halfNumCoefs, int inSampleRate, int outSampleRate)
+{
+    int bits = 0;
+    int lscale = inSampleRate/outSampleRate < 2 ? L - 1 :
+            static_cast<int>(static_cast<uint64_t>(L)*inSampleRate/outSampleRate);
+    for (int i=lscale; i; ++bits, i>>=1)
+        ;
+    mL = L;
+    mShift = kNumPhaseBits - bits;
+    mHalfNumCoefs = halfNumCoefs;
+}
+
+AudioResamplerDyn::AudioResamplerDyn(int bitDepth,
+        int inChannelCount, int32_t sampleRate, src_quality quality)
+    : AudioResampler(bitDepth, inChannelCount, sampleRate, quality),
+    mResampleType(0), mFilterSampleRate(0), mFilterQuality(DEFAULT_QUALITY),
+    mCoefBuffer(NULL)
+{
+    mVolumeSimd[0] = mVolumeSimd[1] = 0;
+    mConstants.set(128, 8, mSampleRate, mSampleRate); // TODO: set better
+}
+
+AudioResamplerDyn::~AudioResamplerDyn() {
+    free(mCoefBuffer);
+}
+
+void AudioResamplerDyn::init() {
+    mFilterSampleRate = 0; // always trigger new filter generation
+    mInBuffer.init();
+}
+
+void AudioResamplerDyn::setVolume(int16_t left, int16_t right) {
+    AudioResampler::setVolume(left, right);
+    mVolumeSimd[0] = static_cast<int32_t>(left)<<16;
+    mVolumeSimd[1] = static_cast<int32_t>(right)<<16;
+}
+
+template <typename T> T max(T a, T b) {return a > b ? a : b;}
+
+template <typename T> T absdiff(T a, T b) {return a > b ? a - b : b - a;}
+
+template<typename T>
+void AudioResamplerDyn::createKaiserFir(Constants &c, double stopBandAtten,
+        int inSampleRate, int outSampleRate, double tbwCheat) {
+    T* buf = reinterpret_cast<T*>(memalign(32, (c.mL+1)*c.mHalfNumCoefs*sizeof(T)));
+    static const double atten = 0.9998;   // to avoid ripple overflow
+    double fcr;
+    double tbw = firKaiserTbw(c.mHalfNumCoefs, stopBandAtten);
+
+    if (inSampleRate < outSampleRate) { // upsample
+        fcr = max(0.5*tbwCheat - tbw/2, tbw/2);
+    } else { // downsample
+        fcr = max(0.5*tbwCheat*outSampleRate/inSampleRate - tbw/2, tbw/2);
+    }
+    // create and set filter
+    firKaiserGen(buf, c.mL, c.mHalfNumCoefs, stopBandAtten, fcr, atten);
+    c.setBuf(buf);
+    if (mCoefBuffer) {
+        free(mCoefBuffer);
+    }
+    mCoefBuffer = buf;
+#ifdef DEBUG_RESAMPLER
+    // print basic filter stats
+    printf("L:%d  hnc:%d  stopBandAtten:%lf  fcr:%lf  atten:%lf  tbw:%lf\n",
+            c.mL, c.mHalfNumCoefs, stopBandAtten, fcr, atten, tbw);
+    // test the filter and report results
+    double fp = (fcr - tbw/2)/c.mL;
+    double fs = (fcr + tbw/2)/c.mL;
+    double passMin, passMax, passRipple;
+    double stopMax, stopRipple;
+    testFir(buf, c.mL, c.mHalfNumCoefs, fp, fs, /*passSteps*/ 1000, /*stopSteps*/ 100000,
+            passMin, passMax, passRipple, stopMax, stopRipple);
+    printf("passband(%lf, %lf): %.8lf %.8lf %.8lf\n", 0., fp, passMin, passMax, passRipple);
+    printf("stopband(%lf, %lf): %.8lf %.3lf\n", fs, 0.5, stopMax, stopRipple);
+#endif
+}
+
+// recursive gcd. Using objdump, it appears the tail recursion is converted to a while loop.
+static int gcd(int n, int m) {
+    if (m == 0) {
+        return n;
+    }
+    return gcd(m, n % m);
+}
+
+static bool isClose(int32_t newSampleRate, int32_t prevSampleRate,
+        int32_t filterSampleRate, int32_t outSampleRate) {
+
+    // different upsampling ratios do not need a filter change.
+    if (filterSampleRate != 0
+            && filterSampleRate < outSampleRate
+            && newSampleRate < outSampleRate)
+        return true;
+
+    // check design criteria again if downsampling is detected.
+    int pdiff = absdiff(newSampleRate, prevSampleRate);
+    int adiff = absdiff(newSampleRate, filterSampleRate);
+
+    // allow up to 6% relative change increments.
+    // allow up to 12% absolute change increments (from filter design)
+    return pdiff < prevSampleRate>>4 && adiff < filterSampleRate>>3;
+}
+
+void AudioResamplerDyn::setSampleRate(int32_t inSampleRate) {
+    if (mInSampleRate == inSampleRate) {
+        return;
+    }
+    int32_t oldSampleRate = mInSampleRate;
+    int32_t oldHalfNumCoefs = mConstants.mHalfNumCoefs;
+    uint32_t oldPhaseWrapLimit = mConstants.mL << mConstants.mShift;
+    bool useS32 = false;
+
+    mInSampleRate = inSampleRate;
+
+    // TODO: Add precalculated Equiripple filters
+
+    if (mFilterQuality != getQuality() ||
+            !isClose(inSampleRate, oldSampleRate, mFilterSampleRate, mSampleRate)) {
+        mFilterSampleRate = inSampleRate;
+        mFilterQuality = getQuality();
+
+        // Begin Kaiser Filter computation
+        //
+        // The quantization floor for S16 is about 96db - 10*log_10(#length) + 3dB.
+        // Keep the stop band attenuation no greater than 84-85dB for 32 length S16 filters
+        //
+        // For s32 we keep the stop band attenuation at the same as 16b resolution, about
+        // 96-98dB
+        //
+
+        double stopBandAtten;
+        double tbwCheat = 1.; // how much we "cheat" into aliasing
+        int halfLength;
+        if (mFilterQuality == DYN_HIGH_QUALITY) {
+            // 32b coefficients, 64 length
+            useS32 = true;
+            stopBandAtten = 98.;
+            halfLength = 32;
+        } else if (mFilterQuality == DYN_LOW_QUALITY) {
+            // 16b coefficients, 16-32 length
+            useS32 = false;
+            stopBandAtten = 80.;
+            if (mSampleRate >= inSampleRate * 2) {
+                halfLength = 16;
+            } else {
+                halfLength = 8;
+            }
+            if (mSampleRate >= inSampleRate) {
+                tbwCheat = 1.05;
+            } else {
+                tbwCheat = 1.03;
+            }
+        } else { // DYN_MED_QUALITY
+            // 16b coefficients, 32-64 length
+            // note: > 64 length filters with 16b coefs can have quantization noise problems
+            useS32 = false;
+            stopBandAtten = 84.;
+            if (mSampleRate >= inSampleRate * 4) {
+                halfLength = 32;
+            } else if (mSampleRate >= inSampleRate * 2) {
+                halfLength = 24;
+            } else {
+                halfLength = 16;
+            }
+            if (mSampleRate >= inSampleRate) {
+                tbwCheat = 1.03;
+            } else {
+                tbwCheat = 1.01;
+            }
+        }
+
+        // determine the number of polyphases in the filterbank.
+        // for 16b, it is desirable to have 2^(16/2) = 256 phases.
+        // https://ccrma.stanford.edu/~jos/resample/Relation_Interpolation_Error_Quantization.html
+        //
+        // We are a bit more lax on this.
+
+        int phases = mSampleRate / gcd(mSampleRate, inSampleRate);
+
+        // TODO: Once dynamic sample rate change is an option, the code below
+        // should be modified to execute only when dynamic sample rate change is enabled.
+        //
+        // as above, #phases less than 63 is too few phases for accurate linear interpolation.
+        // we increase the phases to compensate, but more phases means more memory per
+        // filter and more time to compute the filter.
+        //
+        // if we know that the filter will be used for dynamic sample rate changes,
+        // that would allow us skip this part for fixed sample rate resamplers.
+        //
+        while (phases<63) {
+            phases *= 2; // this code only needed to support dynamic rate changes
+        }
+
+        if (phases>=256) {  // too many phases, always interpolate
+            phases = 127;
+        }
+
+        // create the filter
+        mConstants.set(phases, halfLength, inSampleRate, mSampleRate);
+        if (useS32) {
+            createKaiserFir<int32_t>(mConstants, stopBandAtten,
+                    inSampleRate, mSampleRate, tbwCheat);
+        } else {
+            createKaiserFir<int16_t>(mConstants, stopBandAtten,
+                    inSampleRate, mSampleRate, tbwCheat);
+        }
+    } // End Kaiser filter
+
+    // update phase and state based on the new filter.
+    const Constants& c(mConstants);
+    mInBuffer.resize(mChannelCount, c.mHalfNumCoefs);
+    const uint32_t phaseWrapLimit = c.mL << c.mShift;
+    // try to preserve as much of the phase fraction as possible for on-the-fly changes
+    mPhaseFraction = static_cast<unsigned long long>(mPhaseFraction)
+            * phaseWrapLimit / oldPhaseWrapLimit;
+    mPhaseFraction %= phaseWrapLimit; // should not do anything, but just in case.
+    mPhaseIncrement = static_cast<uint32_t>(static_cast<double>(phaseWrapLimit)
+            * inSampleRate / mSampleRate);
+
+    // determine which resampler to use
+    // check if locked phase (works only if mPhaseIncrement has no "fractional phase bits")
+    int locked = (mPhaseIncrement << (sizeof(mPhaseIncrement)*8 - c.mShift)) == 0;
+    int stride = (c.mHalfNumCoefs&7)==0 ? 16 : (c.mHalfNumCoefs&3)==0 ? 8 : 2;
+    if (locked) {
+        mPhaseFraction = mPhaseFraction >> c.mShift << c.mShift; // remove fractional phase
+    }
+
+    mResampleType = RESAMPLETYPE(mChannelCount, locked, stride, !!useS32);
+#ifdef DEBUG_RESAMPLER
+    printf("channels:%d  %s  stride:%d  %s  coef:%d  shift:%d\n",
+            mChannelCount, locked ? "locked" : "interpolated",
+            stride, useS32 ? "S32" : "S16", 2*c.mHalfNumCoefs, c.mShift);
+#endif
+}
+
+void AudioResamplerDyn::resample(int32_t* out, size_t outFrameCount,
+            AudioBufferProvider* provider)
+{
+    // TODO:
+    // 24 cases - this perhaps can be reduced later, as testing might take too long
+    switch (mResampleType) {
+
+    // stride 16 (falls back to stride 2 for machines that do not support NEON)
+    case RESAMPLETYPE(1, true, 16, 0):
+        return resample<1, true, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, true, 16, 0):
+        return resample<2, true, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, false, 16, 0):
+        return resample<1, false, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, false, 16, 0):
+        return resample<2, false, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, true, 16, 1):
+        return resample<1, true, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, true, 16, 1):
+        return resample<2, true, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(1, false, 16, 1):
+        return resample<1, false, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, false, 16, 1):
+        return resample<2, false, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+#if 0
+    // TODO: Remove these?
+    // stride 8
+    case RESAMPLETYPE(1, true, 8, 0):
+        return resample<1, true, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, true, 8, 0):
+        return resample<2, true, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, false, 8, 0):
+        return resample<1, false, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, false, 8, 0):
+        return resample<2, false, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, true, 8, 1):
+        return resample<1, true, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, true, 8, 1):
+        return resample<2, true, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(1, false, 8, 1):
+        return resample<1, false, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, false, 8, 1):
+        return resample<2, false, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    // stride 2 (can handle any filter length)
+    case RESAMPLETYPE(1, true, 2, 0):
+        return resample<1, true, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, true, 2, 0):
+        return resample<2, true, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, false, 2, 0):
+        return resample<1, false, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, false, 2, 0):
+        return resample<2, false, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, true, 2, 1):
+        return resample<1, true, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, true, 2, 1):
+        return resample<2, true, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(1, false, 2, 1):
+        return resample<1, false, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, false, 2, 1):
+        return resample<2, false, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+#endif
+    default:
+        ; // error
+    }
+}
+
+template<int CHANNELS, bool LOCKED, int STRIDE, typename TC>
+void AudioResamplerDyn::resample(int32_t* out, size_t outFrameCount,
+        const TC* const coefs,  AudioBufferProvider* provider)
+{
+    const Constants& c(mConstants);
+    int16_t* impulse = mInBuffer.getImpulse();
+    size_t inputIndex = mInputIndex;
+    uint32_t phaseFraction = mPhaseFraction;
+    const uint32_t phaseIncrement = mPhaseIncrement;
+    size_t outputIndex = 0;
+    size_t outputSampleCount = outFrameCount * 2;   // stereo output
+    size_t inFrameCount = (outFrameCount*mInSampleRate)/mSampleRate;
+    const uint32_t phaseWrapLimit = c.mL << c.mShift;
+
+    // NOTE: be very careful when modifying the code here. register
+    // pressure is very high and a small change might cause the compiler
+    // to generate far less efficient code.
+    // Always sanity check the result with objdump or test-resample.
+
+    // the following logic is a bit convoluted to keep the main processing loop
+    // as tight as possible with register allocation.
+    while (outputIndex < outputSampleCount) {
+        // buffer is empty, fetch a new one
+        while (mBuffer.frameCount == 0) {
+            mBuffer.frameCount = inFrameCount;
+            provider->getNextBuffer(&mBuffer,
+                    calculateOutputPTS(outputIndex / 2));
+            if (mBuffer.raw == NULL) {
+                goto resample_exit;
+            }
+            if (phaseFraction >= phaseWrapLimit) { // read in data
+                mInBuffer.readAdvance<CHANNELS>(
+                        impulse, c.mHalfNumCoefs, mBuffer.i16, inputIndex);
+                phaseFraction -= phaseWrapLimit;
+                while (phaseFraction >= phaseWrapLimit) {
+                    inputIndex++;
+                    if (inputIndex >= mBuffer.frameCount) {
+                        inputIndex -= mBuffer.frameCount;
+                        provider->releaseBuffer(&mBuffer);
+                        break;
+                    }
+                    mInBuffer.readAdvance<CHANNELS>(
+                            impulse, c.mHalfNumCoefs, mBuffer.i16, inputIndex);
+                    phaseFraction -= phaseWrapLimit;
+                }
+            }
+        }
+        const int16_t* const in = mBuffer.i16;
+        const size_t frameCount = mBuffer.frameCount;
+        const int coefShift = c.mShift;
+        const int halfNumCoefs = c.mHalfNumCoefs;
+        const int32_t* const volumeSimd = mVolumeSimd;
+
+        // reread the last input in.
+        mInBuffer.readAgain<CHANNELS>(impulse, halfNumCoefs, in, inputIndex);
+
+        // main processing loop
+        while (CC_LIKELY(outputIndex < outputSampleCount)) {
+            // caution: fir() is inlined and may be large.
+            // output will be loaded with the appropriate values
+            //
+            // from the input samples in impulse[-halfNumCoefs+1]... impulse[halfNumCoefs]
+            // from the polyphase filter of (phaseFraction / phaseWrapLimit) in coefs.
+            //
+            fir<CHANNELS, LOCKED, STRIDE>(
+                    &out[outputIndex],
+                    phaseFraction, phaseWrapLimit,
+                    coefShift, halfNumCoefs, coefs,
+                    impulse, volumeSimd);
+            outputIndex += 2;
+
+            phaseFraction += phaseIncrement;
+            while (phaseFraction >= phaseWrapLimit) {
+                inputIndex++;
+                if (inputIndex >= frameCount) {
+                    goto done;  // need a new buffer
+                }
+                mInBuffer.readAdvance<CHANNELS>(impulse, halfNumCoefs, in, inputIndex);
+                phaseFraction -= phaseWrapLimit;
+            }
+        }
+done:
+        // often arrives here when input buffer runs out
+        if (inputIndex >= frameCount) {
+            inputIndex -= frameCount;
+            provider->releaseBuffer(&mBuffer);
+            // mBuffer.frameCount MUST be zero here.
+        }
+    }
+
+resample_exit:
+    mInBuffer.setImpulse(impulse);
+    mInputIndex = inputIndex;
+    mPhaseFraction = phaseFraction;
+}
+
+// ----------------------------------------------------------------------------
+}; // namespace android
diff --git a/services/audioflinger/AudioResamplerDyn.h b/services/audioflinger/AudioResamplerDyn.h
new file mode 100644
index 0000000..df1fdbe
--- /dev/null
+++ b/services/audioflinger/AudioResamplerDyn.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_DYN_H
+#define ANDROID_AUDIO_RESAMPLER_DYN_H
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <cutils/log.h>
+
+#include "AudioResampler.h"
+
+namespace android {
+
+class AudioResamplerDyn: public AudioResampler {
+public:
+    AudioResamplerDyn(int bitDepth, int inChannelCount, int32_t sampleRate,
+            src_quality quality);
+
+    virtual ~AudioResamplerDyn();
+
+    virtual void init();
+
+    virtual void setSampleRate(int32_t inSampleRate);
+
+    virtual void setVolume(int16_t left, int16_t right);
+
+    virtual void resample(int32_t* out, size_t outFrameCount,
+            AudioBufferProvider* provider);
+
+private:
+
+    class Constants { // stores the filter constants.
+    public:
+        Constants() :
+            mL(0), mShift(0), mHalfNumCoefs(0), mFirCoefsS16(NULL)
+        {}
+        void set(int L, int halfNumCoefs,
+                int inSampleRate, int outSampleRate);
+        inline void setBuf(int16_t* buf) {
+            mFirCoefsS16 = buf;
+        }
+        inline void setBuf(int32_t* buf) {
+            mFirCoefsS32 = buf;
+        }
+
+        int mL;       // interpolation phases in the filter.
+        int mShift;   // right shift to get polyphase index
+        unsigned int mHalfNumCoefs; // filter half #coefs
+        union {       // polyphase filter bank
+            const int16_t* mFirCoefsS16;
+            const int32_t* mFirCoefsS32;
+        };
+    };
+
+    // Input buffer management for a given input type TI, now (int16_t)
+    // Is agnostic of the actual type, can work with int32_t and float.
+    template<typename TI>
+    class InBuffer {
+    public:
+        InBuffer();
+        ~InBuffer();
+        void init();
+        void resize(int CHANNELS, int halfNumCoefs);
+
+        // used for direct management of the mImpulse pointer
+        inline TI* getImpulse() {
+            return mImpulse;
+        }
+        inline void setImpulse(TI *impulse) {
+            mImpulse = impulse;
+        }
+        template<int CHANNELS>
+        inline void readAgain(TI*& impulse, const int halfNumCoefs,
+                const TI* const in, const size_t inputIndex);
+        template<int CHANNELS>
+        inline void readAdvance(TI*& impulse, const int halfNumCoefs,
+                const TI* const in, const size_t inputIndex);
+
+    private:
+        // tuning parameter guidelines: 2 <= multiple <= 8
+        static const int kStateSizeMultipleOfFilterLength = 4;
+
+        TI* mState;    // base pointer for the input buffer storage
+        TI* mImpulse;  // current location of the impulse response (centered)
+        TI* mRingFull; // mState <= mImpulse < mRingFull
+        // in general, mRingFull = mState + mStateSize - halfNumCoefs*CHANNELS.
+        size_t mStateSize; // in units of TI.
+    };
+
+    template<int CHANNELS, bool LOCKED, int STRIDE, typename TC>
+    void resample(int32_t* out, size_t outFrameCount,
+            const TC* const coefs, AudioBufferProvider* provider);
+
+    template<typename T>
+    void createKaiserFir(Constants &c, double stopBandAtten,
+            int inSampleRate, int outSampleRate, double tbwCheat);
+
+    InBuffer<int16_t> mInBuffer;
+    Constants mConstants;  // current set of coefficient parameters
+    int32_t __attribute__ ((aligned (8))) mVolumeSimd[2];
+    int32_t mResampleType; // contains the resample type.
+    int32_t mFilterSampleRate; // designed filter sample rate.
+    src_quality mFilterQuality; // designed filter quality.
+    void* mCoefBuffer; // if a filter is created, this is not null
+};
+
+// ----------------------------------------------------------------------------
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_DYN_H*/
diff --git a/services/audioflinger/AudioResamplerFirGen.h b/services/audioflinger/AudioResamplerFirGen.h
new file mode 100644
index 0000000..fac3001
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirGen.h
@@ -0,0 +1,684 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_GEN_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_GEN_H
+
+namespace android {
+
+/*
+ * generates a sine wave at equal steps.
+ *
+ * As most of our functions use sine or cosine at equal steps,
+ * it is very efficient to compute them that way (single multiply and subtract),
+ * rather than invoking the math library sin() or cos() each time.
+ *
+ * SineGen uses Goertzel's Algorithm (as a generator not a filter)
+ * to calculate sine(wstart + n * wstep) or cosine(wstart + n * wstep)
+ * by stepping through 0, 1, ... n.
+ *
+ * e^i(wstart+wstep) = 2cos(wstep) * e^i(wstart) - e^i(wstart-wstep)
+ *
+ * or looking at just the imaginary sine term, as the cosine follows identically:
+ *
+ * sin(wstart+wstep) = 2cos(wstep) * sin(wstart) - sin(wstart-wstep)
+ *
+ * Goertzel's algorithm is more efficient than the angle addition formula,
+ * e^i(wstart+wstep) = e^i(wstart) * e^i(wstep), which takes up to
+ * 4 multiplies and 2 adds (or 3* and 3+) and requires both sine and
+ * cosine generation due to the complex * complex multiply (full rotation).
+ *
+ * See: http://en.wikipedia.org/wiki/Goertzel_algorithm
+ *
+ */
+
+class SineGen {
+public:
+    SineGen(double wstart, double wstep, bool cosine = false) {
+        if (cosine) {
+            mCurrent = cos(wstart);
+            mPrevious = cos(wstart - wstep);
+        } else {
+            mCurrent = sin(wstart);
+            mPrevious = sin(wstart - wstep);
+        }
+        mTwoCos = 2.*cos(wstep);
+    }
+    SineGen(double expNow, double expPrev, double twoCosStep) {
+        mCurrent = expNow;
+        mPrevious = expPrev;
+        mTwoCos = twoCosStep;
+    }
+    inline double value() const {
+        return mCurrent;
+    }
+    inline void advance() {
+        double tmp = mCurrent;
+        mCurrent = mCurrent*mTwoCos - mPrevious;
+        mPrevious = tmp;
+    }
+    inline double valueAdvance() {
+        double tmp = mCurrent;
+        mCurrent = mCurrent*mTwoCos - mPrevious;
+        mPrevious = tmp;
+        return tmp;
+    }
+
+private:
+    double mCurrent; // current value of sine/cosine
+    double mPrevious; // previous value of sine/cosine
+    double mTwoCos; // stepping factor
+};
+
+/*
+ * generates a series of sine generators, phase offset by fixed steps.
+ *
+ * This is used to generate polyphase sine generators, one per polyphase
+ * in the filter code below.
+ *
+ * The SineGen returned by value() starts at innerStart = outerStart + n*outerStep;
+ * increments by innerStep.
+ *
+ */
+
+class SineGenGen {
+public:
+    SineGenGen(double outerStart, double outerStep, double innerStep, bool cosine = false)
+            : mSineInnerCur(outerStart, outerStep, cosine),
+              mSineInnerPrev(outerStart-innerStep, outerStep, cosine)
+    {
+        mTwoCos = 2.*cos(innerStep);
+    }
+    inline SineGen value() {
+        return SineGen(mSineInnerCur.value(), mSineInnerPrev.value(), mTwoCos);
+    }
+    inline void advance() {
+        mSineInnerCur.advance();
+        mSineInnerPrev.advance();
+    }
+    inline SineGen valueAdvance() {
+        return SineGen(mSineInnerCur.valueAdvance(), mSineInnerPrev.valueAdvance(), mTwoCos);
+    }
+
+private:
+    SineGen mSineInnerCur; // generate the inner sine values (stepped by outerStep).
+    SineGen mSineInnerPrev; // generate the inner sine previous values
+                            // (behind by innerStep, stepped by outerStep).
+    double mTwoCos; // the inner stepping factor for the returned SineGen.
+};
+
+static inline double sqr(double x) {
+    return x * x;
+}
+
+/*
+ * rounds a double to the nearest integer for FIR coefficients.
+ *
+ * One variant uses noise shaping, which must keep error history
+ * to work (the err parameter, initialized to 0).
+ * The other variant is a non-noise shaped version for
+ * S32 coefficients (noise shaping doesn't gain much).
+ *
+ * Caution: No bounds saturation is applied, but isn't needed in this case.
+ *
+ * @param x is the value to round.
+ *
+ * @param maxval is the maximum integer scale factor expressed as an int64 (for headroom).
+ * Typically this may be the maximum positive integer+1 (using the fact that double precision
+ * FIR coefficients generated here are never that close to 1.0 to pose an overflow condition).
+ *
+ * @param err is the previous error (actual - rounded) for the previous rounding op.
+ * For 16b coefficients this can improve stopband dB performance by up to 2dB.
+ *
+ * Many variants exist for the noise shaping: http://en.wikipedia.org/wiki/Noise_shaping
+ *
+ */
+
+static inline int64_t toint(double x, int64_t maxval, double& err) {
+    double val = x * maxval;
+    double ival = floor(val + 0.5 + err*0.2);
+    err = val - ival;
+    return static_cast<int64_t>(ival);
+}
+
+static inline int64_t toint(double x, int64_t maxval) {
+    return static_cast<int64_t>(floor(x * maxval + 0.5));
+}
+
+/*
+ * Modified Bessel function of the first kind
+ * http://en.wikipedia.org/wiki/Bessel_function
+ *
+ * The formulas are taken from Abramowitz and Stegun,
+ * _Handbook of Mathematical Functions_ (links below):
+ *
+ * http://people.math.sfu.ca/~cbm/aands/page_375.htm
+ * http://people.math.sfu.ca/~cbm/aands/page_378.htm
+ *
+ * http://dlmf.nist.gov/10.25
+ * http://dlmf.nist.gov/10.40
+ *
+ * Note we assume x is nonnegative (the function is symmetric,
+ * pass in the absolute value as needed).
+ *
+ * Constants are compile time derived with templates I0Term<> and
+ * I0ATerm<> to the precision of the compiler.  The series can be expanded
+ * to any precision needed, but currently set around 24b precision.
+ *
+ * We use a bit of template math here, constexpr would probably be
+ * more appropriate for a C++11 compiler.
+ *
+ * For the intermediate range 3.75 < x < 15, we use minimax polynomial fit.
+ *
+ */
+
+template <int N>
+struct I0Term {
+    static const double value = I0Term<N-1>::value / (4. * N * N);
+};
+
+template <>
+struct I0Term<0> {
+    static const double value = 1.;
+};
+
+template <int N>
+struct I0ATerm {
+    static const double value = I0ATerm<N-1>::value * (2.*N-1.) * (2.*N-1.) / (8. * N);
+};
+
+template <>
+struct I0ATerm<0> { // 1/sqrt(2*PI);
+    static const double value = 0.398942280401432677939946059934381868475858631164934657665925;
+};
+
+#if USE_HORNERS_METHOD
+/* Polynomial evaluation of A + Bx + Cx^2 + Dx^3 + ...
+ * using Horner's Method: http://en.wikipedia.org/wiki/Horner's_method
+ *
+ * This has fewer multiplications than Estrin's method below, but has back to back
+ * floating point dependencies.
+ *
+ * On ARM this appears to work slower, so USE_HORNERS_METHOD is not default enabled.
+ */
+
+inline double Poly2(double A, double B, double x) {
+    return A + x * B;
+}
+
+inline double Poly4(double A, double B, double C, double D, double x) {
+    return A + x * (B + x * (C + x * (D)));
+}
+
+inline double Poly7(double A, double B, double C, double D, double E, double F, double G,
+        double x) {
+    return A + x * (B + x * (C + x * (D + x * (E + x * (F + x * (G))))));
+}
+
+inline double Poly9(double A, double B, double C, double D, double E, double F, double G,
+        double H, double I, double x) {
+    return A + x * (B + x * (C + x * (D + x * (E + x * (F + x * (G + x * (H + x * (I))))))));
+}
+
+#else
+/* Polynomial evaluation of A + Bx + Cx^2 + Dx^3 + ...
+ * using Estrin's Method: http://en.wikipedia.org/wiki/Estrin's_scheme
+ *
+ * This is typically faster, perhaps gains about 5-10% overall on ARM processors
+ * over Horner's method above.
+ */
+
+inline double Poly2(double A, double B, double x) {
+    return A + B * x;
+}
+
+inline double Poly3(double A, double B, double C, double x, double x2) {
+    return Poly2(A, B, x) + C * x2;
+}
+
+inline double Poly3(double A, double B, double C, double x) {
+    return Poly2(A, B, x) + C * x * x;
+}
+
+inline double Poly4(double A, double B, double C, double D, double x, double x2) {
+    return Poly2(A, B, x) + Poly2(C, D, x) * x2; // same as poly2(poly2, poly2, x2);
+}
+
+inline double Poly4(double A, double B, double C, double D, double x) {
+    return Poly4(A, B, C, D, x, x * x);
+}
+
+inline double Poly7(double A, double B, double C, double D, double E, double F, double G,
+        double x) {
+    double x2 = x * x;
+    return Poly4(A, B, C, D, x, x2) + Poly3(E, F, G, x, x2) * (x2 * x2);
+}
+
+inline double Poly8(double A, double B, double C, double D, double E, double F, double G,
+        double H, double x, double x2, double x4) {
+    return Poly4(A, B, C, D, x, x2) + Poly4(E, F, G, H, x, x2) * x4;
+}
+
+inline double Poly9(double A, double B, double C, double D, double E, double F, double G,
+        double H, double I, double x) {
+    double x2 = x * x;
+#if 1
+    // It does not seem faster to explicitly decompose Poly8 into Poly4, but
+    // could depend on compiler floating point scheduling.
+    double x4 = x2 * x2;
+    return Poly8(A, B, C, D, E, F, G, H, x, x2, x4) + I * (x4 * x4);
+#else
+    double val = Poly4(A, B, C, D, x, x2);
+    double x4 = x2 * x2;
+    return val + Poly4(E, F, G, H, x, x2) * x4 + I * (x4 * x4);
+#endif
+}
+#endif
+
+static inline double I0(double x) {
+    if (x < 3.75) {
+        x *= x;
+        return Poly7(I0Term<0>::value, I0Term<1>::value,
+                I0Term<2>::value, I0Term<3>::value,
+                I0Term<4>::value, I0Term<5>::value,
+                I0Term<6>::value, x); // e < 1.6e-7
+    }
+    if (1) {
+        /*
+         * Series expansion coefs are easy to calculate, but are expanded around 0,
+         * so error is unequal over the interval 0 < x < 3.75, the error being
+         * significantly better near 0.
+         *
+         * A better solution is to use precise minimax polynomial fits.
+         *
+         * We use a slightly more complicated solution for 3.75 < x < 15, based on
+         * the tables in Blair and Edwards, "Stable Rational Minimax Approximations
+         * to the Modified Bessel Functions I0(x) and I1(x)", Chalk Hill Nuclear Laboratory,
+         * AECL-4928.
+         *
+         * http://www.iaea.org/inis/collection/NCLCollectionStore/_Public/06/178/6178667.pdf
+         *
+         * See Table 11 for 0 < x < 15; e < 10^(-7.13).
+         *
+         * Note: Beta cannot exceed 15 (hence Stopband cannot exceed 144dB = 24b).
+         *
+         * This speeds up overall computation by about 40% over using the else clause below,
+         * which requires sqrt and exp.
+         *
+         */
+
+        x *= x;
+        double num = Poly9(-0.13544938430e9, -0.33153754512e8,
+                -0.19406631946e7, -0.48058318783e5,
+                -0.63269783360e3, -0.49520779070e1,
+                -0.24970910370e-1, -0.74741159550e-4,
+                -0.18257612460e-6, x);
+        double y = x - 225.; // reflection around 15 (squared)
+        double den = Poly4(-0.34598737196e8, 0.23852643181e6,
+                -0.70699387620e3, 0.10000000000e1, y);
+        return num / den;
+
+#if IO_EXTENDED_BETA
+        /* Table 42 for x > 15; e < 10^(-8.11).
+         * This is used for Beta>15, but is disabled here as
+         * we never use Beta that high.
+         *
+         * NOTE: This should be enabled only for x > 15.
+         */
+
+        double y = 1./x;
+        double z = y - (1./15);
+        double num = Poly2(0.415079861746e1, -0.5149092496e1, z);
+        double den = Poly3(0.103150763823e2, -0.14181687413e2,
+                0.1000000000e1, z);
+        return exp(x) * sqrt(y) * num / den;
+#endif
+    } else {
+        /*
+         * NOT USED, but reference for large Beta.
+         *
+         * Abramowitz and Stegun asymptotic formula.
+         * works for x > 3.75.
+         */
+        double y = 1./x;
+        return exp(x) * sqrt(y) *
+                // note: reciprocal squareroot may be easier!
+                // http://en.wikipedia.org/wiki/Fast_inverse_square_root
+                Poly9(I0ATerm<0>::value, I0ATerm<1>::value,
+                        I0ATerm<2>::value, I0ATerm<3>::value,
+                        I0ATerm<4>::value, I0ATerm<5>::value,
+                        I0ATerm<6>::value, I0ATerm<7>::value,
+                        I0ATerm<8>::value, y); // (... e) < 1.9e-7
+    }
+}
+
+/*
+ * calculates the transition bandwidth for a Kaiser filter
+ *
+ * Formula 3.2.8, Vaidyanathan, _Multirate Systems and Filter Banks_, p. 48
+ * Formula 7.76, Oppenheim and Schafer, _Discrete-time Signal Processing, 3e_, p. 542
+ *
+ * @param halfNumCoef is half the number of coefficients per filter phase.
+ *
+ * @param stopBandAtten is the stop band attenuation desired.
+ *
+ * @return the transition bandwidth in normalized frequency (0 <= f <= 0.5)
+ */
+static inline double firKaiserTbw(int halfNumCoef, double stopBandAtten) {
+    return (stopBandAtten - 7.95)/((2.*14.36)*halfNumCoef);
+}
+
+/*
+ * calculates the fir transfer response of the overall polyphase filter at w.
+ *
+ * Calculates the DTFT transfer coefficient H(w) for 0 <= w <= PI, utilizing the
+ * fact that h[n] is symmetric (cosines only, no complex arithmetic).
+ *
+ * We use Goertzel's algorithm to accelerate the computation to essentially
+ * a single multiply and 2 adds per filter coefficient h[].
+ *
+ * Be careful be careful to consider that h[n] is the overall polyphase filter,
+ * with L phases, so rescaling H(w)/L is probably what you expect for "unity gain",
+ * as you only use one of the polyphases at a time.
+ */
+template <typename T>
+static inline double firTransfer(const T* coef, int L, int halfNumCoef, double w) {
+    double accum = static_cast<double>(coef[0])*0.5;  // "center coefficient" from first bank
+    coef += halfNumCoef;    // skip first filterbank (picked up by the last filterbank).
+#if SLOW_FIRTRANSFER
+    /* Original code for reference.  This is equivalent to the code below, but slower. */
+    for (int i=1 ; i<=L ; ++i) {
+        for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) {
+            accum += cos(ix*w)*static_cast<double>(*coef++);
+        }
+    }
+#else
+    /*
+     * Our overall filter is stored striped by polyphases, not a contiguous h[n].
+     * We could fetch coefficients in a non-contiguous fashion
+     * but that will not scale to vector processing.
+     *
+     * We apply Goertzel's algorithm directly to each polyphase filter bank instead of
+     * using cosine generation/multiplication, thereby saving one multiply per inner loop.
+     *
+     * See: http://en.wikipedia.org/wiki/Goertzel_algorithm
+     * Also: Oppenheim and Schafer, _Discrete Time Signal Processing, 3e_, p. 720.
+     *
+     * We use the basic recursion to incorporate the cosine steps into real sequence x[n]:
+     * s[n] = x[n] + (2cosw)*s[n-1] + s[n-2]
+     *
+     * y[n] = s[n] - e^(iw)s[n-1]
+     *      = sum_{k=-\infty}^{n} x[k]e^(-iw(n-k))
+     *      = e^(-iwn) sum_{k=0}^{n} x[k]e^(iwk)
+     *
+     * The summation contains the frequency steps we want multiplied by the source
+     * (similar to a DTFT).
+     *
+     * Using symmetry, and just the real part (be careful, this must happen
+     * after any internal complex multiplications), the polyphase filterbank
+     * transfer function is:
+     *
+     * Hpp[n, w, w_0] = sum_{k=0}^{n} x[k] * cos(wk + w_0)
+     *                = Re{ e^(iwn + iw_0) y[n]}
+     *                = cos(wn+w_0) * s[n] - cos(w(n+1)+w_0) * s[n-1]
+     *
+     * using the fact that s[n] of real x[n] is real.
+     *
+     */
+    double dcos = 2. * cos(L*w);
+    int start = ((halfNumCoef)*L + 1);
+    SineGen cc((start - L) * w, w, true); // cosine
+    SineGen cp(start * w, w, true); // cosine
+    for (int i=1 ; i<=L ; ++i) {
+        double sc = 0;
+        double sp = 0;
+        for (int j=0 ; j<halfNumCoef ; ++j) {
+            double tmp = sc;
+            sc  = static_cast<double>(*coef++) + dcos*sc - sp;
+            sp = tmp;
+        }
+        // If we are awfully clever, we can apply Goertzel's algorithm
+        // again on the sc and sp sequences returned here.
+        accum += cc.valueAdvance() * sc - cp.valueAdvance() * sp;
+    }
+#endif
+    return accum*2.;
+}
+
+/*
+ * evaluates the minimum and maximum |H(f)| bound in a band region.
+ *
+ * This is usually done with equally spaced increments in the target band in question.
+ * The passband is often very small, and sampled that way. The stopband is often much
+ * larger.
+ *
+ * We use the fact that the overall polyphase filter has an additional bank at the end
+ * for interpolation; hence it is overspecified for the H(f) computation.  Thus the
+ * first polyphase is never actually checked, excepting its first term.
+ *
+ * In this code we use the firTransfer() evaluator above, which uses Goertzel's
+ * algorithm to calculate the transfer function at each point.
+ *
+ * TODO: An alternative with equal spacing is the FFT/DFT.  An alternative with unequal
+ * spacing is a chirp transform.
+ *
+ * @param coef is the designed polyphase filter banks
+ *
+ * @param L is the number of phases (for interpolation)
+ *
+ * @param halfNumCoef should be half the number of coefficients for a single
+ * polyphase.
+ *
+ * @param fstart is the normalized frequency start.
+ *
+ * @param fend is the normalized frequency end.
+ *
+ * @param steps is the number of steps to take (sampling) between frequency start and end
+ *
+ * @param firMin returns the minimum transfer |H(f)| found
+ *
+ * @param firMax returns the maximum transfer |H(f)| found
+ *
+ * 0 <= f <= 0.5.
+ * This is used to test passband and stopband performance.
+ */
+template <typename T>
+static void testFir(const T* coef, int L, int halfNumCoef,
+        double fstart, double fend, int steps, double &firMin, double &firMax) {
+    double wstart = fstart*(2.*M_PI);
+    double wend = fend*(2.*M_PI);
+    double wstep = (wend - wstart)/steps;
+    double fmax, fmin;
+    double trf = firTransfer(coef, L, halfNumCoef, wstart);
+    if (trf<0) {
+        trf = -trf;
+    }
+    fmin = fmax = trf;
+    wstart += wstep;
+    for (int i=1; i<steps; ++i) {
+        trf = firTransfer(coef, L, halfNumCoef, wstart);
+        if (trf<0) {
+            trf = -trf;
+        }
+        if (trf>fmax) {
+            fmax = trf;
+        }
+        else if (trf<fmin) {
+            fmin = trf;
+        }
+        wstart += wstep;
+    }
+    // renormalize - this is only needed for integer filter types
+    double norm = 1./((1ULL<<(sizeof(T)*8-1))*L);
+
+    firMin = fmin * norm;
+    firMax = fmax * norm;
+}
+
+/*
+ * evaluates the |H(f)| lowpass band characteristics.
+ *
+ * This function tests the lowpass characteristics for the overall polyphase filter,
+ * and is used to verify the design.  For this case, fp should be set to the
+ * passband normalized frequency from 0 to 0.5 for the overall filter (thus it
+ * is the designed polyphase bank value / L).  Likewise for fs.
+ *
+ * @param coef is the designed polyphase filter banks
+ *
+ * @param L is the number of phases (for interpolation)
+ *
+ * @param halfNumCoef should be half the number of coefficients for a single
+ * polyphase.
+ *
+ * @param fp is the passband normalized frequency, 0 < fp < fs < 0.5.
+ *
+ * @param fs is the stopband normalized frequency, 0 < fp < fs < 0.5.
+ *
+ * @param passSteps is the number of passband sampling steps.
+ *
+ * @param stopSteps is the number of stopband sampling steps.
+ *
+ * @param passMin is the minimum value in the passband
+ *
+ * @param passMax is the maximum value in the passband (useful for scaling).  This should
+ * be less than 1., to avoid sine wave test overflow.
+ *
+ * @param passRipple is the passband ripple.  Typically this should be less than 0.1 for
+ * an audio filter.  Generally speaker/headphone device characteristics will dominate
+ * the passband term.
+ *
+ * @param stopMax is the maximum value in the stopband.
+ *
+ * @param stopRipple is the stopband ripple, also known as stopband attenuation.
+ * Typically this should be greater than ~80dB for low quality, and greater than
+ * ~100dB for full 16b quality, otherwise aliasing may become noticeable.
+ *
+ */
+template <typename T>
+static void testFir(const T* coef, int L, int halfNumCoef,
+        double fp, double fs, int passSteps, int stopSteps,
+        double &passMin, double &passMax, double &passRipple,
+        double &stopMax, double &stopRipple) {
+    double fmin, fmax;
+    testFir(coef, L, halfNumCoef, 0., fp, passSteps, fmin, fmax);
+    double d1 = (fmax - fmin)/2.;
+    passMin = fmin;
+    passMax = fmax;
+    passRipple = -20.*log10(1. - d1); // passband ripple
+    testFir(coef, L, halfNumCoef, fs, 0.5, stopSteps, fmin, fmax);
+    // fmin is really not important for the stopband.
+    stopMax = fmax;
+    stopRipple = -20.*log10(fmax); // stopband ripple/attenuation
+}
+
+/*
+ * Calculates the overall polyphase filter based on a windowed sinc function.
+ *
+ * The windowed sinc is an odd length symmetric filter of exactly L*halfNumCoef*2+1
+ * taps for the entire kernel.  This is then decomposed into L+1 polyphase filterbanks.
+ * The last filterbank is used for interpolation purposes (and is mostly composed
+ * of the first bank shifted by one sample), and is unnecessary if one does
+ * not do interpolation.
+ *
+ * We use the last filterbank for some transfer function calculation purposes,
+ * so it needs to be generated anyways.
+ *
+ * @param coef is the caller allocated space for coefficients.  This should be
+ * exactly (L+1)*halfNumCoef in size.
+ *
+ * @param L is the number of phases (for interpolation)
+ *
+ * @param halfNumCoef should be half the number of coefficients for a single
+ * polyphase.
+ *
+ * @param stopBandAtten is the stopband value, should be >50dB.
+ *
+ * @param fcr is cutoff frequency/sampling rate (<0.5).  At this point, the energy
+ * should be 6dB less. (fcr is where the amplitude drops by half).  Use the
+ * firKaiserTbw() to calculate the transition bandwidth.  fcr is the midpoint
+ * between the stop band and the pass band (fstop+fpass)/2.
+ *
+ * @param atten is the attenuation (generally slightly less than 1).
+ */
+
+template <typename T>
+static inline void firKaiserGen(T* coef, int L, int halfNumCoef,
+        double stopBandAtten, double fcr, double atten) {
+    //
+    // Formula 3.2.5, 3.2.7, Vaidyanathan, _Multirate Systems and Filter Banks_, p. 48
+    // Formula 7.75, Oppenheim and Schafer, _Discrete-time Signal Processing, 3e_, p. 542
+    //
+    // See also: http://melodi.ee.washington.edu/courses/ee518/notes/lec17.pdf
+    //
+    // Kaiser window and beta parameter
+    //
+    //         | 0.1102*(A - 8.7)                         A > 50
+    //  beta = | 0.5842*(A - 21)^0.4 + 0.07886*(A - 21)   21 <= A <= 50
+    //         | 0.                                       A < 21
+    //
+    // with A is the desired stop-band attenuation in dBFS
+    //
+    //    30 dB    2.210
+    //    40 dB    3.384
+    //    50 dB    4.538
+    //    60 dB    5.658
+    //    70 dB    6.764
+    //    80 dB    7.865
+    //    90 dB    8.960
+    //   100 dB   10.056
+
+    const int N = L * halfNumCoef; // non-negative half
+    const double beta = 0.1102 * (stopBandAtten - 8.7); // >= 50dB always
+    const double xstep = (2. * M_PI) * fcr / L;
+    const double xfrac = 1. / N;
+    const double yscale = atten * L / (I0(beta) * M_PI);
+
+    // We use sine generators, which computes sines on regular step intervals.
+    // This speeds up overall computation about 40% from computing the sine directly.
+
+    SineGenGen sgg(0., xstep, L*xstep); // generates sine generators (one per polyphase)
+
+    for (int i=0 ; i<=L ; ++i) { // generate an extra set of coefs for interpolation
+
+        // computation for a single polyphase of the overall filter.
+        SineGen sg = sgg.valueAdvance(); // current sine generator for "j" inner loop.
+        double err = 0; // for noise shaping on int16_t coefficients (over each polyphase)
+
+        for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) {
+            double y;
+            if (CC_LIKELY(ix)) {
+                double x = static_cast<double>(ix);
+
+                // sine generator: sg.valueAdvance() returns sin(ix*xstep);
+                y = I0(beta * sqrt(1.0 - sqr(x * xfrac))) * yscale * sg.valueAdvance() / x;
+            } else {
+                y = 2. * atten * fcr; // center of filter, sinc(0) = 1.
+                sg.advance();
+            }
+
+            // (caution!) float version does not need rounding
+            if (is_same<T, int16_t>::value) { // int16_t needs noise shaping
+                *coef++ = static_cast<T>(toint(y, 1ULL<<(sizeof(T)*8-1), err));
+            } else {
+                *coef++ = static_cast<T>(toint(y, 1ULL<<(sizeof(T)*8-1)));
+            }
+        }
+    }
+}
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_GEN_H*/
diff --git a/services/audioflinger/AudioResamplerFirOps.h b/services/audioflinger/AudioResamplerFirOps.h
new file mode 100644
index 0000000..bf2163f
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirOps.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_OPS_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_OPS_H
+
+namespace android {
+
+#if defined(__arm__) && !defined(__thumb__)
+#define USE_INLINE_ASSEMBLY (true)
+#else
+#define USE_INLINE_ASSEMBLY (false)
+#endif
+
+#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__)
+#define USE_NEON (true)
+#include <arm_neon.h>
+#else
+#define USE_NEON (false)
+#endif
+
+template<typename T, typename U>
+struct is_same
+{
+    static const bool value = false;
+};
+
+template<typename T>
+struct is_same<T, T>  // partial specialization
+{
+    static const bool value = true;
+};
+
+static inline
+int32_t mulRL(int left, int32_t in, uint32_t vRL)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    if (left) {
+        asm( "smultb %[out], %[in], %[vRL] \n"
+             : [out]"=r"(out)
+             : [in]"%r"(in), [vRL]"r"(vRL)
+             : );
+    } else {
+        asm( "smultt %[out], %[in], %[vRL] \n"
+             : [out]"=r"(out)
+             : [in]"%r"(in), [vRL]"r"(vRL)
+             : );
+    }
+    return out;
+#else
+    int16_t v = left ? static_cast<int16_t>(vRL) : static_cast<int16_t>(vRL>>16);
+    return static_cast<int32_t>((static_cast<int64_t>(in) * v) >> 16);
+#endif
+}
+
+static inline
+int32_t mulAdd(int16_t in, int16_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    asm( "smlabb %[out], %[v], %[in], %[a] \n"
+         : [out]"=r"(out)
+         : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
+         : );
+    return out;
+#else
+    return a + v * in;
+#endif
+}
+
+static inline
+int32_t mulAdd(int16_t in, int32_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    asm( "smlawb %[out], %[v], %[in], %[a] \n"
+         : [out]"=r"(out)
+         : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
+         : );
+    return out;
+#else
+    return a + static_cast<int32_t>((static_cast<int64_t>(v) * in) >> 16);
+#endif
+}
+
+static inline
+int32_t mulAdd(int32_t in, int32_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    asm( "smmla %[out], %[v], %[in], %[a] \n"
+         : [out]"=r"(out)
+         : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
+         : );
+    return out;
+#else
+    return a + static_cast<int32_t>((static_cast<int64_t>(v) * in) >> 32);
+#endif
+}
+
+static inline
+int32_t mulAddRL(int left, uint32_t inRL, int16_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    if (left) {
+        asm( "smlabb %[out], %[v], %[inRL], %[a] \n"
+             : [out]"=r"(out)
+             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+             : );
+    } else {
+        asm( "smlabt %[out], %[v], %[inRL], %[a] \n"
+             : [out]"=r"(out)
+             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+             : );
+    }
+    return out;
+#else
+    int16_t s = left ? static_cast<int16_t>(inRL) : static_cast<int16_t>(inRL>>16);
+    return a + v * s;
+#endif
+}
+
+static inline
+int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    if (left) {
+        asm( "smlawb %[out], %[v], %[inRL], %[a] \n"
+             : [out]"=r"(out)
+             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+             : );
+    } else {
+        asm( "smlawt %[out], %[v], %[inRL], %[a] \n"
+             : [out]"=r"(out)
+             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+             : );
+    }
+    return out;
+#else
+    int16_t s = left ? static_cast<int16_t>(inRL) : static_cast<int16_t>(inRL>>16);
+    return a + static_cast<int32_t>((static_cast<int64_t>(v) * s) >> 16);
+#endif
+}
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_OPS_H*/
diff --git a/services/audioflinger/AudioResamplerFirProcess.h b/services/audioflinger/AudioResamplerFirProcess.h
new file mode 100644
index 0000000..38e387c
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirProcess.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H
+
+namespace android {
+
+// depends on AudioResamplerFirOps.h
+
+template<int CHANNELS, typename TC>
+static inline
+void mac(
+        int32_t& l, int32_t& r,
+        const TC coef,
+        const int16_t* samples)
+{
+    if (CHANNELS == 2) {
+        uint32_t rl = *reinterpret_cast<const uint32_t*>(samples);
+        l = mulAddRL(1, rl, coef, l);
+        r = mulAddRL(0, rl, coef, r);
+    } else {
+        r = l = mulAdd(samples[0], coef, l);
+    }
+}
+
+template<int CHANNELS, typename TC>
+static inline
+void interpolate(
+        int32_t& l, int32_t& r,
+        const TC coef_0, const TC coef_1,
+        const int16_t lerp, const int16_t* samples)
+{
+    TC sinc;
+
+    if (is_same<TC, int16_t>::value) {
+        sinc = (lerp * ((coef_1-coef_0)<<1)>>16) + coef_0;
+    } else {
+        sinc = mulAdd(lerp, (coef_1-coef_0)<<1, coef_0);
+    }
+    if (CHANNELS == 2) {
+        uint32_t rl = *reinterpret_cast<const uint32_t*>(samples);
+        l = mulAddRL(1, rl, sinc, l);
+        r = mulAddRL(0, rl, sinc, r);
+    } else {
+        r = l = mulAdd(samples[0], sinc, l);
+    }
+}
+
+/*
+ * Calculates a single output sample (two stereo frames).
+ *
+ * This function computes both the positive half FIR dot product and
+ * the negative half FIR dot product, accumulates, and then applies the volume.
+ *
+ * This is a locked phase filter (it does not compute the interpolation).
+ *
+ * Use fir() to compute the proper coefficient pointers for a polyphase
+ * filter bank.
+ */
+
+template <int CHANNELS, int STRIDE, typename TC>
+static inline
+void ProcessL(int32_t* const out,
+        int count,
+        const TC* coefsP,
+        const TC* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    int32_t l = 0;
+    int32_t r = 0;
+    do {
+        mac<CHANNELS>(l, r, *coefsP++, sP);
+        sP -= CHANNELS;
+        mac<CHANNELS>(l, r, *coefsN++, sN);
+        sN += CHANNELS;
+    } while (--count > 0);
+    out[0] += 2 * mulRL(0, l, volumeLR[0]); // Note: only use top 16b
+    out[1] += 2 * mulRL(0, r, volumeLR[1]); // Note: only use top 16b
+}
+
+/*
+ * Calculates a single output sample (two stereo frames) interpolating phase.
+ *
+ * This function computes both the positive half FIR dot product and
+ * the negative half FIR dot product, accumulates, and then applies the volume.
+ *
+ * This is an interpolated phase filter.
+ *
+ * Use fir() to compute the proper coefficient pointers for a polyphase
+ * filter bank.
+ */
+
+template <int CHANNELS, int STRIDE, typename TC>
+static inline
+void Process(int32_t* const out,
+        int count,
+        const TC* coefsP,
+        const TC* coefsN,
+        const TC* coefsP1,
+        const TC* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    (void) coefsP1; // suppress unused parameter warning
+    (void) coefsN1;
+    if (sizeof(*coefsP)==4) {
+        lerpP >>= 16;   // ensure lerpP is 16b
+    }
+    int32_t l = 0;
+    int32_t r = 0;
+    for (size_t i = 0; i < count; ++i) {
+        interpolate<CHANNELS>(l, r, coefsP[0], coefsP[count], lerpP, sP);
+        coefsP++;
+        sP -= CHANNELS;
+        interpolate<CHANNELS>(l, r, coefsN[count], coefsN[0], lerpP, sN);
+        coefsN++;
+        sN += CHANNELS;
+    }
+    out[0] += 2 * mulRL(0, l, volumeLR[0]); // Note: only use top 16b
+    out[1] += 2 * mulRL(0, r, volumeLR[1]); // Note: only use top 16b
+}
+
+/*
+ * Calculates a single output sample (two stereo frames) from input sample pointer.
+ *
+ * This sets up the params for the accelerated Process() and ProcessL()
+ * functions to do the appropriate dot products.
+ *
+ * @param out should point to the output buffer with at least enough space for 2 output frames.
+ *
+ * @param phase is the fractional distance between input samples for interpolation:
+ * phase >= 0  && phase < phaseWrapLimit.  It can be thought of as a rational fraction
+ * of phase/phaseWrapLimit.
+ *
+ * @param phaseWrapLimit is #polyphases<<coefShift, where #polyphases is the number of polyphases
+ * in the polyphase filter. Likewise, #polyphases can be obtained as (phaseWrapLimit>>coefShift).
+ *
+ * @param coefShift gives the bit alignment of the polyphase index in the phase parameter.
+ *
+ * @param halfNumCoefs is the half the number of coefficients per polyphase filter. Since the
+ * overall filterbank is odd-length symmetric, only halfNumCoefs need be stored.
+ *
+ * @param coefs is the polyphase filter bank, starting at from polyphase index 0, and ranging to
+ * and including the #polyphases.  Each polyphase of the filter has half-length halfNumCoefs
+ * (due to symmetry).  The total size of the filter bank in coefficients is
+ * (#polyphases+1)*halfNumCoefs.
+ *
+ * The filter bank coefs should be aligned to a minimum of 16 bytes (preferrably to cache line).
+ *
+ * The coefs should be attenuated (to compensate for passband ripple)
+ * if storing back into the native format.
+ *
+ * @param samples are unaligned input samples.  The position is in the "middle" of the
+ * sample array with respect to the FIR filter:
+ * the negative half of the filter is dot product from samples+1 to samples+halfNumCoefs;
+ * the positive half of the filter is dot product from samples to samples-halfNumCoefs+1.
+ *
+ * @param volumeLR is a pointer to an array of two 32 bit volume values, one per stereo channel,
+ * expressed as a S32 integer.  A negative value inverts the channel 180 degrees.
+ * The pointer volumeLR should be aligned to a minimum of 8 bytes.
+ * A typical value for volume is 0x1000 to align to a unity gain output of 20.12.
+ *
+ * In between calls to filterCoefficient, the phase is incremented by phaseIncrement, where
+ * phaseIncrement is calculated as inputSampling * phaseWrapLimit / outputSampling.
+ *
+ * The filter polyphase index is given by indexP = phase >> coefShift. Due to
+ * odd length symmetric filter, the polyphase index of the negative half depends on
+ * whether interpolation is used.
+ *
+ * The fractional siting between the polyphase indices is given by the bits below coefShift:
+ *
+ * lerpP = phase << 32 - coefShift >> 1;  // for 32 bit unsigned phase multiply
+ * lerpP = phase << 32 - coefShift >> 17; // for 16 bit unsigned phase multiply
+ *
+ * For integer types, this is expressed as:
+ *
+ * lerpP = phase << sizeof(phase)*8 - coefShift
+ *              >> (sizeof(phase)-sizeof(*coefs))*8 + 1;
+ *
+ */
+
+template<int CHANNELS, bool LOCKED, int STRIDE, typename TC>
+static inline
+void fir(int32_t* const out,
+        const uint32_t phase, const uint32_t phaseWrapLimit,
+        const int coefShift, const int halfNumCoefs, const TC* const coefs,
+        const int16_t* const samples, const int32_t* const volumeLR)
+{
+    // NOTE: be very careful when modifying the code here. register
+    // pressure is very high and a small change might cause the compiler
+    // to generate far less efficient code.
+    // Always sanity check the result with objdump or test-resample.
+
+    if (LOCKED) {
+        // locked polyphase (no interpolation)
+        // Compute the polyphase filter index on the positive and negative side.
+        uint32_t indexP = phase >> coefShift;
+        uint32_t indexN = (phaseWrapLimit - phase) >> coefShift;
+        const TC* coefsP = coefs + indexP*halfNumCoefs;
+        const TC* coefsN = coefs + indexN*halfNumCoefs;
+        const int16_t* sP = samples;
+        const int16_t* sN = samples + CHANNELS;
+
+        // dot product filter.
+        ProcessL<CHANNELS, STRIDE>(out,
+                halfNumCoefs, coefsP, coefsN, sP, sN, volumeLR);
+    } else {
+        // interpolated polyphase
+        // Compute the polyphase filter index on the positive and negative side.
+        uint32_t indexP = phase >> coefShift;
+        uint32_t indexN = (phaseWrapLimit - phase - 1) >> coefShift; // one's complement.
+        const TC* coefsP = coefs + indexP*halfNumCoefs;
+        const TC* coefsN = coefs + indexN*halfNumCoefs;
+        const TC* coefsP1 = coefsP + halfNumCoefs;
+        const TC* coefsN1 = coefsN + halfNumCoefs;
+        const int16_t* sP = samples;
+        const int16_t* sN = samples + CHANNELS;
+
+        // Interpolation fraction lerpP derived by shifting all the way up and down
+        // to clear the appropriate bits and align to the appropriate level
+        // for the integer multiply.  The constants should resolve in compile time.
+        //
+        // The interpolated filter coefficient is derived as follows for the pos/neg half:
+        //
+        // interpolated[P] = index[P]*lerpP + index[P+1]*(1-lerpP)
+        // interpolated[N] = index[N+1]*lerpP + index[N]*(1-lerpP)
+        uint32_t lerpP = phase << (sizeof(phase)*8 - coefShift)
+                >> ((sizeof(phase)-sizeof(*coefs))*8 + 1);
+
+        // on-the-fly interpolated dot product filter
+        Process<CHANNELS, STRIDE>(out,
+                halfNumCoefs, coefsP, coefsN, coefsP1, coefsN1, sP, sN, lerpP, volumeLR);
+    }
+}
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H*/
diff --git a/services/audioflinger/AudioResamplerFirProcessNeon.h b/services/audioflinger/AudioResamplerFirProcessNeon.h
new file mode 100644
index 0000000..f311cef
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirProcessNeon.h
@@ -0,0 +1,1149 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
+
+namespace android {
+
+// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
+
+#if USE_NEON
+//
+// NEON specializations are enabled for Process() and ProcessL()
+//
+// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
+// and looping stride 16 (or vice versa). This has some polyphase coef data alignment
+// issues with S16 coefs. Consider this later.
+
+// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
+#define ASSEMBLY_ACCUMULATE_MONO \
+        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
+        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
+        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
+        "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
+        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
+        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
+        "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
+
+#define ASSEMBLY_ACCUMULATE_STEREO \
+        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
+        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
+        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
+        "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
+        "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
+        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
+        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
+        "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
+
+template <>
+inline void ProcessL<1, 16>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
+
+        "1:                                      \n"
+
+        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
+        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
+        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
+
+        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
+        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
+        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
+
+        // moving these ARM instructions before neon above seems to be slower
+        "subs           %[count], %[count], #8   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+         ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q10"
+    );
+}
+
+template <>
+inline void ProcessL<2, 16>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// (1) acc_L = 0
+        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
+
+        "1:                                      \n"
+
+        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
+        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
+        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
+        "vrev64.16      q3, q3                   \n"// (0 combines+) reverse right positive
+
+        "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
+        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
+        "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
+        "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
+        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
+        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
+        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
+        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
+
+        // moving these ARM before neon seems to be slower
+        "subs           %[count], %[count], #8   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q10"
+     );
+}
+
+template <>
+inline void Process<1, 16>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* coefsP1,
+        const int16_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
+        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
+
+        "1:                                      \n"
+
+        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
+        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
+        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
+        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
+
+        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
+        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
+
+        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
+        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
+
+        "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
+        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
+
+        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
+        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
+        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
+
+        // moving these ARM instructions before neon above seems to be slower
+        "subs           %[count], %[count], #8   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11"
+    );
+}
+
+template <>
+inline void Process<2, 16>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* coefsP1,
+        const int16_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
+        "veor           q0, q0, q0               \n"// (1) acc_L = 0
+        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
+
+        "1:                                      \n"
+
+        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
+        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
+        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
+        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
+
+        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
+        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
+
+        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
+        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
+        "vrev64.16      q3, q3                   \n"// (1) reverse 8 frames of the right positive
+
+        "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
+        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
+
+        "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
+        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
+        "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
+        "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
+        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
+        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
+        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
+        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
+
+        // moving these ARM before neon seems to be slower
+        "subs           %[count], %[count], #8   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q9", "q10", "q11"
+    );
+}
+
+template <>
+inline void ProcessL<1, 16>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0                    \n"// result, initialize to 0
+
+        "1:                                           \n"
+
+        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
+        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
+        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
+        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
+
+        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
+
+        "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
+        "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
+
+        "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
+        "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q0, q0, q15                   \n"// accumulate result
+        "vadd.s32       q0, q0, q13                   \n"// accumulate result
+
+        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
+        "subs           %[count], %[count], #8        \n"// update loop counter
+
+        "bne            1b                            \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void ProcessL<2, 16>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0                    \n"// result, initialize to 0
+        "veor           q4, q4, q4                    \n"// result, initialize to 0
+
+        "1:                                           \n"
+
+        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
+        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
+        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 4 32-bits coefs
+        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+
+        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
+        "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
+
+        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
+        "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
+
+        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q4, q4, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
+        "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
+
+        "subs           %[count], %[count], #8        \n"// update loop counter
+        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
+
+        "bne            1b                            \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void Process<1, 16>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int32_t* coefsP1,
+        const int32_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
+        "veor           q0, q0, q0                    \n"// result, initialize to 0
+
+        "1:                                           \n"
+
+        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
+        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
+        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
+        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
+        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
+        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
+
+        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
+        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
+        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
+        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
+
+        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
+
+        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
+        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
+        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
+        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
+
+        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
+
+        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q0, q0, q15                   \n"// accumulate result
+        "vadd.s32       q0, q0, q13                   \n"// accumulate result
+
+        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
+        "subs           %[count], %[count], #8        \n"// update loop counter
+
+        "bne            1b                            \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void Process<2, 16>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int32_t* coefsP1,
+        const int32_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
+        "veor           q0, q0, q0                    \n"// result, initialize to 0
+        "veor           q4, q4, q4                    \n"// result, initialize to 0
+
+        "1:                                           \n"
+
+        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
+        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
+        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
+        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
+        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
+        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
+
+        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
+        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
+        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
+        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
+
+        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
+
+        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
+        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
+        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
+        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
+
+        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
+        "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
+
+        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
+        "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
+
+        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q4, q4, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
+        "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
+
+        "subs           %[count], %[count], #8        \n"// update loop counter
+        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
+
+        "bne            1b                            \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void ProcessL<1, 8>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
+
+        "1:                                      \n"
+
+        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
+        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
+        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
+        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs
+
+        "vrev64.16      d4, d4                   \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
+
+        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed)samples by coef
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
+
+        // moving these ARM instructions before neon above seems to be slower
+        "subs           %[count], %[count], #4   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #8         \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q10"
+    );
+}
+
+template <>
+inline void ProcessL<2, 8>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// (1) acc_L = 0
+        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
+
+        "1:                                      \n"
+
+        "vld2.16        {d4, d5}, [%[sP]]        \n"// (2+0d) load 8 16-bits stereo samples
+        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (2) load 8 16-bits stereo samples
+        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
+
+        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
+        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
+        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
+
+        // moving these ARM before neon seems to be slower
+        "subs           %[count], %[count], #4   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q10"
+     );
+}
+
+template <>
+inline void Process<1, 8>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* coefsP1,
+        const int16_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
+        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
+
+        "1:                                      \n"
+
+        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
+        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
+        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
+        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
+        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 4 16-bits coefs
+        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
+
+        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
+        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
+
+        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
+        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
+
+        "vrev64.16      d4, d4                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
+
+        "vadd.s16       d16, d16, d17            \n"// (1+2d) interpolate (step3) 1st set
+        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
+
+        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+        "vmlal.s16      q0, d4, d16              \n"// (1+0d) multiply (reversed)by coef
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
+
+        // moving these ARM instructions before neon above seems to be slower
+        "subs           %[count], %[count], #4   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #8        \n"// move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11"
+    );
+}
+
+template <>
+inline void Process<2, 8>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* coefsP1,
+        const int16_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
+        "veor           q0, q0, q0               \n"// (1) acc_L = 0
+        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
+
+        "1:                                      \n"
+
+        "vld2.16        {d4, d5}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
+        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
+        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
+        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
+
+        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
+        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
+
+        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
+        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
+
+        "vadd.s16       d16, d16, d17            \n"// (1+1d) interpolate (step3) 1st set
+        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
+
+        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
+        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
+        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
+
+        // moving these ARM before neon seems to be slower
+        "subs           %[count], %[count], #4   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q9", "q10", "q11"
+    );
+}
+
+template <>
+inline void ProcessL<1, 8>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// result, initialize to 0
+
+        "1:                                      \n"
+
+        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
+        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
+        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
+        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+
+        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
+
+        "vshll.s16      q12, d4, #15             \n"// (stall) extend samples to 31 bits
+        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12              \n"// accumulate result
+        "vadd.s32       q0, q0, q14              \n"// (stall) accumulate result
+
+        "subs           %[count], %[count], #4   \n"// update loop counter
+        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
+
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11",
+          "q12", "q14"
+    );
+}
+
+template <>
+inline void ProcessL<2, 8>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// result, initialize to 0
+        "veor           q4, q4, q4               \n"// result, initialize to 0
+
+        "1:                                      \n"
+
+        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
+        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
+        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
+        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+
+        "vrev64.16      q2, q2                   \n"// reverse 2 frames of the positive side
+
+        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
+        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
+
+        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
+        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by coef
+        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by coef
+        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by coef
+        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by coef
+
+        "vadd.s32       q0, q0, q12              \n"// accumulate result
+        "vadd.s32       q4, q4, q13              \n"// accumulate result
+        "vadd.s32       q0, q0, q14              \n"// accumulate result
+        "vadd.s32       q4, q4, q15              \n"// accumulate result
+
+        "subs           %[count], %[count], #4   \n"// update loop counter
+        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
+
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3", "q4",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void Process<1, 8>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int32_t* coefsP1,
+        const int32_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
+        "veor           q0, q0, q0               \n"// result, initialize to 0
+
+        "1:                                      \n"
+
+        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
+        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
+        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
+        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
+        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
+        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
+
+        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
+
+        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
+        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
+        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
+        "vqrdmulh.s32   q11, q11, d2[0]          \n"// interpolate (step2) 2nd set of coefs
+        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
+
+        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
+        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
+
+        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12              \n"// accumulate result
+        "vadd.s32       q0, q0, q14              \n"// accumulate result
+
+        "subs           %[count], %[count], #4   \n"// update loop counter
+        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
+
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN0] "+r" (coefsN),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11",
+          "q12", "q14"
+    );
+}
+
+template <>
+inline
+void Process<2, 8>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int32_t* coefsP1,
+        const int32_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
+        "veor           q0, q0, q0               \n"// result, initialize to 0
+        "veor           q4, q4, q4               \n"// result, initialize to 0
+
+        "1:                                      \n"
+        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
+        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
+        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
+        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
+        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
+        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
+
+        "vrev64.16      q2, q2                   \n"// (reversed) 2 frames of the positive side
+
+        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
+        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
+        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
+        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
+        "vqrdmulh.s32   q11, q11, d2[1]          \n"// interpolate (step3) 2nd set of coefs
+        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
+        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
+
+        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
+        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
+
+        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12              \n"// accumulate result
+        "vadd.s32       q4, q4, q13              \n"// accumulate result
+        "vadd.s32       q0, q0, q14              \n"// accumulate result
+        "vadd.s32       q4, q4, q15              \n"// accumulate result
+
+        "subs           %[count], %[count], #4   \n"// update loop counter
+        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
+
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN0] "+r" (coefsN),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3", "q4",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+#endif //USE_NEON
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
diff --git a/services/audioflinger/Effects.cpp b/services/audioflinger/Effects.cpp
index 010e233..29b56db 100644
--- a/services/audioflinger/Effects.cpp
+++ b/services/audioflinger/Effects.cpp
@@ -116,8 +116,9 @@ status_t AudioFlinger::EffectModule::addHandle(EffectHandle *handle)
             continue;
         }
         // first non destroyed handle is considered in control
-        if (controlHandle == NULL)
+        if (controlHandle == NULL) {
             controlHandle = h;
+        }
         if (h->priority() <= priority) {
             break;
         }
@@ -804,7 +805,112 @@ bool AudioFlinger::EffectModule::isOffloaded() const
     return mOffloaded;
 }
 
-void AudioFlinger::EffectModule::dump(int fd, const Vector<String16>& args)
+String8 effectFlagsToString(uint32_t flags) {
+    String8 s;
+
+    s.append("conn. mode: ");
+    switch (flags & EFFECT_FLAG_TYPE_MASK) {
+    case EFFECT_FLAG_TYPE_INSERT: s.append("insert"); break;
+    case EFFECT_FLAG_TYPE_AUXILIARY: s.append("auxiliary"); break;
+    case EFFECT_FLAG_TYPE_REPLACE: s.append("replace"); break;
+    case EFFECT_FLAG_TYPE_PRE_PROC: s.append("preproc"); break;
+    case EFFECT_FLAG_TYPE_POST_PROC: s.append("postproc"); break;
+    default: s.append("unknown/reserved"); break;
+    }
+    s.append(", ");
+
+    s.append("insert pref: ");
+    switch (flags & EFFECT_FLAG_INSERT_MASK) {
+    case EFFECT_FLAG_INSERT_ANY: s.append("any"); break;
+    case EFFECT_FLAG_INSERT_FIRST: s.append("first"); break;
+    case EFFECT_FLAG_INSERT_LAST: s.append("last"); break;
+    case EFFECT_FLAG_INSERT_EXCLUSIVE: s.append("exclusive"); break;
+    default: s.append("unknown/reserved"); break;
+    }
+    s.append(", ");
+
+    s.append("volume mgmt: ");
+    switch (flags & EFFECT_FLAG_VOLUME_MASK) {
+    case EFFECT_FLAG_VOLUME_NONE: s.append("none"); break;
+    case EFFECT_FLAG_VOLUME_CTRL: s.append("implements control"); break;
+    case EFFECT_FLAG_VOLUME_IND: s.append("requires indication"); break;
+    default: s.append("unknown/reserved"); break;
+    }
+    s.append(", ");
+
+    uint32_t devind = flags & EFFECT_FLAG_DEVICE_MASK;
+    if (devind) {
+        s.append("device indication: ");
+        switch (devind) {
+        case EFFECT_FLAG_DEVICE_IND: s.append("requires updates"); break;
+        default: s.append("unknown/reserved"); break;
+        }
+        s.append(", ");
+    }
+
+    s.append("input mode: ");
+    switch (flags & EFFECT_FLAG_INPUT_MASK) {
+    case EFFECT_FLAG_INPUT_DIRECT: s.append("direct"); break;
+    case EFFECT_FLAG_INPUT_PROVIDER: s.append("provider"); break;
+    case EFFECT_FLAG_INPUT_BOTH: s.append("direct+provider"); break;
+    default: s.append("not set"); break;
+    }
+    s.append(", ");
+
+    s.append("output mode: ");
+    switch (flags & EFFECT_FLAG_OUTPUT_MASK) {
+    case EFFECT_FLAG_OUTPUT_DIRECT: s.append("direct"); break;
+    case EFFECT_FLAG_OUTPUT_PROVIDER: s.append("provider"); break;
+    case EFFECT_FLAG_OUTPUT_BOTH: s.append("direct+provider"); break;
+    default: s.append("not set"); break;
+    }
+    s.append(", ");
+
+    uint32_t accel = flags & EFFECT_FLAG_HW_ACC_MASK;
+    if (accel) {
+        s.append("hardware acceleration: ");
+        switch (accel) {
+        case EFFECT_FLAG_HW_ACC_SIMPLE: s.append("non-tunneled"); break;
+        case EFFECT_FLAG_HW_ACC_TUNNEL: s.append("tunneled"); break;
+        default: s.append("unknown/reserved"); break;
+        }
+        s.append(", ");
+    }
+
+    uint32_t modeind = flags & EFFECT_FLAG_AUDIO_MODE_MASK;
+    if (modeind) {
+        s.append("mode indication: ");
+        switch (modeind) {
+        case EFFECT_FLAG_AUDIO_MODE_IND: s.append("required"); break;
+        default: s.append("unknown/reserved"); break;
+        }
+        s.append(", ");
+    }
+
+    uint32_t srcind = flags & EFFECT_FLAG_AUDIO_SOURCE_MASK;
+    if (srcind) {
+        s.append("source indication: ");
+        switch (srcind) {
+        case EFFECT_FLAG_AUDIO_SOURCE_IND: s.append("required"); break;
+        default: s.append("unknown/reserved"); break;
+        }
+        s.append(", ");
+    }
+
+    if (flags & EFFECT_FLAG_OFFLOAD_MASK) {
+        s.append("offloadable, ");
+    }
+
+    int len = s.length();
+    if (s.length() > 2) {
+        char *str = s.lockBuffer(len);
+        s.unlockBuffer(len - 2);
+    }
+    return s;
+}
+
+
+void AudioFlinger::EffectModule::dump(int fd, const Vector<String16>& args __unused)
 {
     const size_t SIZE = 256;
     char buffer[SIZE];
@@ -838,9 +944,10 @@ void AudioFlinger::EffectModule::dump(int fd, const Vector<String16>& args)
                     mDescriptor.type.node[2],
                 mDescriptor.type.node[3],mDescriptor.type.node[4],mDescriptor.type.node[5]);
     result.append(buffer);
-    snprintf(buffer, SIZE, "\t\t- apiVersion: %08X\n\t\t- flags: %08X\n",
+    snprintf(buffer, SIZE, "\t\t- apiVersion: %08X\n\t\t- flags: %08X (%s)\n",
             mDescriptor.apiVersion,
-            mDescriptor.flags);
+            mDescriptor.flags,
+            effectFlagsToString(mDescriptor.flags).string());
     result.append(buffer);
     snprintf(buffer, SIZE, "\t\t- name: %s\n",
             mDescriptor.name);
@@ -851,37 +958,37 @@ void AudioFlinger::EffectModule::dump(int fd, const Vector<String16>& args)
 
     result.append("\t\t- Input configuration:\n");
     result.append("\t\t\tFrames  Smp rate Channels Format Buffer\n");
-    snprintf(buffer, SIZE, "\t\t\t%05zu   %05d    %08x %6d %p\n",
+    snprintf(buffer, SIZE, "\t\t\t%05zu   %05d    %08x %6d (%s) %p\n",
             mConfig.inputCfg.buffer.frameCount,
             mConfig.inputCfg.samplingRate,
             mConfig.inputCfg.channels,
             mConfig.inputCfg.format,
+            formatToString((audio_format_t)mConfig.inputCfg.format),
             mConfig.inputCfg.buffer.raw);
     result.append(buffer);
 
     result.append("\t\t- Output configuration:\n");
     result.append("\t\t\tBuffer     Frames  Smp rate Channels Format\n");
-    snprintf(buffer, SIZE, "\t\t\t%p %05zu   %05d    %08x %d\n",
+    snprintf(buffer, SIZE, "\t\t\t%p %05zu   %05d    %08x %d (%s)\n",
             mConfig.outputCfg.buffer.raw,
             mConfig.outputCfg.buffer.frameCount,
             mConfig.outputCfg.samplingRate,
             mConfig.outputCfg.channels,
-            mConfig.outputCfg.format);
+            mConfig.outputCfg.format,
+            formatToString((audio_format_t)mConfig.outputCfg.format));
     result.append(buffer);
 
     snprintf(buffer, SIZE, "\t\t%zu Clients:\n", mHandles.size());
     result.append(buffer);
-    result.append("\t\t\tPid   Priority Ctrl Locked client server\n");
+    result.append("\t\t\t  Pid Priority Ctrl Locked client server\n");
     for (size_t i = 0; i < mHandles.size(); ++i) {
         EffectHandle *handle = mHandles[i];
         if (handle != NULL && !handle->destroyed_l()) {
-            handle->dump(buffer, SIZE);
+            handle->dumpToBuffer(buffer, SIZE);
             result.append(buffer);
         }
     }
 
-    result.append("\n");
-
     write(fd, result.string(), result.length());
 
     if (locked) {
@@ -911,18 +1018,15 @@ AudioFlinger::EffectHandle::EffectHandle(const sp<EffectModule>& effect,
     }
     int bufOffset = ((sizeof(effect_param_cblk_t) - 1) / sizeof(int) + 1) * sizeof(int);
     mCblkMemory = client->heap()->allocate(EFFECT_PARAM_BUFFER_SIZE + bufOffset);
-    if (mCblkMemory != 0) {
-        mCblk = static_cast<effect_param_cblk_t *>(mCblkMemory->pointer());
-
-        if (mCblk != NULL) {
-            new(mCblk) effect_param_cblk_t();
-            mBuffer = (uint8_t *)mCblk + bufOffset;
-        }
-    } else {
+    if (mCblkMemory == 0 ||
+            (mCblk = static_cast<effect_param_cblk_t *>(mCblkMemory->pointer())) == NULL) {
         ALOGE("not enough memory for Effect size=%u", EFFECT_PARAM_BUFFER_SIZE +
                 sizeof(effect_param_cblk_t));
+        mCblkMemory.clear();
         return;
     }
+    new(mCblk) effect_param_cblk_t();
+    mBuffer = (uint8_t *)mCblk + bufOffset;
 }
 
 AudioFlinger::EffectHandle::~EffectHandle()
@@ -939,6 +1043,11 @@ AudioFlinger::EffectHandle::~EffectHandle()
     disconnect(false);
 }
 
+status_t AudioFlinger::EffectHandle::initCheck()
+{
+    return mClient == 0 || mCblkMemory != 0 ? OK : NO_MEMORY;
+}
+
 status_t AudioFlinger::EffectHandle::enable()
 {
     ALOGV("enable %p", this);
@@ -1179,15 +1288,15 @@ status_t AudioFlinger::EffectHandle::onTransact(
 }
 
 
-void AudioFlinger::EffectHandle::dump(char* buffer, size_t size)
+void AudioFlinger::EffectHandle::dumpToBuffer(char* buffer, size_t size)
 {
     bool locked = mCblk != NULL && AudioFlinger::dumpTryLock(mCblk->lock);
 
-    snprintf(buffer, size, "\t\t\t%05d %05d    %01u    %01u      %05u  %05u\n",
+    snprintf(buffer, size, "\t\t\t%5d    %5d  %3s    %3s  %5u  %5u\n",
             (mClient == 0) ? getpid_cached : mClient->pid(),
             mPriority,
-            mHasControl,
-            !locked,
+            mHasControl ? "yes" : "no",
+            locked ? "yes" : "no",
             mCblk ? mCblk->clientIndex : 0,
             mCblk ? mCblk->serverIndex : 0
             );
@@ -1568,33 +1677,35 @@ void AudioFlinger::EffectChain::dump(int fd, const Vector<String16>& args)
     char buffer[SIZE];
     String8 result;
 
-    snprintf(buffer, SIZE, "Effects for session %d:\n", mSessionId);
+    size_t numEffects = mEffects.size();
+    snprintf(buffer, SIZE, "    %d effects for session %d\n", numEffects, mSessionId);
     result.append(buffer);
 
-    bool locked = AudioFlinger::dumpTryLock(mLock);
-    // failed to lock - AudioFlinger is probably deadlocked
-    if (!locked) {
-        result.append("\tCould not lock mutex:\n");
-    }
+    if (numEffects) {
+        bool locked = AudioFlinger::dumpTryLock(mLock);
+        // failed to lock - AudioFlinger is probably deadlocked
+        if (!locked) {
+            result.append("\tCould not lock mutex:\n");
+        }
 
-    result.append("\tNum fx In buffer   Out buffer   Active tracks:\n");
-    snprintf(buffer, SIZE, "\t%02zu     %p  %p   %d\n",
-            mEffects.size(),
-            mInBuffer,
-            mOutBuffer,
-            mActiveTrackCnt);
-    result.append(buffer);
-    write(fd, result.string(), result.size());
+        result.append("\tIn buffer   Out buffer   Active tracks:\n");
+        snprintf(buffer, SIZE, "\t%p  %p   %d\n",
+                mInBuffer,
+                mOutBuffer,
+                mActiveTrackCnt);
+        result.append(buffer);
+        write(fd, result.string(), result.size());
 
-    for (size_t i = 0; i < mEffects.size(); ++i) {
-        sp<EffectModule> effect = mEffects[i];
-        if (effect != 0) {
-            effect->dump(fd, args);
+        for (size_t i = 0; i < numEffects; ++i) {
+            sp<EffectModule> effect = mEffects[i];
+            if (effect != 0) {
+                effect->dump(fd, args);
+            }
         }
-    }
 
-    if (locked) {
-        mLock.unlock();
+        if (locked) {
+            mLock.unlock();
+        }
     }
 }
 
diff --git a/services/audioflinger/Effects.h b/services/audioflinger/Effects.h
index b717857..ccc4825 100644
--- a/services/audioflinger/Effects.h
+++ b/services/audioflinger/Effects.h
@@ -169,6 +169,7 @@ public:
             const sp<IEffectClient>& effectClient,
             int32_t priority);
     virtual ~EffectHandle();
+    virtual status_t initCheck();
 
     // IEffect
     virtual status_t enable();
@@ -208,7 +209,7 @@ public:
     // destroyed_l() must be called with the associated EffectModule mLock held
     bool destroyed_l() const { return mDestroyed; }
 
-    void dump(char* buffer, size_t size);
+    void dumpToBuffer(char* buffer, size_t size);
 
 protected:
     friend class AudioFlinger;          // for mEffect, mHasControl, mEnabled
diff --git a/services/audioflinger/FastMixer.cpp b/services/audioflinger/FastMixer.cpp
index 85d637e..90122e0 100644
--- a/services/audioflinger/FastMixer.cpp
+++ b/services/audioflinger/FastMixer.cpp
@@ -238,7 +238,7 @@ bool FastMixer::threadLoop()
                 }
             }
 
-            if ((format != previousFormat) || (frameCount != previous->mFrameCount)) {
+            if ((!Format_isEqual(format, previousFormat)) || (frameCount != previous->mFrameCount)) {
                 // FIXME to avoid priority inversion, don't delete here
                 delete mixer;
                 mixer = NULL;
@@ -440,8 +440,9 @@ bool FastMixer::threadLoop()
             }
 
             int64_t pts;
-            if (outputSink == NULL || (OK != outputSink->getNextWriteTimestamp(&pts)))
+            if (outputSink == NULL || (OK != outputSink->getNextWriteTimestamp(&pts))) {
                 pts = AudioBufferProvider::kInvalidPTS;
+            }
 
             // process() is CPU-bound
             mixer->process(pts);
@@ -695,7 +696,7 @@ static int compare_uint32_t(const void *pa, const void *pb)
 void FastMixerDumpState::dump(int fd) const
 {
     if (mCommand == FastMixerState::INITIAL) {
-        fdprintf(fd, "FastMixer not initialized\n");
+        fdprintf(fd, "  FastMixer not initialized\n");
         return;
     }
 #define COMMAND_MAX 32
@@ -729,10 +730,10 @@ void FastMixerDumpState::dump(int fd) const
     double measuredWarmupMs = (mMeasuredWarmupTs.tv_sec * 1000.0) +
             (mMeasuredWarmupTs.tv_nsec / 1000000.0);
     double mixPeriodSec = (double) mFrameCount / (double) mSampleRate;
-    fdprintf(fd, "FastMixer command=%s writeSequence=%u framesWritten=%u\n"
-                 "          numTracks=%u writeErrors=%u underruns=%u overruns=%u\n"
-                 "          sampleRate=%u frameCount=%zu measuredWarmup=%.3g ms, warmupCycles=%u\n"
-                 "          mixPeriod=%.2f ms\n",
+    fdprintf(fd, "  FastMixer command=%s writeSequence=%u framesWritten=%u\n"
+                 "            numTracks=%u writeErrors=%u underruns=%u overruns=%u\n"
+                 "            sampleRate=%u frameCount=%zu measuredWarmup=%.3g ms, warmupCycles=%u\n"
+                 "            mixPeriod=%.2f ms\n",
                  string, mWriteSequence, mFramesWritten,
                  mNumTracks, mWriteErrors, mUnderruns, mOverruns,
                  mSampleRate, mFrameCount, measuredWarmupMs, mWarmupCycles,
@@ -783,14 +784,20 @@ void FastMixerDumpState::dump(int fd) const
         previousCpukHz = sampleCpukHz;
 #endif
     }
-    fdprintf(fd, "Simple moving statistics over last %.1f seconds:\n", wall.n() * mixPeriodSec);
-    fdprintf(fd, "  wall clock time in ms per mix cycle:\n"
-                 "    mean=%.2f min=%.2f max=%.2f stddev=%.2f\n",
-                 wall.mean()*1e-6, wall.minimum()*1e-6, wall.maximum()*1e-6, wall.stddev()*1e-6);
-    fdprintf(fd, "  raw CPU load in us per mix cycle:\n"
-                 "    mean=%.0f min=%.0f max=%.0f stddev=%.0f\n",
-                 loadNs.mean()*1e-3, loadNs.minimum()*1e-3, loadNs.maximum()*1e-3,
-                 loadNs.stddev()*1e-3);
+    if (n) {
+        fdprintf(fd, "  Simple moving statistics over last %.1f seconds:\n",
+                     wall.n() * mixPeriodSec);
+        fdprintf(fd, "    wall clock time in ms per mix cycle:\n"
+                     "      mean=%.2f min=%.2f max=%.2f stddev=%.2f\n",
+                     wall.mean()*1e-6, wall.minimum()*1e-6, wall.maximum()*1e-6,
+                     wall.stddev()*1e-6);
+        fdprintf(fd, "    raw CPU load in us per mix cycle:\n"
+                     "      mean=%.0f min=%.0f max=%.0f stddev=%.0f\n",
+                     loadNs.mean()*1e-3, loadNs.minimum()*1e-3, loadNs.maximum()*1e-3,
+                     loadNs.stddev()*1e-3);
+    } else {
+        fdprintf(fd, "  No FastMixer statistics available currently\n");
+    }
 #ifdef CPU_FREQUENCY_STATISTICS
     fdprintf(fd, "  CPU clock frequency in MHz:\n"
                  "    mean=%.0f min=%.0f max=%.0f stddev=%.0f\n",
@@ -808,9 +815,9 @@ void FastMixerDumpState::dump(int fd) const
             left.sample(tail[i]);
             right.sample(tail[n - (i + 1)]);
         }
-        fdprintf(fd, "Distribution of mix cycle times in ms for the tails (> ~3 stddev outliers):\n"
-                     "  left tail: mean=%.2f min=%.2f max=%.2f stddev=%.2f\n"
-                     "  right tail: mean=%.2f min=%.2f max=%.2f stddev=%.2f\n",
+        fdprintf(fd, "  Distribution of mix cycle times in ms for the tails (> ~3 stddev outliers):\n"
+                     "    left tail: mean=%.2f min=%.2f max=%.2f stddev=%.2f\n"
+                     "    right tail: mean=%.2f min=%.2f max=%.2f stddev=%.2f\n",
                      left.mean()*1e-6, left.minimum()*1e-6, left.maximum()*1e-6, left.stddev()*1e-6,
                      right.mean()*1e-6, right.minimum()*1e-6, right.maximum()*1e-6,
                      right.stddev()*1e-6);
@@ -823,9 +830,9 @@ void FastMixerDumpState::dump(int fd) const
     // Instead we always display all tracks, with an indication
     // of whether we think the track is active.
     uint32_t trackMask = mTrackMask;
-    fdprintf(fd, "Fast tracks: kMaxFastTracks=%u activeMask=%#x\n",
+    fdprintf(fd, "  Fast tracks: kMaxFastTracks=%u activeMask=%#x\n",
             FastMixerState::kMaxFastTracks, trackMask);
-    fdprintf(fd, "Index Active Full Partial Empty  Recent Ready\n");
+    fdprintf(fd, "  Index Active Full Partial Empty  Recent Ready\n");
     for (uint32_t i = 0; i < FastMixerState::kMaxFastTracks; ++i, trackMask >>= 1) {
         bool isActive = trackMask & 1;
         const FastTrackDump *ftDump = &mTracks[i];
@@ -845,7 +852,7 @@ void FastMixerDumpState::dump(int fd) const
             mostRecent = "?";
             break;
         }
-        fdprintf(fd, "%5u %6s %4u %7u %5u %7s %5zu\n", i, isActive ? "yes" : "no",
+        fdprintf(fd, "  %5u %6s %4u %7u %5u %7s %5zu\n", i, isActive ? "yes" : "no",
                 (underruns.mBitFields.mFull) & UNDERRUN_MASK,
                 (underruns.mBitFields.mPartial) & UNDERRUN_MASK,
                 (underruns.mBitFields.mEmpty) & UNDERRUN_MASK,
diff --git a/services/audioflinger/PlaybackTracks.h b/services/audioflinger/PlaybackTracks.h
index 43b77f3..b5e763d 100644
--- a/services/audioflinger/PlaybackTracks.h
+++ b/services/audioflinger/PlaybackTracks.h
@@ -34,9 +34,10 @@ public:
                                 int uid,
                                 IAudioFlinger::track_flags_t flags);
     virtual             ~Track();
+    virtual status_t    initCheck() const;
 
     static  void        appendDumpHeader(String8& result);
-            void        dump(char* buffer, size_t size);
+            void        dump(char* buffer, size_t size, bool active);
     virtual status_t    start(AudioSystem::sync_event_t event =
                                     AudioSystem::SYNC_EVENT_NONE,
                              int triggerSession = 0);
@@ -93,6 +94,8 @@ protected:
     bool isReady() const;
     void setPaused() { mState = PAUSED; }
     void reset();
+    bool isFlushPending() const { return mFlushHwPending; }
+    void flushAck();
 
     bool isOutputTrack() const {
         return (mStreamType == AUDIO_STREAM_CNT);
@@ -154,6 +157,7 @@ private:
     bool                mIsInvalid; // non-resettable latch, set by invalidate()
     AudioTrackServerProxy*  mAudioTrackServerProxy;
     bool                mResumeToStopping; // track was paused in stopping state.
+    bool                mFlushHwPending; // track requests for thread flush
 };  // end of Track
 
 class TimedTrack : public Track {
diff --git a/services/audioflinger/RecordTracks.h b/services/audioflinger/RecordTracks.h
index 57de568..fc3171f 100644
--- a/services/audioflinger/RecordTracks.h
+++ b/services/audioflinger/RecordTracks.h
@@ -45,7 +45,7 @@ public:
                                                 return tmp; }
 
     static  void        appendDumpHeader(String8& result);
-            void        dump(char* buffer, size_t size);
+            void        dump(char* buffer, size_t size, bool active);
 
 private:
     friend class AudioFlinger;  // for mState
@@ -59,5 +59,4 @@ private:
     // releaseBuffer() not overridden
 
     bool                mOverflow;  // overflow on most recent attempt to fill client buffer
-    AudioRecordServerProxy* mAudioRecordServerProxy;
 };
diff --git a/services/audioflinger/Threads.cpp b/services/audioflinger/Threads.cpp
index 498ddb6..b064e89 100644
--- a/services/audioflinger/Threads.cpp
+++ b/services/audioflinger/Threads.cpp
@@ -185,7 +185,11 @@ CpuStats::CpuStats()
 {
 }
 
-void CpuStats::sample(const String8 &title) {
+void CpuStats::sample(const String8 &title
+#ifndef DEBUG_CPU_USAGE
+                __unused
+#endif
+        ) {
 #ifdef DEBUG_CPU_USAGE
     // get current thread's delta CPU time in wall clock ns
     double wcNs;
@@ -269,8 +273,8 @@ AudioFlinger::ThreadBase::ThreadBase(const sp<AudioFlinger>& audioFlinger, audio
     :   Thread(false /*canCallJava*/),
         mType(type),
         mAudioFlinger(audioFlinger),
-        // mSampleRate, mFrameCount, mChannelMask, mChannelCount, mFrameSize, and mFormat are
-        // set by PlaybackThread::readOutputParameters() or RecordThread::readInputParameters()
+        // mSampleRate, mFrameCount, mChannelMask, mChannelCount, mFrameSize, mFormat, mBufferSize
+        // are set by PlaybackThread::readOutputParameters() or RecordThread::readInputParameters()
         mParamStatus(NO_ERROR),
         //FIXME: mStandby should be true here. Is this some kind of hack?
         mStandby(false), mOutDevice(outDevice), mInDevice(inDevice),
@@ -297,6 +301,17 @@ AudioFlinger::ThreadBase::~ThreadBase()
     }
 }
 
+status_t AudioFlinger::ThreadBase::readyToRun()
+{
+    status_t status = initCheck();
+    if (status == NO_ERROR) {
+        ALOGI("AudioFlinger's thread %p ready to run", this);
+    } else {
+        ALOGE("No working audio driver found.");
+    }
+    return status;
+}
+
 void AudioFlinger::ThreadBase::exit()
 {
     ALOGV("ThreadBase::exit");
@@ -369,7 +384,13 @@ void AudioFlinger::ThreadBase::sendPrioConfigEvent_l(pid_t pid, pid_t tid, int32
 
 void AudioFlinger::ThreadBase::processConfigEvents()
 {
-    mLock.lock();
+    Mutex::Autolock _l(mLock);
+    processConfigEvents_l();
+}
+
+// post condition: mConfigEvents.isEmpty()
+void AudioFlinger::ThreadBase::processConfigEvents_l()
+{
     while (!mConfigEvents.isEmpty()) {
         ALOGV("processConfigEvents() remaining events %d", mConfigEvents.size());
         ConfigEvent *event = mConfigEvents[0];
@@ -377,35 +398,81 @@ void AudioFlinger::ThreadBase::processConfigEvents()
         // release mLock before locking AudioFlinger mLock: lock order is always
         // AudioFlinger then ThreadBase to avoid cross deadlock
         mLock.unlock();
-        switch(event->type()) {
-            case CFG_EVENT_PRIO: {
-                PrioConfigEvent *prioEvent = static_cast<PrioConfigEvent *>(event);
-                // FIXME Need to understand why this has be done asynchronously
-                int err = requestPriority(prioEvent->pid(), prioEvent->tid(), prioEvent->prio(),
-                        true /*asynchronous*/);
-                if (err != 0) {
-                    ALOGW("Policy SCHED_FIFO priority %d is unavailable for pid %d tid %d; "
-                          "error %d",
-                          prioEvent->prio(), prioEvent->pid(), prioEvent->tid(), err);
-                }
-            } break;
-            case CFG_EVENT_IO: {
-                IoConfigEvent *ioEvent = static_cast<IoConfigEvent *>(event);
-                mAudioFlinger->mLock.lock();
+        switch (event->type()) {
+        case CFG_EVENT_PRIO: {
+            PrioConfigEvent *prioEvent = static_cast<PrioConfigEvent *>(event);
+            // FIXME Need to understand why this has be done asynchronously
+            int err = requestPriority(prioEvent->pid(), prioEvent->tid(), prioEvent->prio(),
+                    true /*asynchronous*/);
+            if (err != 0) {
+                ALOGW("Policy SCHED_FIFO priority %d is unavailable for pid %d tid %d; error %d",
+                      prioEvent->prio(), prioEvent->pid(), prioEvent->tid(), err);
+            }
+        } break;
+        case CFG_EVENT_IO: {
+            IoConfigEvent *ioEvent = static_cast<IoConfigEvent *>(event);
+            {
+                Mutex::Autolock _l(mAudioFlinger->mLock);
                 audioConfigChanged_l(ioEvent->event(), ioEvent->param());
-                mAudioFlinger->mLock.unlock();
-            } break;
-            default:
-                ALOGE("processConfigEvents() unknown event type %d", event->type());
-                break;
+            }
+        } break;
+        default:
+            ALOGE("processConfigEvents() unknown event type %d", event->type());
+            break;
         }
         delete event;
         mLock.lock();
     }
-    mLock.unlock();
 }
 
-void AudioFlinger::ThreadBase::dumpBase(int fd, const Vector<String16>& args)
+String8 channelMaskToString(audio_channel_mask_t mask, bool output) {
+    String8 s;
+    if (output) {
+        if (mask & AUDIO_CHANNEL_OUT_FRONT_LEFT) s.append("front-left, ");
+        if (mask & AUDIO_CHANNEL_OUT_FRONT_RIGHT) s.append("front-right, ");
+        if (mask & AUDIO_CHANNEL_OUT_FRONT_CENTER) s.append("front-center, ");
+        if (mask & AUDIO_CHANNEL_OUT_LOW_FREQUENCY) s.append("low freq, ");
+        if (mask & AUDIO_CHANNEL_OUT_BACK_LEFT) s.append("back-left, ");
+        if (mask & AUDIO_CHANNEL_OUT_BACK_RIGHT) s.append("back-right, ");
+        if (mask & AUDIO_CHANNEL_OUT_FRONT_LEFT_OF_CENTER) s.append("front-left-of-center, ");
+        if (mask & AUDIO_CHANNEL_OUT_FRONT_RIGHT_OF_CENTER) s.append("front-right-of-center, ");
+        if (mask & AUDIO_CHANNEL_OUT_BACK_CENTER) s.append("back-center, ");
+        if (mask & AUDIO_CHANNEL_OUT_SIDE_LEFT) s.append("side-left, ");
+        if (mask & AUDIO_CHANNEL_OUT_SIDE_RIGHT) s.append("side-right, ");
+        if (mask & AUDIO_CHANNEL_OUT_TOP_CENTER) s.append("top-center ,");
+        if (mask & AUDIO_CHANNEL_OUT_TOP_FRONT_LEFT) s.append("top-front-left, ");
+        if (mask & AUDIO_CHANNEL_OUT_TOP_FRONT_CENTER) s.append("top-front-center, ");
+        if (mask & AUDIO_CHANNEL_OUT_TOP_FRONT_RIGHT) s.append("top-front-right, ");
+        if (mask & AUDIO_CHANNEL_OUT_TOP_BACK_LEFT) s.append("top-back-left, ");
+        if (mask & AUDIO_CHANNEL_OUT_TOP_BACK_CENTER) s.append("top-back-center, " );
+        if (mask & AUDIO_CHANNEL_OUT_TOP_BACK_RIGHT) s.append("top-back-right, " );
+        if (mask & ~AUDIO_CHANNEL_OUT_ALL) s.append("unknown,  ");
+    } else {
+        if (mask & AUDIO_CHANNEL_IN_LEFT) s.append("left, ");
+        if (mask & AUDIO_CHANNEL_IN_RIGHT) s.append("right, ");
+        if (mask & AUDIO_CHANNEL_IN_FRONT) s.append("front, ");
+        if (mask & AUDIO_CHANNEL_IN_BACK) s.append("back, ");
+        if (mask & AUDIO_CHANNEL_IN_LEFT_PROCESSED) s.append("left-processed, ");
+        if (mask & AUDIO_CHANNEL_IN_RIGHT_PROCESSED) s.append("right-processed, ");
+        if (mask & AUDIO_CHANNEL_IN_FRONT_PROCESSED) s.append("front-processed, ");
+        if (mask & AUDIO_CHANNEL_IN_BACK_PROCESSED) s.append("back-processed, ");
+        if (mask & AUDIO_CHANNEL_IN_PRESSURE) s.append("pressure, ");
+        if (mask & AUDIO_CHANNEL_IN_X_AXIS) s.append("X, ");
+        if (mask & AUDIO_CHANNEL_IN_Y_AXIS) s.append("Y, ");
+        if (mask & AUDIO_CHANNEL_IN_Z_AXIS) s.append("Z, ");
+        if (mask & AUDIO_CHANNEL_IN_VOICE_UPLINK) s.append("voice-uplink, ");
+        if (mask & AUDIO_CHANNEL_IN_VOICE_DNLINK) s.append("voice-dnlink, ");
+        if (mask & ~AUDIO_CHANNEL_IN_ALL) s.append("unknown,  ");
+    }
+    int len = s.length();
+    if (s.length() > 2) {
+        char *str = s.lockBuffer(len);
+        s.unlockBuffer(len - 2);
+    }
+    return s;
+}
+
+void AudioFlinger::ThreadBase::dumpBase(int fd, const Vector<String16>& args __unused)
 {
     const size_t SIZE = 256;
     char buffer[SIZE];
@@ -413,47 +480,43 @@ void AudioFlinger::ThreadBase::dumpBase(int fd, const Vector<String16>& args)
 
     bool locked = AudioFlinger::dumpTryLock(mLock);
     if (!locked) {
-        snprintf(buffer, SIZE, "thread %p maybe dead locked\n", this);
-        write(fd, buffer, strlen(buffer));
-    }
-
-    snprintf(buffer, SIZE, "io handle: %d\n", mId);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "TID: %d\n", getTid());
-    result.append(buffer);
-    snprintf(buffer, SIZE, "standby: %d\n", mStandby);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "Sample rate: %u\n", mSampleRate);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "HAL frame count: %zu\n", mFrameCount);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "Channel Count: %u\n", mChannelCount);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "Channel Mask: 0x%08x\n", mChannelMask);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "Format: %d\n", mFormat);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "Frame size: %zu\n", mFrameSize);
-    result.append(buffer);
-
-    snprintf(buffer, SIZE, "\nPending setParameters commands: \n");
-    result.append(buffer);
-    result.append(" Index Command");
-    for (size_t i = 0; i < mNewParameters.size(); ++i) {
-        snprintf(buffer, SIZE, "\n %02zu    ", i);
-        result.append(buffer);
-        result.append(mNewParameters[i]);
+        fdprintf(fd, "thread %p maybe dead locked\n", this);
+    }
+
+    fdprintf(fd, "  I/O handle: %d\n", mId);
+    fdprintf(fd, "  TID: %d\n", getTid());
+    fdprintf(fd, "  Standby: %s\n", mStandby ? "yes" : "no");
+    fdprintf(fd, "  Sample rate: %u\n", mSampleRate);
+    fdprintf(fd, "  HAL frame count: %zu\n", mFrameCount);
+    fdprintf(fd, "  HAL buffer size: %u bytes\n", mBufferSize);
+    fdprintf(fd, "  Channel Count: %u\n", mChannelCount);
+    fdprintf(fd, "  Channel Mask: 0x%08x (%s)\n", mChannelMask,
+            channelMaskToString(mChannelMask, mType != RECORD).string());
+    fdprintf(fd, "  Format: 0x%x (%s)\n", mFormat, formatToString(mFormat));
+    fdprintf(fd, "  Frame size: %zu\n", mFrameSize);
+    fdprintf(fd, "  Pending setParameters commands:");
+    size_t numParams = mNewParameters.size();
+    if (numParams) {
+        fdprintf(fd, "\n   Index Command");
+        for (size_t i = 0; i < numParams; ++i) {
+            fdprintf(fd, "\n   %02zu    ", i);
+            fdprintf(fd, mNewParameters[i]);
+        }
+        fdprintf(fd, "\n");
+    } else {
+        fdprintf(fd, " none\n");
     }
-
-    snprintf(buffer, SIZE, "\n\nPending config events: \n");
-    result.append(buffer);
-    for (size_t i = 0; i < mConfigEvents.size(); i++) {
-        mConfigEvents[i]->dump(buffer, SIZE);
-        result.append(buffer);
+    fdprintf(fd, "  Pending config events:");
+    size_t numConfig = mConfigEvents.size();
+    if (numConfig) {
+        for (size_t i = 0; i < numConfig; i++) {
+            mConfigEvents[i]->dump(buffer, SIZE);
+            fdprintf(fd, "\n    %s", buffer);
+        }
+        fdprintf(fd, "\n");
+    } else {
+        fdprintf(fd, " none\n");
     }
-    result.append("\n");
-
-    write(fd, result.string(), result.size());
 
     if (locked) {
         mLock.unlock();
@@ -466,10 +529,11 @@ void AudioFlinger::ThreadBase::dumpEffectChains(int fd, const Vector<String16>&
     char buffer[SIZE];
     String8 result;
 
-    snprintf(buffer, SIZE, "\n- %zu Effect Chains:\n", mEffectChains.size());
+    size_t numEffectChains = mEffectChains.size();
+    snprintf(buffer, SIZE, "  %zu Effect Chains\n", numEffectChains);
     write(fd, buffer, strlen(buffer));
 
-    for (size_t i = 0; i < mEffectChains.size(); ++i) {
+    for (size_t i = 0; i < numEffectChains; ++i) {
         sp<EffectChain> chain = mEffectChains[i];
         if (chain != 0) {
             chain->dump(fd, args);
@@ -586,7 +650,7 @@ void AudioFlinger::ThreadBase::clearPowerManager()
     mPowerManager.clear();
 }
 
-void AudioFlinger::ThreadBase::PMDeathRecipient::binderDied(const wp<IBinder>& who)
+void AudioFlinger::ThreadBase::PMDeathRecipient::binderDied(const wp<IBinder>& who __unused)
 {
     sp<ThreadBase> thread = mThread.promote();
     if (thread != 0) {
@@ -739,8 +803,7 @@ sp<AudioFlinger::EffectHandle> AudioFlinger::ThreadBase::createEffect_l(
         int sessionId,
         effect_descriptor_t *desc,
         int *enabled,
-        status_t *status
-        )
+        status_t *status)
 {
     sp<EffectModule> effect;
     sp<EffectHandle> handle;
@@ -829,7 +892,10 @@ sp<AudioFlinger::EffectHandle> AudioFlinger::ThreadBase::createEffect_l(
         }
         // create effect handle and connect it to effect module
         handle = new EffectHandle(effect, client, effectClient, priority);
-        lStatus = effect->addHandle(handle.get());
+        lStatus = handle->initCheck();
+        if (lStatus == OK) {
+            lStatus = effect->addHandle(handle.get());
+        }
         if (enabled != NULL) {
             *enabled = (int)effect->isEnabled();
         }
@@ -850,9 +916,7 @@ Exit:
         handle.clear();
     }
 
-    if (status != NULL) {
-        *status = lStatus;
-    }
+    *status = lStatus;
     return handle;
 }
 
@@ -1002,7 +1066,7 @@ AudioFlinger::PlaybackThread::PlaybackThread(const sp<AudioFlinger>& audioFlinge
                                              type_t type)
     :   ThreadBase(audioFlinger, id, device, AUDIO_DEVICE_NONE, type),
         mNormalFrameCount(0), mMixBuffer(NULL),
-        mAllocMixBuffer(NULL), mSuspended(0), mBytesWritten(0),
+        mSuspended(0), mBytesWritten(0),
         mActiveTracksGeneration(0),
         // mStreamTypes[] initialized in constructor body
         mOutput(output),
@@ -1060,7 +1124,7 @@ AudioFlinger::PlaybackThread::PlaybackThread(const sp<AudioFlinger>& audioFlinge
 AudioFlinger::PlaybackThread::~PlaybackThread()
 {
     mAudioFlinger->unregisterWriter(mNBLogWriter);
-    delete [] mAllocMixBuffer;
+    delete[] mMixBuffer;
 }
 
 void AudioFlinger::PlaybackThread::dump(int fd, const Vector<String16>& args)
@@ -1070,13 +1134,13 @@ void AudioFlinger::PlaybackThread::dump(int fd, const Vector<String16>& args)
     dumpEffectChains(fd, args);
 }
 
-void AudioFlinger::PlaybackThread::dumpTracks(int fd, const Vector<String16>& args)
+void AudioFlinger::PlaybackThread::dumpTracks(int fd, const Vector<String16>& args __unused)
 {
     const size_t SIZE = 256;
     char buffer[SIZE];
     String8 result;
 
-    result.appendFormat("Output thread %p stream volumes in dB:\n    ", this);
+    result.appendFormat("  Stream volumes in dB: ");
     for (int i = 0; i < AUDIO_STREAM_CNT; ++i) {
         const stream_type_t *st = &mStreamTypes[i];
         if (i > 0) {
@@ -1091,75 +1155,67 @@ void AudioFlinger::PlaybackThread::dumpTracks(int fd, const Vector<String16>& ar
     write(fd, result.string(), result.length());
     result.clear();
 
-    snprintf(buffer, SIZE, "Output thread %p tracks\n", this);
-    result.append(buffer);
-    Track::appendDumpHeader(result);
-    for (size_t i = 0; i < mTracks.size(); ++i) {
-        sp<Track> track = mTracks[i];
-        if (track != 0) {
-            track->dump(buffer, SIZE);
-            result.append(buffer);
+    // These values are "raw"; they will wrap around.  See prepareTracks_l() for a better way.
+    FastTrackUnderruns underruns = getFastTrackUnderruns(0);
+    fdprintf(fd, "  Normal mixer raw underrun counters: partial=%u empty=%u\n",
+            underruns.mBitFields.mPartial, underruns.mBitFields.mEmpty);
+
+    size_t numtracks = mTracks.size();
+    size_t numactive = mActiveTracks.size();
+    fdprintf(fd, "  %d Tracks", numtracks);
+    size_t numactiveseen = 0;
+    if (numtracks) {
+        fdprintf(fd, " of which %d are active\n", numactive);
+        Track::appendDumpHeader(result);
+        for (size_t i = 0; i < numtracks; ++i) {
+            sp<Track> track = mTracks[i];
+            if (track != 0) {
+                bool active = mActiveTracks.indexOf(track) >= 0;
+                if (active) {
+                    numactiveseen++;
+                }
+                track->dump(buffer, SIZE, active);
+                result.append(buffer);
+            }
         }
+    } else {
+        result.append("\n");
     }
-
-    snprintf(buffer, SIZE, "Output thread %p active tracks\n", this);
-    result.append(buffer);
-    Track::appendDumpHeader(result);
-    for (size_t i = 0; i < mActiveTracks.size(); ++i) {
-        sp<Track> track = mActiveTracks[i].promote();
-        if (track != 0) {
-            track->dump(buffer, SIZE);
-            result.append(buffer);
+    if (numactiveseen != numactive) {
+        // some tracks in the active list were not in the tracks list
+        snprintf(buffer, SIZE, "  The following tracks are in the active list but"
+                " not in the track list\n");
+        result.append(buffer);
+        Track::appendDumpHeader(result);
+        for (size_t i = 0; i < numactive; ++i) {
+            sp<Track> track = mActiveTracks[i].promote();
+            if (track != 0 && mTracks.indexOf(track) < 0) {
+                track->dump(buffer, SIZE, true);
+                result.append(buffer);
+            }
         }
     }
+
     write(fd, result.string(), result.size());
 
-    // These values are "raw"; they will wrap around.  See prepareTracks_l() for a better way.
-    FastTrackUnderruns underruns = getFastTrackUnderruns(0);
-    fdprintf(fd, "Normal mixer raw underrun counters: partial=%u empty=%u\n",
-            underruns.mBitFields.mPartial, underruns.mBitFields.mEmpty);
 }
 
 void AudioFlinger::PlaybackThread::dumpInternals(int fd, const Vector<String16>& args)
 {
-    const size_t SIZE = 256;
-    char buffer[SIZE];
-    String8 result;
-
-    snprintf(buffer, SIZE, "\nOutput thread %p internals\n", this);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "Normal frame count: %zu\n", mNormalFrameCount);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "last write occurred (msecs): %llu\n",
-            ns2ms(systemTime() - mLastWriteTime));
-    result.append(buffer);
-    snprintf(buffer, SIZE, "total writes: %d\n", mNumWrites);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "delayed writes: %d\n", mNumDelayedWrites);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "blocked in write: %d\n", mInWrite);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "suspend count: %d\n", mSuspended);
-    result.append(buffer);
-    snprintf(buffer, SIZE, "mix buffer : %p\n", mMixBuffer);
-    result.append(buffer);
-    write(fd, result.string(), result.size());
-    fdprintf(fd, "Fast track availMask=%#x\n", mFastTrackAvailMask);
+    fdprintf(fd, "\nOutput thread %p:\n", this);
+    fdprintf(fd, "  Normal frame count: %zu\n", mNormalFrameCount);
+    fdprintf(fd, "  Last write occurred (msecs): %llu\n", ns2ms(systemTime() - mLastWriteTime));
+    fdprintf(fd, "  Total writes: %d\n", mNumWrites);
+    fdprintf(fd, "  Delayed writes: %d\n", mNumDelayedWrites);
+    fdprintf(fd, "  Blocked in write: %s\n", mInWrite ? "yes" : "no");
+    fdprintf(fd, "  Suspend count: %d\n", mSuspended);
+    fdprintf(fd, "  Mix buffer : %p\n", mMixBuffer);
+    fdprintf(fd, "  Fast track availMask=%#x\n", mFastTrackAvailMask);
 
     dumpBase(fd, args);
 }
 
 // Thread virtuals
-status_t AudioFlinger::PlaybackThread::readyToRun()
-{
-    status_t status = initCheck();
-    if (status == NO_ERROR) {
-        ALOGI("AudioFlinger's thread %p ready to run", this);
-    } else {
-        ALOGE("No working audio driver found.");
-    }
-    return status;
-}
 
 void AudioFlinger::PlaybackThread::onFirstRef()
 {
@@ -1182,7 +1238,7 @@ sp<AudioFlinger::PlaybackThread::Track> AudioFlinger::PlaybackThread::createTrac
         uint32_t sampleRate,
         audio_format_t format,
         audio_channel_mask_t channelMask,
-        size_t frameCount,
+        size_t *pFrameCount,
         const sp<IMemory>& sharedBuffer,
         int sessionId,
         IAudioFlinger::track_flags_t *flags,
@@ -1190,6 +1246,7 @@ sp<AudioFlinger::PlaybackThread::Track> AudioFlinger::PlaybackThread::createTrac
         int uid,
         status_t *status)
 {
+    size_t frameCount = *pFrameCount;
     sp<Track> track;
     status_t lStatus;
 
@@ -1256,12 +1313,13 @@ sp<AudioFlinger::PlaybackThread::Track> AudioFlinger::PlaybackThread::createTrac
         }
       }
     }
+    *pFrameCount = frameCount;
 
     if (mType == DIRECT) {
         if ((format & AUDIO_FORMAT_MAIN_MASK) == AUDIO_FORMAT_PCM) {
             if (sampleRate != mSampleRate || format != mFormat || channelMask != mChannelMask) {
-                ALOGE("createTrack_l() Bad parameter: sampleRate %u format %d, channelMask 0x%08x "
-                        "for output %p with format %d",
+                ALOGE("createTrack_l() Bad parameter: sampleRate %u format %#x, channelMask 0x%08x "
+                        "for output %p with format %#x",
                         sampleRate, format, channelMask, mOutput, mFormat);
                 lStatus = BAD_VALUE;
                 goto Exit;
@@ -1269,16 +1327,16 @@ sp<AudioFlinger::PlaybackThread::Track> AudioFlinger::PlaybackThread::createTrac
         }
     } else if (mType == OFFLOAD) {
         if (sampleRate != mSampleRate || format != mFormat || channelMask != mChannelMask) {
-            ALOGE("createTrack_l() Bad parameter: sampleRate %d format %d, channelMask 0x%08x \""
-                    "for output %p with format %d",
+            ALOGE("createTrack_l() Bad parameter: sampleRate %d format %#x, channelMask 0x%08x \""
+                    "for output %p with format %#x",
                     sampleRate, format, channelMask, mOutput, mFormat);
             lStatus = BAD_VALUE;
             goto Exit;
         }
     } else {
         if ((format & AUDIO_FORMAT_MAIN_MASK) != AUDIO_FORMAT_PCM) {
-                ALOGE("createTrack_l() Bad parameter: format %d \""
-                        "for output %p with format %d",
+                ALOGE("createTrack_l() Bad parameter: format %#x \""
+                        "for output %p with format %#x",
                         format, mOutput, mFormat);
                 lStatus = BAD_VALUE;
                 goto Exit;
@@ -1324,8 +1382,13 @@ sp<AudioFlinger::PlaybackThread::Track> AudioFlinger::PlaybackThread::createTrac
             track = TimedTrack::create(this, client, streamType, sampleRate, format,
                     channelMask, frameCount, sharedBuffer, sessionId, uid);
         }
-        if (track == 0 || track->getCblk() == NULL || track->name() < 0) {
-            lStatus = NO_MEMORY;
+
+        // new Track always returns non-NULL,
+        // but TimedTrack::create() is a factory that could fail by returning NULL
+        lStatus = track != 0 ? track->initCheck() : (status_t) NO_MEMORY;
+        if (lStatus != NO_ERROR) {
+            ALOGE("createTrack_l() initCheck failed %d; no control block?", lStatus);
+            // track must be cleared from the caller as the caller has the AF lock
             goto Exit;
         }
 
@@ -1350,9 +1413,7 @@ sp<AudioFlinger::PlaybackThread::Track> AudioFlinger::PlaybackThread::createTrac
     lStatus = NO_ERROR;
 
 Exit:
-    if (status) {
-        *status = lStatus;
-    }
+    *status = lStatus;
     return track;
 }
 
@@ -1471,9 +1532,7 @@ status_t AudioFlinger::PlaybackThread::addTrack_l(const sp<Track>& track)
         status = NO_ERROR;
     }
 
-    ALOGV("signal playback thread");
-    broadcast_l();
-
+    onAddNewTrack_l();
     return status;
 }
 
@@ -1599,7 +1658,7 @@ void AudioFlinger::PlaybackThread::resetDraining(uint32_t sequence)
 
 // static
 int AudioFlinger::PlaybackThread::asyncCallback(stream_callback_event_t event,
-                                                void *param,
+                                                void *param __unused,
                                                 void *cookie)
 {
     AudioFlinger::PlaybackThread *me = (AudioFlinger::PlaybackThread *)cookie;
@@ -1633,14 +1692,15 @@ void AudioFlinger::PlaybackThread::readOutputParameters()
     mChannelCount = popcount(mChannelMask);
     mFormat = mOutput->stream->common.get_format(&mOutput->stream->common);
     if (!audio_is_valid_format(mFormat)) {
-        LOG_FATAL("HAL format %d not valid for output", mFormat);
+        LOG_FATAL("HAL format %#x not valid for output", mFormat);
     }
     if ((mType == MIXER || mType == DUPLICATING) && mFormat != AUDIO_FORMAT_PCM_16_BIT) {
-        LOG_FATAL("HAL format %d not supported for mixed output; must be AUDIO_FORMAT_PCM_16_BIT",
+        LOG_FATAL("HAL format %#x not supported for mixed output; must be AUDIO_FORMAT_PCM_16_BIT",
                 mFormat);
     }
     mFrameSize = audio_stream_frame_size(&mOutput->stream->common);
-    mFrameCount = mOutput->stream->common.get_buffer_size(&mOutput->stream->common) / mFrameSize;
+    mBufferSize = mOutput->stream->common.get_buffer_size(&mOutput->stream->common);
+    mFrameCount = mBufferSize / mFrameSize;
     if (mFrameCount & 15) {
         ALOGW("HAL output buffer size is %u frames but AudioMixer requires multiples of 16 frames",
                 mFrameCount);
@@ -1697,11 +1757,11 @@ void AudioFlinger::PlaybackThread::readOutputParameters()
     ALOGI("HAL output buffer size %u frames, normal mix buffer size %u frames", mFrameCount,
             mNormalFrameCount);
 
-    delete[] mAllocMixBuffer;
-    size_t align = (mFrameSize < sizeof(int16_t)) ? sizeof(int16_t) : mFrameSize;
-    mAllocMixBuffer = new int8_t[mNormalFrameCount * mFrameSize + align - 1];
-    mMixBuffer = (int16_t *) ((((size_t)mAllocMixBuffer + align - 1) / align) * align);
-    memset(mMixBuffer, 0, mNormalFrameCount * mFrameSize);
+    delete[] mMixBuffer;
+    size_t normalBufferSize = mNormalFrameCount * mFrameSize;
+    // For historical reasons mMixBuffer is int16_t[], but mFrameSize can be odd (such as 1)
+    mMixBuffer = new int16_t[(normalBufferSize + 1) >> 1];
+    memset(mMixBuffer, 0, normalBufferSize);
 
     // force reconfiguration of effect chains and engines to take new buffer size and audio
     // parameters into account
@@ -1839,7 +1899,7 @@ void AudioFlinger::PlaybackThread::threadLoop_removeTracks(
         const Vector< sp<Track> >& tracksToRemove)
 {
     size_t count = tracksToRemove.size();
-    if (count) {
+    if (count > 0) {
         for (size_t i = 0 ; i < count ; i++) {
             const sp<Track>& track = tracksToRemove.itemAt(i);
             if (!track->isOutputTrack()) {
@@ -1915,7 +1975,7 @@ ssize_t AudioFlinger::PlaybackThread::threadLoop_write()
     // otherwise use the HAL / AudioStreamOut directly
     } else {
         // Direct output and offload threads
-        size_t offset = (mCurrentWriteLength - mBytesRemaining) / sizeof(int16_t);
+        size_t offset = (mCurrentWriteLength - mBytesRemaining);
         if (mUseAsyncWrite) {
             ALOGW_IF(mWriteAckSequence & 1, "threadLoop_write(): out of sequence write request");
             mWriteAckSequence += 2;
@@ -1926,7 +1986,7 @@ ssize_t AudioFlinger::PlaybackThread::threadLoop_write()
         // FIXME We should have an implementation of timestamps for direct output threads.
         // They are used e.g for multichannel PCM playback over HDMI.
         bytesWritten = mOutput->stream->write(mOutput->stream,
-                                                   mMixBuffer + offset, mBytesRemaining);
+                                                   (char *)mMixBuffer + offset, mBytesRemaining);
         if (mUseAsyncWrite &&
                 ((bytesWritten < 0) || (bytesWritten == (ssize_t)mBytesRemaining))) {
             // do not wait for async callback in case of error of full write
@@ -2346,20 +2406,20 @@ bool AudioFlinger::PlaybackThread::threadLoop()
                         (mMixerStatus == MIXER_DRAIN_ALL)) {
                     threadLoop_drain();
                 }
-if (mType == MIXER) {
-                // write blocked detection
-                nsecs_t now = systemTime();
-                nsecs_t delta = now - mLastWriteTime;
-                if (!mStandby && delta > maxPeriod) {
-                    mNumDelayedWrites++;
-                    if ((now - lastWarning) > kWarningThrottleNs) {
-                        ATRACE_NAME("underrun");
-                        ALOGW("write blocked for %llu msecs, %d delayed writes, thread %p",
-                                ns2ms(delta), mNumDelayedWrites, this);
-                        lastWarning = now;
+                if (mType == MIXER) {
+                    // write blocked detection
+                    nsecs_t now = systemTime();
+                    nsecs_t delta = now - mLastWriteTime;
+                    if (!mStandby && delta > maxPeriod) {
+                        mNumDelayedWrites++;
+                        if ((now - lastWarning) > kWarningThrottleNs) {
+                            ATRACE_NAME("underrun");
+                            ALOGW("write blocked for %llu msecs, %d delayed writes, thread %p",
+                                    ns2ms(delta), mNumDelayedWrites, this);
+                            lastWarning = now;
+                        }
                     }
                 }
-}
 
             } else {
                 usleep(sleepTime);
@@ -2407,7 +2467,7 @@ if (mType == MIXER) {
 void AudioFlinger::PlaybackThread::removeTracks_l(const Vector< sp<Track> >& tracksToRemove)
 {
     size_t count = tracksToRemove.size();
-    if (count) {
+    if (count > 0) {
         for (size_t i=0 ; i<count ; i++) {
             const sp<Track>& track = tracksToRemove.itemAt(i);
             mActiveTracks.remove(track);
@@ -2711,12 +2771,6 @@ void AudioFlinger::MixerThread::threadLoop_standby()
     PlaybackThread::threadLoop_standby();
 }
 
-// Empty implementation for standard mixer
-// Overridden for offloaded playback
-void AudioFlinger::PlaybackThread::flushOutput_l()
-{
-}
-
 bool AudioFlinger::PlaybackThread::waitingAsyncCallback_l()
 {
     return false;
@@ -2748,6 +2802,12 @@ void AudioFlinger::PlaybackThread::threadLoop_standby()
     }
 }
 
+void AudioFlinger::PlaybackThread::onAddNewTrack_l()
+{
+    ALOGV("signal playback thread");
+    broadcast_l();
+}
+
 void AudioFlinger::MixerThread::threadLoop_mix()
 {
     // obtain the presentation timestamp of the next output buffer
@@ -2800,7 +2860,7 @@ void AudioFlinger::MixerThread::threadLoop_sleepTime()
             sleepTime = idleSleepTime;
         }
     } else if (mBytesWritten != 0 || (mMixerStatus == MIXER_TRACKS_ENABLED)) {
-        memset (mMixBuffer, 0, mixBufferSize);
+        memset(mMixBuffer, 0, mixBufferSize);
         sleepTime = 0;
         ALOGV_IF(mBytesWritten == 0 && (mMixerStatus == MIXER_TRACKS_ENABLED),
                 "anticipated start");
@@ -3025,12 +3085,14 @@ AudioFlinger::PlaybackThread::mixer_state AudioFlinger::MixerThread::prepareTrac
             // +1 for rounding and +1 for additional sample needed for interpolation
             desiredFrames = (mNormalFrameCount * sr) / mSampleRate + 1 + 1;
             // add frames already consumed but not yet released by the resampler
-            // because cblk->framesReady() will include these frames
+            // because mAudioTrackServerProxy->framesReady() will include these frames
             desiredFrames += mAudioMixer->getUnreleasedFrames(track->name());
+#if 0
             // the minimum track buffer size is normally twice the number of frames necessary
             // to fill one buffer and the resampler should not leave more than one buffer worth
             // of unreleased frames after each pass, but just in case...
             ALOG_ASSERT(desiredFrames <= cblk->frameCount_);
+#endif
         }
         uint32_t minFrames = 1;
         if ((track->sharedBuffer() == 0) && !track->isStopped() && !track->isPausing() &&
@@ -3356,6 +3418,7 @@ bool AudioFlinger::MixerThread::checkForNewParameters_l()
             if ((audio_format_t) value != AUDIO_FORMAT_PCM_16_BIT) {
                 status = BAD_VALUE;
             } else {
+                // no need to save value, since it's constant
                 reconfig = true;
             }
         }
@@ -3363,6 +3426,7 @@ bool AudioFlinger::MixerThread::checkForNewParameters_l()
             if ((audio_channel_mask_t) value != AUDIO_CHANNEL_OUT_STEREO) {
                 status = BAD_VALUE;
             } else {
+                // no need to save value, since it's constant
                 reconfig = true;
             }
         }
@@ -3466,9 +3530,7 @@ void AudioFlinger::MixerThread::dumpInternals(int fd, const Vector<String16>& ar
 
     PlaybackThread::dumpInternals(fd, args);
 
-    snprintf(buffer, SIZE, "AudioMixer tracks: %08x\n", mAudioMixer->trackNames());
-    result.append(buffer);
-    write(fd, result.string(), result.size());
+    fdprintf(fd, "  AudioMixer tracks: 0x%08x\n", mAudioMixer->trackNames());
 
     // Make a non-atomic copy of fast mixer dump state so it won't change underneath us
     const FastMixerDumpState copy(mFastMixerDumpState);
@@ -3722,14 +3784,14 @@ void AudioFlinger::DirectOutputThread::threadLoop_sleepTime()
 }
 
 // getTrackName_l() must be called with ThreadBase::mLock held
-int AudioFlinger::DirectOutputThread::getTrackName_l(audio_channel_mask_t channelMask,
-        int sessionId)
+int AudioFlinger::DirectOutputThread::getTrackName_l(audio_channel_mask_t channelMask __unused,
+        int sessionId __unused)
 {
     return 0;
 }
 
 // deleteTrackName_l() must be called with ThreadBase::mLock held
-void AudioFlinger::DirectOutputThread::deleteTrackName_l(int name)
+void AudioFlinger::DirectOutputThread::deleteTrackName_l(int name __unused)
 {
 }
 
@@ -3982,6 +4044,17 @@ AudioFlinger::PlaybackThread::mixer_state AudioFlinger::OffloadThread::prepareTr
         sp<Track> l = mLatestActiveTrack.promote();
         bool last = l.get() == track;
 
+        if (track->isInvalid()) {
+            ALOGW("An invalidated track shouldn't be in active list");
+            tracksToRemove->add(track);
+            continue;
+        }
+
+        if (track->mState == TrackBase::IDLE) {
+            ALOGW("An idle track shouldn't be in active list");
+            continue;
+        }
+
         if (track->isPausing()) {
             track->setPaused();
             if (last) {
@@ -4000,6 +4073,11 @@ AudioFlinger::PlaybackThread::mixer_state AudioFlinger::OffloadThread::prepareTr
                 mBytesRemaining = 0;    // stop writing
             }
             tracksToRemove->add(track);
+        } else if (track->isFlushPending()) {
+            track->flushAck();
+            if (last) {
+                mFlushPending = true;
+            }
         } else if (track->framesReady() && track->isReady() &&
                 !track->isPaused() && !track->isTerminated() && !track->isStopping_2()) {
             ALOGVV("OffloadThread: track %d s=%08x [OK]", track->name(), cblk->mServer);
@@ -4049,7 +4127,6 @@ AudioFlinger::PlaybackThread::mixer_state AudioFlinger::OffloadThread::prepareTr
                         // seek when resuming.
                         if (previousTrack->sessionId() != track->sessionId()) {
                             previousTrack->invalidate();
-                            mFlushPending = true;
                         }
                     }
                 }
@@ -4125,9 +4202,6 @@ AudioFlinger::PlaybackThread::mixer_state AudioFlinger::OffloadThread::prepareTr
     // if resume is received before pause is executed.
     if (!mStandby && (doHwPause || (mFlushPending && !mHwPaused && (count != 0)))) {
         mOutput->stream->pause(mOutput->stream);
-        if (!doHwPause) {
-            doHwResume = true;
-        }
     }
     if (mFlushPending) {
         flushHw_l();
@@ -4143,11 +4217,6 @@ AudioFlinger::PlaybackThread::mixer_state AudioFlinger::OffloadThread::prepareTr
     return mixerStatus;
 }
 
-void AudioFlinger::OffloadThread::flushOutput_l()
-{
-    mFlushPending = true;
-}
-
 // must be called with thread mutex locked
 bool AudioFlinger::OffloadThread::waitingAsyncCallback_l()
 {
@@ -4162,15 +4231,15 @@ bool AudioFlinger::OffloadThread::waitingAsyncCallback_l()
 // must be called with thread mutex locked
 bool AudioFlinger::OffloadThread::shouldStandby_l()
 {
-    bool TrackPaused = false;
+    bool trackPaused = false;
 
     // do not put the HAL in standby when paused. AwesomePlayer clear the offloaded AudioTrack
     // after a timeout and we will enter standby then.
     if (mTracks.size() > 0) {
-        TrackPaused = mTracks[mTracks.size() - 1]->isPaused();
+        trackPaused = mTracks[mTracks.size() - 1]->isPaused();
     }
 
-    return !mStandby && !TrackPaused;
+    return !mStandby && !trackPaused;
 }
 
 
@@ -4188,6 +4257,8 @@ void AudioFlinger::OffloadThread::flushHw_l()
     mBytesRemaining = 0;
     mPausedWriteLength = 0;
     mPausedBytesRemaining = 0;
+    mHwPaused = false;
+
     if (mUseAsyncWrite) {
         // discard any pending drain or write ack by incrementing sequence
         mWriteAckSequence = (mWriteAckSequence + 2) & ~1;
@@ -4198,6 +4269,18 @@ void AudioFlinger::OffloadThread::flushHw_l()
     }
 }
 
+void AudioFlinger::OffloadThread::onAddNewTrack_l()
+{
+    sp<Track> previousTrack = mPreviousTrack.promote();
+    sp<Track> latestTrack = mLatestActiveTrack.promote();
+
+    if (previousTrack != 0 && latestTrack != 0 &&
+        (previousTrack->sessionId() != latestTrack->sessionId())) {
+        mFlushPending = true;
+    }
+    PlaybackThread::onAddNewTrack_l();
+}
+
 // ----------------------------------------------------------------------------
 
 AudioFlinger::DuplicatingThread::DuplicatingThread(const sp<AudioFlinger>& audioFlinger,
@@ -4377,8 +4460,10 @@ AudioFlinger::RecordThread::RecordThread(const sp<AudioFlinger>& audioFlinger,
 #endif
                                          ) :
     ThreadBase(audioFlinger, id, outDevice, inDevice, RECORD),
-    mInput(input), mResampler(NULL), mRsmpOutBuffer(NULL), mRsmpInBuffer(NULL),
-    // mRsmpInIndex and mBufferSize set by readInputParameters()
+    mInput(input), mActiveTracksGen(0), mResampler(NULL), mRsmpOutBuffer(NULL), mRsmpInBuffer(NULL),
+    // mRsmpInFrames, mRsmpInFramesP2, mRsmpInUnrel, mRsmpInFront, and mRsmpInRear
+    //      are set by readInputParameters()
+    // mRsmpInIndex LEGACY
     mReqChannelCount(popcount(channelMask)),
     mReqSampleRate(sampleRate)
     // mBytesRead is only meaningful while active, and so is cleared in start()
@@ -4388,6 +4473,7 @@ AudioFlinger::RecordThread::RecordThread(const sp<AudioFlinger>& audioFlinger,
 #endif
 {
     snprintf(mName, kNameLength, "AudioIn_%X", id);
+    mNBLogWriter = audioFlinger->newWriter_l(kLogSize, mName);
 
     readInputParameters();
 }
@@ -4395,6 +4481,7 @@ AudioFlinger::RecordThread::RecordThread(const sp<AudioFlinger>& audioFlinger,
 
 AudioFlinger::RecordThread::~RecordThread()
 {
+    mAudioFlinger->unregisterWriter(mNBLogWriter);
     delete[] mRsmpInBuffer;
     delete mResampler;
     delete[] mRsmpOutBuffer;
@@ -4405,230 +4492,323 @@ void AudioFlinger::RecordThread::onFirstRef()
     run(mName, PRIORITY_URGENT_AUDIO);
 }
 
-status_t AudioFlinger::RecordThread::readyToRun()
-{
-    status_t status = initCheck();
-    ALOGW_IF(status != NO_ERROR,"RecordThread %p could not initialize", this);
-    return status;
-}
-
 bool AudioFlinger::RecordThread::threadLoop()
 {
-    AudioBufferProvider::Buffer buffer;
-    sp<RecordTrack> activeTrack;
-    Vector< sp<EffectChain> > effectChains;
-
     nsecs_t lastWarning = 0;
 
     inputStandBy();
-    {
-        Mutex::Autolock _l(mLock);
-        activeTrack = mActiveTrack;
-        acquireWakeLock_l(activeTrack != 0 ? activeTrack->uid() : -1);
-    }
 
     // used to verify we've read at least once before evaluating how many bytes were read
     bool readOnce = false;
 
+    // used to request a deferred sleep, to be executed later while mutex is unlocked
+    bool doSleep = false;
+
+reacquire_wakelock:
+    sp<RecordTrack> activeTrack;
+    int activeTracksGen;
+    {
+        Mutex::Autolock _l(mLock);
+        size_t size = mActiveTracks.size();
+        activeTracksGen = mActiveTracksGen;
+        if (size > 0) {
+            // FIXME an arbitrary choice
+            activeTrack = mActiveTracks[0];
+            acquireWakeLock_l(activeTrack->uid());
+            if (size > 1) {
+                SortedVector<int> tmp;
+                for (size_t i = 0; i < size; i++) {
+                    tmp.add(mActiveTracks[i]->uid());
+                }
+                updateWakeLockUids_l(tmp);
+            }
+        } else {
+            acquireWakeLock_l(-1);
+        }
+    }
+
     // start recording
-    while (!exitPending()) {
+    for (;;) {
+        TrackBase::track_state activeTrackState;
+        Vector< sp<EffectChain> > effectChains;
 
-        processConfigEvents();
+        // sleep with mutex unlocked
+        if (doSleep) {
+            doSleep = false;
+            usleep(kRecordThreadSleepUs);
+        }
 
         { // scope for mLock
             Mutex::Autolock _l(mLock);
-            checkForNewParameters_l();
-            if (mActiveTrack != 0 && activeTrack != mActiveTrack) {
-                SortedVector<int> tmp;
-                tmp.add(mActiveTrack->uid());
-                updateWakeLockUids_l(tmp);
-            }
-            activeTrack = mActiveTrack;
-            if (mActiveTrack == 0 && mConfigEvents.isEmpty()) {
-                standby();
 
-                if (exitPending()) {
-                    break;
-                }
+            processConfigEvents_l();
+            // return value 'reconfig' is currently unused
+            bool reconfig = checkForNewParameters_l();
 
+            // check exitPending here because checkForNewParameters_l() and
+            // checkForNewParameters_l() can temporarily release mLock
+            if (exitPending()) {
+                break;
+            }
+
+            // if no active track(s), then standby and release wakelock
+            size_t size = mActiveTracks.size();
+            if (size == 0) {
+                standbyIfNotAlreadyInStandby();
+                // exitPending() can't become true here
                 releaseWakeLock_l();
                 ALOGV("RecordThread: loop stopping");
                 // go to sleep
                 mWaitWorkCV.wait(mLock);
                 ALOGV("RecordThread: loop starting");
-                acquireWakeLock_l(mActiveTrack != 0 ? mActiveTrack->uid() : -1);
+                goto reacquire_wakelock;
+            }
+
+            if (mActiveTracksGen != activeTracksGen) {
+                activeTracksGen = mActiveTracksGen;
+                SortedVector<int> tmp;
+                for (size_t i = 0; i < size; i++) {
+                    tmp.add(mActiveTracks[i]->uid());
+                }
+                updateWakeLockUids_l(tmp);
+                // FIXME an arbitrary choice
+                activeTrack = mActiveTracks[0];
+            }
+
+            if (activeTrack->isTerminated()) {
+                removeTrack_l(activeTrack);
+                mActiveTracks.remove(activeTrack);
+                mActiveTracksGen++;
                 continue;
             }
-            if (mActiveTrack != 0) {
-                if (mActiveTrack->isTerminated()) {
-                    removeTrack_l(mActiveTrack);
-                    mActiveTrack.clear();
-                } else if (mActiveTrack->mState == TrackBase::PAUSING) {
-                    standby();
-                    mActiveTrack.clear();
+
+            activeTrackState = activeTrack->mState;
+            switch (activeTrackState) {
+            case TrackBase::PAUSING:
+                standbyIfNotAlreadyInStandby();
+                mActiveTracks.remove(activeTrack);
+                mActiveTracksGen++;
+                mStartStopCond.broadcast();
+                doSleep = true;
+                continue;
+
+            case TrackBase::RESUMING:
+                mStandby = false;
+                if (mReqChannelCount != activeTrack->channelCount()) {
+                    mActiveTracks.remove(activeTrack);
+                    mActiveTracksGen++;
                     mStartStopCond.broadcast();
-                } else if (mActiveTrack->mState == TrackBase::RESUMING) {
-                    if (mReqChannelCount != mActiveTrack->channelCount()) {
-                        mActiveTrack.clear();
-                        mStartStopCond.broadcast();
-                    } else if (readOnce) {
-                        // record start succeeds only if first read from audio input
-                        // succeeds
-                        if (mBytesRead >= 0) {
-                            mActiveTrack->mState = TrackBase::ACTIVE;
-                        } else {
-                            mActiveTrack.clear();
-                        }
-                        mStartStopCond.broadcast();
+                    continue;
+                }
+                if (readOnce) {
+                    mStartStopCond.broadcast();
+                    // record start succeeds only if first read from audio input succeeds
+                    if (mBytesRead < 0) {
+                        mActiveTracks.remove(activeTrack);
+                        mActiveTracksGen++;
+                        continue;
                     }
-                    mStandby = false;
+                    activeTrack->mState = TrackBase::ACTIVE;
                 }
+                break;
+
+            case TrackBase::ACTIVE:
+                break;
+
+            case TrackBase::IDLE:
+                doSleep = true;
+                continue;
+
+            default:
+                LOG_FATAL("Unexpected activeTrackState %d", activeTrackState);
             }
 
             lockEffectChains_l(effectChains);
         }
 
-        if (mActiveTrack != 0) {
-            if (mActiveTrack->mState != TrackBase::ACTIVE &&
-                mActiveTrack->mState != TrackBase::RESUMING) {
-                unlockEffectChains(effectChains);
-                usleep(kRecordThreadSleepUs);
-                continue;
-            }
-            for (size_t i = 0; i < effectChains.size(); i ++) {
-                effectChains[i]->process_l();
-            }
+        // thread mutex is now unlocked, mActiveTracks unknown, activeTrack != 0, kept, immutable
+        // activeTrack->mState unknown, activeTrackState immutable and is ACTIVE or RESUMING
 
-            buffer.frameCount = mFrameCount;
-            status_t status = mActiveTrack->getNextBuffer(&buffer);
-            if (status == NO_ERROR) {
-                readOnce = true;
-                size_t framesOut = buffer.frameCount;
-                if (mResampler == NULL) {
-                    // no resampling
-                    while (framesOut) {
-                        size_t framesIn = mFrameCount - mRsmpInIndex;
-                        if (framesIn) {
-                            int8_t *src = (int8_t *)mRsmpInBuffer + mRsmpInIndex * mFrameSize;
-                            int8_t *dst = buffer.i8 + (buffer.frameCount - framesOut) *
-                                    mActiveTrack->mFrameSize;
-                            if (framesIn > framesOut)
-                                framesIn = framesOut;
-                            mRsmpInIndex += framesIn;
-                            framesOut -= framesIn;
-                            if (mChannelCount == mReqChannelCount) {
-                                memcpy(dst, src, framesIn * mFrameSize);
-                            } else {
-                                if (mChannelCount == 1) {
-                                    upmix_to_stereo_i16_from_mono_i16((int16_t *)dst,
-                                            (int16_t *)src, framesIn);
-                                } else {
-                                    downmix_to_mono_i16_from_stereo_i16((int16_t *)dst,
-                                            (int16_t *)src, framesIn);
-                                }
-                            }
+        for (size_t i = 0; i < effectChains.size(); i ++) {
+            // thread mutex is not locked, but effect chain is locked
+            effectChains[i]->process_l();
+        }
+
+        AudioBufferProvider::Buffer buffer;
+        buffer.frameCount = mFrameCount;
+        status_t status = activeTrack->getNextBuffer(&buffer);
+        if (status == NO_ERROR) {
+            readOnce = true;
+            size_t framesOut = buffer.frameCount;
+            if (mResampler == NULL) {
+                // no resampling
+                while (framesOut) {
+                    size_t framesIn = mFrameCount - mRsmpInIndex;
+                    if (framesIn > 0) {
+                        int8_t *src = (int8_t *)mRsmpInBuffer + mRsmpInIndex * mFrameSize;
+                        int8_t *dst = buffer.i8 + (buffer.frameCount - framesOut) *
+                                activeTrack->mFrameSize;
+                        if (framesIn > framesOut) {
+                            framesIn = framesOut;
                         }
-                        if (framesOut && mFrameCount == mRsmpInIndex) {
-                            void *readInto;
-                            if (framesOut == mFrameCount && mChannelCount == mReqChannelCount) {
-                                readInto = buffer.raw;
-                                framesOut = 0;
+                        mRsmpInIndex += framesIn;
+                        framesOut -= framesIn;
+                        if (mChannelCount == mReqChannelCount) {
+                            memcpy(dst, src, framesIn * mFrameSize);
+                        } else {
+                            if (mChannelCount == 1) {
+                                upmix_to_stereo_i16_from_mono_i16((int16_t *)dst,
+                                        (int16_t *)src, framesIn);
                             } else {
-                                readInto = mRsmpInBuffer;
-                                mRsmpInIndex = 0;
+                                downmix_to_mono_i16_from_stereo_i16((int16_t *)dst,
+                                        (int16_t *)src, framesIn);
                             }
-                            mBytesRead = mInput->stream->read(mInput->stream, readInto,
-                                    mBufferSize);
-                            if (mBytesRead <= 0) {
-                                if ((mBytesRead < 0) && (mActiveTrack->mState == TrackBase::ACTIVE))
-                                {
-                                    ALOGE("Error reading audio input");
-                                    // Force input into standby so that it tries to
-                                    // recover at next read attempt
-                                    inputStandBy();
-                                    usleep(kRecordThreadSleepUs);
-                                }
-                                mRsmpInIndex = mFrameCount;
-                                framesOut = 0;
-                                buffer.frameCount = 0;
+                        }
+                    }
+                    if (framesOut > 0 && mFrameCount == mRsmpInIndex) {
+                        void *readInto;
+                        if (framesOut == mFrameCount && mChannelCount == mReqChannelCount) {
+                            readInto = buffer.raw;
+                            framesOut = 0;
+                        } else {
+                            readInto = mRsmpInBuffer;
+                            mRsmpInIndex = 0;
+                        }
+                        mBytesRead = mInput->stream->read(mInput->stream, readInto, mBufferSize);
+                        if (mBytesRead <= 0) {
+                            // TODO: verify that it's benign to use a stale track state
+                            if ((mBytesRead < 0) && (activeTrackState == TrackBase::ACTIVE))
+                            {
+                                ALOGE("Error reading audio input");
+                                // Force input into standby so that it tries to
+                                // recover at next read attempt
+                                inputStandBy();
+                                doSleep = true;
                             }
+                            mRsmpInIndex = mFrameCount;
+                            framesOut = 0;
+                            buffer.frameCount = 0;
+                        }
 #ifdef TEE_SINK
-                            else if (mTeeSink != 0) {
-                                (void) mTeeSink->write(readInto,
-                                        mBytesRead >> Format_frameBitShift(mTeeSink->format()));
-                            }
-#endif
+                        else if (mTeeSink != 0) {
+                            (void) mTeeSink->write(readInto,
+                                    mBytesRead >> Format_frameBitShift(mTeeSink->format()));
                         }
+#endif
                     }
-                } else {
-                    // resampling
-
-                    // resampler accumulates, but we only have one source track
-                    memset(mRsmpOutBuffer, 0, framesOut * FCC_2 * sizeof(int32_t));
-                    // alter output frame count as if we were expecting stereo samples
-                    if (mChannelCount == 1 && mReqChannelCount == 1) {
-                        framesOut >>= 1;
+                }
+            } else {
+                // resampling
+
+                // avoid busy-waiting if client doesn't keep up
+                bool madeProgress = false;
+
+                // keep mRsmpInBuffer full so resampler always has sufficient input
+                for (;;) {
+                    int32_t rear = mRsmpInRear;
+                    ssize_t filled = rear - mRsmpInFront;
+                    ALOG_ASSERT(0 <= filled && (size_t) filled <= mRsmpInFramesP2);
+                    // exit once there is enough data in buffer for resampler
+                    if ((size_t) filled >= mRsmpInFrames) {
+                        break;
                     }
-                    mResampler->resample(mRsmpOutBuffer, framesOut,
-                            this /* AudioBufferProvider* */);
-                    // ditherAndClamp() works as long as all buffers returned by
-                    // mActiveTrack->getNextBuffer() are 32 bit aligned which should be always true.
-                    if (mChannelCount == 2 && mReqChannelCount == 1) {
-                        // temporarily type pun mRsmpOutBuffer from Q19.12 to int16_t
-                        ditherAndClamp(mRsmpOutBuffer, mRsmpOutBuffer, framesOut);
-                        // the resampler always outputs stereo samples:
-                        // do post stereo to mono conversion
-                        downmix_to_mono_i16_from_stereo_i16(buffer.i16, (int16_t *)mRsmpOutBuffer,
-                                framesOut);
-                    } else {
-                        ditherAndClamp((int32_t *)buffer.raw, mRsmpOutBuffer, framesOut);
+                    size_t avail = mRsmpInFramesP2 - filled;
+                    // Only try to read full HAL buffers.
+                    // But if the HAL read returns a partial buffer, use it.
+                    if (avail < mFrameCount) {
+                        ALOGE("insufficient space to read: avail %d < mFrameCount %d",
+                                avail, mFrameCount);
+                        break;
+                    }
+                    // If 'avail' is non-contiguous, first read past the nominal end of buffer, then
+                    // copy to the right place.  Permitted because mRsmpInBuffer was over-allocated.
+                    rear &= mRsmpInFramesP2 - 1;
+                    mBytesRead = mInput->stream->read(mInput->stream,
+                            &mRsmpInBuffer[rear * mChannelCount], mBufferSize);
+                    if (mBytesRead <= 0) {
+                        ALOGE("read failed: mBytesRead=%d < %u", mBytesRead, mBufferSize);
+                        break;
                     }
-                    // now done with mRsmpOutBuffer
+                    ALOG_ASSERT((size_t) mBytesRead <= mBufferSize);
+                    size_t framesRead = mBytesRead / mFrameSize;
+                    ALOG_ASSERT(framesRead > 0);
+                    madeProgress = true;
+                    // If 'avail' was non-contiguous, we now correct for reading past end of buffer.
+                    size_t part1 = mRsmpInFramesP2 - rear;
+                    if (framesRead > part1) {
+                        memcpy(mRsmpInBuffer, &mRsmpInBuffer[mRsmpInFramesP2 * mChannelCount],
+                                (framesRead - part1) * mFrameSize);
+                    }
+                    mRsmpInRear += framesRead;
+                }
 
+                if (!madeProgress) {
+                    ALOGV("Did not make progress");
+                    usleep(((mFrameCount * 1000) / mSampleRate) * 1000);
                 }
-                if (mFramestoDrop == 0) {
-                    mActiveTrack->releaseBuffer(&buffer);
+
+                // resampler accumulates, but we only have one source track
+                memset(mRsmpOutBuffer, 0, framesOut * FCC_2 * sizeof(int32_t));
+                mResampler->resample(mRsmpOutBuffer, framesOut,
+                        this /* AudioBufferProvider* */);
+                // ditherAndClamp() works as long as all buffers returned by
+                // activeTrack->getNextBuffer() are 32 bit aligned which should be always true.
+                if (mReqChannelCount == 1) {
+                    // temporarily type pun mRsmpOutBuffer from Q19.12 to int16_t
+                    ditherAndClamp(mRsmpOutBuffer, mRsmpOutBuffer, framesOut);
+                    // the resampler always outputs stereo samples:
+                    // do post stereo to mono conversion
+                    downmix_to_mono_i16_from_stereo_i16(buffer.i16, (int16_t *)mRsmpOutBuffer,
+                            framesOut);
                 } else {
-                    if (mFramestoDrop > 0) {
-                        mFramestoDrop -= buffer.frameCount;
-                        if (mFramestoDrop <= 0) {
-                            clearSyncStartEvent();
-                        }
-                    } else {
-                        mFramestoDrop += buffer.frameCount;
-                        if (mFramestoDrop >= 0 || mSyncStartEvent == 0 ||
-                                mSyncStartEvent->isCancelled()) {
-                            ALOGW("Synced record %s, session %d, trigger session %d",
-                                  (mFramestoDrop >= 0) ? "timed out" : "cancelled",
-                                  mActiveTrack->sessionId(),
-                                  (mSyncStartEvent != 0) ? mSyncStartEvent->triggerSession() : 0);
-                            clearSyncStartEvent();
-                        }
-                    }
+                    ditherAndClamp((int32_t *)buffer.raw, mRsmpOutBuffer, framesOut);
                 }
-                mActiveTrack->clearOverflow();
+                // now done with mRsmpOutBuffer
+
             }
-            // client isn't retrieving buffers fast enough
-            else {
-                if (!mActiveTrack->setOverflow()) {
-                    nsecs_t now = systemTime();
-                    if ((now - lastWarning) > kWarningThrottleNs) {
-                        ALOGW("RecordThread: buffer overflow");
-                        lastWarning = now;
+            if (mFramestoDrop == 0) {
+                activeTrack->releaseBuffer(&buffer);
+            } else {
+                if (mFramestoDrop > 0) {
+                    mFramestoDrop -= buffer.frameCount;
+                    if (mFramestoDrop <= 0) {
+                        clearSyncStartEvent();
+                    }
+                } else {
+                    mFramestoDrop += buffer.frameCount;
+                    if (mFramestoDrop >= 0 || mSyncStartEvent == 0 ||
+                            mSyncStartEvent->isCancelled()) {
+                        ALOGW("Synced record %s, session %d, trigger session %d",
+                              (mFramestoDrop >= 0) ? "timed out" : "cancelled",
+                              activeTrack->sessionId(),
+                              (mSyncStartEvent != 0) ? mSyncStartEvent->triggerSession() : 0);
+                        clearSyncStartEvent();
                     }
                 }
-                // Release the processor for a while before asking for a new buffer.
-                // This will give the application more chance to read from the buffer and
-                // clear the overflow.
-                usleep(kRecordThreadSleepUs);
             }
+            activeTrack->clearOverflow();
+        }
+        // client isn't retrieving buffers fast enough
+        else {
+            if (!activeTrack->setOverflow()) {
+                nsecs_t now = systemTime();
+                if ((now - lastWarning) > kWarningThrottleNs) {
+                    ALOGW("RecordThread: buffer overflow");
+                    lastWarning = now;
+                }
+            }
+            // Release the processor for a while before asking for a new buffer.
+            // This will give the application more chance to read from the buffer and
+            // clear the overflow.
+            doSleep = true;
         }
+
         // enable changes in effect chain
         unlockEffectChains(effectChains);
-        effectChains.clear();
+        // effectChains doesn't need to be cleared, since it is cleared by destructor at scope end
     }
 
-    standby();
+    standbyIfNotAlreadyInStandby();
 
     {
         Mutex::Autolock _l(mLock);
@@ -4636,7 +4816,8 @@ bool AudioFlinger::RecordThread::threadLoop()
             sp<RecordTrack> track = mTracks[i];
             track->invalidate();
         }
-        mActiveTrack.clear();
+        mActiveTracks.clear();
+        mActiveTracksGen++;
         mStartStopCond.broadcast();
     }
 
@@ -4646,7 +4827,7 @@ bool AudioFlinger::RecordThread::threadLoop()
     return false;
 }
 
-void AudioFlinger::RecordThread::standby()
+void AudioFlinger::RecordThread::standbyIfNotAlreadyInStandby()
 {
     if (!mStandby) {
         inputStandBy();
@@ -4659,18 +4840,19 @@ void AudioFlinger::RecordThread::inputStandBy()
     mInput->stream->common.standby(&mInput->stream->common);
 }
 
-sp<AudioFlinger::RecordThread::RecordTrack>  AudioFlinger::RecordThread::createRecordTrack_l(
+sp<AudioFlinger::RecordThread::RecordTrack> AudioFlinger::RecordThread::createRecordTrack_l(
         const sp<AudioFlinger::Client>& client,
         uint32_t sampleRate,
         audio_format_t format,
         audio_channel_mask_t channelMask,
-        size_t frameCount,
+        size_t *pFrameCount,
         int sessionId,
         int uid,
         IAudioFlinger::track_flags_t *flags,
         pid_t tid,
         status_t *status)
 {
+    size_t frameCount = *pFrameCount;
     sp<RecordTrack> track;
     status_t lStatus;
 
@@ -4679,6 +4861,7 @@ sp<AudioFlinger::RecordThread::RecordTrack>  AudioFlinger::RecordThread::createR
         ALOGE("createRecordTrack_l() audio driver not initialized");
         goto Exit;
     }
+
     // client expresses a preference for FAST, but we get the final say
     if (*flags & IAudioFlinger::TRACK_FAST) {
       if (
@@ -4729,6 +4912,7 @@ sp<AudioFlinger::RecordThread::RecordTrack>  AudioFlinger::RecordThread::createR
         }
       }
     }
+    *pFrameCount = frameCount;
 
     // FIXME use flags and tid similar to createTrack_l()
 
@@ -4738,10 +4922,10 @@ sp<AudioFlinger::RecordThread::RecordTrack>  AudioFlinger::RecordThread::createR
         track = new RecordTrack(this, client, sampleRate,
                       format, channelMask, frameCount, sessionId, uid);
 
-        if (track->getCblk() == 0) {
-            ALOGE("createRecordTrack_l() no control block");
-            lStatus = NO_MEMORY;
-            track.clear();
+        lStatus = track->initCheck();
+        if (lStatus != NO_ERROR) {
+            ALOGE("createRecordTrack_l() initCheck failed %d; no control block?", lStatus);
+            // track must be cleared from the caller as the caller has the AF lock
             goto Exit;
         }
         mTracks.add(track);
@@ -4762,9 +4946,7 @@ sp<AudioFlinger::RecordThread::RecordTrack>  AudioFlinger::RecordThread::createR
     lStatus = NO_ERROR;
 
 Exit:
-    if (status) {
-        *status = lStatus;
-    }
+    *status = lStatus;
     return track;
 }
 
@@ -4795,43 +4977,57 @@ status_t AudioFlinger::RecordThread::start(RecordThread::RecordTrack* recordTrac
     }
 
     {
+        // This section is a rendezvous between binder thread executing start() and RecordThread
         AutoMutex lock(mLock);
-        if (mActiveTrack != 0) {
-            if (recordTrack != mActiveTrack.get()) {
+        if (mActiveTracks.size() > 0) {
+            // FIXME does not work for multiple active tracks
+            if (mActiveTracks.indexOf(recordTrack) != 0) {
                 status = -EBUSY;
-            } else if (mActiveTrack->mState == TrackBase::PAUSING) {
-                mActiveTrack->mState = TrackBase::ACTIVE;
+            } else if (recordTrack->mState == TrackBase::PAUSING) {
+                recordTrack->mState = TrackBase::ACTIVE;
             }
             return status;
         }
 
+        // FIXME why? already set in constructor, 'STARTING_1' would be more accurate
         recordTrack->mState = TrackBase::IDLE;
-        mActiveTrack = recordTrack;
+        mActiveTracks.add(recordTrack);
+        mActiveTracksGen++;
         mLock.unlock();
         status_t status = AudioSystem::startInput(mId);
         mLock.lock();
+        // FIXME should verify that mActiveTrack is still == recordTrack
         if (status != NO_ERROR) {
-            mActiveTrack.clear();
+            mActiveTracks.remove(recordTrack);
+            mActiveTracksGen++;
             clearSyncStartEvent();
             return status;
         }
+        // FIXME LEGACY
         mRsmpInIndex = mFrameCount;
+        mRsmpInFront = 0;
+        mRsmpInRear = 0;
+        mRsmpInUnrel = 0;
         mBytesRead = 0;
         if (mResampler != NULL) {
             mResampler->reset();
         }
-        mActiveTrack->mState = TrackBase::RESUMING;
+        // FIXME hijacking a playback track state name which was intended for start after pause;
+        //       here 'STARTING_2' would be more accurate
+        recordTrack->mState = TrackBase::RESUMING;
         // signal thread to start
         ALOGV("Signal record thread");
         mWaitWorkCV.broadcast();
         // do not wait for mStartStopCond if exiting
         if (exitPending()) {
-            mActiveTrack.clear();
+            mActiveTracks.remove(recordTrack);
+            mActiveTracksGen++;
             status = INVALID_OPERATION;
             goto startError;
         }
+        // FIXME incorrect usage of wait: no explicit predicate or loop
         mStartStopCond.wait(mLock);
-        if (mActiveTrack == 0) {
+        if (mActiveTracks.indexOf(recordTrack) < 0) {
             ALOGV("Record failed to start");
             status = BAD_VALUE;
             goto startError;
@@ -4877,29 +5073,31 @@ void AudioFlinger::RecordThread::handleSyncStartEvent(const sp<SyncEvent>& event
 bool AudioFlinger::RecordThread::stop(RecordThread::RecordTrack* recordTrack) {
     ALOGV("RecordThread::stop");
     AutoMutex _l(mLock);
-    if (recordTrack != mActiveTrack.get() || recordTrack->mState == TrackBase::PAUSING) {
+    if (mActiveTracks.indexOf(recordTrack) != 0 || recordTrack->mState == TrackBase::PAUSING) {
         return false;
     }
+    // note that threadLoop may still be processing the track at this point [without lock]
     recordTrack->mState = TrackBase::PAUSING;
     // do not wait for mStartStopCond if exiting
     if (exitPending()) {
         return true;
     }
+    // FIXME incorrect usage of wait: no explicit predicate or loop
     mStartStopCond.wait(mLock);
-    // if we have been restarted, recordTrack == mActiveTrack.get() here
-    if (exitPending() || recordTrack != mActiveTrack.get()) {
+    // if we have been restarted, recordTrack is in mActiveTracks here
+    if (exitPending() || mActiveTracks.indexOf(recordTrack) != 0) {
         ALOGV("Record stopped OK");
         return true;
     }
     return false;
 }
 
-bool AudioFlinger::RecordThread::isValidSyncEvent(const sp<SyncEvent>& event) const
+bool AudioFlinger::RecordThread::isValidSyncEvent(const sp<SyncEvent>& event __unused) const
 {
     return false;
 }
 
-status_t AudioFlinger::RecordThread::setSyncEvent(const sp<SyncEvent>& event)
+status_t AudioFlinger::RecordThread::setSyncEvent(const sp<SyncEvent>& event __unused)
 {
 #if 0   // This branch is currently dead code, but is preserved in case it will be needed in future
     if (!isValidSyncEvent(event)) {
@@ -4930,7 +5128,7 @@ void AudioFlinger::RecordThread::destroyTrack_l(const sp<RecordTrack>& track)
     track->terminate();
     track->mState = TrackBase::STOPPED;
     // active tracks are removed by threadLoop()
-    if (mActiveTrack != track) {
+    if (mActiveTracks.indexOf(track) < 0) {
         removeTrack_l(track);
     }
 }
@@ -4950,104 +5148,110 @@ void AudioFlinger::RecordThread::dump(int fd, const Vector<String16>& args)
 
 void AudioFlinger::RecordThread::dumpInternals(int fd, const Vector<String16>& args)
 {
-    const size_t SIZE = 256;
-    char buffer[SIZE];
-    String8 result;
+    fdprintf(fd, "\nInput thread %p:\n", this);
 
-    snprintf(buffer, SIZE, "\nInput thread %p internals\n", this);
-    result.append(buffer);
-
-    if (mActiveTrack != 0) {
-        snprintf(buffer, SIZE, "In index: %zu\n", mRsmpInIndex);
-        result.append(buffer);
-        snprintf(buffer, SIZE, "Buffer size: %zu bytes\n", mBufferSize);
-        result.append(buffer);
-        snprintf(buffer, SIZE, "Resampling: %d\n", (mResampler != NULL));
-        result.append(buffer);
-        snprintf(buffer, SIZE, "Out channel count: %u\n", mReqChannelCount);
-        result.append(buffer);
-        snprintf(buffer, SIZE, "Out sample rate: %u\n", mReqSampleRate);
-        result.append(buffer);
+    if (mActiveTracks.size() > 0) {
+        fdprintf(fd, "  In index: %zu\n", mRsmpInIndex);
+        fdprintf(fd, "  Buffer size: %zu bytes\n", mBufferSize);
+        fdprintf(fd, "  Resampling: %d\n", (mResampler != NULL));
+        fdprintf(fd, "  Out channel count: %u\n", mReqChannelCount);
+        fdprintf(fd, "  Out sample rate: %u\n", mReqSampleRate);
     } else {
-        result.append("No active record client\n");
+        fdprintf(fd, "  No active record client\n");
     }
 
-    write(fd, result.string(), result.size());
-
     dumpBase(fd, args);
 }
 
-void AudioFlinger::RecordThread::dumpTracks(int fd, const Vector<String16>& args)
+void AudioFlinger::RecordThread::dumpTracks(int fd, const Vector<String16>& args __unused)
 {
     const size_t SIZE = 256;
     char buffer[SIZE];
     String8 result;
 
-    snprintf(buffer, SIZE, "Input thread %p tracks\n", this);
-    result.append(buffer);
-    RecordTrack::appendDumpHeader(result);
-    for (size_t i = 0; i < mTracks.size(); ++i) {
-        sp<RecordTrack> track = mTracks[i];
-        if (track != 0) {
-            track->dump(buffer, SIZE);
-            result.append(buffer);
+    size_t numtracks = mTracks.size();
+    size_t numactive = mActiveTracks.size();
+    size_t numactiveseen = 0;
+    fdprintf(fd, "  %d Tracks", numtracks);
+    if (numtracks) {
+        fdprintf(fd, " of which %d are active\n", numactive);
+        RecordTrack::appendDumpHeader(result);
+        for (size_t i = 0; i < numtracks ; ++i) {
+            sp<RecordTrack> track = mTracks[i];
+            if (track != 0) {
+                bool active = mActiveTracks.indexOf(track) >= 0;
+                if (active) {
+                    numactiveseen++;
+                }
+                track->dump(buffer, SIZE, active);
+                result.append(buffer);
+            }
         }
+    } else {
+        fdprintf(fd, "\n");
     }
 
-    if (mActiveTrack != 0) {
-        snprintf(buffer, SIZE, "\nInput thread %p active tracks\n", this);
+    if (numactiveseen != numactive) {
+        snprintf(buffer, SIZE, "  The following tracks are in the active list but"
+                " not in the track list\n");
         result.append(buffer);
         RecordTrack::appendDumpHeader(result);
-        mActiveTrack->dump(buffer, SIZE);
-        result.append(buffer);
+        for (size_t i = 0; i < numactive; ++i) {
+            sp<RecordTrack> track = mActiveTracks[i];
+            if (mTracks.indexOf(track) < 0) {
+                track->dump(buffer, SIZE, true);
+                result.append(buffer);
+            }
+        }
 
     }
     write(fd, result.string(), result.size());
 }
 
 // AudioBufferProvider interface
-status_t AudioFlinger::RecordThread::getNextBuffer(AudioBufferProvider::Buffer* buffer, int64_t pts)
-{
-    size_t framesReq = buffer->frameCount;
-    size_t framesReady = mFrameCount - mRsmpInIndex;
-    int channelCount;
-
-    if (framesReady == 0) {
-        mBytesRead = mInput->stream->read(mInput->stream, mRsmpInBuffer, mBufferSize);
-        if (mBytesRead <= 0) {
-            if ((mBytesRead < 0) && (mActiveTrack->mState == TrackBase::ACTIVE)) {
-                ALOGE("RecordThread::getNextBuffer() Error reading audio input");
-                // Force input into standby so that it tries to
-                // recover at next read attempt
-                inputStandBy();
-                usleep(kRecordThreadSleepUs);
-            }
-            buffer->raw = NULL;
-            buffer->frameCount = 0;
-            return NOT_ENOUGH_DATA;
-        }
-        mRsmpInIndex = 0;
-        framesReady = mFrameCount;
-    }
-
-    if (framesReq > framesReady) {
-        framesReq = framesReady;
-    }
-
-    if (mChannelCount == 1 && mReqChannelCount == 2) {
-        channelCount = 1;
-    } else {
-        channelCount = 2;
-    }
-    buffer->raw = mRsmpInBuffer + mRsmpInIndex * channelCount;
-    buffer->frameCount = framesReq;
+status_t AudioFlinger::RecordThread::getNextBuffer(AudioBufferProvider::Buffer* buffer, int64_t pts __unused)
+{
+    int32_t rear = mRsmpInRear;
+    int32_t front = mRsmpInFront;
+    ssize_t filled = rear - front;
+    ALOG_ASSERT(0 <= filled && (size_t) filled <= mRsmpInFramesP2);
+    // 'filled' may be non-contiguous, so return only the first contiguous chunk
+    front &= mRsmpInFramesP2 - 1;
+    size_t part1 = mRsmpInFramesP2 - front;
+    if (part1 > (size_t) filled) {
+        part1 = filled;
+    }
+    size_t ask = buffer->frameCount;
+    ALOG_ASSERT(ask > 0);
+    if (part1 > ask) {
+        part1 = ask;
+    }
+    if (part1 == 0) {
+        // Higher-level should keep mRsmpInBuffer full, and not call resampler if empty
+        ALOGE("RecordThread::getNextBuffer() starved");
+        buffer->raw = NULL;
+        buffer->frameCount = 0;
+        mRsmpInUnrel = 0;
+        return NOT_ENOUGH_DATA;
+    }
+
+    buffer->raw = mRsmpInBuffer + front * mChannelCount;
+    buffer->frameCount = part1;
+    mRsmpInUnrel = part1;
     return NO_ERROR;
 }
 
 // AudioBufferProvider interface
 void AudioFlinger::RecordThread::releaseBuffer(AudioBufferProvider::Buffer* buffer)
 {
-    mRsmpInIndex += buffer->frameCount;
+    size_t stepCount = buffer->frameCount;
+    if (stepCount == 0) {
+        return;
+    }
+    ALOG_ASSERT(stepCount <= mRsmpInUnrel);
+    mRsmpInUnrel -= stepCount;
+    mRsmpInFront += stepCount;
+    buffer->raw = NULL;
     buffer->frameCount = 0;
 }
 
@@ -5062,7 +5266,7 @@ bool AudioFlinger::RecordThread::checkForNewParameters_l()
         int value;
         audio_format_t reqFormat = mFormat;
         uint32_t reqSamplingRate = mReqSampleRate;
-        uint32_t reqChannelCount = mReqChannelCount;
+        audio_channel_mask_t reqChannelMask = audio_channel_in_mask_from_count(mReqChannelCount);
 
         if (param.getInt(String8(AudioParameter::keySamplingRate), value) == NO_ERROR) {
             reqSamplingRate = value;
@@ -5077,14 +5281,19 @@ bool AudioFlinger::RecordThread::checkForNewParameters_l()
             }
         }
         if (param.getInt(String8(AudioParameter::keyChannels), value) == NO_ERROR) {
-            reqChannelCount = popcount(value);
-            reconfig = true;
+            audio_channel_mask_t mask = (audio_channel_mask_t) value;
+            if (mask != AUDIO_CHANNEL_IN_MONO && mask != AUDIO_CHANNEL_IN_STEREO) {
+                status = BAD_VALUE;
+            } else {
+                reqChannelMask = mask;
+                reconfig = true;
+            }
         }
         if (param.getInt(String8(AudioParameter::keyFrameCount), value) == NO_ERROR) {
             // do not accept frame count changes if tracks are open as the track buffer
             // size depends on frame count and correct behavior would not be guaranteed
             // if frame count is changed after track creation
-            if (mActiveTrack != 0) {
+            if (mActiveTracks.size() > 0) {
                 status = INVALID_OPERATION;
             } else {
                 reconfig = true;
@@ -5127,6 +5336,7 @@ bool AudioFlinger::RecordThread::checkForNewParameters_l()
             }
             mAudioSource = (audio_source_t)value;
         }
+
         if (status == NO_ERROR) {
             status = mInput->stream->common.set_parameters(&mInput->stream->common,
                     keyValuePair.string());
@@ -5143,7 +5353,8 @@ bool AudioFlinger::RecordThread::checkForNewParameters_l()
                             <= (2 * reqSamplingRate)) &&
                     popcount(mInput->stream->common.get_channels(&mInput->stream->common))
                             <= FCC_2 &&
-                    (reqChannelCount <= FCC_2)) {
+                    (reqChannelMask == AUDIO_CHANNEL_IN_MONO ||
+                            reqChannelMask == AUDIO_CHANNEL_IN_STEREO)) {
                     status = NO_ERROR;
                 }
                 if (status == NO_ERROR) {
@@ -5177,9 +5388,9 @@ String8 AudioFlinger::RecordThread::getParameters(const String8& keys)
     return out_s8;
 }
 
-void AudioFlinger::RecordThread::audioConfigChanged_l(int event, int param) {
+void AudioFlinger::RecordThread::audioConfigChanged_l(int event, int param __unused) {
     AudioSystem::OutputDescriptor desc;
-    void *param2 = NULL;
+    const void *param2 = NULL;
 
     switch (event) {
     case AudioSystem::INPUT_OPENED:
@@ -5213,39 +5424,32 @@ void AudioFlinger::RecordThread::readInputParameters()
     mChannelCount = popcount(mChannelMask);
     mFormat = mInput->stream->common.get_format(&mInput->stream->common);
     if (mFormat != AUDIO_FORMAT_PCM_16_BIT) {
-        ALOGE("HAL format %d not supported; must be AUDIO_FORMAT_PCM_16_BIT", mFormat);
+        ALOGE("HAL format %#x not supported; must be AUDIO_FORMAT_PCM_16_BIT", mFormat);
     }
     mFrameSize = audio_stream_frame_size(&mInput->stream->common);
     mBufferSize = mInput->stream->common.get_buffer_size(&mInput->stream->common);
     mFrameCount = mBufferSize / mFrameSize;
-    mRsmpInBuffer = new int16_t[mFrameCount * mChannelCount];
-
-    if (mSampleRate != mReqSampleRate && mChannelCount <= FCC_2 && mReqChannelCount <= FCC_2)
-    {
-        int channelCount;
-        // optimization: if mono to mono, use the resampler in stereo to stereo mode to avoid
-        // stereo to mono post process as the resampler always outputs stereo.
-        if (mChannelCount == 1 && mReqChannelCount == 2) {
-            channelCount = 1;
-        } else {
-            channelCount = 2;
-        }
-        mResampler = AudioResampler::create(16, channelCount, mReqSampleRate);
+    // With 3 HAL buffers, we can guarantee ability to down-sample the input by ratio of 2:1 to
+    // 1 full output buffer, regardless of the alignment of the available input.
+    mRsmpInFrames = mFrameCount * 3;
+    mRsmpInFramesP2 = roundup(mRsmpInFrames);
+    // Over-allocate beyond mRsmpInFramesP2 to permit a HAL read past end of buffer
+    mRsmpInBuffer = new int16_t[(mRsmpInFramesP2 + mFrameCount - 1) * mChannelCount];
+    mRsmpInFront = 0;
+    mRsmpInRear = 0;
+    mRsmpInUnrel = 0;
+
+    if (mSampleRate != mReqSampleRate && mChannelCount <= FCC_2 && mReqChannelCount <= FCC_2) {
+        mResampler = AudioResampler::create(16, (int) mChannelCount, mReqSampleRate);
         mResampler->setSampleRate(mSampleRate);
         mResampler->setVolume(AudioMixer::UNITY_GAIN, AudioMixer::UNITY_GAIN);
+        // resampler always outputs stereo
         mRsmpOutBuffer = new int32_t[mFrameCount * FCC_2];
-
-        // optmization: if mono to mono, alter input frame count as if we were inputing
-        // stereo samples
-        if (mChannelCount == 1 && mReqChannelCount == 1) {
-            mFrameCount >>= 1;
-        }
-
     }
     mRsmpInIndex = mFrameCount;
 }
 
-unsigned int AudioFlinger::RecordThread::getInputFramesLost()
+uint32_t AudioFlinger::RecordThread::getInputFramesLost()
 {
     Mutex::Autolock _l(mLock);
     if (initCheck() != NO_ERROR) {
diff --git a/services/audioflinger/Threads.h b/services/audioflinger/Threads.h
index a2fb874..999fea3 100644
--- a/services/audioflinger/Threads.h
+++ b/services/audioflinger/Threads.h
@@ -36,6 +36,8 @@ public:
                 audio_devices_t outDevice, audio_devices_t inDevice, type_t type);
     virtual             ~ThreadBase();
 
+    virtual status_t    readyToRun();
+
     void dumpBase(int fd, const Vector<String16>& args);
     void dumpEffectChains(int fd, const Vector<String16>& args);
 
@@ -63,7 +65,7 @@ public:
     class IoConfigEvent : public ConfigEvent {
     public:
         IoConfigEvent(int event, int param) :
-            ConfigEvent(CFG_EVENT_IO), mEvent(event), mParam(event) {}
+            ConfigEvent(CFG_EVENT_IO), mEvent(event), mParam(param) {}
         virtual ~IoConfigEvent() {}
 
                 int event() const { return mEvent; }
@@ -141,6 +143,7 @@ public:
                 void        sendIoConfigEvent_l(int event, int param = 0);
                 void        sendPrioConfigEvent_l(pid_t pid, pid_t tid, int32_t prio);
                 void        processConfigEvents();
+                void        processConfigEvents_l();
 
                 // see note at declaration of mStandby, mOutDevice and mInDevice
                 bool        standby() const { return mStandby; }
@@ -156,7 +159,7 @@ public:
                                     int sessionId,
                                     effect_descriptor_t *desc,
                                     int *enabled,
-                                    status_t *status);
+                                    status_t *status /*non-NULL*/);
                 void disconnectEffect(const sp< EffectModule>& effect,
                                       EffectHandle *handle,
                                       bool unpinIfLast);
@@ -198,13 +201,13 @@ public:
                 // effect
                 void removeEffect_l(const sp< EffectModule>& effect);
                 // detach all tracks connected to an auxiliary effect
-    virtual     void detachAuxEffect_l(int effectId) {}
+    virtual     void detachAuxEffect_l(int effectId __unused) {}
                 // returns either EFFECT_SESSION if effects on this audio session exist in one
                 // chain, or TRACK_SESSION if tracks on this audio session exist, or both
                 virtual uint32_t hasAudioSession(int sessionId) const = 0;
                 // the value returned by default implementation is not important as the
                 // strategy is only meaningful for PlaybackThread which implements this method
-                virtual uint32_t getStrategyForSession_l(int sessionId) { return 0; }
+                virtual uint32_t getStrategyForSession_l(int sessionId __unused) { return 0; }
 
                 // suspend or restore effect according to the type of effect passed. a NULL
                 // type pointer means suspend all effects in the session
@@ -275,6 +278,7 @@ protected:
                 uint32_t                mChannelCount;
                 size_t                  mFrameSize;
                 audio_format_t          mFormat;
+                size_t                  mBufferSize;       // HAL buffer size for read() or write()
 
                 // Parameter sequence by client: binder thread calling setParameters():
                 //  1. Lock mLock
@@ -303,12 +307,12 @@ protected:
                 Vector<ConfigEvent *>     mConfigEvents;
 
                 // These fields are written and read by thread itself without lock or barrier,
-                // and read by other threads without lock or barrier via standby() , outDevice()
+                // and read by other threads without lock or barrier via standby(), outDevice()
                 // and inDevice().
                 // Because of the absence of a lock or barrier, any other thread that reads
                 // these fields must use the information in isolation, or be prepared to deal
                 // with possibility that it might be inconsistent with other information.
-                bool                    mStandby;   // Whether thread is currently in standby.
+                bool                    mStandby;     // Whether thread is currently in standby.
                 audio_devices_t         mOutDevice;   // output device
                 audio_devices_t         mInDevice;    // input device
                 audio_source_t          mAudioSource; // (see audio.h, audio_source_t)
@@ -358,7 +362,6 @@ public:
                 void        dump(int fd, const Vector<String16>& args);
 
     // Thread virtuals
-    virtual     status_t    readyToRun();
     virtual     bool        threadLoop();
 
     // RefBase
@@ -391,7 +394,7 @@ protected:
     virtual     bool        waitingAsyncCallback();
     virtual     bool        waitingAsyncCallback_l();
     virtual     bool        shouldStandby_l();
-
+    virtual     void        onAddNewTrack_l();
 
     // ThreadBase virtuals
     virtual     void        preExit();
@@ -419,13 +422,13 @@ public:
                                 uint32_t sampleRate,
                                 audio_format_t format,
                                 audio_channel_mask_t channelMask,
-                                size_t frameCount,
+                                size_t *pFrameCount,
                                 const sp<IMemory>& sharedBuffer,
                                 int sessionId,
                                 IAudioFlinger::track_flags_t *flags,
                                 pid_t tid,
                                 int uid,
-                                status_t *status);
+                                status_t *status /*non-NULL*/);
 
                 AudioStreamOut* getOutput() const;
                 AudioStreamOut* clearOutput();
@@ -479,7 +482,6 @@ protected:
     size_t                          mNormalFrameCount;  // normal mixer and effects
 
     int16_t*                        mMixBuffer;         // frame size aligned mix buffer
-    int8_t*                         mAllocMixBuffer;    // mixer buffer allocation address
 
     // suspend count, > 0 means suspended.  While suspended, the thread continues to pull from
     // tracks and mix, but doesn't write to HAL.  A2DP and SCO HAL implementations can't handle
@@ -623,13 +625,12 @@ private:
     sp<NBLog::Writer>       mFastMixerNBLogWriter;
 public:
     virtual     bool        hasFastMixer() const = 0;
-    virtual     FastTrackUnderruns getFastTrackUnderruns(size_t fastIndex) const
+    virtual     FastTrackUnderruns getFastTrackUnderruns(size_t fastIndex __unused) const
                                 { FastTrackUnderruns dummy; return dummy; }
 
 protected:
                 // accessed by both binder threads and within threadLoop(), lock on mutex needed
                 unsigned    mFastTrackAvailMask;    // bit i set if fast track [i] is available
-    virtual     void        flushOutput_l();
 
 private:
     // timestamp latch:
@@ -748,11 +749,11 @@ protected:
     // threadLoop snippets
     virtual     mixer_state prepareTracks_l(Vector< sp<Track> > *tracksToRemove);
     virtual     void        threadLoop_exit();
-    virtual     void        flushOutput_l();
 
     virtual     bool        waitingAsyncCallback();
     virtual     bool        waitingAsyncCallback_l();
     virtual     bool        shouldStandby_l();
+    virtual     void        onAddNewTrack_l();
 
 private:
                 void        flushHw_l();
@@ -867,23 +868,23 @@ public:
 
     // Thread virtuals
     virtual bool        threadLoop();
-    virtual status_t    readyToRun();
 
     // RefBase
     virtual void        onFirstRef();
 
     virtual status_t    initCheck() const { return (mInput == NULL) ? NO_INIT : NO_ERROR; }
+
             sp<AudioFlinger::RecordThread::RecordTrack>  createRecordTrack_l(
                     const sp<AudioFlinger::Client>& client,
                     uint32_t sampleRate,
                     audio_format_t format,
                     audio_channel_mask_t channelMask,
-                    size_t frameCount,
+                    size_t *pFrameCount,
                     int sessionId,
                     int uid,
                     IAudioFlinger::track_flags_t *flags,
                     pid_t tid,
-                    status_t *status);
+                    status_t *status /*non-NULL*/);
 
             status_t    start(RecordTrack* recordTrack,
                               AudioSystem::sync_event_t event,
@@ -905,7 +906,7 @@ public:
     virtual String8     getParameters(const String8& keys);
     virtual void        audioConfigChanged_l(int event, int param = 0);
             void        readInputParameters();
-    virtual unsigned int  getInputFramesLost();
+    virtual uint32_t    getInputFramesLost();
 
     virtual status_t addEffectChain_l(const sp<EffectChain>& chain);
     virtual size_t removeEffectChain_l(const sp<EffectChain>& chain);
@@ -926,30 +927,43 @@ public:
             bool        hasFastRecorder() const { return false; }
 
 private:
-            void clearSyncStartEvent();
+            void    clearSyncStartEvent();
 
             // Enter standby if not already in standby, and set mStandby flag
-            void standby();
+            void    standbyIfNotAlreadyInStandby();
 
             // Call the HAL standby method unconditionally, and don't change mStandby flag
-            void inputStandBy();
+            void    inputStandBy();
 
             AudioStreamIn                       *mInput;
             SortedVector < sp<RecordTrack> >    mTracks;
-            // mActiveTrack has dual roles:  it indicates the current active track, and
+            // mActiveTracks has dual roles:  it indicates the current active track(s), and
             // is used together with mStartStopCond to indicate start()/stop() progress
-            sp<RecordTrack>                     mActiveTrack;
+            SortedVector< sp<RecordTrack> >     mActiveTracks;
+            // generation counter for mActiveTracks
+            int                                 mActiveTracksGen;
             Condition                           mStartStopCond;
 
             // updated by RecordThread::readInputParameters()
             AudioResampler                      *mResampler;
             // interleaved stereo pairs of fixed-point signed Q19.12
             int32_t                             *mRsmpOutBuffer;
-            int16_t                             *mRsmpInBuffer; // [mFrameCount * mChannelCount]
-            size_t                              mRsmpInIndex;
-            size_t                              mBufferSize;    // stream buffer size for read()
+
+            // resampler converts input at HAL Hz to output at AudioRecord client Hz
+            int16_t                             *mRsmpInBuffer; // see new[] for details on the size
+            size_t                              mRsmpInFrames;  // size of resampler input in frames
+            size_t                              mRsmpInFramesP2;// size rounded up to a power-of-2
+            size_t                              mRsmpInUnrel;   // unreleased frames remaining from
+                                                                // most recent getNextBuffer
+            // these are rolling counters that are never cleared
+            int32_t                             mRsmpInFront;   // next available frame
+            int32_t                             mRsmpInRear;    // last filled frame + 1
+            size_t                              mRsmpInIndex;   // FIXME legacy
+
+            // client's requested configuration, which may differ from the HAL configuration
             const uint32_t                      mReqChannelCount;
             const uint32_t                      mReqSampleRate;
+
             ssize_t                             mBytesRead;
             // sync event triggering actual audio capture. Frames read before this event will
             // be dropped and therefore not read by the application.
diff --git a/services/audioflinger/TrackBase.h b/services/audioflinger/TrackBase.h
index cd201d9..05fde7c 100644
--- a/services/audioflinger/TrackBase.h
+++ b/services/audioflinger/TrackBase.h
@@ -48,6 +48,7 @@ public:
                                 int uid,
                                 bool isOut);
     virtual             ~TrackBase();
+    virtual status_t    initCheck() const { return getCblk() != 0 ? NO_ERROR : NO_MEMORY; }
 
     virtual status_t    start(AudioSystem::sync_event_t event,
                              int triggerSession) = 0;
@@ -78,15 +79,6 @@ protected:
 
     virtual uint32_t sampleRate() const { return mSampleRate; }
 
-    // Return a pointer to the start of a contiguous slice of the track buffer.
-    // Parameter 'offset' is the requested start position, expressed in
-    // monotonically increasing frame units relative to the track epoch.
-    // Parameter 'frames' is the requested length, also in frame units.
-    // Always returns non-NULL.  It is the caller's responsibility to
-    // verify that this will be successful; the result of calling this
-    // function with invalid 'offset' or 'frames' is undefined.
-    void* getBuffer(uint32_t offset, uint32_t frames) const;
-
     bool isStopped() const {
         return (mState == STOPPED || mState == FLUSHED);
     }
diff --git a/services/audioflinger/Tracks.cpp b/services/audioflinger/Tracks.cpp
index fccc7b8..e5152b8 100644
--- a/services/audioflinger/Tracks.cpp
+++ b/services/audioflinger/Tracks.cpp
@@ -116,12 +116,11 @@ AudioFlinger::ThreadBase::TrackBase::TrackBase(
 
     if (client != 0) {
         mCblkMemory = client->heap()->allocate(size);
-        if (mCblkMemory != 0) {
-            mCblk = static_cast<audio_track_cblk_t *>(mCblkMemory->pointer());
-            // can't assume mCblk != NULL
-        } else {
+        if (mCblkMemory == 0 ||
+                (mCblk = static_cast<audio_track_cblk_t *>(mCblkMemory->pointer())) == NULL) {
             ALOGE("not enough memory for AudioTrack size=%u", size);
             client->heap()->dump("AudioTrack");
+            mCblkMemory.clear();
             return;
         }
     } else {
@@ -134,7 +133,6 @@ AudioFlinger::ThreadBase::TrackBase::TrackBase(
     if (mCblk != NULL) {
         new(mCblk) audio_track_cblk_t();
         // clear all buffers
-        mCblk->frameCount_ = frameCount;
         if (sharedBuffer == 0) {
             mBuffer = (char*)mCblk + sizeof(audio_track_cblk_t);
             memset(mBuffer, 0, bufferSize);
@@ -148,7 +146,7 @@ AudioFlinger::ThreadBase::TrackBase::TrackBase(
 #ifdef TEE_SINK
         if (mTeeSinkTrackEnabled) {
             NBAIO_Format pipeFormat = Format_from_SR_C(mSampleRate, mChannelCount);
-            if (pipeFormat != Format_Invalid) {
+            if (Format_isValid(pipeFormat)) {
                 Pipe *pipe = new Pipe(mTeeSinkTrackFrames, pipeFormat);
                 size_t numCounterOffers = 0;
                 const NBAIO_Format offers[1] = {pipeFormat};
@@ -275,6 +273,11 @@ status_t AudioFlinger::TrackHandle::queueTimedBuffer(const sp<IMemory>& buffer,
     if (!mTrack->isTimedTrack())
         return INVALID_OPERATION;
 
+    if (buffer == 0 || buffer->pointer() == NULL) {
+        ALOGE("queueTimedBuffer() buffer is 0 or has NULL pointer()");
+        return BAD_VALUE;
+    }
+
     PlaybackThread::TimedTrack* tt =
             reinterpret_cast<PlaybackThread::TimedTrack*>(mTrack.get());
     return tt->queueTimedBuffer(buffer, pts);
@@ -344,7 +347,8 @@ AudioFlinger::PlaybackThread::Track::Track(
     mCachedVolume(1.0),
     mIsInvalid(false),
     mAudioTrackServerProxy(NULL),
-    mResumeToStopping(false)
+    mResumeToStopping(false),
+    mFlushHwPending(false)
 {
     if (mCblk != NULL) {
         if (sharedBuffer == 0) {
@@ -396,6 +400,15 @@ AudioFlinger::PlaybackThread::Track::~Track()
     }
 }
 
+status_t AudioFlinger::PlaybackThread::Track::initCheck() const
+{
+    status_t status = TrackBase::initCheck();
+    if (status == NO_ERROR && mName < 0) {
+        status = NO_MEMORY;
+    }
+    return status;
+}
+
 void AudioFlinger::PlaybackThread::Track::destroy()
 {
     // NOTE: destroyTrack_l() can remove a strong reference to this Track
@@ -422,17 +435,19 @@ void AudioFlinger::PlaybackThread::Track::destroy()
 
 /*static*/ void AudioFlinger::PlaybackThread::Track::appendDumpHeader(String8& result)
 {
-    result.append("   Name Client Type      Fmt Chn mask Session fCount S F SRate  "
+    result.append("    Name Active Client Type      Fmt Chn mask Session fCount S F SRate  "
                   "L dB  R dB    Server Main buf  Aux Buf Flags UndFrmCnt\n");
 }
 
-void AudioFlinger::PlaybackThread::Track::dump(char* buffer, size_t size)
+void AudioFlinger::PlaybackThread::Track::dump(char* buffer, size_t size, bool active)
 {
     uint32_t vlr = mAudioTrackServerProxy->getVolumeLR();
     if (isFastTrack()) {
-        sprintf(buffer, "   F %2d", mFastIndex);
+        sprintf(buffer, "    F %2d", mFastIndex);
+    } else if (mName >= AudioMixer::TRACK0) {
+        sprintf(buffer, "    %4d", mName - AudioMixer::TRACK0);
     } else {
-        sprintf(buffer, "   %4d", mName - AudioMixer::TRACK0);
+        sprintf(buffer, "    none");
     }
     track_state state = mState;
     char stateChar;
@@ -487,8 +502,9 @@ void AudioFlinger::PlaybackThread::Track::dump(char* buffer, size_t size)
         nowInUnderrun = '?';
         break;
     }
-    snprintf(&buffer[7], size-7, " %6u %4u %08X %08X %7u %6zu %1c %1d %5u %5.2g %5.2g  "
+    snprintf(&buffer[8], size-8, " %6s %6u %4u %08X %08X %7u %6zu %1c %1d %5u %5.2g %5.2g  "
                                  "%08X %p %p 0x%03X %9u%c\n",
+            active ? "yes" : "no",
             (mClient == 0) ? getpid_cached : mClient->pid(),
             mStreamType,
             mFormat,
@@ -514,7 +530,7 @@ uint32_t AudioFlinger::PlaybackThread::Track::sampleRate() const {
 
 // AudioBufferProvider interface
 status_t AudioFlinger::PlaybackThread::Track::getNextBuffer(
-        AudioBufferProvider::Buffer* buffer, int64_t pts)
+        AudioBufferProvider::Buffer* buffer, int64_t pts __unused)
 {
     ServerProxy::Buffer buf;
     size_t desiredFrames = buffer->frameCount;
@@ -551,7 +567,7 @@ size_t AudioFlinger::PlaybackThread::Track::framesReleased() const
 
 // Don't call for fast tracks; the framesReady() could result in priority inversion
 bool AudioFlinger::PlaybackThread::Track::isReady() const {
-    if (mFillingUpStatus != FS_FILLING || isStopped() || isPausing()) {
+    if (mFillingUpStatus != FS_FILLING || isStopped() || isPausing() || isStopping()) {
         return true;
     }
 
@@ -564,8 +580,8 @@ bool AudioFlinger::PlaybackThread::Track::isReady() const {
     return false;
 }
 
-status_t AudioFlinger::PlaybackThread::Track::start(AudioSystem::sync_event_t event,
-                                                    int triggerSession)
+status_t AudioFlinger::PlaybackThread::Track::start(AudioSystem::sync_event_t event __unused,
+                                                    int triggerSession __unused)
 {
     status_t status = NO_ERROR;
     ALOGV("start(%d), calling pid %d session %d",
@@ -719,6 +735,7 @@ void AudioFlinger::PlaybackThread::Track::flush()
                 mRetryCount = PlaybackThread::kMaxTrackRetriesOffload;
             }
 
+            mFlushHwPending = true;
             mResumeToStopping = false;
         } else {
             if (mState != STOPPING_1 && mState != STOPPING_2 && mState != STOPPED &&
@@ -739,11 +756,19 @@ void AudioFlinger::PlaybackThread::Track::flush()
         // Prevent flush being lost if the track is flushed and then resumed
         // before mixer thread can run. This is important when offloading
         // because the hardware buffer could hold a large amount of audio
-        playbackThread->flushOutput_l();
         playbackThread->broadcast_l();
     }
 }
 
+// must be called with thread lock held
+void AudioFlinger::PlaybackThread::Track::flushAck()
+{
+    if (!isOffloaded())
+        return;
+
+    mFlushHwPending = false;
+}
+
 void AudioFlinger::PlaybackThread::Track::reset()
 {
     // Do not reset twice to avoid discarding data written just after a flush and before
@@ -979,7 +1004,8 @@ AudioFlinger::PlaybackThread::TimedTrack::create(
             size_t frameCount,
             const sp<IMemory>& sharedBuffer,
             int sessionId,
-            int uid) {
+            int uid)
+{
     if (!client->reserveTimedTrack())
         return 0;
 
@@ -1045,15 +1071,14 @@ status_t AudioFlinger::PlaybackThread::TimedTrack::allocateTimedBuffer(
 
         mTimedMemoryDealer = new MemoryDealer(kTimedBufferHeapSize,
                                               "AudioFlingerTimed");
-        if (mTimedMemoryDealer == NULL)
+        if (mTimedMemoryDealer == NULL) {
             return NO_MEMORY;
+        }
     }
 
     sp<IMemory> newBuffer = mTimedMemoryDealer->allocate(size);
-    if (newBuffer == NULL) {
-        newBuffer = mTimedMemoryDealer->allocate(size);
-        if (newBuffer == NULL)
-            return NO_MEMORY;
+    if (newBuffer == 0 || newBuffer->pointer() == NULL) {
+        return NO_MEMORY;
     }
 
     *buffer = newBuffer;
@@ -1152,7 +1177,7 @@ void AudioFlinger::PlaybackThread::TimedTrack::trimTimedBufferQueueHead_l(
 
 void AudioFlinger::PlaybackThread::TimedTrack::updateFramesPendingAfterTrim_l(
         const TimedBuffer& buf,
-        const char* logTag) {
+        const char* logTag __unused) {
     uint32_t bufBytes        = buf.buffer()->size();
     uint32_t consumedAlready = buf.position();
 
@@ -1504,9 +1529,9 @@ AudioFlinger::PlaybackThread::OutputTrack::OutputTrack(
         mOutBuffer.frameCount = 0;
         playbackThread->mTracks.add(this);
         ALOGV("OutputTrack constructor mCblk %p, mBuffer %p, "
-                "mCblk->frameCount_ %u, mChannelMask 0x%08x",
+                "frameCount %u, mChannelMask 0x%08x",
                 mCblk, mBuffer,
-                mCblk->frameCount_, mChannelMask);
+                frameCount, mChannelMask);
         // since client and server are in the same process,
         // the buffer has the same virtual address on both sides
         mClientProxy = new AudioTrackClientProxy(mCblk, mBuffer, mFrameCount, mFrameSize);
@@ -1764,9 +1789,7 @@ AudioFlinger::RecordThread::RecordTrack::RecordTrack(
 {
     ALOGV("RecordTrack constructor");
     if (mCblk != NULL) {
-        mAudioRecordServerProxy = new AudioRecordServerProxy(mCblk, mBuffer, frameCount,
-                mFrameSize);
-        mServerProxy = mAudioRecordServerProxy;
+        mServerProxy = new AudioRecordServerProxy(mCblk, mBuffer, frameCount, mFrameSize);
     }
 }
 
@@ -1777,7 +1800,7 @@ AudioFlinger::RecordThread::RecordTrack::~RecordTrack()
 
 // AudioBufferProvider interface
 status_t AudioFlinger::RecordThread::RecordTrack::getNextBuffer(AudioBufferProvider::Buffer* buffer,
-        int64_t pts)
+        int64_t pts __unused)
 {
     ServerProxy::Buffer buf;
     buf.mFrameCount = buffer->frameCount;
@@ -1845,12 +1868,13 @@ void AudioFlinger::RecordThread::RecordTrack::invalidate()
 
 /*static*/ void AudioFlinger::RecordThread::RecordTrack::appendDumpHeader(String8& result)
 {
-    result.append("Client Fmt Chn mask Session S   Server fCount\n");
+    result.append("    Active Client Fmt Chn mask Session S   Server fCount\n");
 }
 
-void AudioFlinger::RecordThread::RecordTrack::dump(char* buffer, size_t size)
+void AudioFlinger::RecordThread::RecordTrack::dump(char* buffer, size_t size, bool active)
 {
-    snprintf(buffer, size, "%6u %3u %08X %7u %1d %08X %6zu\n",
+    snprintf(buffer, size, "    %6s %6u %3u %08X %7u %1d %08X %6zu\n",
+            active ? "yes" : "no",
             (mClient == 0) ? getpid_cached : mClient->pid(),
             mFormat,
             mChannelMask,
diff --git a/services/audioflinger/test-resample.cpp b/services/audioflinger/test-resample.cpp
index 7a314cf..66fcd90 100644
--- a/services/audioflinger/test-resample.cpp
+++ b/services/audioflinger/test-resample.cpp
@@ -26,54 +26,30 @@
 #include <errno.h>
 #include <time.h>
 #include <math.h>
+#include <audio_utils/sndfile.h>
 
 using namespace android;
 
-struct HeaderWav {
-    HeaderWav(size_t size, int nc, int sr, int bits) {
-        strncpy(RIFF, "RIFF", 4);
-        chunkSize = size + sizeof(HeaderWav);
-        strncpy(WAVE, "WAVE", 4);
-        strncpy(fmt,  "fmt ", 4);
-        fmtSize = 16;
-        audioFormat = 1;
-        numChannels = nc;
-        samplesRate = sr;
-        byteRate = sr * numChannels * (bits/8);
-        align = nc*(bits/8);
-        bitsPerSample = bits;
-        strncpy(data, "data", 4);
-        dataSize = size;
-    }
-
-    char RIFF[4];           // RIFF
-    uint32_t chunkSize;     // File size
-    char WAVE[4];        // WAVE
-    char fmt[4];            // fmt\0
-    uint32_t fmtSize;       // fmt size
-    uint16_t audioFormat;   // 1=PCM
-    uint16_t numChannels;   // num channels
-    uint32_t samplesRate;   // sample rate in hz
-    uint32_t byteRate;      // Bps
-    uint16_t align;         // 2=16-bit mono, 4=16-bit stereo
-    uint16_t bitsPerSample; // bits per sample
-    char data[4];           // "data"
-    uint32_t dataSize;      // size
-};
+bool gVerbose = false;
 
 static int usage(const char* name) {
-    fprintf(stderr,"Usage: %s [-p] [-h] [-s] [-q {dq|lq|mq|hq|vhq}] [-i input-sample-rate] "
-                   "[-o output-sample-rate] [<input-file>] <output-file>\n", name);
+    fprintf(stderr,"Usage: %s [-p] [-h] [-v] [-s] [-q {dq|lq|mq|hq|vhq|dlq|dmq|dhq}]"
+                   " [-i input-sample-rate] [-o output-sample-rate] [<input-file>]"
+                   " <output-file>\n", name);
     fprintf(stderr,"    -p    enable profiling\n");
     fprintf(stderr,"    -h    create wav file\n");
-    fprintf(stderr,"    -s    stereo\n");
+    fprintf(stderr,"    -v    verbose : log buffer provider calls\n");
+    fprintf(stderr,"    -s    stereo (ignored if input file is specified)\n");
     fprintf(stderr,"    -q    resampler quality\n");
     fprintf(stderr,"              dq  : default quality\n");
     fprintf(stderr,"              lq  : low quality\n");
     fprintf(stderr,"              mq  : medium quality\n");
     fprintf(stderr,"              hq  : high quality\n");
     fprintf(stderr,"              vhq : very high quality\n");
-    fprintf(stderr,"    -i    input file sample rate\n");
+    fprintf(stderr,"              dlq : dynamic low quality\n");
+    fprintf(stderr,"              dmq : dynamic medium quality\n");
+    fprintf(stderr,"              dhq : dynamic high quality\n");
+    fprintf(stderr,"    -i    input file sample rate (ignored if input file is specified)\n");
     fprintf(stderr,"    -o    output file sample rate\n");
     return -1;
 }
@@ -81,7 +57,8 @@ static int usage(const char* name) {
 int main(int argc, char* argv[]) {
 
     const char* const progname = argv[0];
-    bool profiling = false;
+    bool profileResample = false;
+    bool profileFilter = false;
     bool writeHeader = false;
     int channels = 1;
     int input_freq = 0;
@@ -89,14 +66,20 @@ int main(int argc, char* argv[]) {
     AudioResampler::src_quality quality = AudioResampler::DEFAULT_QUALITY;
 
     int ch;
-    while ((ch = getopt(argc, argv, "phsq:i:o:")) != -1) {
+    while ((ch = getopt(argc, argv, "pfhvsq:i:o:")) != -1) {
         switch (ch) {
         case 'p':
-            profiling = true;
+            profileResample = true;
+            break;
+        case 'f':
+            profileFilter = true;
             break;
         case 'h':
             writeHeader = true;
             break;
+        case 'v':
+            gVerbose = true;
+            break;
         case 's':
             channels = 2;
             break;
@@ -111,6 +94,12 @@ int main(int argc, char* argv[]) {
                 quality = AudioResampler::HIGH_QUALITY;
             else if (!strcmp(optarg, "vhq"))
                 quality = AudioResampler::VERY_HIGH_QUALITY;
+            else if (!strcmp(optarg, "dlq"))
+                quality = AudioResampler::DYN_LOW_QUALITY;
+            else if (!strcmp(optarg, "dmq"))
+                quality = AudioResampler::DYN_MED_QUALITY;
+            else if (!strcmp(optarg, "dhq"))
+                quality = AudioResampler::DYN_HIGH_QUALITY;
             else {
                 usage(progname);
                 return -1;
@@ -148,25 +137,22 @@ int main(int argc, char* argv[]) {
     size_t input_size;
     void* input_vaddr;
     if (argc == 2) {
-        struct stat st;
-        if (stat(file_in, &st) < 0) {
-            fprintf(stderr, "stat: %s\n", strerror(errno));
-            return -1;
-        }
-
-        int input_fd = open(file_in, O_RDONLY);
-        if (input_fd < 0) {
-            fprintf(stderr, "open: %s\n", strerror(errno));
-            return -1;
-        }
-
-        input_size = st.st_size;
-        input_vaddr = mmap(0, input_size, PROT_READ, MAP_PRIVATE, input_fd, 0);
-        if (input_vaddr == MAP_FAILED ) {
-            fprintf(stderr, "mmap: %s\n", strerror(errno));
-            return -1;
+        SF_INFO info;
+        info.format = 0;
+        SNDFILE *sf = sf_open(file_in, SFM_READ, &info);
+        if (sf == NULL) {
+            perror(file_in);
+            return EXIT_FAILURE;
         }
+        input_size = info.frames * info.channels * sizeof(short);
+        input_vaddr = malloc(input_size);
+        (void) sf_readf_short(sf, (short *) input_vaddr, info.frames);
+        sf_close(sf);
+        channels = info.channels;
+        input_freq = info.samplerate;
     } else {
+        // data for testing is exactly (input sampling rate/1000)/2 seconds
+        // so 44.1khz input is 22.05 seconds
         double k = 1000; // Hz / s
         double time = (input_freq / 2) / k;
         size_t input_frames = size_t(input_freq * time);
@@ -178,7 +164,7 @@ int main(int argc, char* argv[]) {
             double y = sin(M_PI * k * t * t);
             int16_t yi = floor(y * 32767.0 + 0.5);
             for (size_t j=0 ; j<(size_t)channels ; j++) {
-                in[i*channels + j] = yi / (1+j);
+                in[i*channels + j] = yi / (1+j); // right ch. 1/2 left ch.
             }
         }
     }
@@ -186,89 +172,238 @@ int main(int argc, char* argv[]) {
     // ----------------------------------------------------------
 
     class Provider: public AudioBufferProvider {
-        int16_t* mAddr;
-        size_t mNumFrames;
+        int16_t* const  mAddr;      // base address
+        const size_t    mNumFrames; // total frames
+        const int       mChannels;
+        size_t          mNextFrame; // index of next frame to provide
+        size_t          mUnrel;     // number of frames not yet released
     public:
-        Provider(const void* addr, size_t size, int channels) {
-            mAddr = (int16_t*) addr;
-            mNumFrames = size / (channels*sizeof(int16_t));
+        Provider(const void* addr, size_t size, int channels)
+          : mAddr((int16_t*) addr),
+            mNumFrames(size / (channels*sizeof(int16_t))),
+            mChannels(channels),
+            mNextFrame(0), mUnrel(0) {
         }
         virtual status_t getNextBuffer(Buffer* buffer,
                 int64_t pts = kInvalidPTS) {
-            buffer->frameCount = mNumFrames;
-            buffer->i16 = mAddr;
-            return NO_ERROR;
+            (void)pts; // suppress warning
+            size_t requestedFrames = buffer->frameCount;
+            if (requestedFrames > mNumFrames - mNextFrame) {
+                buffer->frameCount = mNumFrames - mNextFrame;
+            }
+            if (gVerbose) {
+                printf("getNextBuffer() requested %u frames out of %u frames available,"
+                        " and returned %u frames\n",
+                        requestedFrames, mNumFrames - mNextFrame, buffer->frameCount);
+            }
+            mUnrel = buffer->frameCount;
+            if (buffer->frameCount > 0) {
+                buffer->i16 = &mAddr[mChannels * mNextFrame];
+                return NO_ERROR;
+            } else {
+                buffer->i16 = NULL;
+                return NOT_ENOUGH_DATA;
+            }
         }
         virtual void releaseBuffer(Buffer* buffer) {
+            if (buffer->frameCount > mUnrel) {
+                fprintf(stderr, "ERROR releaseBuffer() released %u frames but only %u available "
+                        "to release\n", buffer->frameCount, mUnrel);
+                mNextFrame += mUnrel;
+                mUnrel = 0;
+            } else {
+                if (gVerbose) {
+                    printf("releaseBuffer() released %u frames out of %u frames available "
+                            "to release\n", buffer->frameCount, mUnrel);
+                }
+                mNextFrame += buffer->frameCount;
+                mUnrel -= buffer->frameCount;
+            }
+            buffer->frameCount = 0;
+            buffer->i16 = NULL;
+        }
+        void reset() {
+            mNextFrame = 0;
         }
     } provider(input_vaddr, input_size, channels);
 
     size_t input_frames = input_size / (channels * sizeof(int16_t));
+    if (gVerbose) {
+        printf("%u input frames\n", input_frames);
+    }
     size_t output_size = 2 * 4 * ((int64_t) input_frames * output_freq) / input_freq;
     output_size &= ~7; // always stereo, 32-bits
 
-    void* output_vaddr = malloc(output_size);
-
-    if (profiling) {
+    if (profileFilter) {
+        // Check how fast sample rate changes are that require filter changes.
+        // The delta sample rate changes must indicate a downsampling ratio,
+        // and must be larger than 10% changes.
+        //
+        // On fast devices, filters should be generated between 0.1ms - 1ms.
+        // (single threaded).
         AudioResampler* resampler = AudioResampler::create(16, channels,
-                output_freq, quality);
-
-        size_t out_frames = output_size/8;
-        resampler->setSampleRate(input_freq);
-        resampler->setVolume(0x1000, 0x1000);
-
-        memset(output_vaddr, 0, output_size);
+                8000, quality);
+        int looplimit = 100;
         timespec start, end;
         clock_gettime(CLOCK_MONOTONIC, &start);
-        resampler->resample((int*) output_vaddr, out_frames, &provider);
-        resampler->resample((int*) output_vaddr, out_frames, &provider);
-        resampler->resample((int*) output_vaddr, out_frames, &provider);
-        resampler->resample((int*) output_vaddr, out_frames, &provider);
+        for (int i = 0; i < looplimit; ++i) {
+            resampler->setSampleRate(9000);
+            resampler->setSampleRate(12000);
+            resampler->setSampleRate(20000);
+            resampler->setSampleRate(30000);
+        }
         clock_gettime(CLOCK_MONOTONIC, &end);
         int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;
         int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec;
-        int64_t time = (end_ns - start_ns)/4;
-        printf("%f Mspl/s\n", out_frames/(time/1e9)/1e6);
+        int64_t time = end_ns - start_ns;
+        printf("%.2f sample rate changes with filter calculation/sec\n",
+                looplimit * 4 / (time / 1e9));
 
+        // Check how fast sample rate changes are without filter changes.
+        // This should be very fast, probably 0.1us - 1us per sample rate
+        // change.
+        resampler->setSampleRate(1000);
+        looplimit = 1000;
+        clock_gettime(CLOCK_MONOTONIC, &start);
+        for (int i = 0; i < looplimit; ++i) {
+            resampler->setSampleRate(1000+i);
+        }
+        clock_gettime(CLOCK_MONOTONIC, &end);
+        start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;
+        end_ns = end.tv_sec * 1000000000LL + end.tv_nsec;
+        time = end_ns - start_ns;
+        printf("%.2f sample rate changes without filter calculation/sec\n",
+                looplimit / (time / 1e9));
+        resampler->reset();
         delete resampler;
     }
 
+    void* output_vaddr = malloc(output_size);
     AudioResampler* resampler = AudioResampler::create(16, channels,
             output_freq, quality);
     size_t out_frames = output_size/8;
+
+    /* set volume precision to 12 bits, so the volume scale is 1<<12.
+     * This means the "integer" part fits in the Q19.12 precision
+     * representation of output int32_t.
+     *
+     * Generally 0 < volumePrecision <= 14 (due to the limits of
+     * int16_t values for Volume). volumePrecision cannot be 0 due
+     * to rounding and shifts.
+     */
+    const int volumePrecision = 12; // in bits
+
     resampler->setSampleRate(input_freq);
-    resampler->setVolume(0x1000, 0x1000);
+    resampler->setVolume(1 << volumePrecision, 1 << volumePrecision);
+
+    if (profileResample) {
+        /*
+         * For profiling on mobile devices, upon experimentation
+         * it is better to run a few trials with a shorter loop limit,
+         * and take the minimum time.
+         *
+         * Long tests can cause CPU temperature to build up and thermal throttling
+         * to reduce CPU frequency.
+         *
+         * For frequency checks (index=0, or 1, etc.):
+         * "cat /sys/devices/system/cpu/cpu${index}/cpufreq/scaling_*_freq"
+         *
+         * For temperature checks (index=0, or 1, etc.):
+         * "cat /sys/class/thermal/thermal_zone${index}/temp"
+         *
+         * Another way to avoid thermal throttling is to fix the CPU frequency
+         * at a lower level which prevents excessive temperatures.
+         */
+        const int trials = 4;
+        const int looplimit = 4;
+        timespec start, end;
+        int64_t time;
+
+        for (int n = 0; n < trials; ++n) {
+            clock_gettime(CLOCK_MONOTONIC, &start);
+            for (int i = 0; i < looplimit; ++i) {
+                resampler->resample((int*) output_vaddr, out_frames, &provider);
+                provider.reset(); //  during benchmarking reset only the provider
+            }
+            clock_gettime(CLOCK_MONOTONIC, &end);
+            int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;
+            int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec;
+            int64_t diff_ns = end_ns - start_ns;
+            if (n == 0 || diff_ns < time) {
+                time = diff_ns;   // save the best out of our trials.
+            }
+        }
+        // Mfrms/s is "Millions of output frames per second".
+        printf("quality: %d  channels: %d  msec: %lld  Mfrms/s: %.2lf\n",
+                quality, channels, time/1000000, out_frames * looplimit / (time / 1e9) / 1e6);
+        resampler->reset();
+    }
 
     memset(output_vaddr, 0, output_size);
+    if (gVerbose) {
+        printf("resample() %u output frames\n", out_frames);
+    }
     resampler->resample((int*) output_vaddr, out_frames, &provider);
+    if (gVerbose) {
+        printf("resample() complete\n");
+    }
+    resampler->reset();
+    if (gVerbose) {
+        printf("reset() complete\n");
+    }
+    delete resampler;
+    resampler = NULL;
 
-    // down-mix (we just truncate and keep the left channel)
+    // mono takes left channel only
+    // stereo right channel is half amplitude of stereo left channel (due to input creation)
     int32_t* out = (int32_t*) output_vaddr;
     int16_t* convert = (int16_t*) malloc(out_frames * channels * sizeof(int16_t));
+
+    // round to half towards zero and saturate at int16 (non-dithered)
+    const int roundVal = (1<<(volumePrecision-1)) - 1; // volumePrecision > 0
+
     for (size_t i = 0; i < out_frames; i++) {
-        for (int j=0 ; j<channels ; j++) {
-            int32_t s = out[i * 2 + j] >> 12;
-            if (s > 32767)       s =  32767;
-            else if (s < -32768) s = -32768;
+        for (int j = 0; j < channels; j++) {
+            int32_t s = out[i * 2 + j] + roundVal; // add offset here
+            if (s < 0) {
+                s = (s + 1) >> volumePrecision; // round to 0
+                if (s < -32768) {
+                    s = -32768;
+                }
+            } else {
+                s = s >> volumePrecision;
+                if (s > 32767) {
+                    s = 32767;
+                }
+            }
             convert[i * channels + j] = int16_t(s);
         }
     }
 
     // write output to disk
-    int output_fd = open(file_out, O_WRONLY | O_CREAT | O_TRUNC,
-            S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
-    if (output_fd < 0) {
-        fprintf(stderr, "open: %s\n", strerror(errno));
-        return -1;
-    }
-
     if (writeHeader) {
-        HeaderWav wav(out_frames * channels * sizeof(int16_t), channels, output_freq, 16);
-        write(output_fd, &wav, sizeof(wav));
+        SF_INFO info;
+        info.frames = 0;
+        info.samplerate = output_freq;
+        info.channels = channels;
+        info.format = SF_FORMAT_WAV | SF_FORMAT_PCM_16;
+        SNDFILE *sf = sf_open(file_out, SFM_WRITE, &info);
+        if (sf == NULL) {
+            perror(file_out);
+            return EXIT_FAILURE;
+        }
+        (void) sf_writef_short(sf, convert, out_frames);
+        sf_close(sf);
+    } else {
+        int output_fd = open(file_out, O_WRONLY | O_CREAT | O_TRUNC,
+                S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+        if (output_fd < 0) {
+            perror(file_out);
+            return EXIT_FAILURE;
+        }
+        write(output_fd, convert, out_frames * channels * sizeof(int16_t));
+        close(output_fd);
     }
 
-    write(output_fd, convert, out_frames * channels * sizeof(int16_t));
-    close(output_fd);
-
-    return 0;
+    return EXIT_SUCCESS;
 }