43 files changed, 3734 insertions, 250 deletions
diff --git a/cmds/screenrecord/screenrecord.cpp b/cmds/screenrecord/screenrecord.cpp
index 61f83e3..b6f150c 100644
--- a/cmds/screenrecord/screenrecord.cpp
+++ b/cmds/screenrecord/screenrecord.cpp
@@ -45,6 +45,7 @@
 #include <signal.h>
 #include <getopt.h>
 #include <sys/wait.h>
+#include <termios.h>
 #include <assert.h>
 
 #include "screenrecord.h"
@@ -61,6 +62,7 @@ static const uint32_t kFallbackHeight = 720;
 // Command-line parameters.
 static bool gVerbose = false;           // chatty on stdout
 static bool gRotate = false;            // rotate 90 degrees
+static bool gRawOutput = false;         // generate raw H.264 byte stream output
 static bool gSizeSpecified = false;     // was size explicitly requested?
 static bool gWantInfoScreen = false;    // do we want initial info screen?
 static bool gWantFrameTime = false;     // do we want times on each frame?
@@ -298,10 +300,12 @@ static status_t prepareVirtualDisplay(const DisplayInfo& mainDpyInfo,
  * input frames are coming from the virtual display as fast as SurfaceFlinger
  * wants to send them.
  *
+ * Exactly one of muxer or rawFp must be non-null.
+ *
  * The muxer must *not* have been started before calling.
  */
 static status_t runEncoder(const sp<MediaCodec>& encoder,
-        const sp<MediaMuxer>& muxer, const sp<IBinder>& mainDpy,
+        const sp<MediaMuxer>& muxer, FILE* rawFp, const sp<IBinder>& mainDpy,
         const sp<IBinder>& virtualDpy, uint8_t orientation) {
     static int kTimeout = 250000;   // be responsive on signal
     status_t err;
@@ -311,6 +315,8 @@ static status_t runEncoder(const sp<MediaCodec>& encoder,
     int64_t endWhenNsec = startWhenNsec + seconds_to_nanoseconds(gTimeLimitSec);
     DisplayInfo mainDpyInfo;
 
+    assert((rawFp == NULL && muxer != NULL) || (rawFp != NULL && muxer == NULL));
+
     Vector<sp<ABuffer> > buffers;
     err = encoder->getOutputBuffers(&buffers);
     if (err != NO_ERROR) {
@@ -342,15 +348,16 @@ static status_t runEncoder(const sp<MediaCodec>& encoder,
         case NO_ERROR:
             // got a buffer
             if ((flags & MediaCodec::BUFFER_FLAG_CODECCONFIG) != 0) {
-                // ignore this -- we passed the CSD into MediaMuxer when
-                // we got the format change notification
-                ALOGV("Got codec config buffer (%u bytes); ignoring", size);
-                size = 0;
+                ALOGV("Got codec config buffer (%u bytes)", size);
+                if (muxer != NULL) {
+                    // ignore this -- we passed the CSD into MediaMuxer when
+                    // we got the format change notification
+                    size = 0;
+                }
             }
             if (size != 0) {
                 ALOGV("Got data in buffer %d, size=%d, pts=%lld",
                         bufIndex, size, ptsUsec);
-                assert(trackIdx != -1);
 
                 { // scope
                     ATRACE_NAME("orientation");
@@ -379,14 +386,23 @@ static status_t runEncoder(const sp<MediaCodec>& encoder,
                     ptsUsec = systemTime(SYSTEM_TIME_MONOTONIC) / 1000;
                 }
 
-                // The MediaMuxer docs are unclear, but it appears that we
-                // need to pass either the full set of BufferInfo flags, or
-                // (flags & BUFFER_FLAG_SYNCFRAME).
-                //
-                // If this blocks for too long we could drop frames.  We may
-                // want to queue these up and do them on a different thread.
-                { // scope
+                if (muxer == NULL) {
+                    fwrite(buffers[bufIndex]->data(), 1, size, rawFp);
+                    // Flush the data immediately in case we're streaming.
+                    // We don't want to do this if all we've written is
+                    // the SPS/PPS data because mplayer gets confused.
+                    if ((flags & MediaCodec::BUFFER_FLAG_CODECCONFIG) == 0) {
+                        fflush(rawFp);
+                    }
+                } else {
+                    // The MediaMuxer docs are unclear, but it appears that we
+                    // need to pass either the full set of BufferInfo flags, or
+                    // (flags & BUFFER_FLAG_SYNCFRAME).
+                    //
+                    // If this blocks for too long we could drop frames.  We may
+                    // want to queue these up and do them on a different thread.
                     ATRACE_NAME("write sample");
+                    assert(trackIdx != -1);
                     err = muxer->writeSampleData(buffers[bufIndex], trackIdx,
                             ptsUsec, flags);
                     if (err != NO_ERROR) {
@@ -418,12 +434,14 @@ static status_t runEncoder(const sp<MediaCodec>& encoder,
                 ALOGV("Encoder format changed");
                 sp<AMessage> newFormat;
                 encoder->getOutputFormat(&newFormat);
-                trackIdx = muxer->addTrack(newFormat);
-                ALOGV("Starting muxer");
-                err = muxer->start();
-                if (err != NO_ERROR) {
-                    fprintf(stderr, "Unable to start muxer (err=%d)\n", err);
-                    return err;
+                if (muxer != NULL) {
+                    trackIdx = muxer->addTrack(newFormat);
+                    ALOGV("Starting muxer");
+                    err = muxer->start();
+                    if (err != NO_ERROR) {
+                        fprintf(stderr, "Unable to start muxer (err=%d)\n", err);
+                        return err;
+                    }
                 }
             }
             break;
@@ -457,6 +475,44 @@ static status_t runEncoder(const sp<MediaCodec>& encoder,
 }
 
 /*
+ * Raw H.264 byte stream output requested.  Send the output to stdout
+ * if desired.  If the output is a tty, reconfigure it to avoid the
+ * CRLF line termination that we see with "adb shell" commands.
+ */
+static FILE* prepareRawOutput(const char* fileName) {
+    FILE* rawFp = NULL;
+
+    if (strcmp(fileName, "-") == 0) {
+        if (gVerbose) {
+            fprintf(stderr, "ERROR: verbose output and '-' not compatible");
+            return NULL;
+        }
+        rawFp = stdout;
+    } else {
+        rawFp = fopen(fileName, "w");
+        if (rawFp == NULL) {
+            fprintf(stderr, "fopen raw failed: %s\n", strerror(errno));
+            return NULL;
+        }
+    }
+
+    int fd = fileno(rawFp);
+    if (isatty(fd)) {
+        // best effort -- reconfigure tty for "raw"
+        ALOGD("raw video output to tty (fd=%d)", fd);
+        struct termios term;
+        if (tcgetattr(fd, &term) == 0) {
+            cfmakeraw(&term);
+            if (tcsetattr(fd, TCSANOW, &term) == 0) {
+                ALOGD("tty successfully configured for raw");
+            }
+        }
+    }
+
+    return rawFp;
+}
+
+/*
  * Main "do work" method.
  *
  * Configures codec, muxer, and virtual display, then starts moving bits
@@ -558,16 +614,26 @@ static status_t recordScreen(const char* fileName) {
         return err;
     }
 
-    // Configure muxer.  We have to wait for the CSD blob from the encoder
-    // before we can start it.
-    sp<MediaMuxer> muxer = new MediaMuxer(fileName,
-            MediaMuxer::OUTPUT_FORMAT_MPEG_4);
-    if (gRotate) {
-        muxer->setOrientationHint(90);  // TODO: does this do anything?
+    sp<MediaMuxer> muxer = NULL;
+    FILE* rawFp = NULL;
+    if (gRawOutput) {
+        rawFp = prepareRawOutput(fileName);
+        if (rawFp == NULL) {
+            encoder->release();
+            return -1;
+        }
+    } else {
+        // Configure muxer.  We have to wait for the CSD blob from the encoder
+        // before we can start it.
+        muxer = new MediaMuxer(fileName, MediaMuxer::OUTPUT_FORMAT_MPEG_4);
+        if (gRotate) {
+            muxer->setOrientationHint(90);  // TODO: does this do anything?
+        }
     }
 
     // Main encoder loop.
-    err = runEncoder(encoder, muxer, mainDpy, dpy, mainDpyInfo.orientation);
+    err = runEncoder(encoder, muxer, rawFp, mainDpy, dpy,
+            mainDpyInfo.orientation);
     if (err != NO_ERROR) {
         fprintf(stderr, "Encoder failed (err=%d)\n", err);
         // fall through to cleanup
@@ -584,9 +650,13 @@ static status_t recordScreen(const char* fileName) {
         overlay->stop();
     }
     encoder->stop();
-    // If we don't stop muxer explicitly, i.e. let the destructor run,
-    // it may hang (b/11050628).
-    muxer->stop();
+    if (muxer != NULL) {
+        // If we don't stop muxer explicitly, i.e. let the destructor run,
+        // it may hang (b/11050628).
+        muxer->stop();
+    } else if (rawFp != stdout) {
+        fclose(rawFp);
+    }
     encoder->release();
 
     return err;
@@ -753,6 +823,7 @@ int main(int argc, char* const argv[]) {
         { "show-frame-time",    no_argument,        NULL, 'f' },
         { "bugreport",          no_argument,        NULL, 'u' },
         { "rotate",             no_argument,        NULL, 'r' },
+        { "raw",                no_argument,        NULL, 'w' },
         { NULL,                 0,                  NULL, 0 }
     };
 
@@ -818,6 +889,10 @@ int main(int argc, char* const argv[]) {
             // experimental feature
             gRotate = true;
             break;
+        case 'w':
+            // experimental feature
+            gRawOutput = true;
+            break;
         default:
             if (ic != '?') {
                 fprintf(stderr, "getopt_long returned unexpected value 0x%x\n", ic);
@@ -831,17 +906,19 @@ int main(int argc, char* const argv[]) {
         return 2;
     }
 
-    // MediaMuxer tries to create the file in the constructor, but we don't
-    // learn about the failure until muxer.start(), which returns a generic
-    // error code without logging anything.  We attempt to create the file
-    // now for better diagnostics.
     const char* fileName = argv[optind];
-    int fd = open(fileName, O_CREAT | O_RDWR, 0644);
-    if (fd < 0) {
-        fprintf(stderr, "Unable to open '%s': %s\n", fileName, strerror(errno));
-        return 1;
+    if (!gRawOutput) {
+        // MediaMuxer tries to create the file in the constructor, but we don't
+        // learn about the failure until muxer.start(), which returns a generic
+        // error code without logging anything.  We attempt to create the file
+        // now for better diagnostics.
+        int fd = open(fileName, O_CREAT | O_RDWR, 0644);
+        if (fd < 0) {
+            fprintf(stderr, "Unable to open '%s': %s\n", fileName, strerror(errno));
+            return 1;
+        }
+        close(fd);
     }
-    close(fd);
 
     status_t err = recordScreen(fileName);
     if (err == NO_ERROR) {
diff --git a/cmds/screenrecord/screenrecord.h b/cmds/screenrecord/screenrecord.h
index 95e8a68..9b058c2 100644
--- a/cmds/screenrecord/screenrecord.h
+++ b/cmds/screenrecord/screenrecord.h
@@ -18,6 +18,6 @@
 #define SCREENRECORD_SCREENRECORD_H
 
 #define kVersionMajor 1
-#define kVersionMinor 1
+#define kVersionMinor 2
 
 #endif /*SCREENRECORD_SCREENRECORD_H*/
diff --git a/include/media/AudioBufferProvider.h b/include/media/AudioBufferProvider.h
index ef392f0..7be449c 100644
--- a/include/media/AudioBufferProvider.h
+++ b/include/media/AudioBufferProvider.h
@@ -61,6 +61,17 @@ public:
     //  buffer->frameCount  0
     virtual status_t getNextBuffer(Buffer* buffer, int64_t pts = kInvalidPTS) = 0;
 
+    // Release (a portion of) the buffer previously obtained by getNextBuffer().
+    // It is permissible to call releaseBuffer() multiple times per getNextBuffer().
+    // On entry:
+    //  buffer->frameCount  number of frames to release, must be <= number of frames
+    //                      obtained but not yet released
+    //  buffer->raw         unused
+    // On return:
+    //  buffer->frameCount  0; implementation MUST set to zero
+    //  buffer->raw         undefined; implementation is PERMITTED to set to any value,
+    //                      so if caller needs to continue using this buffer it must
+    //                      keep track of the pointer itself
     virtual void releaseBuffer(Buffer* buffer) = 0;
 };
 
diff --git a/include/media/AudioRecord.h b/include/media/AudioRecord.h
index b192bd3..0439cb0 100644
--- a/include/media/AudioRecord.h
+++ b/include/media/AudioRecord.h
@@ -412,6 +412,7 @@ private:
         bool                mPaused;    // whether thread is requested to pause at next loop entry
         bool                mPausedInt; // whether thread internally requests pause
         nsecs_t             mPausedNs;  // if mPausedInt then associated timeout, otherwise ignored
+        bool                mIgnoreNextPausedInt;   // whether to ignore next mPausedInt request
     };
 
             // body of AudioRecordThread::threadLoop()
@@ -422,7 +423,7 @@ private:
             //      NS_INACTIVE inactive so don't run again until re-started
             //      NS_NEVER    never again
             static const nsecs_t NS_WHENEVER = -1, NS_INACTIVE = -2, NS_NEVER = -3;
-            nsecs_t processAudioBuffer(const sp<AudioRecordThread>& thread);
+            nsecs_t processAudioBuffer();
 
             // caller must hold lock on mLock for all _l methods
             status_t openRecord_l(size_t epoch);
diff --git a/include/media/AudioSystem.h b/include/media/AudioSystem.h
index b96b8a1..ca9aaf7 100644
--- a/include/media/AudioSystem.h
+++ b/include/media/AudioSystem.h
@@ -67,20 +67,24 @@ public:
 
     // returns true in *state if tracks are active on the specified stream or have been active
     // in the past inPastMs milliseconds
-    static status_t isStreamActive(audio_stream_type_t stream, bool *state, uint32_t inPastMs = 0);
+    static status_t isStreamActive(audio_stream_type_t stream, bool *state, uint32_t inPastMs);
     // returns true in *state if tracks are active for what qualifies as remote playback
     // on the specified stream or have been active in the past inPastMs milliseconds. Remote
     // playback isn't mutually exclusive with local playback.
     static status_t isStreamActiveRemotely(audio_stream_type_t stream, bool *state,
-            uint32_t inPastMs = 0);
+            uint32_t inPastMs);
     // returns true in *state if a recorder is currently recording with the specified source
     static status_t isSourceActive(audio_source_t source, bool *state);
 
     // set/get audio hardware parameters. The function accepts a list of parameters
     // key value pairs in the form: key1=value1;key2=value2;...
     // Some keys are reserved for standard parameters (See AudioParameter class).
+    // The versions with audio_io_handle_t are intended for internal media framework use only.
     static status_t setParameters(audio_io_handle_t ioHandle, const String8& keyValuePairs);
     static String8  getParameters(audio_io_handle_t ioHandle, const String8& keys);
+    // The versions without audio_io_handle_t are intended for JNI.
+    static status_t setParameters(const String8& keyValuePairs);
+    static String8  getParameters(const String8& keys);
 
     static void setErrorCallback(audio_error_callback cb);
 
@@ -90,12 +94,14 @@ public:
     static float linearToLog(int volume);
     static int logToLinear(float volume);
 
+    // Returned samplingRate and frameCount output values are guaranteed
+    // to be non-zero if status == NO_ERROR
     static status_t getOutputSamplingRate(uint32_t* samplingRate,
-            audio_stream_type_t stream = AUDIO_STREAM_DEFAULT);
+            audio_stream_type_t stream);
     static status_t getOutputFrameCount(size_t* frameCount,
-            audio_stream_type_t stream = AUDIO_STREAM_DEFAULT);
+            audio_stream_type_t stream);
     static status_t getOutputLatency(uint32_t* latency,
-            audio_stream_type_t stream = AUDIO_STREAM_DEFAULT);
+            audio_stream_type_t stream);
     static status_t getSamplingRate(audio_io_handle_t output,
                                           audio_stream_type_t streamType,
                                           uint32_t* samplingRate);
@@ -202,16 +208,16 @@ public:
                                         const audio_offload_info_t *offloadInfo = NULL);
     static status_t startOutput(audio_io_handle_t output,
                                 audio_stream_type_t stream,
-                                int session = 0);
+                                int session);
     static status_t stopOutput(audio_io_handle_t output,
                                audio_stream_type_t stream,
-                               int session = 0);
+                               int session);
     static void releaseOutput(audio_io_handle_t output);
     static audio_io_handle_t getInput(audio_source_t inputSource,
-                                    uint32_t samplingRate = 0,
-                                    audio_format_t format = AUDIO_FORMAT_DEFAULT,
-                                    audio_channel_mask_t channelMask = AUDIO_CHANNEL_IN_MONO,
-                                    int sessionId = 0);
+                                    uint32_t samplingRate,
+                                    audio_format_t format,
+                                    audio_channel_mask_t channelMask,
+                                    int sessionId);
     static status_t startInput(audio_io_handle_t input);
     static status_t stopInput(audio_io_handle_t input);
     static void releaseInput(audio_io_handle_t input);
diff --git a/include/media/AudioTrack.h b/include/media/AudioTrack.h
index 2717549..96135a6 100644
--- a/include/media/AudioTrack.h
+++ b/include/media/AudioTrack.h
@@ -123,6 +123,8 @@ public:
      *  - NO_ERROR: successful operation
      *  - NO_INIT: audio server or audio hardware not initialized
      *  - BAD_VALUE: unsupported configuration
+     * frameCount is guaranteed to be non-zero if status is NO_ERROR,
+     * and is undefined otherwise.
      */
 
     static status_t getMinFrameCount(size_t* frameCount,
@@ -566,7 +568,7 @@ public:
             uint32_t    getUnderrunFrames() const;
 
     /* Get the flags */
-            audio_output_flags_t getFlags() const { return mFlags; }
+            audio_output_flags_t getFlags() const { AutoMutex _l(mLock); return mFlags; }
 
     /* Set parameters - only possible when using direct output */
             status_t    setParameters(const String8& keyValuePairs);
@@ -626,9 +628,9 @@ protected:
             //      NS_INACTIVE inactive so don't run again until re-started
             //      NS_NEVER    never again
             static const nsecs_t NS_WHENEVER = -1, NS_INACTIVE = -2, NS_NEVER = -3;
-            nsecs_t processAudioBuffer(const sp<AudioTrackThread>& thread);
-            status_t processStreamEnd(int32_t waitCount);
+            nsecs_t processAudioBuffer();
 
+            bool     isOffloaded() const;
 
             // caller must hold lock on mLock for all _l methods
 
@@ -650,7 +652,7 @@ protected:
             // FIXME enum is faster than strcmp() for parameter 'from'
             status_t restoreTrack_l(const char *from);
 
-            bool     isOffloaded() const
+            bool     isOffloaded_l() const
                 { return (mFlags & AUDIO_OUTPUT_FLAG_COMPRESS_OFFLOAD) != 0; }
 
     // Next 3 fields may be changed if IAudioTrack is re-created, but always != 0
@@ -720,6 +722,9 @@ protected:
     uint32_t                mUpdatePeriod;          // in frames, zero means no EVENT_NEW_POS
 
     audio_output_flags_t    mFlags;
+        // const after set(), except for bits AUDIO_OUTPUT_FLAG_FAST and AUDIO_OUTPUT_FLAG_OFFLOAD.
+        // mLock must be held to read or write those bits reliably.
+
     int                     mSessionId;
     int                     mAuxEffectId;
 
diff --git a/include/media/stagefright/MetaData.h b/include/media/stagefright/MetaData.h
index de3fc36..3a87474 100644
--- a/include/media/stagefright/MetaData.h
+++ b/include/media/stagefright/MetaData.h
@@ -134,6 +134,7 @@ enum {
     kKeyRequiresSecureBuffers = 'secu',  // bool (int32_t)
 
     kKeyIsADTS            = 'adts',  // bool (int32_t)
+    kKeyAACAOT            = 'aaot',  // int32_t
 
     // If a MediaBuffer's data represents (at least partially) encrypted
     // data, the following fields aid in decryption.
diff --git a/libvideoeditor/lvpp/VideoEditorPreviewController.cpp b/libvideoeditor/lvpp/VideoEditorPreviewController.cpp
index 149c4ea..c3cd3d0 100755
--- a/libvideoeditor/lvpp/VideoEditorPreviewController.cpp
+++ b/libvideoeditor/lvpp/VideoEditorPreviewController.cpp
@@ -1248,7 +1248,7 @@ void VideoEditorPreviewController::notify(
         case MEDIA_SET_VIDEO_SIZE:
             ALOGV("MEDIA_SET_VIDEO_SIZE; New video size %d x %d", ext1, ext2);
             break;
-        case 0xAAAAAAAA:
+        case static_cast<int>(0xAAAAAAAA):
             ALOGV("VIDEO PLAYBACK ALMOST over, prepare next player");
             // Select next player and prepare it
             // If there is a clip after this one
@@ -1268,7 +1268,7 @@ void VideoEditorPreviewController::notify(
                 }
             }
             break;
-        case 0xBBBBBBBB:
+        case static_cast<int>(0xBBBBBBBB):
         {
             ALOGV("VIDEO PLAYBACK, Update Overlay");
             int overlayIndex = ext2;
diff --git a/libvideoeditor/osal/inc/M4OSA_Error.h b/libvideoeditor/osal/inc/M4OSA_Error.h
index 4d59529..75c3177 100755
--- a/libvideoeditor/osal/inc/M4OSA_Error.h
+++ b/libvideoeditor/osal/inc/M4OSA_Error.h
@@ -57,7 +57,7 @@ typedef M4OSA_UInt32   M4OSA_ERR;
   * @arg coreID: (IN) [M4OSA_UInt32] CoreID to put in the error code
   * @arg errorID: (IN) [M4OSA_UInt32] ErrorID to put in the error code*/
 #define M4OSA_ERR_CREATE(severity, coreID, errorID)\
-   (M4OSA_Int32)((((M4OSA_UInt32)severity)<<30)+((((M4OSA_UInt32)coreID)&0x003FFF)<<16)+(((M4OSA_UInt32)errorID)&0x00FFFF))
+   (M4OSA_UInt32)((((M4OSA_UInt32)severity)<<30)+((((M4OSA_UInt32)coreID)&0x003FFF)<<16)+(((M4OSA_UInt32)errorID)&0x00FFFF))
 
 /** This macro extracts the 3 fields from the error:
   * @arg error: (IN) [M4OSA_ERR] Error code
diff --git a/libvideoeditor/vss/stagefrightshells/src/VideoEditor3gpReader.cpp b/libvideoeditor/vss/stagefrightshells/src/VideoEditor3gpReader.cpp
index 3c8915a..99cf9ec 100755
--- a/libvideoeditor/vss/stagefrightshells/src/VideoEditor3gpReader.cpp
+++ b/libvideoeditor/vss/stagefrightshells/src/VideoEditor3gpReader.cpp
@@ -776,16 +776,16 @@ M4OSA_ERR VideoEditor3gpReader_setOption(M4OSA_Context context,
         case M4READER_kOptionID_SetOsaFileReaderFctsPtr:
         break;
 
-        case M4READER_3GP_kOptionID_AudioOnly:
+        case static_cast<M4OSA_OptionID>(M4READER_3GP_kOptionID_AudioOnly):
         break;
 
-        case M4READER_3GP_kOptionID_VideoOnly:
+        case static_cast<M4OSA_OptionID>(M4READER_3GP_kOptionID_VideoOnly):
         break;
 
-        case M4READER_3GP_kOptionID_FastOpenMode:
+        case static_cast<M4OSA_OptionID>(M4READER_3GP_kOptionID_FastOpenMode):
         break;
 
-        case M4READER_kOptionID_MaxMetadataSize:
+        case static_cast<M4OSA_OptionID>(M4READER_kOptionID_MaxMetadataSize):
         break;
 
         default:
diff --git a/libvideoeditor/vss/stagefrightshells/src/VideoEditorAudioDecoder.cpp b/libvideoeditor/vss/stagefrightshells/src/VideoEditorAudioDecoder.cpp
index 9b35d07..e4c7ea1 100755
--- a/libvideoeditor/vss/stagefrightshells/src/VideoEditorAudioDecoder.cpp
+++ b/libvideoeditor/vss/stagefrightshells/src/VideoEditorAudioDecoder.cpp
@@ -809,7 +809,7 @@ M4OSA_ERR VideoEditorAudioDecoder_setOption(M4AD_Context pContext,
     pDecoderContext = (VideoEditorAudioDecoder_Context*)pContext;
 
     switch( optionID ) {
-        case M4AD_kOptionID_UserParam:
+        case static_cast<M4OSA_UInt32>(M4AD_kOptionID_UserParam):
             ALOGV("VideoEditorAudioDecodersetOption UserParam is not supported");
             err = M4ERR_NOT_IMPLEMENTED;
             break;
diff --git a/media/libeffects/loudness/EffectLoudnessEnhancer.cpp b/media/libeffects/loudness/EffectLoudnessEnhancer.cpp
index 91ed677..3c2b320 100644
--- a/media/libeffects/loudness/EffectLoudnessEnhancer.cpp
+++ b/media/libeffects/loudness/EffectLoudnessEnhancer.cpp
@@ -453,13 +453,13 @@ const struct effect_interface_s gLEInterface = {
 // This is the only symbol that needs to be exported
 __attribute__ ((visibility ("default")))
 audio_effect_library_t AUDIO_EFFECT_LIBRARY_INFO_SYM = {
-    tag : AUDIO_EFFECT_LIBRARY_TAG,
-    version : EFFECT_LIBRARY_API_VERSION,
-    name : "Loudness Enhancer Library",
-    implementor : "The Android Open Source Project",
-    create_effect : LELib_Create,
-    release_effect : LELib_Release,
-    get_descriptor : LELib_GetDescriptor,
+    .tag = AUDIO_EFFECT_LIBRARY_TAG,
+    .version = EFFECT_LIBRARY_API_VERSION,
+    .name = "Loudness Enhancer Library",
+    .implementor = "The Android Open Source Project",
+    .create_effect = LELib_Create,
+    .release_effect = LELib_Release,
+    .get_descriptor = LELib_GetDescriptor,
 };
 
 }; // extern "C"
diff --git a/media/libeffects/proxy/EffectProxy.cpp b/media/libeffects/proxy/EffectProxy.cpp
index dd4ad08..e6faaaf 100644
--- a/media/libeffects/proxy/EffectProxy.cpp
+++ b/media/libeffects/proxy/EffectProxy.cpp
@@ -329,11 +329,11 @@ int Effect_getDescriptor(effect_handle_t   self,
 
 __attribute__ ((visibility ("default")))
 audio_effect_library_t AUDIO_EFFECT_LIBRARY_INFO_SYM = {
-    tag : AUDIO_EFFECT_LIBRARY_TAG,
-    version : EFFECT_LIBRARY_API_VERSION,
-    name : "Effect Proxy",
-    implementor : "AOSP",
-    create_effect : android::EffectProxyCreate,
-    release_effect : android::EffectProxyRelease,
-    get_descriptor : android::EffectProxyGetDescriptor,
+    .tag = AUDIO_EFFECT_LIBRARY_TAG,
+    .version = EFFECT_LIBRARY_API_VERSION,
+    .name = "Effect Proxy",
+    .implementor = "AOSP",
+    .create_effect = android::EffectProxyCreate,
+    .release_effect = android::EffectProxyRelease,
+    .get_descriptor = android::EffectProxyGetDescriptor,
 };
diff --git a/media/libmedia/AudioRecord.cpp b/media/libmedia/AudioRecord.cpp
index df99147..e39a475 100644
--- a/media/libmedia/AudioRecord.cpp
+++ b/media/libmedia/AudioRecord.cpp
@@ -692,7 +692,7 @@ ssize_t AudioRecord::read(void* buffer, size_t userSize)
 
 // -------------------------------------------------------------------------
 
-nsecs_t AudioRecord::processAudioBuffer(const sp<AudioRecordThread>& thread)
+nsecs_t AudioRecord::processAudioBuffer()
 {
     mLock.lock();
     if (mAwaitBoost) {
@@ -954,7 +954,7 @@ status_t AudioRecord::restoreRecord_l(const char *from)
 
 // =========================================================================
 
-void AudioRecord::DeathNotifier::binderDied(const wp<IBinder>& who)
+void AudioRecord::DeathNotifier::binderDied(const wp<IBinder>& who __unused)
 {
     sp<AudioRecord> audioRecord = mAudioRecord.promote();
     if (audioRecord != 0) {
@@ -966,7 +966,8 @@ void AudioRecord::DeathNotifier::binderDied(const wp<IBinder>& who)
 // =========================================================================
 
 AudioRecord::AudioRecordThread::AudioRecordThread(AudioRecord& receiver, bool bCanCallJava)
-    : Thread(bCanCallJava), mReceiver(receiver), mPaused(true), mPausedInt(false), mPausedNs(0LL)
+    : Thread(bCanCallJava), mReceiver(receiver), mPaused(true), mPausedInt(false), mPausedNs(0LL),
+      mIgnoreNextPausedInt(false)
 {
 }
 
@@ -983,6 +984,10 @@ bool AudioRecord::AudioRecordThread::threadLoop()
             // caller will check for exitPending()
             return true;
         }
+        if (mIgnoreNextPausedInt) {
+            mIgnoreNextPausedInt = false;
+            mPausedInt = false;
+        }
         if (mPausedInt) {
             if (mPausedNs > 0) {
                 (void) mMyCond.waitRelative(mMyLock, mPausedNs);
@@ -993,7 +998,7 @@ bool AudioRecord::AudioRecordThread::threadLoop()
             return true;
         }
     }
-    nsecs_t ns =  mReceiver.processAudioBuffer(this);
+    nsecs_t ns =  mReceiver.processAudioBuffer();
     switch (ns) {
     case 0:
         return true;
@@ -1017,12 +1022,7 @@ void AudioRecord::AudioRecordThread::requestExit()
 {
     // must be in this order to avoid a race condition
     Thread::requestExit();
-    AutoMutex _l(mMyLock);
-    if (mPaused || mPausedInt) {
-        mPaused = false;
-        mPausedInt = false;
-        mMyCond.signal();
-    }
+    resume();
 }
 
 void AudioRecord::AudioRecordThread::pause()
@@ -1034,6 +1034,7 @@ void AudioRecord::AudioRecordThread::pause()
 void AudioRecord::AudioRecordThread::resume()
 {
     AutoMutex _l(mMyLock);
+    mIgnoreNextPausedInt = true;
     if (mPaused || mPausedInt) {
         mPaused = false;
         mPausedInt = false;
diff --git a/media/libmedia/AudioSystem.cpp b/media/libmedia/AudioSystem.cpp
index 8033c2c..4580030 100644
--- a/media/libmedia/AudioSystem.cpp
+++ b/media/libmedia/AudioSystem.cpp
@@ -40,10 +40,10 @@ audio_error_callback AudioSystem::gAudioErrorCallback = NULL;
 DefaultKeyedVector<audio_io_handle_t, AudioSystem::OutputDescriptor *> AudioSystem::gOutputs(0);
 
 // Cached values for recording queries, all protected by gLock
-uint32_t AudioSystem::gPrevInSamplingRate = 16000;
-audio_format_t AudioSystem::gPrevInFormat = AUDIO_FORMAT_PCM_16_BIT;
-audio_channel_mask_t AudioSystem::gPrevInChannelMask = AUDIO_CHANNEL_IN_MONO;
-size_t AudioSystem::gInBuffSize = 0;
+uint32_t AudioSystem::gPrevInSamplingRate;
+audio_format_t AudioSystem::gPrevInFormat;
+audio_channel_mask_t AudioSystem::gPrevInChannelMask;
+size_t AudioSystem::gInBuffSize = 0;    // zero indicates cache is invalid
 
 
 // establish binder interface to AudioFlinger service
@@ -190,6 +190,16 @@ String8 AudioSystem::getParameters(audio_io_handle_t ioHandle, const String8& ke
     return result;
 }
 
+status_t AudioSystem::setParameters(const String8& keyValuePairs)
+{
+    return setParameters((audio_io_handle_t) 0, keyValuePairs);
+}
+
+String8 AudioSystem::getParameters(const String8& keys)
+{
+    return getParameters((audio_io_handle_t) 0, keys);
+}
+
 // convert volume steps to natural log scale
 
 // change this value to change volume scaling
@@ -249,6 +259,11 @@ status_t AudioSystem::getSamplingRate(audio_io_handle_t output,
         *samplingRate = outputDesc->samplingRate;
         gLock.unlock();
     }
+    if (*samplingRate == 0) {
+        ALOGE("AudioSystem::getSamplingRate failed for output %d stream type %d",
+                output, streamType);
+        return BAD_VALUE;
+    }
 
     ALOGV("getSamplingRate() streamType %d, output %d, sampling rate %u", streamType, output,
             *samplingRate);
@@ -289,6 +304,11 @@ status_t AudioSystem::getFrameCount(audio_io_handle_t output,
         *frameCount = outputDesc->frameCount;
         gLock.unlock();
     }
+    if (*frameCount == 0) {
+        ALOGE("AudioSystem::getFrameCount failed for output %d stream type %d",
+                output, streamType);
+        return BAD_VALUE;
+    }
 
     ALOGV("getFrameCount() streamType %d, output %d, frameCount %d", streamType, output,
             *frameCount);
@@ -349,6 +369,12 @@ status_t AudioSystem::getInputBufferSize(uint32_t sampleRate, audio_format_t for
             return PERMISSION_DENIED;
         }
         inBuffSize = af->getInputBufferSize(sampleRate, format, channelMask);
+        if (inBuffSize == 0) {
+            ALOGE("AudioSystem::getInputBufferSize failed sampleRate %d format %x channelMask %x",
+                    sampleRate, format, channelMask);
+            return BAD_VALUE;
+        }
+        // A benign race is possible here: we could overwrite a fresher cache entry
         gLock.lock();
         // save the request params
         gPrevInSamplingRate = sampleRate;
@@ -419,7 +445,7 @@ void AudioSystem::releaseAudioSessionId(int audioSession) {
 
 // ---------------------------------------------------------------------------
 
-void AudioSystem::AudioFlingerClient::binderDied(const wp<IBinder>& who) {
+void AudioSystem::AudioFlingerClient::binderDied(const wp<IBinder>& who __unused) {
     Mutex::Autolock _l(AudioSystem::gLock);
 
     AudioSystem::gAudioFlinger.clear();
@@ -804,7 +830,7 @@ bool AudioSystem::isOffloadSupported(const audio_offload_info_t& info)
 
 // ---------------------------------------------------------------------------
 
-void AudioSystem::AudioPolicyServiceClient::binderDied(const wp<IBinder>& who) {
+void AudioSystem::AudioPolicyServiceClient::binderDied(const wp<IBinder>& who __unused) {
     Mutex::Autolock _l(AudioSystem::gLock);
     AudioSystem::gAudioPolicyService.clear();
 
diff --git a/media/libmedia/AudioTrack.cpp b/media/libmedia/AudioTrack.cpp
index b48cb1f..8954d9f 100644
--- a/media/libmedia/AudioTrack.cpp
+++ b/media/libmedia/AudioTrack.cpp
@@ -44,9 +44,6 @@ status_t AudioTrack::getMinFrameCount(
         return BAD_VALUE;
     }
 
-    // default to 0 in case of error
-    *frameCount = 0;
-
     // FIXME merge with similar code in createTrack_l(), except we're missing
     //       some information here that is available in createTrack_l():
     //          audio_io_handle_t output
@@ -54,16 +51,20 @@ status_t AudioTrack::getMinFrameCount(
     //          audio_channel_mask_t channelMask
     //          audio_output_flags_t flags
     uint32_t afSampleRate;
-    if (AudioSystem::getOutputSamplingRate(&afSampleRate, streamType) != NO_ERROR) {
-        return NO_INIT;
+    status_t status;
+    status = AudioSystem::getOutputSamplingRate(&afSampleRate, streamType);
+    if (status != NO_ERROR) {
+        return status;
     }
     size_t afFrameCount;
-    if (AudioSystem::getOutputFrameCount(&afFrameCount, streamType) != NO_ERROR) {
-        return NO_INIT;
+    status = AudioSystem::getOutputFrameCount(&afFrameCount, streamType);
+    if (status != NO_ERROR) {
+        return status;
     }
     uint32_t afLatency;
-    if (AudioSystem::getOutputLatency(&afLatency, streamType) != NO_ERROR) {
-        return NO_INIT;
+    status = AudioSystem::getOutputLatency(&afLatency, streamType);
+    if (status != NO_ERROR) {
+        return status;
     }
 
     // Ensure that buffer depth covers at least audio hardware latency
@@ -74,6 +75,13 @@ status_t AudioTrack::getMinFrameCount(
 
     *frameCount = (sampleRate == 0) ? afFrameCount * minBufCount :
             afFrameCount * minBufCount * sampleRate / afSampleRate;
+    // The formula above should always produce a non-zero value, but return an error
+    // in the unlikely event that it does not, as that's part of the API contract.
+    if (*frameCount == 0) {
+        ALOGE("AudioTrack::getMinFrameCount failed for streamType %d, sampleRate %d",
+                streamType, sampleRate);
+        return BAD_VALUE;
+    }
     ALOGV("getMinFrameCount=%d: afFrameCount=%d, minBufCount=%d, afSampleRate=%d, afLatency=%d",
             *frameCount, afFrameCount, minBufCount, afSampleRate, afLatency);
     return NO_ERROR;
@@ -237,12 +245,14 @@ status_t AudioTrack::set(
         streamType = AUDIO_STREAM_MUSIC;
     }
 
+    status_t status;
     if (sampleRate == 0) {
-        uint32_t afSampleRate;
-        if (AudioSystem::getOutputSamplingRate(&afSampleRate, streamType) != NO_ERROR) {
-            return NO_INIT;
+        status = AudioSystem::getOutputSamplingRate(&sampleRate, streamType);
+        if (status != NO_ERROR) {
+            ALOGE("Could not get output sample rate for stream type %d; status %d",
+                    streamType, status);
+            return status;
         }
-        sampleRate = afSampleRate;
     }
     mSampleRate = sampleRate;
 
@@ -330,7 +340,7 @@ status_t AudioTrack::set(
     }
 
     // create the IAudioTrack
-    status_t status = createTrack_l(streamType,
+    status = createTrack_l(streamType,
                                   sampleRate,
                                   format,
                                   frameCount,
@@ -447,7 +457,7 @@ void AudioTrack::stop()
         return;
     }
 
-    if (isOffloaded()) {
+    if (isOffloaded_l()) {
         mState = STATE_STOPPING;
     } else {
         mState = STATE_STOPPED;
@@ -469,7 +479,7 @@ void AudioTrack::stop()
 
     sp<AudioTrackThread> t = mAudioTrackThread;
     if (t != 0) {
-        if (!isOffloaded()) {
+        if (!isOffloaded_l()) {
             t->pause();
         }
     } else {
@@ -507,7 +517,7 @@ void AudioTrack::flush_l()
     mRefreshRemaining = true;
 
     mState = STATE_FLUSHED;
-    if (isOffloaded()) {
+    if (isOffloaded_l()) {
         mProxy->interrupt();
     }
     mProxy->flush();
@@ -540,7 +550,7 @@ status_t AudioTrack::setVolume(float left, float right)
 
     mProxy->setVolumeLR((uint32_t(uint16_t(right * 0x1000)) << 16) | uint16_t(left * 0x1000));
 
-    if (isOffloaded()) {
+    if (isOffloaded_l()) {
         mAudioTrack->signal();
     }
     return NO_ERROR;
@@ -604,7 +614,7 @@ uint32_t AudioTrack::getSampleRate() const
     // sample rate can be updated during playback by the offloaded decoder so we need to
     // query the HAL and update if needed.
 // FIXME use Proxy return channel to update the rate from server and avoid polling here
-    if (isOffloaded()) {
+    if (isOffloaded_l()) {
         if (mOutput != 0) {
             uint32_t sampleRate = 0;
             status_t status = AudioSystem::getSamplingRate(mOutput, mStreamType, &sampleRate);
@@ -741,7 +751,7 @@ status_t AudioTrack::getPosition(uint32_t *position) const
     }
 
     AutoMutex lock(mLock);
-    if (isOffloaded()) {
+    if (isOffloaded_l()) {
         uint32_t dspFrames = 0;
 
         if (mOutput != 0) {
@@ -1341,7 +1351,7 @@ status_t TimedAudioTrack::setMediaTimeTransform(const LinearTransform& xform,
 
 // -------------------------------------------------------------------------
 
-nsecs_t AudioTrack::processAudioBuffer(const sp<AudioTrackThread>& thread)
+nsecs_t AudioTrack::processAudioBuffer()
 {
     // Currently the AudioTrack thread is not created if there are no callbacks.
     // Would it ever make sense to run the thread, even without callbacks?
@@ -1379,7 +1389,7 @@ nsecs_t AudioTrack::processAudioBuffer(const sp<AudioTrackThread>& thread)
         // for offloaded tracks restoreTrack_l() will just update the sequence and clear
         // AudioSystem cache. We should not exit here but after calling the callback so
         // that the upper layers can recreate the track
-        if (!isOffloaded() || (mSequence == mObservedSequence)) {
+        if (!isOffloaded_l() || (mSequence == mObservedSequence)) {
             status_t status = restoreTrack_l("processAudioBuffer");
             mLock.unlock();
             // Run again immediately, but with a new IAudioTrack
@@ -1666,7 +1676,7 @@ nsecs_t AudioTrack::processAudioBuffer(const sp<AudioTrackThread>& thread)
 status_t AudioTrack::restoreTrack_l(const char *from)
 {
     ALOGW("dead IAudioTrack, %s, creating a new one from %s()",
-          isOffloaded() ? "Offloaded" : "PCM", from);
+          isOffloaded_l() ? "Offloaded" : "PCM", from);
     ++mSequence;
     status_t result;
 
@@ -1674,7 +1684,8 @@ status_t AudioTrack::restoreTrack_l(const char *from)
     // output parameters in getOutput_l() and createTrack_l()
     AudioSystem::clearAudioConfigCache();
 
-    if (isOffloaded()) {
+    if (isOffloaded_l()) {
+        // FIXME re-creation of offloaded tracks is not yet implemented
         return DEAD_OBJECT;
     }
 
@@ -1760,14 +1771,21 @@ status_t AudioTrack::getTimestamp(AudioTimestamp& timestamp)
 
 String8 AudioTrack::getParameters(const String8& keys)
 {
-    if (mOutput) {
-        return AudioSystem::getParameters(mOutput, keys);
+    audio_io_handle_t output = getOutput();
+    if (output != 0) {
+        return AudioSystem::getParameters(output, keys);
     } else {
         return String8::empty();
     }
 }
 
-status_t AudioTrack::dump(int fd, const Vector<String16>& args) const
+bool AudioTrack::isOffloaded() const
+{
+    AutoMutex lock(mLock);
+    return isOffloaded_l();
+}
+
+status_t AudioTrack::dump(int fd, const Vector<String16>& args __unused) const
 {
 
     const size_t SIZE = 256;
@@ -1797,7 +1815,7 @@ uint32_t AudioTrack::getUnderrunFrames() const
 
 // =========================================================================
 
-void AudioTrack::DeathNotifier::binderDied(const wp<IBinder>& who)
+void AudioTrack::DeathNotifier::binderDied(const wp<IBinder>& who __unused)
 {
     sp<AudioTrack> audioTrack = mAudioTrack.promote();
     if (audioTrack != 0) {
@@ -1841,7 +1859,7 @@ bool AudioTrack::AudioTrackThread::threadLoop()
             return true;
         }
     }
-    nsecs_t ns = mReceiver.processAudioBuffer(this);
+    nsecs_t ns = mReceiver.processAudioBuffer();
     switch (ns) {
     case 0:
         return true;
diff --git a/media/libmedia/AudioTrackShared.cpp b/media/libmedia/AudioTrackShared.cpp
index caa7900..7a1e207 100644
--- a/media/libmedia/AudioTrackShared.cpp
+++ b/media/libmedia/AudioTrackShared.cpp
@@ -765,7 +765,7 @@ ssize_t StaticAudioTrackServerProxy::pollPosition()
     return (ssize_t) position;
 }
 
-status_t StaticAudioTrackServerProxy::obtainBuffer(Buffer* buffer, bool ackFlush)
+status_t StaticAudioTrackServerProxy::obtainBuffer(Buffer* buffer, bool ackFlush __unused)
 {
     if (mIsShutdown) {
         buffer->mFrameCount = 0;
@@ -847,7 +847,7 @@ void StaticAudioTrackServerProxy::releaseBuffer(Buffer* buffer)
     buffer->mNonContig = 0;
 }
 
-void StaticAudioTrackServerProxy::tallyUnderrunFrames(uint32_t frameCount)
+void StaticAudioTrackServerProxy::tallyUnderrunFrames(uint32_t frameCount __unused)
 {
     // Unlike AudioTrackServerProxy::tallyUnderrunFrames() used for streaming tracks,
     // we don't have a location to count underrun frames.  The underrun frame counter
diff --git a/media/libmedia/IMediaDeathNotifier.cpp b/media/libmedia/IMediaDeathNotifier.cpp
index 9db5b1b..10b4934 100644
--- a/media/libmedia/IMediaDeathNotifier.cpp
+++ b/media/libmedia/IMediaDeathNotifier.cpp
@@ -75,7 +75,7 @@ IMediaDeathNotifier::removeObitRecipient(const wp<IMediaDeathNotifier>& recipien
 }
 
 void
-IMediaDeathNotifier::DeathNotifier::binderDied(const wp<IBinder>& who) {
+IMediaDeathNotifier::DeathNotifier::binderDied(const wp<IBinder>& who __unused) {
     ALOGW("media server died");
 
     // Need to do this with the lock held
diff --git a/media/libmedia/SoundPool.cpp b/media/libmedia/SoundPool.cpp
index b420c95..98acd1f 100644
--- a/media/libmedia/SoundPool.cpp
+++ b/media/libmedia/SoundPool.cpp
@@ -199,7 +199,7 @@ SoundChannel* SoundPool::findNextChannel(int channelID)
     return NULL;
 }
 
-int SoundPool::load(const char* path, int priority)
+int SoundPool::load(const char* path, int priority __unused)
 {
     ALOGV("load: path=%s, priority=%d", path, priority);
     Mutex::Autolock lock(&mLock);
@@ -209,7 +209,7 @@ int SoundPool::load(const char* path, int priority)
     return sample->sampleID();
 }
 
-int SoundPool::load(int fd, int64_t offset, int64_t length, int priority)
+int SoundPool::load(int fd, int64_t offset, int64_t length, int priority __unused)
 {
     ALOGV("load: fd=%d, offset=%lld, length=%lld, priority=%d",
             fd, offset, length, priority);
diff --git a/media/libmedia/mediametadataretriever.cpp b/media/libmedia/mediametadataretriever.cpp
index 110b94c..bad2494 100644
--- a/media/libmedia/mediametadataretriever.cpp
+++ b/media/libmedia/mediametadataretriever.cpp
@@ -157,7 +157,7 @@ sp<IMemory> MediaMetadataRetriever::extractAlbumArt()
     return mRetriever->extractAlbumArt();
 }
 
-void MediaMetadataRetriever::DeathNotifier::binderDied(const wp<IBinder>& who) {
+void MediaMetadataRetriever::DeathNotifier::binderDied(const wp<IBinder>& who __unused) {
     Mutex::Autolock lock(MediaMetadataRetriever::sServiceLock);
     MediaMetadataRetriever::sService.clear();
     ALOGW("MediaMetadataRetriever server died!");
diff --git a/media/libmedia/mediaplayer.cpp b/media/libmedia/mediaplayer.cpp
index 0f6d897..7a6f31d 100644
--- a/media/libmedia/mediaplayer.cpp
+++ b/media/libmedia/mediaplayer.cpp
@@ -654,7 +654,7 @@ status_t MediaPlayer::setRetransmitEndpoint(const char* addrString,
         return BAD_VALUE;
     }
 
-    memset(&mRetransmitEndpoint, 0, sizeof(&mRetransmitEndpoint));
+    memset(&mRetransmitEndpoint, 0, sizeof(mRetransmitEndpoint));
     mRetransmitEndpoint.sin_family = AF_INET;
     mRetransmitEndpoint.sin_addr   = saddr;
     mRetransmitEndpoint.sin_port   = htons(port);
diff --git a/media/libnbaio/MonoPipe.cpp b/media/libnbaio/MonoPipe.cpp
index de0ad28..3c61b60 100644
--- a/media/libnbaio/MonoPipe.cpp
+++ b/media/libnbaio/MonoPipe.cpp
@@ -183,7 +183,7 @@ ssize_t MonoPipe::write(const void *buffer, size_t count)
             }
         }
         if (ns > 0) {
-            const struct timespec req = {0, ns};
+            const struct timespec req = {0, static_cast<long>(ns)};
             nanosleep(&req, NULL);
         }
         // record the time that this write() completed
diff --git a/media/libstagefright/MPEG4Extractor.cpp b/media/libstagefright/MPEG4Extractor.cpp
index 9b36b6a..9c89e82 100644
--- a/media/libstagefright/MPEG4Extractor.cpp
+++ b/media/libstagefright/MPEG4Extractor.cpp
@@ -2372,6 +2372,58 @@ status_t MPEG4Extractor::verifyTrack(Track *track) {
     return OK;
 }
 
+typedef enum {
+    //AOT_NONE             = -1,
+    //AOT_NULL_OBJECT      = 0,
+    //AOT_AAC_MAIN         = 1, /**< Main profile                              */
+    AOT_AAC_LC           = 2,   /**< Low Complexity object                     */
+    //AOT_AAC_SSR          = 3,
+    //AOT_AAC_LTP          = 4,
+    AOT_SBR              = 5,
+    //AOT_AAC_SCAL         = 6,
+    //AOT_TWIN_VQ          = 7,
+    //AOT_CELP             = 8,
+    //AOT_HVXC             = 9,
+    //AOT_RSVD_10          = 10, /**< (reserved)                                */
+    //AOT_RSVD_11          = 11, /**< (reserved)                                */
+    //AOT_TTSI             = 12, /**< TTSI Object                               */
+    //AOT_MAIN_SYNTH       = 13, /**< Main Synthetic object                     */
+    //AOT_WAV_TAB_SYNTH    = 14, /**< Wavetable Synthesis object                */
+    //AOT_GEN_MIDI         = 15, /**< General MIDI object                       */
+    //AOT_ALG_SYNTH_AUD_FX = 16, /**< Algorithmic Synthesis and Audio FX object */
+    AOT_ER_AAC_LC        = 17,   /**< Error Resilient(ER) AAC Low Complexity    */
+    //AOT_RSVD_18          = 18, /**< (reserved)                                */
+    //AOT_ER_AAC_LTP       = 19, /**< Error Resilient(ER) AAC LTP object        */
+    AOT_ER_AAC_SCAL      = 20,   /**< Error Resilient(ER) AAC Scalable object   */
+    //AOT_ER_TWIN_VQ       = 21, /**< Error Resilient(ER) TwinVQ object         */
+    AOT_ER_BSAC          = 22,   /**< Error Resilient(ER) BSAC object           */
+    AOT_ER_AAC_LD        = 23,   /**< Error Resilient(ER) AAC LowDelay object   */
+    //AOT_ER_CELP          = 24, /**< Error Resilient(ER) CELP object           */
+    //AOT_ER_HVXC          = 25, /**< Error Resilient(ER) HVXC object           */
+    //AOT_ER_HILN          = 26, /**< Error Resilient(ER) HILN object           */
+    //AOT_ER_PARA          = 27, /**< Error Resilient(ER) Parametric object     */
+    //AOT_RSVD_28          = 28, /**< might become SSC                          */
+    AOT_PS               = 29,   /**< PS, Parametric Stereo (includes SBR)      */
+    //AOT_MPEGS            = 30, /**< MPEG Surround                             */
+
+    AOT_ESCAPE           = 31,   /**< Signal AOT uses more than 5 bits          */
+
+    //AOT_MP3ONMP4_L1      = 32, /**< MPEG-Layer1 in mp4                        */
+    //AOT_MP3ONMP4_L2      = 33, /**< MPEG-Layer2 in mp4                        */
+    //AOT_MP3ONMP4_L3      = 34, /**< MPEG-Layer3 in mp4                        */
+    //AOT_RSVD_35          = 35, /**< might become DST                          */
+    //AOT_RSVD_36          = 36, /**< might become ALS                          */
+    //AOT_AAC_SLS          = 37, /**< AAC + SLS                                 */
+    //AOT_SLS              = 38, /**< SLS                                       */
+    //AOT_ER_AAC_ELD       = 39, /**< AAC Enhanced Low Delay                    */
+
+    //AOT_USAC             = 42, /**< USAC                                      */
+    //AOT_SAOC             = 43, /**< SAOC                                      */
+    //AOT_LD_MPEGS         = 44, /**< Low Delay MPEG Surround                   */
+
+    //AOT_RSVD50           = 50,  /**< Interim AOT for Rsvd50                   */
+} AUDIO_OBJECT_TYPE;
+
 status_t MPEG4Extractor::updateAudioTrackInfoFromESDS_MPEG4Audio(
         const void *esds_data, size_t esds_size) {
     ESDS esds(esds_data, esds_size);
@@ -2419,6 +2471,11 @@ status_t MPEG4Extractor::updateAudioTrackInfoFromESDS_MPEG4Audio(
         return ERROR_MALFORMED;
     }
 
+    static uint32_t kSamplingRate[] = {
+        96000, 88200, 64000, 48000, 44100, 32000, 24000, 22050,
+        16000, 12000, 11025, 8000, 7350
+    };
+
     ABitReader br(csd, csd_size);
     uint32_t objectType = br.getBits(5);
 
@@ -2426,6 +2483,9 @@ status_t MPEG4Extractor::updateAudioTrackInfoFromESDS_MPEG4Audio(
         objectType = 32 + br.getBits(6);
     }
 
+    //keep AOT type
+    mLastTrack->meta->setInt32(kKeyAACAOT, objectType);
+
     uint32_t freqIndex = br.getBits(4);
 
     int32_t sampleRate = 0;
@@ -2438,28 +2498,134 @@ status_t MPEG4Extractor::updateAudioTrackInfoFromESDS_MPEG4Audio(
         numChannels = br.getBits(4);
     } else {
         numChannels = br.getBits(4);
-        if (objectType == 5) {
-            // SBR specific config per 14496-3 table 1.13
-            freqIndex = br.getBits(4);
-            if (freqIndex == 15) {
-                if (csd_size < 8) {
-                    return ERROR_MALFORMED;
-                }
-                sampleRate = br.getBits(24);
-            }
+
+        if (freqIndex == 13 || freqIndex == 14) {
+            return ERROR_MALFORMED;
         }
 
-        if (sampleRate == 0) {
-            static uint32_t kSamplingRate[] = {
-                96000, 88200, 64000, 48000, 44100, 32000, 24000, 22050,
-                16000, 12000, 11025, 8000, 7350
-            };
+        sampleRate = kSamplingRate[freqIndex];
+    }
 
-            if (freqIndex == 13 || freqIndex == 14) {
+    if (objectType == AOT_SBR || objectType == AOT_PS) {//SBR specific config per 14496-3 table 1.13
+        uint32_t extFreqIndex = br.getBits(4);
+        int32_t extSampleRate;
+        if (extFreqIndex == 15) {
+            if (csd_size < 8) {
+                return ERROR_MALFORMED;
+            }
+            extSampleRate = br.getBits(24);
+        } else {
+            if (extFreqIndex == 13 || extFreqIndex == 14) {
                 return ERROR_MALFORMED;
             }
+            extSampleRate = kSamplingRate[extFreqIndex];
+        }
+        //TODO: save the extension sampling rate value in meta data =>
+        //      mLastTrack->meta->setInt32(kKeyExtSampleRate, extSampleRate);
+    }
+
+    switch (numChannels) {
+        // values defined in 14496-3_2009 amendment-4 Table 1.19 - Channel Configuration
+        case 0:
+        case 1:// FC
+        case 2:// FL FR
+        case 3:// FC, FL FR
+        case 4:// FC, FL FR, RC
+        case 5:// FC, FL FR, SL SR
+        case 6:// FC, FL FR, SL SR, LFE
+            //numChannels already contains the right value
+            break;
+        case 11:// FC, FL FR, SL SR, RC, LFE
+            numChannels = 7;
+            break;
+        case 7: // FC, FCL FCR, FL FR, SL SR, LFE
+        case 12:// FC, FL  FR,  SL SR, RL RR, LFE
+        case 14:// FC, FL  FR,  SL SR, LFE, FHL FHR
+            numChannels = 8;
+            break;
+        default:
+            return ERROR_UNSUPPORTED;
+    }
+
+    {
+        if (objectType == AOT_SBR || objectType == AOT_PS) {
+            const int32_t extensionSamplingFrequency = br.getBits(4);
+            objectType = br.getBits(5);
+
+            if (objectType == AOT_ESCAPE) {
+                objectType = 32 + br.getBits(6);
+            }
+        }
+        if (objectType == AOT_AAC_LC || objectType == AOT_ER_AAC_LC ||
+                objectType == AOT_ER_AAC_LD || objectType == AOT_ER_AAC_SCAL ||
+                objectType == AOT_ER_BSAC) {
+            const int32_t frameLengthFlag = br.getBits(1);
+
+            const int32_t dependsOnCoreCoder = br.getBits(1);
+
+            if (dependsOnCoreCoder ) {
+                const int32_t coreCoderDelay = br.getBits(14);
+            }
+
+            const int32_t extensionFlag = br.getBits(1);
+
+            if (numChannels == 0 ) {
+                int32_t channelsEffectiveNum = 0;
+                int32_t channelsNum = 0;
+                const int32_t ElementInstanceTag = br.getBits(4);
+                const int32_t Profile = br.getBits(2);
+                const int32_t SamplingFrequencyIndex = br.getBits(4);
+                const int32_t NumFrontChannelElements = br.getBits(4);
+                const int32_t NumSideChannelElements = br.getBits(4);
+                const int32_t NumBackChannelElements = br.getBits(4);
+                const int32_t NumLfeChannelElements = br.getBits(2);
+                const int32_t NumAssocDataElements = br.getBits(3);
+                const int32_t NumValidCcElements = br.getBits(4);
+
+                const int32_t MonoMixdownPresent = br.getBits(1);
+                if (MonoMixdownPresent != 0) {
+                    const int32_t MonoMixdownElementNumber = br.getBits(4);
+                }
+
+                const int32_t StereoMixdownPresent = br.getBits(1);
+                if (StereoMixdownPresent != 0) {
+                    const int32_t StereoMixdownElementNumber = br.getBits(4);
+                }
+
+                const int32_t MatrixMixdownIndexPresent = br.getBits(1);
+                if (MatrixMixdownIndexPresent != 0) {
+                    const int32_t MatrixMixdownIndex = br.getBits(2);
+                    const int32_t PseudoSurroundEnable = br.getBits(1);
+                }
 
-            sampleRate = kSamplingRate[freqIndex];
+                int i;
+                for (i=0; i < NumFrontChannelElements; i++) {
+                    const int32_t FrontElementIsCpe = br.getBits(1);
+                    const int32_t FrontElementTagSelect = br.getBits(4);
+                    channelsNum += FrontElementIsCpe ? 2 : 1;
+                }
+
+                for (i=0; i < NumSideChannelElements; i++) {
+                    const int32_t SideElementIsCpe = br.getBits(1);
+                    const int32_t SideElementTagSelect = br.getBits(4);
+                    channelsNum += SideElementIsCpe ? 2 : 1;
+                }
+
+                for (i=0; i < NumBackChannelElements; i++) {
+                    const int32_t BackElementIsCpe = br.getBits(1);
+                    const int32_t BackElementTagSelect = br.getBits(4);
+                    channelsNum += BackElementIsCpe ? 2 : 1;
+                }
+                channelsEffectiveNum = channelsNum;
+
+                for (i=0; i < NumLfeChannelElements; i++) {
+                    const int32_t LfeElementTagSelect = br.getBits(4);
+                    channelsNum += 1;
+                }
+                ALOGV("mpeg4 audio channelsNum = %d", channelsNum);
+                ALOGV("mpeg4 audio channelsEffectiveNum = %d", channelsEffectiveNum);
+                numChannels = channelsNum;
+            }
         }
     }
 
diff --git a/media/libstagefright/Utils.cpp b/media/libstagefright/Utils.cpp
index 9041c21..216a329 100644
--- a/media/libstagefright/Utils.cpp
+++ b/media/libstagefright/Utils.cpp
@@ -562,6 +562,17 @@ bool canOffloadStream(const sp<MetaData>& meta, bool hasVideo,
         return false;
     }
 
+    // check whether it is ELD/LD content -> no offloading
+    // FIXME: this should depend on audio DSP capabilities. mapMimeToAudioFormat() should use the
+    // metadata to refine the AAC format and the audio HAL should only list supported profiles.
+    int32_t aacaot = -1;
+    if (meta->findInt32(kKeyAACAOT, &aacaot)) {
+        if (aacaot == 23 || aacaot == 39 ) {
+            ALOGV("track of type '%s' is ELD/LD content", mime);
+            return false;
+        }
+    }
+
     int32_t srate = -1;
     if (!meta->findInt32(kKeySampleRate, &srate)) {
         ALOGV("track of type '%s' does not publish sample rate", mime);
diff --git a/media/libstagefright/codecs/aacdec/SoftAAC2.cpp b/media/libstagefright/codecs/aacdec/SoftAAC2.cpp
index f842e27..2f5eff4 100644
--- a/media/libstagefright/codecs/aacdec/SoftAAC2.cpp
+++ b/media/libstagefright/codecs/aacdec/SoftAAC2.cpp
@@ -30,7 +30,7 @@
 #define DRC_DEFAULT_MOBILE_REF_LEVEL 64  /* 64*-0.25dB = -16 dB below full scale for mobile conf */
 #define DRC_DEFAULT_MOBILE_DRC_CUT   127 /* maximum compression of dynamic range for mobile conf */
 #define DRC_DEFAULT_MOBILE_DRC_BOOST 127 /* maximum compression of dynamic range for mobile conf */
-#define MAX_CHANNEL_COUNT            6  /* maximum number of audio channels that can be decoded */
+#define MAX_CHANNEL_COUNT            8  /* maximum number of audio channels that can be decoded */
 // names of properties that can be used to override the default DRC settings
 #define PROP_DRC_OVERRIDE_REF_LEVEL  "aac_drc_reference_level"
 #define PROP_DRC_OVERRIDE_CUT        "aac_drc_cut"
@@ -296,8 +296,11 @@ void SoftAAC2::maybeConfigureDownmix() const {
         if (!(property_get("media.aac_51_output_enabled", value, NULL) &&
                 (!strcmp(value, "1") || !strcasecmp(value, "true")))) {
             ALOGI("Downmixing multichannel AAC to stereo");
-            aacDecoder_SetParam(mAACDecoder, AAC_PCM_OUTPUT_CHANNELS, 2);
+            aacDecoder_SetParam(mAACDecoder, AAC_PCM_MAX_OUTPUT_CHANNELS, 2);
             mStreamInfo->numChannels = 2;
+            // By default, the decoder creates a 5.1 channel downmix signal
+            // for seven and eight channel input streams. To enable 6.1 and 7.1 channel output
+            // use aacDecoder_SetParam(mAACDecoder, AAC_PCM_MAX_OUTPUT_CHANNELS, -1)
         }
     }
 }
diff --git a/media/libstagefright/httplive/M3UParser.cpp b/media/libstagefright/httplive/M3UParser.cpp
index 243888c..dd248cb 100644
--- a/media/libstagefright/httplive/M3UParser.cpp
+++ b/media/libstagefright/httplive/M3UParser.cpp
@@ -608,7 +608,7 @@ status_t M3UParser::parseMetaDataDuration(
     if (meta->get() == NULL) {
         *meta = new AMessage;
     }
-    (*meta)->setInt64(key, (int64_t)x * 1E6);
+    (*meta)->setInt64(key, (int64_t)(x * 1E6));
 
     return OK;
 }
diff --git a/media/libstagefright/httplive/PlaylistFetcher.cpp b/media/libstagefright/httplive/PlaylistFetcher.cpp
index 1754bf2..f095987 100644
--- a/media/libstagefright/httplive/PlaylistFetcher.cpp
+++ b/media/libstagefright/httplive/PlaylistFetcher.cpp
@@ -861,12 +861,13 @@ status_t PlaylistFetcher::extractAndQueueAccessUnits(
                     && source->dequeueAccessUnit(&accessUnit) == OK) {
                 // Note that we do NOT dequeue any discontinuities.
 
+                // for simplicity, store a reference to the format in each unit
+                sp<MetaData> format = source->getFormat();
+                if (format != NULL) {
+                    accessUnit->meta()->setObject("format", format);
+                }
                 packetSource->queueAccessUnit(accessUnit);
             }
-
-            if (packetSource->getFormat() == NULL) {
-                packetSource->setFormat(source->getFormat());
-            }
         }
 
         return OK;
diff --git a/media/libstagefright/mpeg2ts/AnotherPacketSource.cpp b/media/libstagefright/mpeg2ts/AnotherPacketSource.cpp
index 3153c8b..52fb2a5 100644
--- a/media/libstagefright/mpeg2ts/AnotherPacketSource.cpp
+++ b/media/libstagefright/mpeg2ts/AnotherPacketSource.cpp
@@ -70,7 +70,27 @@ status_t AnotherPacketSource::stop() {
 }
 
 sp<MetaData> AnotherPacketSource::getFormat() {
-    return mFormat;
+    Mutex::Autolock autoLock(mLock);
+    if (mFormat != NULL) {
+        return mFormat;
+    }
+
+    List<sp<ABuffer> >::iterator it = mBuffers.begin();
+    while (it != mBuffers.end()) {
+        sp<ABuffer> buffer = *it;
+        int32_t discontinuity;
+        if (buffer->meta()->findInt32("discontinuity", &discontinuity)) {
+            break;
+        }
+
+        sp<RefBase> object;
+        if (buffer->meta()->findObject("format", &object)) {
+            return static_cast<MetaData*>(object.get());
+        }
+
+        ++it;
+    }
+    return NULL;
 }
 
 status_t AnotherPacketSource::dequeueAccessUnit(sp<ABuffer> *buffer) {
@@ -94,6 +114,11 @@ status_t AnotherPacketSource::dequeueAccessUnit(sp<ABuffer> *buffer) {
             return INFO_DISCONTINUITY;
         }
 
+        sp<RefBase> object;
+        if ((*buffer)->meta()->findObject("format", &object)) {
+            mFormat = static_cast<MetaData*>(object.get());
+        }
+
         return OK;
     }
 
@@ -120,17 +145,22 @@ status_t AnotherPacketSource::read(
             }
 
             return INFO_DISCONTINUITY;
-        } else {
-            int64_t timeUs;
-            CHECK(buffer->meta()->findInt64("timeUs", &timeUs));
+        }
 
-            MediaBuffer *mediaBuffer = new MediaBuffer(buffer);
+        sp<RefBase> object;
+        if (buffer->meta()->findObject("format", &object)) {
+            mFormat = static_cast<MetaData*>(object.get());
+        }
 
-            mediaBuffer->meta_data()->setInt64(kKeyTime, timeUs);
+        int64_t timeUs;
+        CHECK(buffer->meta()->findInt64("timeUs", &timeUs));
 
-            *out = mediaBuffer;
-            return OK;
-        }
+        MediaBuffer *mediaBuffer = new MediaBuffer(buffer);
+
+        mediaBuffer->meta_data()->setInt64(kKeyTime, timeUs);
+
+        *out = mediaBuffer;
+        return OK;
     }
 
     return mEOSResult;
diff --git a/services/audioflinger/Android.mk b/services/audioflinger/Android.mk
index 3ec9285..4524d3c 100644
--- a/services/audioflinger/Android.mk
+++ b/services/audioflinger/Android.mk
@@ -23,7 +23,8 @@ LOCAL_SRC_FILES:=               \
     AudioPolicyService.cpp      \
     ServiceUtilities.cpp        \
     AudioResamplerCubic.cpp.arm \
-    AudioResamplerSinc.cpp.arm
+    AudioResamplerSinc.cpp.arm  \
+    AudioResamplerDyn.cpp.arm
 
 LOCAL_SRC_FILES += StateQueue.cpp
 
@@ -74,10 +75,11 @@ include $(BUILD_SHARED_LIBRARY)
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES:=               \
-	test-resample.cpp 			\
+    test-resample.cpp           \
     AudioResampler.cpp.arm      \
-	AudioResamplerCubic.cpp.arm \
-    AudioResamplerSinc.cpp.arm
+    AudioResamplerCubic.cpp.arm \
+    AudioResamplerSinc.cpp.arm  \
+    AudioResamplerDyn.cpp.arm
 
 LOCAL_C_INCLUDES := \
     $(call include-path-for, audio-utils)
diff --git a/services/audioflinger/AudioPolicyService.cpp b/services/audioflinger/AudioPolicyService.cpp
index c5ad2c0..c8f0730 100644
--- a/services/audioflinger/AudioPolicyService.cpp
+++ b/services/audioflinger/AudioPolicyService.cpp
@@ -1441,6 +1441,14 @@ status_t AudioPolicyService::loadPreProcessorConfig(const char *path)
     loadEffects(root, effects);
     loadInputSources(root, effects);
 
+    // delete effects to fix memory leak.
+    // as effects is local var and valgrind would treat this as memory leak
+    // and although it only did in mediaserver init, but free it in case mediaserver reboot
+    size_t i;
+    for (i = 0; i < effects.size(); i++) {
+      delete effects[i];
+    }
+
     config_free(root);
     free(root);
     free(data);
diff --git a/services/audioflinger/AudioResampler.cpp b/services/audioflinger/AudioResampler.cpp
index 323f1a4..3b5a8c1 100644
--- a/services/audioflinger/AudioResampler.cpp
+++ b/services/audioflinger/AudioResampler.cpp
@@ -25,6 +25,7 @@
 #include "AudioResampler.h"
 #include "AudioResamplerSinc.h"
 #include "AudioResamplerCubic.h"
+#include "AudioResamplerDyn.h"
 
 #ifdef __arm__
 #include <machine/cpu-features.h>
@@ -85,6 +86,9 @@ bool AudioResampler::qualityIsSupported(src_quality quality)
     case MED_QUALITY:
     case HIGH_QUALITY:
     case VERY_HIGH_QUALITY:
+    case DYN_LOW_QUALITY:
+    case DYN_MED_QUALITY:
+    case DYN_HIGH_QUALITY:
         return true;
     default:
         return false;
@@ -105,7 +109,7 @@ void AudioResampler::init_routine()
         if (*endptr == '\0') {
             defaultQuality = (src_quality) l;
             ALOGD("forcing AudioResampler quality to %d", defaultQuality);
-            if (defaultQuality < DEFAULT_QUALITY || defaultQuality > VERY_HIGH_QUALITY) {
+            if (defaultQuality < DEFAULT_QUALITY || defaultQuality > DYN_HIGH_QUALITY) {
                 defaultQuality = DEFAULT_QUALITY;
             }
         }
@@ -125,6 +129,12 @@ uint32_t AudioResampler::qualityMHz(src_quality quality)
         return 20;
     case VERY_HIGH_QUALITY:
         return 34;
+    case DYN_LOW_QUALITY:
+        return 4;
+    case DYN_MED_QUALITY:
+        return 6;
+    case DYN_HIGH_QUALITY:
+        return 12;
     }
 }
 
@@ -175,6 +185,15 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount,
         case VERY_HIGH_QUALITY:
             quality = HIGH_QUALITY;
             break;
+        case DYN_LOW_QUALITY:
+            atFinalQuality = true;
+            break;
+        case DYN_MED_QUALITY:
+            quality = DYN_LOW_QUALITY;
+            break;
+        case DYN_HIGH_QUALITY:
+            quality = DYN_MED_QUALITY;
+            break;
         }
     }
     pthread_mutex_unlock(&mutex);
@@ -200,6 +219,12 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount,
         ALOGV("Create VERY_HIGH_QUALITY sinc Resampler = %d", quality);
         resampler = new AudioResamplerSinc(bitDepth, inChannelCount, sampleRate, quality);
         break;
+    case DYN_LOW_QUALITY:
+    case DYN_MED_QUALITY:
+    case DYN_HIGH_QUALITY:
+        ALOGV("Create dynamic Resampler = %d", quality);
+        resampler = new AudioResamplerDyn(bitDepth, inChannelCount, sampleRate, quality);
+        break;
     }
 
     // initialize resampler
diff --git a/services/audioflinger/AudioResampler.h b/services/audioflinger/AudioResampler.h
index 33e64ce..c341325 100644
--- a/services/audioflinger/AudioResampler.h
+++ b/services/audioflinger/AudioResampler.h
@@ -41,6 +41,9 @@ public:
         MED_QUALITY=2,
         HIGH_QUALITY=3,
         VERY_HIGH_QUALITY=4,
+        DYN_LOW_QUALITY=5,
+        DYN_MED_QUALITY=6,
+        DYN_HIGH_QUALITY=7,
     };
 
     static AudioResampler* create(int bitDepth, int inChannelCount,
diff --git a/services/audioflinger/AudioResamplerDyn.cpp b/services/audioflinger/AudioResamplerDyn.cpp
new file mode 100644
index 0000000..984548d
--- /dev/null
+++ b/services/audioflinger/AudioResamplerDyn.cpp
@@ -0,0 +1,551 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define LOG_TAG "AudioResamplerDyn"
+//#define LOG_NDEBUG 0
+
+#include <malloc.h>
+#include <string.h>
+#include <stdlib.h>
+#include <dlfcn.h>
+#include <math.h>
+
+#include <cutils/compiler.h>
+#include <cutils/properties.h>
+#include <utils/Log.h>
+
+#include "AudioResamplerFirOps.h" // USE_NEON and USE_INLINE_ASSEMBLY defined here
+#include "AudioResamplerFirProcess.h"
+#include "AudioResamplerFirProcessNeon.h"
+#include "AudioResamplerFirGen.h" // requires math.h
+#include "AudioResamplerDyn.h"
+
+//#define DEBUG_RESAMPLER
+
+namespace android {
+
+// generate a unique resample type compile-time constant (constexpr)
+#define RESAMPLETYPE(CHANNELS, LOCKED, STRIDE, COEFTYPE) \
+    ((((CHANNELS)-1)&1) | !!(LOCKED)<<1 | (COEFTYPE)<<2 \
+    | ((STRIDE)==8 ? 1 : (STRIDE)==16 ? 2 : 0)<<3)
+
+/*
+ * InBuffer is a type agnostic input buffer.
+ *
+ * Layout of the state buffer for halfNumCoefs=8.
+ *
+ * [rrrrrrppppppppnnnnnnnnrrrrrrrrrrrrrrrrrrr.... rrrrrrr]
+ *  S            I                                R
+ *
+ * S = mState
+ * I = mImpulse
+ * R = mRingFull
+ * p = past samples, convoluted with the (p)ositive side of sinc()
+ * n = future samples, convoluted with the (n)egative side of sinc()
+ * r = extra space for implementing the ring buffer
+ */
+
+template<typename TI>
+AudioResamplerDyn::InBuffer<TI>::InBuffer()
+    : mState(NULL), mImpulse(NULL), mRingFull(NULL), mStateSize(0) {
+}
+
+template<typename TI>
+AudioResamplerDyn::InBuffer<TI>::~InBuffer() {
+    init();
+}
+
+template<typename TI>
+void AudioResamplerDyn::InBuffer<TI>::init() {
+    free(mState);
+    mState = NULL;
+    mImpulse = NULL;
+    mRingFull = NULL;
+    mStateSize = 0;
+}
+
+// resizes the state buffer to accommodate the appropriate filter length
+template<typename TI>
+void AudioResamplerDyn::InBuffer<TI>::resize(int CHANNELS, int halfNumCoefs) {
+    // calculate desired state size
+    int stateSize = halfNumCoefs * CHANNELS * 2
+            * kStateSizeMultipleOfFilterLength;
+
+    // check if buffer needs resizing
+    if (mState
+            && stateSize == mStateSize
+            && mRingFull-mState == mStateSize-halfNumCoefs*CHANNELS) {
+        return;
+    }
+
+    // create new buffer
+    TI* state = (int16_t*)memalign(32, stateSize*sizeof(*state));
+    memset(state, 0, stateSize*sizeof(*state));
+
+    // attempt to preserve state
+    if (mState) {
+        TI* srcLo = mImpulse - halfNumCoefs*CHANNELS;
+        TI* srcHi = mImpulse + halfNumCoefs*CHANNELS;
+        TI* dst = state;
+
+        if (srcLo < mState) {
+            dst += mState-srcLo;
+            srcLo = mState;
+        }
+        if (srcHi > mState + mStateSize) {
+            srcHi = mState + mStateSize;
+        }
+        memcpy(dst, srcLo, (srcHi - srcLo) * sizeof(*srcLo));
+        free(mState);
+    }
+
+    // set class member vars
+    mState = state;
+    mStateSize = stateSize;
+    mImpulse = mState + halfNumCoefs*CHANNELS; // actually one sample greater than needed
+    mRingFull = mState + mStateSize - halfNumCoefs*CHANNELS;
+}
+
+// copy in the input data into the head (impulse+halfNumCoefs) of the buffer.
+template<typename TI>
+template<int CHANNELS>
+void AudioResamplerDyn::InBuffer<TI>::readAgain(TI*& impulse, const int halfNumCoefs,
+        const TI* const in, const size_t inputIndex) {
+    int16_t* head = impulse + halfNumCoefs*CHANNELS;
+    for (size_t i=0 ; i<CHANNELS ; i++) {
+        head[i] = in[inputIndex*CHANNELS + i];
+    }
+}
+
+// advance the impulse pointer, and load in data into the head (impulse+halfNumCoefs)
+template<typename TI>
+template<int CHANNELS>
+void AudioResamplerDyn::InBuffer<TI>::readAdvance(TI*& impulse, const int halfNumCoefs,
+        const TI* const in, const size_t inputIndex) {
+    impulse += CHANNELS;
+
+    if (CC_UNLIKELY(impulse >= mRingFull)) {
+        const size_t shiftDown = mRingFull - mState - halfNumCoefs*CHANNELS;
+        memcpy(mState, mState+shiftDown, halfNumCoefs*CHANNELS*2*sizeof(TI));
+        impulse -= shiftDown;
+    }
+    readAgain<CHANNELS>(impulse, halfNumCoefs, in, inputIndex);
+}
+
+void AudioResamplerDyn::Constants::set(
+        int L, int halfNumCoefs, int inSampleRate, int outSampleRate)
+{
+    int bits = 0;
+    int lscale = inSampleRate/outSampleRate < 2 ? L - 1 :
+            static_cast<int>(static_cast<uint64_t>(L)*inSampleRate/outSampleRate);
+    for (int i=lscale; i; ++bits, i>>=1)
+        ;
+    mL = L;
+    mShift = kNumPhaseBits - bits;
+    mHalfNumCoefs = halfNumCoefs;
+}
+
+AudioResamplerDyn::AudioResamplerDyn(int bitDepth,
+        int inChannelCount, int32_t sampleRate, src_quality quality)
+    : AudioResampler(bitDepth, inChannelCount, sampleRate, quality),
+    mResampleType(0), mFilterSampleRate(0), mFilterQuality(DEFAULT_QUALITY),
+    mCoefBuffer(NULL)
+{
+    mVolumeSimd[0] = mVolumeSimd[1] = 0;
+    mConstants.set(128, 8, mSampleRate, mSampleRate); // TODO: set better
+}
+
+AudioResamplerDyn::~AudioResamplerDyn() {
+    free(mCoefBuffer);
+}
+
+void AudioResamplerDyn::init() {
+    mFilterSampleRate = 0; // always trigger new filter generation
+    mInBuffer.init();
+}
+
+void AudioResamplerDyn::setVolume(int16_t left, int16_t right) {
+    AudioResampler::setVolume(left, right);
+    mVolumeSimd[0] = static_cast<int32_t>(left)<<16;
+    mVolumeSimd[1] = static_cast<int32_t>(right)<<16;
+}
+
+template <typename T> T max(T a, T b) {return a > b ? a : b;}
+
+template <typename T> T absdiff(T a, T b) {return a > b ? a - b : b - a;}
+
+template<typename T>
+void AudioResamplerDyn::createKaiserFir(Constants &c, double stopBandAtten,
+        int inSampleRate, int outSampleRate, double tbwCheat) {
+    T* buf = reinterpret_cast<T*>(memalign(32, (c.mL+1)*c.mHalfNumCoefs*sizeof(T)));
+    static const double atten = 0.9998;   // to avoid ripple overflow
+    double fcr;
+    double tbw = firKaiserTbw(c.mHalfNumCoefs, stopBandAtten);
+
+    if (inSampleRate < outSampleRate) { // upsample
+        fcr = max(0.5*tbwCheat - tbw/2, tbw/2);
+    } else { // downsample
+        fcr = max(0.5*tbwCheat*outSampleRate/inSampleRate - tbw/2, tbw/2);
+    }
+    // create and set filter
+    firKaiserGen(buf, c.mL, c.mHalfNumCoefs, stopBandAtten, fcr, atten);
+    c.setBuf(buf);
+    if (mCoefBuffer) {
+        free(mCoefBuffer);
+    }
+    mCoefBuffer = buf;
+#ifdef DEBUG_RESAMPLER
+    // print basic filter stats
+    printf("L:%d  hnc:%d  stopBandAtten:%lf  fcr:%lf  atten:%lf  tbw:%lf\n",
+            c.mL, c.mHalfNumCoefs, stopBandAtten, fcr, atten, tbw);
+    // test the filter and report results
+    double fp = (fcr - tbw/2)/c.mL;
+    double fs = (fcr + tbw/2)/c.mL;
+    double passMin, passMax, passRipple;
+    double stopMax, stopRipple;
+    testFir(buf, c.mL, c.mHalfNumCoefs, fp, fs, /*passSteps*/ 1000, /*stopSteps*/ 100000,
+            passMin, passMax, passRipple, stopMax, stopRipple);
+    printf("passband(%lf, %lf): %.8lf %.8lf %.8lf\n", 0., fp, passMin, passMax, passRipple);
+    printf("stopband(%lf, %lf): %.8lf %.3lf\n", fs, 0.5, stopMax, stopRipple);
+#endif
+}
+
+// recursive gcd. Using objdump, it appears the tail recursion is converted to a while loop.
+static int gcd(int n, int m) {
+    if (m == 0) {
+        return n;
+    }
+    return gcd(m, n % m);
+}
+
+static bool isClose(int32_t newSampleRate, int32_t prevSampleRate,
+        int32_t filterSampleRate, int32_t outSampleRate) {
+
+    // different upsampling ratios do not need a filter change.
+    if (filterSampleRate != 0
+            && filterSampleRate < outSampleRate
+            && newSampleRate < outSampleRate)
+        return true;
+
+    // check design criteria again if downsampling is detected.
+    int pdiff = absdiff(newSampleRate, prevSampleRate);
+    int adiff = absdiff(newSampleRate, filterSampleRate);
+
+    // allow up to 6% relative change increments.
+    // allow up to 12% absolute change increments (from filter design)
+    return pdiff < prevSampleRate>>4 && adiff < filterSampleRate>>3;
+}
+
+void AudioResamplerDyn::setSampleRate(int32_t inSampleRate) {
+    if (mInSampleRate == inSampleRate) {
+        return;
+    }
+    int32_t oldSampleRate = mInSampleRate;
+    int32_t oldHalfNumCoefs = mConstants.mHalfNumCoefs;
+    uint32_t oldPhaseWrapLimit = mConstants.mL << mConstants.mShift;
+    bool useS32 = false;
+
+    mInSampleRate = inSampleRate;
+
+    // TODO: Add precalculated Equiripple filters
+
+    if (mFilterQuality != getQuality() ||
+            !isClose(inSampleRate, oldSampleRate, mFilterSampleRate, mSampleRate)) {
+        mFilterSampleRate = inSampleRate;
+        mFilterQuality = getQuality();
+
+        // Begin Kaiser Filter computation
+        //
+        // The quantization floor for S16 is about 96db - 10*log_10(#length) + 3dB.
+        // Keep the stop band attenuation no greater than 84-85dB for 32 length S16 filters
+        //
+        // For s32 we keep the stop band attenuation at the same as 16b resolution, about
+        // 96-98dB
+        //
+
+        double stopBandAtten;
+        double tbwCheat = 1.; // how much we "cheat" into aliasing
+        int halfLength;
+        if (mFilterQuality == DYN_HIGH_QUALITY) {
+            // 32b coefficients, 64 length
+            useS32 = true;
+            stopBandAtten = 98.;
+            halfLength = 32;
+        } else if (mFilterQuality == DYN_LOW_QUALITY) {
+            // 16b coefficients, 16-32 length
+            useS32 = false;
+            stopBandAtten = 80.;
+            if (mSampleRate >= inSampleRate * 2) {
+                halfLength = 16;
+            } else {
+                halfLength = 8;
+            }
+            if (mSampleRate >= inSampleRate) {
+                tbwCheat = 1.05;
+            } else {
+                tbwCheat = 1.03;
+            }
+        } else { // DYN_MED_QUALITY
+            // 16b coefficients, 32-64 length
+            // note: > 64 length filters with 16b coefs can have quantization noise problems
+            useS32 = false;
+            stopBandAtten = 84.;
+            if (mSampleRate >= inSampleRate * 4) {
+                halfLength = 32;
+            } else if (mSampleRate >= inSampleRate * 2) {
+                halfLength = 24;
+            } else {
+                halfLength = 16;
+            }
+            if (mSampleRate >= inSampleRate) {
+                tbwCheat = 1.03;
+            } else {
+                tbwCheat = 1.01;
+            }
+        }
+
+        // determine the number of polyphases in the filterbank.
+        // for 16b, it is desirable to have 2^(16/2) = 256 phases.
+        // https://ccrma.stanford.edu/~jos/resample/Relation_Interpolation_Error_Quantization.html
+        //
+        // We are a bit more lax on this.
+
+        int phases = mSampleRate / gcd(mSampleRate, inSampleRate);
+
+        // TODO: Once dynamic sample rate change is an option, the code below
+        // should be modified to execute only when dynamic sample rate change is enabled.
+        //
+        // as above, #phases less than 63 is too few phases for accurate linear interpolation.
+        // we increase the phases to compensate, but more phases means more memory per
+        // filter and more time to compute the filter.
+        //
+        // if we know that the filter will be used for dynamic sample rate changes,
+        // that would allow us skip this part for fixed sample rate resamplers.
+        //
+        while (phases<63) {
+            phases *= 2; // this code only needed to support dynamic rate changes
+        }
+
+        if (phases>=256) {  // too many phases, always interpolate
+            phases = 127;
+        }
+
+        // create the filter
+        mConstants.set(phases, halfLength, inSampleRate, mSampleRate);
+        if (useS32) {
+            createKaiserFir<int32_t>(mConstants, stopBandAtten,
+                    inSampleRate, mSampleRate, tbwCheat);
+        } else {
+            createKaiserFir<int16_t>(mConstants, stopBandAtten,
+                    inSampleRate, mSampleRate, tbwCheat);
+        }
+    } // End Kaiser filter
+
+    // update phase and state based on the new filter.
+    const Constants& c(mConstants);
+    mInBuffer.resize(mChannelCount, c.mHalfNumCoefs);
+    const uint32_t phaseWrapLimit = c.mL << c.mShift;
+    // try to preserve as much of the phase fraction as possible for on-the-fly changes
+    mPhaseFraction = static_cast<unsigned long long>(mPhaseFraction)
+            * phaseWrapLimit / oldPhaseWrapLimit;
+    mPhaseFraction %= phaseWrapLimit; // should not do anything, but just in case.
+    mPhaseIncrement = static_cast<uint32_t>(static_cast<double>(phaseWrapLimit)
+            * inSampleRate / mSampleRate);
+
+    // determine which resampler to use
+    // check if locked phase (works only if mPhaseIncrement has no "fractional phase bits")
+    int locked = (mPhaseIncrement << (sizeof(mPhaseIncrement)*8 - c.mShift)) == 0;
+    int stride = (c.mHalfNumCoefs&7)==0 ? 16 : (c.mHalfNumCoefs&3)==0 ? 8 : 2;
+    if (locked) {
+        mPhaseFraction = mPhaseFraction >> c.mShift << c.mShift; // remove fractional phase
+    }
+    if (!USE_NEON) {
+        stride = 2; // C version only
+    }
+    // TODO: Remove this for testing
+    //stride = 2;
+    mResampleType = RESAMPLETYPE(mChannelCount, locked, stride, !!useS32);
+#ifdef DEBUG_RESAMPLER
+    printf("channels:%d  %s  stride:%d  %s  coef:%d  shift:%d\n",
+            mChannelCount, locked ? "locked" : "interpolated",
+            stride, useS32 ? "S32" : "S16", 2*c.mHalfNumCoefs, c.mShift);
+#endif
+}
+
+void AudioResamplerDyn::resample(int32_t* out, size_t outFrameCount,
+            AudioBufferProvider* provider)
+{
+    // TODO:
+    // 24 cases - this perhaps can be reduced later, as testing might take too long
+    switch (mResampleType) {
+
+    // stride 16 (stride 2 for machines that do not support NEON)
+    case RESAMPLETYPE(1, true, 16, 0):
+        return resample<1, true, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, true, 16, 0):
+        return resample<2, true, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, false, 16, 0):
+        return resample<1, false, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, false, 16, 0):
+        return resample<2, false, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, true, 16, 1):
+        return resample<1, true, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, true, 16, 1):
+        return resample<2, true, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(1, false, 16, 1):
+        return resample<1, false, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, false, 16, 1):
+        return resample<2, false, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+#if 0
+    // TODO: Remove these?
+    // stride 8
+    case RESAMPLETYPE(1, true, 8, 0):
+        return resample<1, true, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, true, 8, 0):
+        return resample<2, true, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, false, 8, 0):
+        return resample<1, false, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, false, 8, 0):
+        return resample<2, false, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, true, 8, 1):
+        return resample<1, true, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, true, 8, 1):
+        return resample<2, true, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(1, false, 8, 1):
+        return resample<1, false, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, false, 8, 1):
+        return resample<2, false, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    // stride 2 (can handle any filter length)
+    case RESAMPLETYPE(1, true, 2, 0):
+        return resample<1, true, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, true, 2, 0):
+        return resample<2, true, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, false, 2, 0):
+        return resample<1, false, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, false, 2, 0):
+        return resample<2, false, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, true, 2, 1):
+        return resample<1, true, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, true, 2, 1):
+        return resample<2, true, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(1, false, 2, 1):
+        return resample<1, false, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, false, 2, 1):
+        return resample<2, false, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+#endif
+    default:
+        ; // error
+    }
+}
+
+template<int CHANNELS, bool LOCKED, int STRIDE, typename TC>
+void AudioResamplerDyn::resample(int32_t* out, size_t outFrameCount,
+        const TC* const coefs,  AudioBufferProvider* provider)
+{
+    const Constants& c(mConstants);
+    int16_t* impulse = mInBuffer.getImpulse();
+    size_t inputIndex = mInputIndex;
+    uint32_t phaseFraction = mPhaseFraction;
+    const uint32_t phaseIncrement = mPhaseIncrement;
+    size_t outputIndex = 0;
+    size_t outputSampleCount = outFrameCount * 2;   // stereo output
+    size_t inFrameCount = (outFrameCount*mInSampleRate)/mSampleRate;
+    const uint32_t phaseWrapLimit = c.mL << c.mShift;
+
+    // NOTE: be very careful when modifying the code here. register
+    // pressure is very high and a small change might cause the compiler
+    // to generate far less efficient code.
+    // Always sanity check the result with objdump or test-resample.
+
+    // the following logic is a bit convoluted to keep the main processing loop
+    // as tight as possible with register allocation.
+    while (outputIndex < outputSampleCount) {
+        // buffer is empty, fetch a new one
+        while (mBuffer.frameCount == 0) {
+            mBuffer.frameCount = inFrameCount;
+            provider->getNextBuffer(&mBuffer,
+                    calculateOutputPTS(outputIndex / 2));
+            if (mBuffer.raw == NULL) {
+                goto resample_exit;
+            }
+            if (phaseFraction >= phaseWrapLimit) { // read in data
+                mInBuffer.readAdvance<CHANNELS>(
+                        impulse, c.mHalfNumCoefs, mBuffer.i16, inputIndex);
+                phaseFraction -= phaseWrapLimit;
+                while (phaseFraction >= phaseWrapLimit) {
+                    inputIndex++;
+                    if (inputIndex >= mBuffer.frameCount) {
+                        inputIndex -= mBuffer.frameCount;
+                        provider->releaseBuffer(&mBuffer);
+                        break;
+                    }
+                    mInBuffer.readAdvance<CHANNELS>(
+                            impulse, c.mHalfNumCoefs, mBuffer.i16, inputIndex);
+                    phaseFraction -= phaseWrapLimit;
+                }
+            }
+        }
+        const int16_t* const in = mBuffer.i16;
+        const size_t frameCount = mBuffer.frameCount;
+        const int coefShift = c.mShift;
+        const int halfNumCoefs = c.mHalfNumCoefs;
+        const int32_t* const volumeSimd = mVolumeSimd;
+
+        // reread the last input in.
+        mInBuffer.readAgain<CHANNELS>(impulse, halfNumCoefs, in, inputIndex);
+
+        // main processing loop
+        while (CC_LIKELY(outputIndex < outputSampleCount)) {
+            // caution: fir() is inlined and may be large.
+            // output will be loaded with the appropriate values
+            //
+            // from the input samples in impulse[-halfNumCoefs+1]... impulse[halfNumCoefs]
+            // from the polyphase filter of (phaseFraction / phaseWrapLimit) in coefs.
+            //
+            fir<CHANNELS, LOCKED, STRIDE>(
+                    &out[outputIndex],
+                    phaseFraction, phaseWrapLimit,
+                    coefShift, halfNumCoefs, coefs,
+                    impulse, volumeSimd);
+            outputIndex += 2;
+
+            phaseFraction += phaseIncrement;
+            while (phaseFraction >= phaseWrapLimit) {
+                inputIndex++;
+                if (inputIndex >= frameCount) {
+                    goto done;  // need a new buffer
+                }
+                mInBuffer.readAdvance<CHANNELS>(impulse, halfNumCoefs, in, inputIndex);
+                phaseFraction -= phaseWrapLimit;
+            }
+        }
+done:
+        // often arrives here when input buffer runs out
+        if (inputIndex >= frameCount) {
+            inputIndex -= frameCount;
+            provider->releaseBuffer(&mBuffer);
+            // mBuffer.frameCount MUST be zero here.
+        }
+    }
+
+resample_exit:
+    mInBuffer.setImpulse(impulse);
+    mInputIndex = inputIndex;
+    mPhaseFraction = phaseFraction;
+}
+
+// ----------------------------------------------------------------------------
+}; // namespace android
diff --git a/services/audioflinger/AudioResamplerDyn.h b/services/audioflinger/AudioResamplerDyn.h
new file mode 100644
index 0000000..df1fdbe
--- /dev/null
+++ b/services/audioflinger/AudioResamplerDyn.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_DYN_H
+#define ANDROID_AUDIO_RESAMPLER_DYN_H
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <cutils/log.h>
+
+#include "AudioResampler.h"
+
+namespace android {
+
+class AudioResamplerDyn: public AudioResampler {
+public:
+    AudioResamplerDyn(int bitDepth, int inChannelCount, int32_t sampleRate,
+            src_quality quality);
+
+    virtual ~AudioResamplerDyn();
+
+    virtual void init();
+
+    virtual void setSampleRate(int32_t inSampleRate);
+
+    virtual void setVolume(int16_t left, int16_t right);
+
+    virtual void resample(int32_t* out, size_t outFrameCount,
+            AudioBufferProvider* provider);
+
+private:
+
+    class Constants { // stores the filter constants.
+    public:
+        Constants() :
+            mL(0), mShift(0), mHalfNumCoefs(0), mFirCoefsS16(NULL)
+        {}
+        void set(int L, int halfNumCoefs,
+                int inSampleRate, int outSampleRate);
+        inline void setBuf(int16_t* buf) {
+            mFirCoefsS16 = buf;
+        }
+        inline void setBuf(int32_t* buf) {
+            mFirCoefsS32 = buf;
+        }
+
+        int mL;       // interpolation phases in the filter.
+        int mShift;   // right shift to get polyphase index
+        unsigned int mHalfNumCoefs; // filter half #coefs
+        union {       // polyphase filter bank
+            const int16_t* mFirCoefsS16;
+            const int32_t* mFirCoefsS32;
+        };
+    };
+
+    // Input buffer management for a given input type TI, now (int16_t)
+    // Is agnostic of the actual type, can work with int32_t and float.
+    template<typename TI>
+    class InBuffer {
+    public:
+        InBuffer();
+        ~InBuffer();
+        void init();
+        void resize(int CHANNELS, int halfNumCoefs);
+
+        // used for direct management of the mImpulse pointer
+        inline TI* getImpulse() {
+            return mImpulse;
+        }
+        inline void setImpulse(TI *impulse) {
+            mImpulse = impulse;
+        }
+        template<int CHANNELS>
+        inline void readAgain(TI*& impulse, const int halfNumCoefs,
+                const TI* const in, const size_t inputIndex);
+        template<int CHANNELS>
+        inline void readAdvance(TI*& impulse, const int halfNumCoefs,
+                const TI* const in, const size_t inputIndex);
+
+    private:
+        // tuning parameter guidelines: 2 <= multiple <= 8
+        static const int kStateSizeMultipleOfFilterLength = 4;
+
+        TI* mState;    // base pointer for the input buffer storage
+        TI* mImpulse;  // current location of the impulse response (centered)
+        TI* mRingFull; // mState <= mImpulse < mRingFull
+        // in general, mRingFull = mState + mStateSize - halfNumCoefs*CHANNELS.
+        size_t mStateSize; // in units of TI.
+    };
+
+    template<int CHANNELS, bool LOCKED, int STRIDE, typename TC>
+    void resample(int32_t* out, size_t outFrameCount,
+            const TC* const coefs, AudioBufferProvider* provider);
+
+    template<typename T>
+    void createKaiserFir(Constants &c, double stopBandAtten,
+            int inSampleRate, int outSampleRate, double tbwCheat);
+
+    InBuffer<int16_t> mInBuffer;
+    Constants mConstants;  // current set of coefficient parameters
+    int32_t __attribute__ ((aligned (8))) mVolumeSimd[2];
+    int32_t mResampleType; // contains the resample type.
+    int32_t mFilterSampleRate; // designed filter sample rate.
+    src_quality mFilterQuality; // designed filter quality.
+    void* mCoefBuffer; // if a filter is created, this is not null
+};
+
+// ----------------------------------------------------------------------------
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_DYN_H*/
diff --git a/services/audioflinger/AudioResamplerFirGen.h b/services/audioflinger/AudioResamplerFirGen.h
new file mode 100644
index 0000000..fac3001
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirGen.h
@@ -0,0 +1,684 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_GEN_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_GEN_H
+
+namespace android {
+
+/*
+ * generates a sine wave at equal steps.
+ *
+ * As most of our functions use sine or cosine at equal steps,
+ * it is very efficient to compute them that way (single multiply and subtract),
+ * rather than invoking the math library sin() or cos() each time.
+ *
+ * SineGen uses Goertzel's Algorithm (as a generator not a filter)
+ * to calculate sine(wstart + n * wstep) or cosine(wstart + n * wstep)
+ * by stepping through 0, 1, ... n.
+ *
+ * e^i(wstart+wstep) = 2cos(wstep) * e^i(wstart) - e^i(wstart-wstep)
+ *
+ * or looking at just the imaginary sine term, as the cosine follows identically:
+ *
+ * sin(wstart+wstep) = 2cos(wstep) * sin(wstart) - sin(wstart-wstep)
+ *
+ * Goertzel's algorithm is more efficient than the angle addition formula,
+ * e^i(wstart+wstep) = e^i(wstart) * e^i(wstep), which takes up to
+ * 4 multiplies and 2 adds (or 3* and 3+) and requires both sine and
+ * cosine generation due to the complex * complex multiply (full rotation).
+ *
+ * See: http://en.wikipedia.org/wiki/Goertzel_algorithm
+ *
+ */
+
+class SineGen {
+public:
+    SineGen(double wstart, double wstep, bool cosine = false) {
+        if (cosine) {
+            mCurrent = cos(wstart);
+            mPrevious = cos(wstart - wstep);
+        } else {
+            mCurrent = sin(wstart);
+            mPrevious = sin(wstart - wstep);
+        }
+        mTwoCos = 2.*cos(wstep);
+    }
+    SineGen(double expNow, double expPrev, double twoCosStep) {
+        mCurrent = expNow;
+        mPrevious = expPrev;
+        mTwoCos = twoCosStep;
+    }
+    inline double value() const {
+        return mCurrent;
+    }
+    inline void advance() {
+        double tmp = mCurrent;
+        mCurrent = mCurrent*mTwoCos - mPrevious;
+        mPrevious = tmp;
+    }
+    inline double valueAdvance() {
+        double tmp = mCurrent;
+        mCurrent = mCurrent*mTwoCos - mPrevious;
+        mPrevious = tmp;
+        return tmp;
+    }
+
+private:
+    double mCurrent; // current value of sine/cosine
+    double mPrevious; // previous value of sine/cosine
+    double mTwoCos; // stepping factor
+};
+
+/*
+ * generates a series of sine generators, phase offset by fixed steps.
+ *
+ * This is used to generate polyphase sine generators, one per polyphase
+ * in the filter code below.
+ *
+ * The SineGen returned by value() starts at innerStart = outerStart + n*outerStep;
+ * increments by innerStep.
+ *
+ */
+
+class SineGenGen {
+public:
+    SineGenGen(double outerStart, double outerStep, double innerStep, bool cosine = false)
+            : mSineInnerCur(outerStart, outerStep, cosine),
+              mSineInnerPrev(outerStart-innerStep, outerStep, cosine)
+    {
+        mTwoCos = 2.*cos(innerStep);
+    }
+    inline SineGen value() {
+        return SineGen(mSineInnerCur.value(), mSineInnerPrev.value(), mTwoCos);
+    }
+    inline void advance() {
+        mSineInnerCur.advance();
+        mSineInnerPrev.advance();
+    }
+    inline SineGen valueAdvance() {
+        return SineGen(mSineInnerCur.valueAdvance(), mSineInnerPrev.valueAdvance(), mTwoCos);
+    }
+
+private:
+    SineGen mSineInnerCur; // generate the inner sine values (stepped by outerStep).
+    SineGen mSineInnerPrev; // generate the inner sine previous values
+                            // (behind by innerStep, stepped by outerStep).
+    double mTwoCos; // the inner stepping factor for the returned SineGen.
+};
+
+static inline double sqr(double x) {
+    return x * x;
+}
+
+/*
+ * rounds a double to the nearest integer for FIR coefficients.
+ *
+ * One variant uses noise shaping, which must keep error history
+ * to work (the err parameter, initialized to 0).
+ * The other variant is a non-noise shaped version for
+ * S32 coefficients (noise shaping doesn't gain much).
+ *
+ * Caution: No bounds saturation is applied, but isn't needed in this case.
+ *
+ * @param x is the value to round.
+ *
+ * @param maxval is the maximum integer scale factor expressed as an int64 (for headroom).
+ * Typically this may be the maximum positive integer+1 (using the fact that double precision
+ * FIR coefficients generated here are never that close to 1.0 to pose an overflow condition).
+ *
+ * @param err is the previous error (actual - rounded) for the previous rounding op.
+ * For 16b coefficients this can improve stopband dB performance by up to 2dB.
+ *
+ * Many variants exist for the noise shaping: http://en.wikipedia.org/wiki/Noise_shaping
+ *
+ */
+
+static inline int64_t toint(double x, int64_t maxval, double& err) {
+    double val = x * maxval;
+    double ival = floor(val + 0.5 + err*0.2);
+    err = val - ival;
+    return static_cast<int64_t>(ival);
+}
+
+static inline int64_t toint(double x, int64_t maxval) {
+    return static_cast<int64_t>(floor(x * maxval + 0.5));
+}
+
+/*
+ * Modified Bessel function of the first kind
+ * http://en.wikipedia.org/wiki/Bessel_function
+ *
+ * The formulas are taken from Abramowitz and Stegun,
+ * _Handbook of Mathematical Functions_ (links below):
+ *
+ * http://people.math.sfu.ca/~cbm/aands/page_375.htm
+ * http://people.math.sfu.ca/~cbm/aands/page_378.htm
+ *
+ * http://dlmf.nist.gov/10.25
+ * http://dlmf.nist.gov/10.40
+ *
+ * Note we assume x is nonnegative (the function is symmetric,
+ * pass in the absolute value as needed).
+ *
+ * Constants are compile time derived with templates I0Term<> and
+ * I0ATerm<> to the precision of the compiler.  The series can be expanded
+ * to any precision needed, but currently set around 24b precision.
+ *
+ * We use a bit of template math here, constexpr would probably be
+ * more appropriate for a C++11 compiler.
+ *
+ * For the intermediate range 3.75 < x < 15, we use minimax polynomial fit.
+ *
+ */
+
+template <int N>
+struct I0Term {
+    static const double value = I0Term<N-1>::value / (4. * N * N);
+};
+
+template <>
+struct I0Term<0> {
+    static const double value = 1.;
+};
+
+template <int N>
+struct I0ATerm {
+    static const double value = I0ATerm<N-1>::value * (2.*N-1.) * (2.*N-1.) / (8. * N);
+};
+
+template <>
+struct I0ATerm<0> { // 1/sqrt(2*PI);
+    static const double value = 0.398942280401432677939946059934381868475858631164934657665925;
+};
+
+#if USE_HORNERS_METHOD
+/* Polynomial evaluation of A + Bx + Cx^2 + Dx^3 + ...
+ * using Horner's Method: http://en.wikipedia.org/wiki/Horner's_method
+ *
+ * This has fewer multiplications than Estrin's method below, but has back to back
+ * floating point dependencies.
+ *
+ * On ARM this appears to work slower, so USE_HORNERS_METHOD is not default enabled.
+ */
+
+inline double Poly2(double A, double B, double x) {
+    return A + x * B;
+}
+
+inline double Poly4(double A, double B, double C, double D, double x) {
+    return A + x * (B + x * (C + x * (D)));
+}
+
+inline double Poly7(double A, double B, double C, double D, double E, double F, double G,
+        double x) {
+    return A + x * (B + x * (C + x * (D + x * (E + x * (F + x * (G))))));
+}
+
+inline double Poly9(double A, double B, double C, double D, double E, double F, double G,
+        double H, double I, double x) {
+    return A + x * (B + x * (C + x * (D + x * (E + x * (F + x * (G + x * (H + x * (I))))))));
+}
+
+#else
+/* Polynomial evaluation of A + Bx + Cx^2 + Dx^3 + ...
+ * using Estrin's Method: http://en.wikipedia.org/wiki/Estrin's_scheme
+ *
+ * This is typically faster, perhaps gains about 5-10% overall on ARM processors
+ * over Horner's method above.
+ */
+
+inline double Poly2(double A, double B, double x) {
+    return A + B * x;
+}
+
+inline double Poly3(double A, double B, double C, double x, double x2) {
+    return Poly2(A, B, x) + C * x2;
+}
+
+inline double Poly3(double A, double B, double C, double x) {
+    return Poly2(A, B, x) + C * x * x;
+}
+
+inline double Poly4(double A, double B, double C, double D, double x, double x2) {
+    return Poly2(A, B, x) + Poly2(C, D, x) * x2; // same as poly2(poly2, poly2, x2);
+}
+
+inline double Poly4(double A, double B, double C, double D, double x) {
+    return Poly4(A, B, C, D, x, x * x);
+}
+
+inline double Poly7(double A, double B, double C, double D, double E, double F, double G,
+        double x) {
+    double x2 = x * x;
+    return Poly4(A, B, C, D, x, x2) + Poly3(E, F, G, x, x2) * (x2 * x2);
+}
+
+inline double Poly8(double A, double B, double C, double D, double E, double F, double G,
+        double H, double x, double x2, double x4) {
+    return Poly4(A, B, C, D, x, x2) + Poly4(E, F, G, H, x, x2) * x4;
+}
+
+inline double Poly9(double A, double B, double C, double D, double E, double F, double G,
+        double H, double I, double x) {
+    double x2 = x * x;
+#if 1
+    // It does not seem faster to explicitly decompose Poly8 into Poly4, but
+    // could depend on compiler floating point scheduling.
+    double x4 = x2 * x2;
+    return Poly8(A, B, C, D, E, F, G, H, x, x2, x4) + I * (x4 * x4);
+#else
+    double val = Poly4(A, B, C, D, x, x2);
+    double x4 = x2 * x2;
+    return val + Poly4(E, F, G, H, x, x2) * x4 + I * (x4 * x4);
+#endif
+}
+#endif
+
+static inline double I0(double x) {
+    if (x < 3.75) {
+        x *= x;
+        return Poly7(I0Term<0>::value, I0Term<1>::value,
+                I0Term<2>::value, I0Term<3>::value,
+                I0Term<4>::value, I0Term<5>::value,
+                I0Term<6>::value, x); // e < 1.6e-7
+    }
+    if (1) {
+        /*
+         * Series expansion coefs are easy to calculate, but are expanded around 0,
+         * so error is unequal over the interval 0 < x < 3.75, the error being
+         * significantly better near 0.
+         *
+         * A better solution is to use precise minimax polynomial fits.
+         *
+         * We use a slightly more complicated solution for 3.75 < x < 15, based on
+         * the tables in Blair and Edwards, "Stable Rational Minimax Approximations
+         * to the Modified Bessel Functions I0(x) and I1(x)", Chalk Hill Nuclear Laboratory,
+         * AECL-4928.
+         *
+         * http://www.iaea.org/inis/collection/NCLCollectionStore/_Public/06/178/6178667.pdf
+         *
+         * See Table 11 for 0 < x < 15; e < 10^(-7.13).
+         *
+         * Note: Beta cannot exceed 15 (hence Stopband cannot exceed 144dB = 24b).
+         *
+         * This speeds up overall computation by about 40% over using the else clause below,
+         * which requires sqrt and exp.
+         *
+         */
+
+        x *= x;
+        double num = Poly9(-0.13544938430e9, -0.33153754512e8,
+                -0.19406631946e7, -0.48058318783e5,
+                -0.63269783360e3, -0.49520779070e1,
+                -0.24970910370e-1, -0.74741159550e-4,
+                -0.18257612460e-6, x);
+        double y = x - 225.; // reflection around 15 (squared)
+        double den = Poly4(-0.34598737196e8, 0.23852643181e6,
+                -0.70699387620e3, 0.10000000000e1, y);
+        return num / den;
+
+#if IO_EXTENDED_BETA
+        /* Table 42 for x > 15; e < 10^(-8.11).
+         * This is used for Beta>15, but is disabled here as
+         * we never use Beta that high.
+         *
+         * NOTE: This should be enabled only for x > 15.
+         */
+
+        double y = 1./x;
+        double z = y - (1./15);
+        double num = Poly2(0.415079861746e1, -0.5149092496e1, z);
+        double den = Poly3(0.103150763823e2, -0.14181687413e2,
+                0.1000000000e1, z);
+        return exp(x) * sqrt(y) * num / den;
+#endif
+    } else {
+        /*
+         * NOT USED, but reference for large Beta.
+         *
+         * Abramowitz and Stegun asymptotic formula.
+         * works for x > 3.75.
+         */
+        double y = 1./x;
+        return exp(x) * sqrt(y) *
+                // note: reciprocal squareroot may be easier!
+                // http://en.wikipedia.org/wiki/Fast_inverse_square_root
+                Poly9(I0ATerm<0>::value, I0ATerm<1>::value,
+                        I0ATerm<2>::value, I0ATerm<3>::value,
+                        I0ATerm<4>::value, I0ATerm<5>::value,
+                        I0ATerm<6>::value, I0ATerm<7>::value,
+                        I0ATerm<8>::value, y); // (... e) < 1.9e-7
+    }
+}
+
+/*
+ * calculates the transition bandwidth for a Kaiser filter
+ *
+ * Formula 3.2.8, Vaidyanathan, _Multirate Systems and Filter Banks_, p. 48
+ * Formula 7.76, Oppenheim and Schafer, _Discrete-time Signal Processing, 3e_, p. 542
+ *
+ * @param halfNumCoef is half the number of coefficients per filter phase.
+ *
+ * @param stopBandAtten is the stop band attenuation desired.
+ *
+ * @return the transition bandwidth in normalized frequency (0 <= f <= 0.5)
+ */
+static inline double firKaiserTbw(int halfNumCoef, double stopBandAtten) {
+    return (stopBandAtten - 7.95)/((2.*14.36)*halfNumCoef);
+}
+
+/*
+ * calculates the fir transfer response of the overall polyphase filter at w.
+ *
+ * Calculates the DTFT transfer coefficient H(w) for 0 <= w <= PI, utilizing the
+ * fact that h[n] is symmetric (cosines only, no complex arithmetic).
+ *
+ * We use Goertzel's algorithm to accelerate the computation to essentially
+ * a single multiply and 2 adds per filter coefficient h[].
+ *
+ * Be careful be careful to consider that h[n] is the overall polyphase filter,
+ * with L phases, so rescaling H(w)/L is probably what you expect for "unity gain",
+ * as you only use one of the polyphases at a time.
+ */
+template <typename T>
+static inline double firTransfer(const T* coef, int L, int halfNumCoef, double w) {
+    double accum = static_cast<double>(coef[0])*0.5;  // "center coefficient" from first bank
+    coef += halfNumCoef;    // skip first filterbank (picked up by the last filterbank).
+#if SLOW_FIRTRANSFER
+    /* Original code for reference.  This is equivalent to the code below, but slower. */
+    for (int i=1 ; i<=L ; ++i) {
+        for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) {
+            accum += cos(ix*w)*static_cast<double>(*coef++);
+        }
+    }
+#else
+    /*
+     * Our overall filter is stored striped by polyphases, not a contiguous h[n].
+     * We could fetch coefficients in a non-contiguous fashion
+     * but that will not scale to vector processing.
+     *
+     * We apply Goertzel's algorithm directly to each polyphase filter bank instead of
+     * using cosine generation/multiplication, thereby saving one multiply per inner loop.
+     *
+     * See: http://en.wikipedia.org/wiki/Goertzel_algorithm
+     * Also: Oppenheim and Schafer, _Discrete Time Signal Processing, 3e_, p. 720.
+     *
+     * We use the basic recursion to incorporate the cosine steps into real sequence x[n]:
+     * s[n] = x[n] + (2cosw)*s[n-1] + s[n-2]
+     *
+     * y[n] = s[n] - e^(iw)s[n-1]
+     *      = sum_{k=-\infty}^{n} x[k]e^(-iw(n-k))
+     *      = e^(-iwn) sum_{k=0}^{n} x[k]e^(iwk)
+     *
+     * The summation contains the frequency steps we want multiplied by the source
+     * (similar to a DTFT).
+     *
+     * Using symmetry, and just the real part (be careful, this must happen
+     * after any internal complex multiplications), the polyphase filterbank
+     * transfer function is:
+     *
+     * Hpp[n, w, w_0] = sum_{k=0}^{n} x[k] * cos(wk + w_0)
+     *                = Re{ e^(iwn + iw_0) y[n]}
+     *                = cos(wn+w_0) * s[n] - cos(w(n+1)+w_0) * s[n-1]
+     *
+     * using the fact that s[n] of real x[n] is real.
+     *
+     */
+    double dcos = 2. * cos(L*w);
+    int start = ((halfNumCoef)*L + 1);
+    SineGen cc((start - L) * w, w, true); // cosine
+    SineGen cp(start * w, w, true); // cosine
+    for (int i=1 ; i<=L ; ++i) {
+        double sc = 0;
+        double sp = 0;
+        for (int j=0 ; j<halfNumCoef ; ++j) {
+            double tmp = sc;
+            sc  = static_cast<double>(*coef++) + dcos*sc - sp;
+            sp = tmp;
+        }
+        // If we are awfully clever, we can apply Goertzel's algorithm
+        // again on the sc and sp sequences returned here.
+        accum += cc.valueAdvance() * sc - cp.valueAdvance() * sp;
+    }
+#endif
+    return accum*2.;
+}
+
+/*
+ * evaluates the minimum and maximum |H(f)| bound in a band region.
+ *
+ * This is usually done with equally spaced increments in the target band in question.
+ * The passband is often very small, and sampled that way. The stopband is often much
+ * larger.
+ *
+ * We use the fact that the overall polyphase filter has an additional bank at the end
+ * for interpolation; hence it is overspecified for the H(f) computation.  Thus the
+ * first polyphase is never actually checked, excepting its first term.
+ *
+ * In this code we use the firTransfer() evaluator above, which uses Goertzel's
+ * algorithm to calculate the transfer function at each point.
+ *
+ * TODO: An alternative with equal spacing is the FFT/DFT.  An alternative with unequal
+ * spacing is a chirp transform.
+ *
+ * @param coef is the designed polyphase filter banks
+ *
+ * @param L is the number of phases (for interpolation)
+ *
+ * @param halfNumCoef should be half the number of coefficients for a single
+ * polyphase.
+ *
+ * @param fstart is the normalized frequency start.
+ *
+ * @param fend is the normalized frequency end.
+ *
+ * @param steps is the number of steps to take (sampling) between frequency start and end
+ *
+ * @param firMin returns the minimum transfer |H(f)| found
+ *
+ * @param firMax returns the maximum transfer |H(f)| found
+ *
+ * 0 <= f <= 0.5.
+ * This is used to test passband and stopband performance.
+ */
+template <typename T>
+static void testFir(const T* coef, int L, int halfNumCoef,
+        double fstart, double fend, int steps, double &firMin, double &firMax) {
+    double wstart = fstart*(2.*M_PI);
+    double wend = fend*(2.*M_PI);
+    double wstep = (wend - wstart)/steps;
+    double fmax, fmin;
+    double trf = firTransfer(coef, L, halfNumCoef, wstart);
+    if (trf<0) {
+        trf = -trf;
+    }
+    fmin = fmax = trf;
+    wstart += wstep;
+    for (int i=1; i<steps; ++i) {
+        trf = firTransfer(coef, L, halfNumCoef, wstart);
+        if (trf<0) {
+            trf = -trf;
+        }
+        if (trf>fmax) {
+            fmax = trf;
+        }
+        else if (trf<fmin) {
+            fmin = trf;
+        }
+        wstart += wstep;
+    }
+    // renormalize - this is only needed for integer filter types
+    double norm = 1./((1ULL<<(sizeof(T)*8-1))*L);
+
+    firMin = fmin * norm;
+    firMax = fmax * norm;
+}
+
+/*
+ * evaluates the |H(f)| lowpass band characteristics.
+ *
+ * This function tests the lowpass characteristics for the overall polyphase filter,
+ * and is used to verify the design.  For this case, fp should be set to the
+ * passband normalized frequency from 0 to 0.5 for the overall filter (thus it
+ * is the designed polyphase bank value / L).  Likewise for fs.
+ *
+ * @param coef is the designed polyphase filter banks
+ *
+ * @param L is the number of phases (for interpolation)
+ *
+ * @param halfNumCoef should be half the number of coefficients for a single
+ * polyphase.
+ *
+ * @param fp is the passband normalized frequency, 0 < fp < fs < 0.5.
+ *
+ * @param fs is the stopband normalized frequency, 0 < fp < fs < 0.5.
+ *
+ * @param passSteps is the number of passband sampling steps.
+ *
+ * @param stopSteps is the number of stopband sampling steps.
+ *
+ * @param passMin is the minimum value in the passband
+ *
+ * @param passMax is the maximum value in the passband (useful for scaling).  This should
+ * be less than 1., to avoid sine wave test overflow.
+ *
+ * @param passRipple is the passband ripple.  Typically this should be less than 0.1 for
+ * an audio filter.  Generally speaker/headphone device characteristics will dominate
+ * the passband term.
+ *
+ * @param stopMax is the maximum value in the stopband.
+ *
+ * @param stopRipple is the stopband ripple, also known as stopband attenuation.
+ * Typically this should be greater than ~80dB for low quality, and greater than
+ * ~100dB for full 16b quality, otherwise aliasing may become noticeable.
+ *
+ */
+template <typename T>
+static void testFir(const T* coef, int L, int halfNumCoef,
+        double fp, double fs, int passSteps, int stopSteps,
+        double &passMin, double &passMax, double &passRipple,
+        double &stopMax, double &stopRipple) {
+    double fmin, fmax;
+    testFir(coef, L, halfNumCoef, 0., fp, passSteps, fmin, fmax);
+    double d1 = (fmax - fmin)/2.;
+    passMin = fmin;
+    passMax = fmax;
+    passRipple = -20.*log10(1. - d1); // passband ripple
+    testFir(coef, L, halfNumCoef, fs, 0.5, stopSteps, fmin, fmax);
+    // fmin is really not important for the stopband.
+    stopMax = fmax;
+    stopRipple = -20.*log10(fmax); // stopband ripple/attenuation
+}
+
+/*
+ * Calculates the overall polyphase filter based on a windowed sinc function.
+ *
+ * The windowed sinc is an odd length symmetric filter of exactly L*halfNumCoef*2+1
+ * taps for the entire kernel.  This is then decomposed into L+1 polyphase filterbanks.
+ * The last filterbank is used for interpolation purposes (and is mostly composed
+ * of the first bank shifted by one sample), and is unnecessary if one does
+ * not do interpolation.
+ *
+ * We use the last filterbank for some transfer function calculation purposes,
+ * so it needs to be generated anyways.
+ *
+ * @param coef is the caller allocated space for coefficients.  This should be
+ * exactly (L+1)*halfNumCoef in size.
+ *
+ * @param L is the number of phases (for interpolation)
+ *
+ * @param halfNumCoef should be half the number of coefficients for a single
+ * polyphase.
+ *
+ * @param stopBandAtten is the stopband value, should be >50dB.
+ *
+ * @param fcr is cutoff frequency/sampling rate (<0.5).  At this point, the energy
+ * should be 6dB less. (fcr is where the amplitude drops by half).  Use the
+ * firKaiserTbw() to calculate the transition bandwidth.  fcr is the midpoint
+ * between the stop band and the pass band (fstop+fpass)/2.
+ *
+ * @param atten is the attenuation (generally slightly less than 1).
+ */
+
+template <typename T>
+static inline void firKaiserGen(T* coef, int L, int halfNumCoef,
+        double stopBandAtten, double fcr, double atten) {
+    //
+    // Formula 3.2.5, 3.2.7, Vaidyanathan, _Multirate Systems and Filter Banks_, p. 48
+    // Formula 7.75, Oppenheim and Schafer, _Discrete-time Signal Processing, 3e_, p. 542
+    //
+    // See also: http://melodi.ee.washington.edu/courses/ee518/notes/lec17.pdf
+    //
+    // Kaiser window and beta parameter
+    //
+    //         | 0.1102*(A - 8.7)                         A > 50
+    //  beta = | 0.5842*(A - 21)^0.4 + 0.07886*(A - 21)   21 <= A <= 50
+    //         | 0.                                       A < 21
+    //
+    // with A is the desired stop-band attenuation in dBFS
+    //
+    //    30 dB    2.210
+    //    40 dB    3.384
+    //    50 dB    4.538
+    //    60 dB    5.658
+    //    70 dB    6.764
+    //    80 dB    7.865
+    //    90 dB    8.960
+    //   100 dB   10.056
+
+    const int N = L * halfNumCoef; // non-negative half
+    const double beta = 0.1102 * (stopBandAtten - 8.7); // >= 50dB always
+    const double xstep = (2. * M_PI) * fcr / L;
+    const double xfrac = 1. / N;
+    const double yscale = atten * L / (I0(beta) * M_PI);
+
+    // We use sine generators, which computes sines on regular step intervals.
+    // This speeds up overall computation about 40% from computing the sine directly.
+
+    SineGenGen sgg(0., xstep, L*xstep); // generates sine generators (one per polyphase)
+
+    for (int i=0 ; i<=L ; ++i) { // generate an extra set of coefs for interpolation
+
+        // computation for a single polyphase of the overall filter.
+        SineGen sg = sgg.valueAdvance(); // current sine generator for "j" inner loop.
+        double err = 0; // for noise shaping on int16_t coefficients (over each polyphase)
+
+        for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) {
+            double y;
+            if (CC_LIKELY(ix)) {
+                double x = static_cast<double>(ix);
+
+                // sine generator: sg.valueAdvance() returns sin(ix*xstep);
+                y = I0(beta * sqrt(1.0 - sqr(x * xfrac))) * yscale * sg.valueAdvance() / x;
+            } else {
+                y = 2. * atten * fcr; // center of filter, sinc(0) = 1.
+                sg.advance();
+            }
+
+            // (caution!) float version does not need rounding
+            if (is_same<T, int16_t>::value) { // int16_t needs noise shaping
+                *coef++ = static_cast<T>(toint(y, 1ULL<<(sizeof(T)*8-1), err));
+            } else {
+                *coef++ = static_cast<T>(toint(y, 1ULL<<(sizeof(T)*8-1)));
+            }
+        }
+    }
+}
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_GEN_H*/
diff --git a/services/audioflinger/AudioResamplerFirOps.h b/services/audioflinger/AudioResamplerFirOps.h
new file mode 100644
index 0000000..bf2163f
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirOps.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_OPS_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_OPS_H
+
+namespace android {
+
+#if defined(__arm__) && !defined(__thumb__)
+#define USE_INLINE_ASSEMBLY (true)
+#else
+#define USE_INLINE_ASSEMBLY (false)
+#endif
+
+#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__)
+#define USE_NEON (true)
+#include <arm_neon.h>
+#else
+#define USE_NEON (false)
+#endif
+
+template<typename T, typename U>
+struct is_same
+{
+    static const bool value = false;
+};
+
+template<typename T>
+struct is_same<T, T>  // partial specialization
+{
+    static const bool value = true;
+};
+
+static inline
+int32_t mulRL(int left, int32_t in, uint32_t vRL)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    if (left) {
+        asm( "smultb %[out], %[in], %[vRL] \n"
+             : [out]"=r"(out)
+             : [in]"%r"(in), [vRL]"r"(vRL)
+             : );
+    } else {
+        asm( "smultt %[out], %[in], %[vRL] \n"
+             : [out]"=r"(out)
+             : [in]"%r"(in), [vRL]"r"(vRL)
+             : );
+    }
+    return out;
+#else
+    int16_t v = left ? static_cast<int16_t>(vRL) : static_cast<int16_t>(vRL>>16);
+    return static_cast<int32_t>((static_cast<int64_t>(in) * v) >> 16);
+#endif
+}
+
+static inline
+int32_t mulAdd(int16_t in, int16_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    asm( "smlabb %[out], %[v], %[in], %[a] \n"
+         : [out]"=r"(out)
+         : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
+         : );
+    return out;
+#else
+    return a + v * in;
+#endif
+}
+
+static inline
+int32_t mulAdd(int16_t in, int32_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    asm( "smlawb %[out], %[v], %[in], %[a] \n"
+         : [out]"=r"(out)
+         : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
+         : );
+    return out;
+#else
+    return a + static_cast<int32_t>((static_cast<int64_t>(v) * in) >> 16);
+#endif
+}
+
+static inline
+int32_t mulAdd(int32_t in, int32_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    asm( "smmla %[out], %[v], %[in], %[a] \n"
+         : [out]"=r"(out)
+         : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
+         : );
+    return out;
+#else
+    return a + static_cast<int32_t>((static_cast<int64_t>(v) * in) >> 32);
+#endif
+}
+
+static inline
+int32_t mulAddRL(int left, uint32_t inRL, int16_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    if (left) {
+        asm( "smlabb %[out], %[v], %[inRL], %[a] \n"
+             : [out]"=r"(out)
+             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+             : );
+    } else {
+        asm( "smlabt %[out], %[v], %[inRL], %[a] \n"
+             : [out]"=r"(out)
+             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+             : );
+    }
+    return out;
+#else
+    int16_t s = left ? static_cast<int16_t>(inRL) : static_cast<int16_t>(inRL>>16);
+    return a + v * s;
+#endif
+}
+
+static inline
+int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    if (left) {
+        asm( "smlawb %[out], %[v], %[inRL], %[a] \n"
+             : [out]"=r"(out)
+             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+             : );
+    } else {
+        asm( "smlawt %[out], %[v], %[inRL], %[a] \n"
+             : [out]"=r"(out)
+             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+             : );
+    }
+    return out;
+#else
+    int16_t s = left ? static_cast<int16_t>(inRL) : static_cast<int16_t>(inRL>>16);
+    return a + static_cast<int32_t>((static_cast<int64_t>(v) * s) >> 16);
+#endif
+}
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_OPS_H*/
diff --git a/services/audioflinger/AudioResamplerFirProcess.h b/services/audioflinger/AudioResamplerFirProcess.h
new file mode 100644
index 0000000..38e387c
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirProcess.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H
+
+namespace android {
+
+// depends on AudioResamplerFirOps.h
+
+template<int CHANNELS, typename TC>
+static inline
+void mac(
+        int32_t& l, int32_t& r,
+        const TC coef,
+        const int16_t* samples)
+{
+    if (CHANNELS == 2) {
+        uint32_t rl = *reinterpret_cast<const uint32_t*>(samples);
+        l = mulAddRL(1, rl, coef, l);
+        r = mulAddRL(0, rl, coef, r);
+    } else {
+        r = l = mulAdd(samples[0], coef, l);
+    }
+}
+
+template<int CHANNELS, typename TC>
+static inline
+void interpolate(
+        int32_t& l, int32_t& r,
+        const TC coef_0, const TC coef_1,
+        const int16_t lerp, const int16_t* samples)
+{
+    TC sinc;
+
+    if (is_same<TC, int16_t>::value) {
+        sinc = (lerp * ((coef_1-coef_0)<<1)>>16) + coef_0;
+    } else {
+        sinc = mulAdd(lerp, (coef_1-coef_0)<<1, coef_0);
+    }
+    if (CHANNELS == 2) {
+        uint32_t rl = *reinterpret_cast<const uint32_t*>(samples);
+        l = mulAddRL(1, rl, sinc, l);
+        r = mulAddRL(0, rl, sinc, r);
+    } else {
+        r = l = mulAdd(samples[0], sinc, l);
+    }
+}
+
+/*
+ * Calculates a single output sample (two stereo frames).
+ *
+ * This function computes both the positive half FIR dot product and
+ * the negative half FIR dot product, accumulates, and then applies the volume.
+ *
+ * This is a locked phase filter (it does not compute the interpolation).
+ *
+ * Use fir() to compute the proper coefficient pointers for a polyphase
+ * filter bank.
+ */
+
+template <int CHANNELS, int STRIDE, typename TC>
+static inline
+void ProcessL(int32_t* const out,
+        int count,
+        const TC* coefsP,
+        const TC* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    int32_t l = 0;
+    int32_t r = 0;
+    do {
+        mac<CHANNELS>(l, r, *coefsP++, sP);
+        sP -= CHANNELS;
+        mac<CHANNELS>(l, r, *coefsN++, sN);
+        sN += CHANNELS;
+    } while (--count > 0);
+    out[0] += 2 * mulRL(0, l, volumeLR[0]); // Note: only use top 16b
+    out[1] += 2 * mulRL(0, r, volumeLR[1]); // Note: only use top 16b
+}
+
+/*
+ * Calculates a single output sample (two stereo frames) interpolating phase.
+ *
+ * This function computes both the positive half FIR dot product and
+ * the negative half FIR dot product, accumulates, and then applies the volume.
+ *
+ * This is an interpolated phase filter.
+ *
+ * Use fir() to compute the proper coefficient pointers for a polyphase
+ * filter bank.
+ */
+
+template <int CHANNELS, int STRIDE, typename TC>
+static inline
+void Process(int32_t* const out,
+        int count,
+        const TC* coefsP,
+        const TC* coefsN,
+        const TC* coefsP1,
+        const TC* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    (void) coefsP1; // suppress unused parameter warning
+    (void) coefsN1;
+    if (sizeof(*coefsP)==4) {
+        lerpP >>= 16;   // ensure lerpP is 16b
+    }
+    int32_t l = 0;
+    int32_t r = 0;
+    for (size_t i = 0; i < count; ++i) {
+        interpolate<CHANNELS>(l, r, coefsP[0], coefsP[count], lerpP, sP);
+        coefsP++;
+        sP -= CHANNELS;
+        interpolate<CHANNELS>(l, r, coefsN[count], coefsN[0], lerpP, sN);
+        coefsN++;
+        sN += CHANNELS;
+    }
+    out[0] += 2 * mulRL(0, l, volumeLR[0]); // Note: only use top 16b
+    out[1] += 2 * mulRL(0, r, volumeLR[1]); // Note: only use top 16b
+}
+
+/*
+ * Calculates a single output sample (two stereo frames) from input sample pointer.
+ *
+ * This sets up the params for the accelerated Process() and ProcessL()
+ * functions to do the appropriate dot products.
+ *
+ * @param out should point to the output buffer with at least enough space for 2 output frames.
+ *
+ * @param phase is the fractional distance between input samples for interpolation:
+ * phase >= 0  && phase < phaseWrapLimit.  It can be thought of as a rational fraction
+ * of phase/phaseWrapLimit.
+ *
+ * @param phaseWrapLimit is #polyphases<<coefShift, where #polyphases is the number of polyphases
+ * in the polyphase filter. Likewise, #polyphases can be obtained as (phaseWrapLimit>>coefShift).
+ *
+ * @param coefShift gives the bit alignment of the polyphase index in the phase parameter.
+ *
+ * @param halfNumCoefs is the half the number of coefficients per polyphase filter. Since the
+ * overall filterbank is odd-length symmetric, only halfNumCoefs need be stored.
+ *
+ * @param coefs is the polyphase filter bank, starting at from polyphase index 0, and ranging to
+ * and including the #polyphases.  Each polyphase of the filter has half-length halfNumCoefs
+ * (due to symmetry).  The total size of the filter bank in coefficients is
+ * (#polyphases+1)*halfNumCoefs.
+ *
+ * The filter bank coefs should be aligned to a minimum of 16 bytes (preferrably to cache line).
+ *
+ * The coefs should be attenuated (to compensate for passband ripple)
+ * if storing back into the native format.
+ *
+ * @param samples are unaligned input samples.  The position is in the "middle" of the
+ * sample array with respect to the FIR filter:
+ * the negative half of the filter is dot product from samples+1 to samples+halfNumCoefs;
+ * the positive half of the filter is dot product from samples to samples-halfNumCoefs+1.
+ *
+ * @param volumeLR is a pointer to an array of two 32 bit volume values, one per stereo channel,
+ * expressed as a S32 integer.  A negative value inverts the channel 180 degrees.
+ * The pointer volumeLR should be aligned to a minimum of 8 bytes.
+ * A typical value for volume is 0x1000 to align to a unity gain output of 20.12.
+ *
+ * In between calls to filterCoefficient, the phase is incremented by phaseIncrement, where
+ * phaseIncrement is calculated as inputSampling * phaseWrapLimit / outputSampling.
+ *
+ * The filter polyphase index is given by indexP = phase >> coefShift. Due to
+ * odd length symmetric filter, the polyphase index of the negative half depends on
+ * whether interpolation is used.
+ *
+ * The fractional siting between the polyphase indices is given by the bits below coefShift:
+ *
+ * lerpP = phase << 32 - coefShift >> 1;  // for 32 bit unsigned phase multiply
+ * lerpP = phase << 32 - coefShift >> 17; // for 16 bit unsigned phase multiply
+ *
+ * For integer types, this is expressed as:
+ *
+ * lerpP = phase << sizeof(phase)*8 - coefShift
+ *              >> (sizeof(phase)-sizeof(*coefs))*8 + 1;
+ *
+ */
+
+template<int CHANNELS, bool LOCKED, int STRIDE, typename TC>
+static inline
+void fir(int32_t* const out,
+        const uint32_t phase, const uint32_t phaseWrapLimit,
+        const int coefShift, const int halfNumCoefs, const TC* const coefs,
+        const int16_t* const samples, const int32_t* const volumeLR)
+{
+    // NOTE: be very careful when modifying the code here. register
+    // pressure is very high and a small change might cause the compiler
+    // to generate far less efficient code.
+    // Always sanity check the result with objdump or test-resample.
+
+    if (LOCKED) {
+        // locked polyphase (no interpolation)
+        // Compute the polyphase filter index on the positive and negative side.
+        uint32_t indexP = phase >> coefShift;
+        uint32_t indexN = (phaseWrapLimit - phase) >> coefShift;
+        const TC* coefsP = coefs + indexP*halfNumCoefs;
+        const TC* coefsN = coefs + indexN*halfNumCoefs;
+        const int16_t* sP = samples;
+        const int16_t* sN = samples + CHANNELS;
+
+        // dot product filter.
+        ProcessL<CHANNELS, STRIDE>(out,
+                halfNumCoefs, coefsP, coefsN, sP, sN, volumeLR);
+    } else {
+        // interpolated polyphase
+        // Compute the polyphase filter index on the positive and negative side.
+        uint32_t indexP = phase >> coefShift;
+        uint32_t indexN = (phaseWrapLimit - phase - 1) >> coefShift; // one's complement.
+        const TC* coefsP = coefs + indexP*halfNumCoefs;
+        const TC* coefsN = coefs + indexN*halfNumCoefs;
+        const TC* coefsP1 = coefsP + halfNumCoefs;
+        const TC* coefsN1 = coefsN + halfNumCoefs;
+        const int16_t* sP = samples;
+        const int16_t* sN = samples + CHANNELS;
+
+        // Interpolation fraction lerpP derived by shifting all the way up and down
+        // to clear the appropriate bits and align to the appropriate level
+        // for the integer multiply.  The constants should resolve in compile time.
+        //
+        // The interpolated filter coefficient is derived as follows for the pos/neg half:
+        //
+        // interpolated[P] = index[P]*lerpP + index[P+1]*(1-lerpP)
+        // interpolated[N] = index[N+1]*lerpP + index[N]*(1-lerpP)
+        uint32_t lerpP = phase << (sizeof(phase)*8 - coefShift)
+                >> ((sizeof(phase)-sizeof(*coefs))*8 + 1);
+
+        // on-the-fly interpolated dot product filter
+        Process<CHANNELS, STRIDE>(out,
+                halfNumCoefs, coefsP, coefsN, coefsP1, coefsN1, sP, sN, lerpP, volumeLR);
+    }
+}
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H*/
diff --git a/services/audioflinger/AudioResamplerFirProcessNeon.h b/services/audioflinger/AudioResamplerFirProcessNeon.h
new file mode 100644
index 0000000..f311cef
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirProcessNeon.h
@@ -0,0 +1,1149 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
+
+namespace android {
+
+// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
+
+#if USE_NEON
+//
+// NEON specializations are enabled for Process() and ProcessL()
+//
+// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
+// and looping stride 16 (or vice versa). This has some polyphase coef data alignment
+// issues with S16 coefs. Consider this later.
+
+// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
+#define ASSEMBLY_ACCUMULATE_MONO \
+        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
+        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
+        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
+        "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
+        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
+        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
+        "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
+
+#define ASSEMBLY_ACCUMULATE_STEREO \
+        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
+        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
+        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
+        "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
+        "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
+        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
+        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
+        "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
+
+template <>
+inline void ProcessL<1, 16>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
+
+        "1:                                      \n"
+
+        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
+        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
+        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
+
+        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
+        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
+        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
+
+        // moving these ARM instructions before neon above seems to be slower
+        "subs           %[count], %[count], #8   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+         ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q10"
+    );
+}
+
+template <>
+inline void ProcessL<2, 16>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// (1) acc_L = 0
+        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
+
+        "1:                                      \n"
+
+        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
+        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
+        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
+        "vrev64.16      q3, q3                   \n"// (0 combines+) reverse right positive
+
+        "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
+        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
+        "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
+        "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
+        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
+        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
+        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
+        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
+
+        // moving these ARM before neon seems to be slower
+        "subs           %[count], %[count], #8   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q10"
+     );
+}
+
+template <>
+inline void Process<1, 16>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* coefsP1,
+        const int16_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
+        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
+
+        "1:                                      \n"
+
+        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
+        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
+        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
+        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
+
+        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
+        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
+
+        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
+        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
+
+        "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
+        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
+
+        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
+        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
+        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
+
+        // moving these ARM instructions before neon above seems to be slower
+        "subs           %[count], %[count], #8   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11"
+    );
+}
+
+template <>
+inline void Process<2, 16>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* coefsP1,
+        const int16_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
+        "veor           q0, q0, q0               \n"// (1) acc_L = 0
+        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
+
+        "1:                                      \n"
+
+        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
+        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
+        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
+        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
+
+        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
+        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
+
+        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
+        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
+        "vrev64.16      q3, q3                   \n"// (1) reverse 8 frames of the right positive
+
+        "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
+        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
+
+        "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
+        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
+        "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
+        "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
+        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
+        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
+        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
+        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
+
+        // moving these ARM before neon seems to be slower
+        "subs           %[count], %[count], #8   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q9", "q10", "q11"
+    );
+}
+
+template <>
+inline void ProcessL<1, 16>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0                    \n"// result, initialize to 0
+
+        "1:                                           \n"
+
+        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
+        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
+        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
+        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
+
+        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
+
+        "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
+        "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
+
+        "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
+        "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q0, q0, q15                   \n"// accumulate result
+        "vadd.s32       q0, q0, q13                   \n"// accumulate result
+
+        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
+        "subs           %[count], %[count], #8        \n"// update loop counter
+
+        "bne            1b                            \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void ProcessL<2, 16>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0                    \n"// result, initialize to 0
+        "veor           q4, q4, q4                    \n"// result, initialize to 0
+
+        "1:                                           \n"
+
+        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
+        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
+        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 4 32-bits coefs
+        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+
+        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
+        "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
+
+        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
+        "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
+
+        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q4, q4, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
+        "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
+
+        "subs           %[count], %[count], #8        \n"// update loop counter
+        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
+
+        "bne            1b                            \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void Process<1, 16>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int32_t* coefsP1,
+        const int32_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
+        "veor           q0, q0, q0                    \n"// result, initialize to 0
+
+        "1:                                           \n"
+
+        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
+        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
+        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
+        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
+        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
+        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
+
+        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
+        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
+        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
+        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
+
+        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
+
+        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
+        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
+        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
+        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
+
+        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
+
+        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q0, q0, q15                   \n"// accumulate result
+        "vadd.s32       q0, q0, q13                   \n"// accumulate result
+
+        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
+        "subs           %[count], %[count], #8        \n"// update loop counter
+
+        "bne            1b                            \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void Process<2, 16>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int32_t* coefsP1,
+        const int32_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
+        "veor           q0, q0, q0                    \n"// result, initialize to 0
+        "veor           q4, q4, q4                    \n"// result, initialize to 0
+
+        "1:                                           \n"
+
+        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
+        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
+        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
+        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
+        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
+        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
+
+        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
+        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
+        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
+        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
+
+        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
+
+        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
+        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
+        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
+        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
+
+        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
+        "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
+
+        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
+        "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
+
+        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q4, q4, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
+        "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
+
+        "subs           %[count], %[count], #8        \n"// update loop counter
+        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
+
+        "bne            1b                            \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void ProcessL<1, 8>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
+
+        "1:                                      \n"
+
+        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
+        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
+        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
+        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs
+
+        "vrev64.16      d4, d4                   \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
+
+        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed)samples by coef
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
+
+        // moving these ARM instructions before neon above seems to be slower
+        "subs           %[count], %[count], #4   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #8         \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q10"
+    );
+}
+
+template <>
+inline void ProcessL<2, 8>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// (1) acc_L = 0
+        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
+
+        "1:                                      \n"
+
+        "vld2.16        {d4, d5}, [%[sP]]        \n"// (2+0d) load 8 16-bits stereo samples
+        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (2) load 8 16-bits stereo samples
+        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
+
+        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
+        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
+        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
+
+        // moving these ARM before neon seems to be slower
+        "subs           %[count], %[count], #4   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q10"
+     );
+}
+
+template <>
+inline void Process<1, 8>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* coefsP1,
+        const int16_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
+        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
+
+        "1:                                      \n"
+
+        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
+        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
+        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
+        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
+        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 4 16-bits coefs
+        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
+
+        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
+        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
+
+        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
+        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
+
+        "vrev64.16      d4, d4                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
+
+        "vadd.s16       d16, d16, d17            \n"// (1+2d) interpolate (step3) 1st set
+        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
+
+        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+        "vmlal.s16      q0, d4, d16              \n"// (1+0d) multiply (reversed)by coef
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
+
+        // moving these ARM instructions before neon above seems to be slower
+        "subs           %[count], %[count], #4   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #8        \n"// move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11"
+    );
+}
+
+template <>
+inline void Process<2, 8>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* coefsP1,
+        const int16_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
+        "veor           q0, q0, q0               \n"// (1) acc_L = 0
+        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
+
+        "1:                                      \n"
+
+        "vld2.16        {d4, d5}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
+        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
+        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
+        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
+
+        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
+        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
+
+        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
+        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
+
+        "vadd.s16       d16, d16, d17            \n"// (1+1d) interpolate (step3) 1st set
+        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
+
+        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
+        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
+        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
+
+        // moving these ARM before neon seems to be slower
+        "subs           %[count], %[count], #4   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q9", "q10", "q11"
+    );
+}
+
+template <>
+inline void ProcessL<1, 8>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// result, initialize to 0
+
+        "1:                                      \n"
+
+        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
+        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
+        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
+        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+
+        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
+
+        "vshll.s16      q12, d4, #15             \n"// (stall) extend samples to 31 bits
+        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12              \n"// accumulate result
+        "vadd.s32       q0, q0, q14              \n"// (stall) accumulate result
+
+        "subs           %[count], %[count], #4   \n"// update loop counter
+        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
+
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11",
+          "q12", "q14"
+    );
+}
+
+template <>
+inline void ProcessL<2, 8>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// result, initialize to 0
+        "veor           q4, q4, q4               \n"// result, initialize to 0
+
+        "1:                                      \n"
+
+        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
+        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
+        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
+        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+
+        "vrev64.16      q2, q2                   \n"// reverse 2 frames of the positive side
+
+        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
+        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
+
+        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
+        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by coef
+        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by coef
+        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by coef
+        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by coef
+
+        "vadd.s32       q0, q0, q12              \n"// accumulate result
+        "vadd.s32       q4, q4, q13              \n"// accumulate result
+        "vadd.s32       q0, q0, q14              \n"// accumulate result
+        "vadd.s32       q4, q4, q15              \n"// accumulate result
+
+        "subs           %[count], %[count], #4   \n"// update loop counter
+        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
+
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3", "q4",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void Process<1, 8>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int32_t* coefsP1,
+        const int32_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
+        "veor           q0, q0, q0               \n"// result, initialize to 0
+
+        "1:                                      \n"
+
+        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
+        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
+        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
+        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
+        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
+        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
+
+        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
+
+        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
+        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
+        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
+        "vqrdmulh.s32   q11, q11, d2[0]          \n"// interpolate (step2) 2nd set of coefs
+        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
+
+        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
+        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
+
+        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12              \n"// accumulate result
+        "vadd.s32       q0, q0, q14              \n"// accumulate result
+
+        "subs           %[count], %[count], #4   \n"// update loop counter
+        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
+
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN0] "+r" (coefsN),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11",
+          "q12", "q14"
+    );
+}
+
+template <>
+inline
+void Process<2, 8>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int32_t* coefsP1,
+        const int32_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
+        "veor           q0, q0, q0               \n"// result, initialize to 0
+        "veor           q4, q4, q4               \n"// result, initialize to 0
+
+        "1:                                      \n"
+        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
+        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
+        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
+        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
+        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
+        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
+
+        "vrev64.16      q2, q2                   \n"// (reversed) 2 frames of the positive side
+
+        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
+        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
+        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
+        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
+        "vqrdmulh.s32   q11, q11, d2[1]          \n"// interpolate (step3) 2nd set of coefs
+        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
+        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
+
+        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
+        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
+
+        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12              \n"// accumulate result
+        "vadd.s32       q4, q4, q13              \n"// accumulate result
+        "vadd.s32       q0, q0, q14              \n"// accumulate result
+        "vadd.s32       q4, q4, q15              \n"// accumulate result
+
+        "subs           %[count], %[count], #4   \n"// update loop counter
+        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
+
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN0] "+r" (coefsN),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3", "q4",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+#endif //USE_NEON
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
diff --git a/services/audioflinger/Threads.cpp b/services/audioflinger/Threads.cpp
index 35b8575..01b90a8 100644
--- a/services/audioflinger/Threads.cpp
+++ b/services/audioflinger/Threads.cpp
@@ -3047,15 +3047,8 @@ AudioFlinger::PlaybackThread::mixer_state AudioFlinger::MixerThread::prepareTrac
                 (mMixerStatusIgnoringFastTracks == MIXER_TRACKS_READY)) {
             minFrames = desiredFrames;
         }
-        // It's not safe to call framesReady() for a static buffer track, so assume it's ready
-        size_t framesReady;
-        if (track->sharedBuffer() == 0) {
-            framesReady = track->framesReady();
-        } else if (track->isStopped()) {
-            framesReady = 0;
-        } else {
-            framesReady = 1;
-        }
+
+        size_t framesReady = track->framesReady();
         if ((framesReady >= minFrames) && track->isReady() &&
                 !track->isPaused() && !track->isTerminated())
         {
diff --git a/services/audioflinger/Tracks.cpp b/services/audioflinger/Tracks.cpp
index 53196c8..a5b9ac5 100644
--- a/services/audioflinger/Tracks.cpp
+++ b/services/audioflinger/Tracks.cpp
@@ -564,7 +564,7 @@ size_t AudioFlinger::PlaybackThread::Track::framesReleased() const
 
 // Don't call for fast tracks; the framesReady() could result in priority inversion
 bool AudioFlinger::PlaybackThread::Track::isReady() const {
-    if (mFillingUpStatus != FS_FILLING || isStopped() || isPausing()) {
+    if (mFillingUpStatus != FS_FILLING || isStopped() || isPausing() || isStopping()) {
         return true;
     }
 
diff --git a/services/audioflinger/test-resample.cpp b/services/audioflinger/test-resample.cpp
index 0d00a0f..66fcd90 100644
--- a/services/audioflinger/test-resample.cpp
+++ b/services/audioflinger/test-resample.cpp
@@ -33,8 +33,9 @@ using namespace android;
 bool gVerbose = false;
 
 static int usage(const char* name) {
-    fprintf(stderr,"Usage: %s [-p] [-h] [-v] [-s] [-q {dq|lq|mq|hq|vhq}] [-i input-sample-rate] "
-                   "[-o output-sample-rate] [<input-file>] <output-file>\n", name);
+    fprintf(stderr,"Usage: %s [-p] [-h] [-v] [-s] [-q {dq|lq|mq|hq|vhq|dlq|dmq|dhq}]"
+                   " [-i input-sample-rate] [-o output-sample-rate] [<input-file>]"
+                   " <output-file>\n", name);
     fprintf(stderr,"    -p    enable profiling\n");
     fprintf(stderr,"    -h    create wav file\n");
     fprintf(stderr,"    -v    verbose : log buffer provider calls\n");
@@ -45,6 +46,9 @@ static int usage(const char* name) {
     fprintf(stderr,"              mq  : medium quality\n");
     fprintf(stderr,"              hq  : high quality\n");
     fprintf(stderr,"              vhq : very high quality\n");
+    fprintf(stderr,"              dlq : dynamic low quality\n");
+    fprintf(stderr,"              dmq : dynamic medium quality\n");
+    fprintf(stderr,"              dhq : dynamic high quality\n");
     fprintf(stderr,"    -i    input file sample rate (ignored if input file is specified)\n");
     fprintf(stderr,"    -o    output file sample rate\n");
     return -1;
@@ -53,7 +57,8 @@ static int usage(const char* name) {
 int main(int argc, char* argv[]) {
 
     const char* const progname = argv[0];
-    bool profiling = false;
+    bool profileResample = false;
+    bool profileFilter = false;
     bool writeHeader = false;
     int channels = 1;
     int input_freq = 0;
@@ -61,10 +66,13 @@ int main(int argc, char* argv[]) {
     AudioResampler::src_quality quality = AudioResampler::DEFAULT_QUALITY;
 
     int ch;
-    while ((ch = getopt(argc, argv, "phvsq:i:o:")) != -1) {
+    while ((ch = getopt(argc, argv, "pfhvsq:i:o:")) != -1) {
         switch (ch) {
         case 'p':
-            profiling = true;
+            profileResample = true;
+            break;
+        case 'f':
+            profileFilter = true;
             break;
         case 'h':
             writeHeader = true;
@@ -86,6 +94,12 @@ int main(int argc, char* argv[]) {
                 quality = AudioResampler::HIGH_QUALITY;
             else if (!strcmp(optarg, "vhq"))
                 quality = AudioResampler::VERY_HIGH_QUALITY;
+            else if (!strcmp(optarg, "dlq"))
+                quality = AudioResampler::DYN_LOW_QUALITY;
+            else if (!strcmp(optarg, "dmq"))
+                quality = AudioResampler::DYN_MED_QUALITY;
+            else if (!strcmp(optarg, "dhq"))
+                quality = AudioResampler::DYN_HIGH_QUALITY;
             else {
                 usage(progname);
                 return -1;
@@ -137,6 +151,8 @@ int main(int argc, char* argv[]) {
         channels = info.channels;
         input_freq = info.samplerate;
     } else {
+        // data for testing is exactly (input sampling rate/1000)/2 seconds
+        // so 44.1khz input is 22.05 seconds
         double k = 1000; // Hz / s
         double time = (input_freq / 2) / k;
         size_t input_frames = size_t(input_freq * time);
@@ -148,7 +164,7 @@ int main(int argc, char* argv[]) {
             double y = sin(M_PI * k * t * t);
             int16_t yi = floor(y * 32767.0 + 0.5);
             for (size_t j=0 ; j<(size_t)channels ; j++) {
-                in[i*channels + j] = yi / (1+j);
+                in[i*channels + j] = yi / (1+j); // right ch. 1/2 left ch.
             }
         }
     }
@@ -170,6 +186,7 @@ int main(int argc, char* argv[]) {
         }
         virtual status_t getNextBuffer(Buffer* buffer,
                 int64_t pts = kInvalidPTS) {
+            (void)pts; // suppress warning
             size_t requestedFrames = buffer->frameCount;
             if (requestedFrames > mNumFrames - mNextFrame) {
                 buffer->frameCount = mNumFrames - mNextFrame;
@@ -202,6 +219,11 @@ int main(int argc, char* argv[]) {
                 mNextFrame += buffer->frameCount;
                 mUnrel -= buffer->frameCount;
             }
+            buffer->frameCount = 0;
+            buffer->i16 = NULL;
+        }
+        void reset() {
+            mNextFrame = 0;
         }
     } provider(input_vaddr, input_size, channels);
 
@@ -212,37 +234,110 @@ int main(int argc, char* argv[]) {
     size_t output_size = 2 * 4 * ((int64_t) input_frames * output_freq) / input_freq;
     output_size &= ~7; // always stereo, 32-bits
 
-    void* output_vaddr = malloc(output_size);
-
-    if (profiling) {
+    if (profileFilter) {
+        // Check how fast sample rate changes are that require filter changes.
+        // The delta sample rate changes must indicate a downsampling ratio,
+        // and must be larger than 10% changes.
+        //
+        // On fast devices, filters should be generated between 0.1ms - 1ms.
+        // (single threaded).
         AudioResampler* resampler = AudioResampler::create(16, channels,
-                output_freq, quality);
-
-        size_t out_frames = output_size/8;
-        resampler->setSampleRate(input_freq);
-        resampler->setVolume(0x1000, 0x1000);
-
-        memset(output_vaddr, 0, output_size);
+                8000, quality);
+        int looplimit = 100;
         timespec start, end;
         clock_gettime(CLOCK_MONOTONIC, &start);
-        resampler->resample((int*) output_vaddr, out_frames, &provider);
-        resampler->resample((int*) output_vaddr, out_frames, &provider);
-        resampler->resample((int*) output_vaddr, out_frames, &provider);
-        resampler->resample((int*) output_vaddr, out_frames, &provider);
+        for (int i = 0; i < looplimit; ++i) {
+            resampler->setSampleRate(9000);
+            resampler->setSampleRate(12000);
+            resampler->setSampleRate(20000);
+            resampler->setSampleRate(30000);
+        }
         clock_gettime(CLOCK_MONOTONIC, &end);
         int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;
         int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec;
-        int64_t time = (end_ns - start_ns)/4;
-        printf("%f Mspl/s\n", out_frames/(time/1e9)/1e6);
+        int64_t time = end_ns - start_ns;
+        printf("%.2f sample rate changes with filter calculation/sec\n",
+                looplimit * 4 / (time / 1e9));
 
+        // Check how fast sample rate changes are without filter changes.
+        // This should be very fast, probably 0.1us - 1us per sample rate
+        // change.
+        resampler->setSampleRate(1000);
+        looplimit = 1000;
+        clock_gettime(CLOCK_MONOTONIC, &start);
+        for (int i = 0; i < looplimit; ++i) {
+            resampler->setSampleRate(1000+i);
+        }
+        clock_gettime(CLOCK_MONOTONIC, &end);
+        start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;
+        end_ns = end.tv_sec * 1000000000LL + end.tv_nsec;
+        time = end_ns - start_ns;
+        printf("%.2f sample rate changes without filter calculation/sec\n",
+                looplimit / (time / 1e9));
+        resampler->reset();
         delete resampler;
     }
 
+    void* output_vaddr = malloc(output_size);
     AudioResampler* resampler = AudioResampler::create(16, channels,
             output_freq, quality);
     size_t out_frames = output_size/8;
+
+    /* set volume precision to 12 bits, so the volume scale is 1<<12.
+     * This means the "integer" part fits in the Q19.12 precision
+     * representation of output int32_t.
+     *
+     * Generally 0 < volumePrecision <= 14 (due to the limits of
+     * int16_t values for Volume). volumePrecision cannot be 0 due
+     * to rounding and shifts.
+     */
+    const int volumePrecision = 12; // in bits
+
     resampler->setSampleRate(input_freq);
-    resampler->setVolume(0x1000, 0x1000);
+    resampler->setVolume(1 << volumePrecision, 1 << volumePrecision);
+
+    if (profileResample) {
+        /*
+         * For profiling on mobile devices, upon experimentation
+         * it is better to run a few trials with a shorter loop limit,
+         * and take the minimum time.
+         *
+         * Long tests can cause CPU temperature to build up and thermal throttling
+         * to reduce CPU frequency.
+         *
+         * For frequency checks (index=0, or 1, etc.):
+         * "cat /sys/devices/system/cpu/cpu${index}/cpufreq/scaling_*_freq"
+         *
+         * For temperature checks (index=0, or 1, etc.):
+         * "cat /sys/class/thermal/thermal_zone${index}/temp"
+         *
+         * Another way to avoid thermal throttling is to fix the CPU frequency
+         * at a lower level which prevents excessive temperatures.
+         */
+        const int trials = 4;
+        const int looplimit = 4;
+        timespec start, end;
+        int64_t time;
+
+        for (int n = 0; n < trials; ++n) {
+            clock_gettime(CLOCK_MONOTONIC, &start);
+            for (int i = 0; i < looplimit; ++i) {
+                resampler->resample((int*) output_vaddr, out_frames, &provider);
+                provider.reset(); //  during benchmarking reset only the provider
+            }
+            clock_gettime(CLOCK_MONOTONIC, &end);
+            int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;
+            int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec;
+            int64_t diff_ns = end_ns - start_ns;
+            if (n == 0 || diff_ns < time) {
+                time = diff_ns;   // save the best out of our trials.
+            }
+        }
+        // Mfrms/s is "Millions of output frames per second".
+        printf("quality: %d  channels: %d  msec: %lld  Mfrms/s: %.2lf\n",
+                quality, channels, time/1000000, out_frames * looplimit / (time / 1e9) / 1e6);
+        resampler->reset();
+    }
 
     memset(output_vaddr, 0, output_size);
     if (gVerbose) {
@@ -256,15 +351,31 @@ int main(int argc, char* argv[]) {
     if (gVerbose) {
         printf("reset() complete\n");
     }
+    delete resampler;
+    resampler = NULL;
 
-    // down-mix (we just truncate and keep the left channel)
+    // mono takes left channel only
+    // stereo right channel is half amplitude of stereo left channel (due to input creation)
     int32_t* out = (int32_t*) output_vaddr;
     int16_t* convert = (int16_t*) malloc(out_frames * channels * sizeof(int16_t));
+
+    // round to half towards zero and saturate at int16 (non-dithered)
+    const int roundVal = (1<<(volumePrecision-1)) - 1; // volumePrecision > 0
+
     for (size_t i = 0; i < out_frames; i++) {
-        for (int j=0 ; j<channels ; j++) {
-            int32_t s = out[i * 2 + j] >> 12;
-            if (s > 32767)       s =  32767;
-            else if (s < -32768) s = -32768;
+        for (int j = 0; j < channels; j++) {
+            int32_t s = out[i * 2 + j] + roundVal; // add offset here
+            if (s < 0) {
+                s = (s + 1) >> volumePrecision; // round to 0
+                if (s < -32768) {
+                    s = -32768;
+                }
+            } else {
+                s = s >> volumePrecision;
+                if (s > 32767) {
+                    s = 32767;
+                }
+            }
             convert[i * channels + j] = int16_t(s);
         }
     }
diff --git a/services/camera/libcameraservice/api2/CameraDeviceClient.cpp b/services/camera/libcameraservice/api2/CameraDeviceClient.cpp
index 1cdf8dc..187220e 100644
--- a/services/camera/libcameraservice/api2/CameraDeviceClient.cpp
+++ b/services/camera/libcameraservice/api2/CameraDeviceClient.cpp
@@ -635,26 +635,56 @@ status_t CameraDeviceClient::getRotationTransformLocked(int32_t* transform) {
         return INVALID_OPERATION;
     }
 
+    camera_metadata_ro_entry_t entryFacing = staticInfo.find(ANDROID_LENS_FACING);
+    if (entry.count == 0) {
+        ALOGE("%s: Camera %d: Can't find android.lens.facing in "
+                "static metadata!", __FUNCTION__, mCameraId);
+        return INVALID_OPERATION;
+    }
+
     int32_t& flags = *transform;
 
+    bool mirror = (entryFacing.data.u8[0] == ANDROID_LENS_FACING_FRONT);
     int orientation = entry.data.i32[0];
-    switch (orientation) {
-        case 0:
-            flags = 0;
-            break;
-        case 90:
-            flags = NATIVE_WINDOW_TRANSFORM_ROT_90;
-            break;
-        case 180:
-            flags = NATIVE_WINDOW_TRANSFORM_ROT_180;
-            break;
-        case 270:
-            flags = NATIVE_WINDOW_TRANSFORM_ROT_270;
-            break;
-        default:
-            ALOGE("%s: Invalid HAL android.sensor.orientation value: %d",
-                  __FUNCTION__, orientation);
-            return INVALID_OPERATION;
+    if (!mirror) {
+        switch (orientation) {
+            case 0:
+                flags = 0;
+                break;
+            case 90:
+                flags = NATIVE_WINDOW_TRANSFORM_ROT_90;
+                break;
+            case 180:
+                flags = NATIVE_WINDOW_TRANSFORM_ROT_180;
+                break;
+            case 270:
+                flags = NATIVE_WINDOW_TRANSFORM_ROT_270;
+                break;
+            default:
+                ALOGE("%s: Invalid HAL android.sensor.orientation value: %d",
+                      __FUNCTION__, orientation);
+                return INVALID_OPERATION;
+        }
+    } else {
+        switch (orientation) {
+            case 0:
+                flags = HAL_TRANSFORM_FLIP_H;
+                break;
+            case 90:
+                flags = HAL_TRANSFORM_FLIP_H | HAL_TRANSFORM_ROT_90;
+                break;
+            case 180:
+                flags = HAL_TRANSFORM_FLIP_V;
+                break;
+            case 270:
+                flags = HAL_TRANSFORM_FLIP_V | HAL_TRANSFORM_ROT_90;
+                break;
+            default:
+                ALOGE("%s: Invalid HAL android.sensor.orientation value: %d",
+                      __FUNCTION__, orientation);
+                return INVALID_OPERATION;
+        }
+
     }
 
     /**
diff --git a/tools/resampler_tools/fir.cpp b/tools/resampler_tools/fir.cpp
index cc3d509..3d6a74d 100644
--- a/tools/resampler_tools/fir.cpp
+++ b/tools/resampler_tools/fir.cpp
@@ -20,15 +20,25 @@
 #include <stdlib.h>
 #include <string.h>
 
-static double sinc(double x) {
+static inline double sinc(double x) {
     if (fabs(x) == 0.0f) return 1.0f;
     return sin(x) / x;
 }
 
-static double sqr(double x) {
+static inline double sqr(double x) {
     return x*x;
 }
 
+static inline int64_t toint(double x, int64_t maxval) {
+    int64_t v;
+
+    v = static_cast<int64_t>(floor(x * maxval + 0.5));
+    if (v >= maxval) {
+        return maxval - 1; // error!
+    }
+    return v;
+}
+
 static double I0(double x) {
     // from the Numerical Recipes in C p. 237
     double ax,ans,y;
@@ -54,11 +64,12 @@ static double kaiser(int k, int N, double beta) {
     return I0(beta * sqrt(1.0 - sqr((2.0*k)/N - 1.0))) / I0(beta);
 }
 
-
 static void usage(char* name) {
     fprintf(stderr,
-            "usage: %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings] [-f {float|fixed}] [-b beta] [-v dBFS] [-l lerp]\n"
-            "       %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings] [-f {float|fixed}] [-b beta] [-v dBFS] -p M/N\n"
+            "usage: %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings]"
+            " [-f {float|fixed|fixed16}] [-b beta] [-v dBFS] [-l lerp]\n"
+            "       %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings]"
+            " [-f {float|fixed|fixed16}] [-b beta] [-v dBFS] -p M/N\n"
             "    -h    this help message\n"
             "    -d    debug, print comma-separated coefficient table\n"
             "    -p    generate poly-phase filter coefficients, with sample increment M/N\n"
@@ -66,6 +77,7 @@ static void usage(char* name) {
             "    -c    cut-off frequency (20478)\n"
             "    -n    number of zero-crossings on one side (8)\n"
             "    -l    number of lerping bits (4)\n"
+            "    -m    number of polyphases (related to -l, default 16)\n"
             "    -f    output format, can be fixed-point or floating-point (fixed)\n"
             "    -b    kaiser window parameter beta (7.865 [-80dB])\n"
             "    -v    attenuation in dBFS (0)\n",
@@ -77,8 +89,7 @@ static void usage(char* name) {
 int main(int argc, char** argv)
 {
     // nc is the number of bits to store the coefficients
-    const int nc = 32;
-
+    int nc = 32;
     bool polyphase = false;
     unsigned int polyM = 160;
     unsigned int polyN = 147;
@@ -88,7 +99,6 @@ int main(int argc, char** argv)
     double atten = 1;
     int format = 0;
 
-
     // in order to keep the errors associated with the linear
     // interpolation of the coefficients below the quantization error
     // we must satisfy:
@@ -104,7 +114,6 @@ int main(int argc, char** argv)
     // Smith, J.O. Digital Audio Resampling Home Page
     // https://ccrma.stanford.edu/~jos/resample/, 2011-03-29
     //
-    int nz = 4;
 
     //         | 0.1102*(A - 8.7)                         A > 50
     //  beta = | 0.5842*(A - 21)^0.4 + 0.07886*(A - 21)   21 <= A <= 50
@@ -123,7 +132,6 @@ int main(int argc, char** argv)
     //   100 dB   10.056
     double beta = 7.865;
 
-
     // 2*nzc = (A - 8) / (2.285 * dw)
     //      with dw the transition width = 2*pi*dF/Fs
     //
@@ -148,8 +156,9 @@ int main(int argc, char** argv)
     // nzc  = 20
     //
 
+    int M = 1 << 4; // number of phases for interpolation
     int ch;
-    while ((ch = getopt(argc, argv, ":hds:c:n:f:l:b:p:v:")) != -1) {
+    while ((ch = getopt(argc, argv, ":hds:c:n:f:l:m:b:p:v:z:")) != -1) {
         switch (ch) {
             case 'd':
                 debug = true;
@@ -169,13 +178,26 @@ int main(int argc, char** argv)
             case 'n':
                 nzc = atoi(optarg);
                 break;
+            case 'm':
+                M = atoi(optarg);
+                break;
             case 'l':
-                nz = atoi(optarg);
+                M = 1 << atoi(optarg);
                 break;
             case 'f':
-                if (!strcmp(optarg,"fixed")) format = 0;
-                else if (!strcmp(optarg,"float")) format = 1;
-                else usage(argv[0]);
+                if (!strcmp(optarg, "fixed")) {
+                    format = 0;
+                }
+                else if (!strcmp(optarg, "fixed16")) {
+                    format = 0;
+                    nc = 16;
+                }
+                else if (!strcmp(optarg, "float")) {
+                    format = 1;
+                }
+                else {
+                    usage(argv[0]);
+                }
                 break;
             case 'b':
                 beta = atof(optarg);
@@ -193,11 +215,14 @@ int main(int argc, char** argv)
     // cut off frequency ratio Fc/Fs
     double Fcr = Fc / Fs;
 
-
     // total number of coefficients (one side)
-    const int M = (1 << nz);
+
     const int N = M * nzc;
 
+    // lerp (which is most useful if M is a power of 2)
+
+    int nz = 0; // recalculate nz as the bits needed to represent M
+    for (int i = M-1 ; i; i>>=1, nz++);
     // generate the right half of the filter
     if (!debug) {
         printf("// cmd-line: ");
@@ -207,7 +232,7 @@ int main(int argc, char** argv)
         printf("\n");
         if (!polyphase) {
             printf("const int32_t RESAMPLE_FIR_SIZE           = %d;\n", N);
-            printf("const int32_t RESAMPLE_FIR_LERP_INT_BITS  = %d;\n", nz);
+            printf("const int32_t RESAMPLE_FIR_INT_PHASES     = %d;\n", M);
             printf("const int32_t RESAMPLE_FIR_NUM_COEF       = %d;\n", nzc);
         } else {
             printf("const int32_t RESAMPLE_FIR_SIZE           = %d;\n", 2*nzc*polyN);
@@ -224,7 +249,7 @@ int main(int argc, char** argv)
         for (int i=0 ; i<=M ; i++) { // an extra set of coefs for interpolation
             for (int j=0 ; j<nzc ; j++) {
                 int ix = j*M + i;
-                double x = (2.0 * M_PI * ix * Fcr) / (1 << nz);
+                double x = (2.0 * M_PI * ix * Fcr) / M;
                 double y = kaiser(ix+N, 2*N, beta) * sinc(x) * 2.0 * Fcr;
                 y *= atten;
 
@@ -232,11 +257,13 @@ int main(int argc, char** argv)
                     if (j == 0)
                         printf("\n    ");
                 }
-
                 if (!format) {
-                    int64_t yi = floor(y * ((1ULL<<(nc-1))) + 0.5);
-                    if (yi >= (1LL<<(nc-1))) yi = (1LL<<(nc-1))-1;
-                    printf("0x%08x, ", int32_t(yi));
+                    int64_t yi = toint(y, 1ULL<<(nc-1));
+                    if (nc > 16) {
+                        printf("0x%08x, ", int32_t(yi));
+                    } else {
+                        printf("0x%04x, ", int32_t(yi)&0xffff);
+                    }
                 } else {
                     printf("%.9g%s ", y, debug ? "," : "f,");
                 }
@@ -254,9 +281,12 @@ int main(int argc, char** argv)
                 double y = kaiser(i+N, 2*N, beta) * sinc(x) * 2.0 * Fcr;;
                 y *= atten;
                 if (!format) {
-                    int64_t yi = floor(y * ((1ULL<<(nc-1))) + 0.5);
-                    if (yi >= (1LL<<(nc-1))) yi = (1LL<<(nc-1))-1;
-                    printf("0x%08x", int32_t(yi));
+                    int64_t yi = toint(y, 1ULL<<(nc-1));
+                    if (nc > 16) {
+                        printf("0x%08x, ", int32_t(yi));
+                    } else {
+                        printf("0x%04x, ", int32_t(yi)&0xffff);
+                    }
                 } else {
                     printf("%.9g%s", y, debug ? "" : "f");
                 }
@@ -277,5 +307,3 @@ int main(int argc, char** argv)
 }
 
 // http://www.csee.umbc.edu/help/sound/AFsp-V2R1/html/audio/ResampAudio.html
-
-