summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Hung <hunga@google.com>2013-12-09 12:12:46 -0800
committerAndy Hung <hunga@google.com>2013-12-27 14:34:36 -0800
commit86eae0e5931103e040ac2cdd023ef5db252e09f6 (patch)
tree2764bafecfc0157792f880daa4fb535e74286bfe
parente6144d7a558c74e508a5c103cdc462c3cd7cf508 (diff)
downloadframeworks_av-86eae0e5931103e040ac2cdd023ef5db252e09f6.zip
frameworks_av-86eae0e5931103e040ac2cdd023ef5db252e09f6.tar.gz
frameworks_av-86eae0e5931103e040ac2cdd023ef5db252e09f6.tar.bz2
Audio resampler update to add S16 filters
This does not affect the existing resamplers. New resampler accessed through additional quality settings: DYN_LOW_QUALITY = 5 DYN_MED_QUALITY = 6 DYN_HIGH_QUALITY = 7 Change-Id: Iebbd31871e808a4a6dee3f3abfd7e9dcf77c48e1 Signed-off-by: Andy Hung <hunga@google.com>
-rw-r--r--services/audioflinger/Android.mk10
-rw-r--r--services/audioflinger/AudioResampler.cpp27
-rw-r--r--services/audioflinger/AudioResampler.h3
-rw-r--r--services/audioflinger/AudioResamplerDyn.cpp530
-rw-r--r--services/audioflinger/AudioResamplerDyn.h123
-rw-r--r--services/audioflinger/AudioResamplerFirGen.h307
-rw-r--r--services/audioflinger/AudioResamplerFirOps.h163
-rw-r--r--services/audioflinger/AudioResamplerFirProcess.h256
-rw-r--r--services/audioflinger/AudioResamplerFirProcessNeon.h1149
-rw-r--r--services/audioflinger/test-resample.cpp70
-rw-r--r--tools/resampler_tools/fir.cpp84
11 files changed, 2660 insertions, 62 deletions
diff --git a/services/audioflinger/Android.mk b/services/audioflinger/Android.mk
index 3ec9285..4524d3c 100644
--- a/services/audioflinger/Android.mk
+++ b/services/audioflinger/Android.mk
@@ -23,7 +23,8 @@ LOCAL_SRC_FILES:= \
AudioPolicyService.cpp \
ServiceUtilities.cpp \
AudioResamplerCubic.cpp.arm \
- AudioResamplerSinc.cpp.arm
+ AudioResamplerSinc.cpp.arm \
+ AudioResamplerDyn.cpp.arm
LOCAL_SRC_FILES += StateQueue.cpp
@@ -74,10 +75,11 @@ include $(BUILD_SHARED_LIBRARY)
include $(CLEAR_VARS)
LOCAL_SRC_FILES:= \
- test-resample.cpp \
+ test-resample.cpp \
AudioResampler.cpp.arm \
- AudioResamplerCubic.cpp.arm \
- AudioResamplerSinc.cpp.arm
+ AudioResamplerCubic.cpp.arm \
+ AudioResamplerSinc.cpp.arm \
+ AudioResamplerDyn.cpp.arm
LOCAL_C_INCLUDES := \
$(call include-path-for, audio-utils)
diff --git a/services/audioflinger/AudioResampler.cpp b/services/audioflinger/AudioResampler.cpp
index 323f1a4..3b5a8c1 100644
--- a/services/audioflinger/AudioResampler.cpp
+++ b/services/audioflinger/AudioResampler.cpp
@@ -25,6 +25,7 @@
#include "AudioResampler.h"
#include "AudioResamplerSinc.h"
#include "AudioResamplerCubic.h"
+#include "AudioResamplerDyn.h"
#ifdef __arm__
#include <machine/cpu-features.h>
@@ -85,6 +86,9 @@ bool AudioResampler::qualityIsSupported(src_quality quality)
case MED_QUALITY:
case HIGH_QUALITY:
case VERY_HIGH_QUALITY:
+ case DYN_LOW_QUALITY:
+ case DYN_MED_QUALITY:
+ case DYN_HIGH_QUALITY:
return true;
default:
return false;
@@ -105,7 +109,7 @@ void AudioResampler::init_routine()
if (*endptr == '\0') {
defaultQuality = (src_quality) l;
ALOGD("forcing AudioResampler quality to %d", defaultQuality);
- if (defaultQuality < DEFAULT_QUALITY || defaultQuality > VERY_HIGH_QUALITY) {
+ if (defaultQuality < DEFAULT_QUALITY || defaultQuality > DYN_HIGH_QUALITY) {
defaultQuality = DEFAULT_QUALITY;
}
}
@@ -125,6 +129,12 @@ uint32_t AudioResampler::qualityMHz(src_quality quality)
return 20;
case VERY_HIGH_QUALITY:
return 34;
+ case DYN_LOW_QUALITY:
+ return 4;
+ case DYN_MED_QUALITY:
+ return 6;
+ case DYN_HIGH_QUALITY:
+ return 12;
}
}
@@ -175,6 +185,15 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount,
case VERY_HIGH_QUALITY:
quality = HIGH_QUALITY;
break;
+ case DYN_LOW_QUALITY:
+ atFinalQuality = true;
+ break;
+ case DYN_MED_QUALITY:
+ quality = DYN_LOW_QUALITY;
+ break;
+ case DYN_HIGH_QUALITY:
+ quality = DYN_MED_QUALITY;
+ break;
}
}
pthread_mutex_unlock(&mutex);
@@ -200,6 +219,12 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount,
ALOGV("Create VERY_HIGH_QUALITY sinc Resampler = %d", quality);
resampler = new AudioResamplerSinc(bitDepth, inChannelCount, sampleRate, quality);
break;
+ case DYN_LOW_QUALITY:
+ case DYN_MED_QUALITY:
+ case DYN_HIGH_QUALITY:
+ ALOGV("Create dynamic Resampler = %d", quality);
+ resampler = new AudioResamplerDyn(bitDepth, inChannelCount, sampleRate, quality);
+ break;
}
// initialize resampler
diff --git a/services/audioflinger/AudioResampler.h b/services/audioflinger/AudioResampler.h
index 33e64ce..c341325 100644
--- a/services/audioflinger/AudioResampler.h
+++ b/services/audioflinger/AudioResampler.h
@@ -41,6 +41,9 @@ public:
MED_QUALITY=2,
HIGH_QUALITY=3,
VERY_HIGH_QUALITY=4,
+ DYN_LOW_QUALITY=5,
+ DYN_MED_QUALITY=6,
+ DYN_HIGH_QUALITY=7,
};
static AudioResampler* create(int bitDepth, int inChannelCount,
diff --git a/services/audioflinger/AudioResamplerDyn.cpp b/services/audioflinger/AudioResamplerDyn.cpp
new file mode 100644
index 0000000..1e38d3e
--- /dev/null
+++ b/services/audioflinger/AudioResamplerDyn.cpp
@@ -0,0 +1,530 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define LOG_TAG "AudioResamplerDyn"
+//#define LOG_NDEBUG 0
+
+#include <malloc.h>
+#include <string.h>
+#include <stdlib.h>
+#include <dlfcn.h>
+#include <math.h>
+
+#include <cutils/compiler.h>
+#include <cutils/properties.h>
+#include <utils/Log.h>
+
+#include "AudioResamplerFirOps.h" // USE_NEON and USE_INLINE_ASSEMBLY defined here
+#include "AudioResamplerFirProcess.h"
+#include "AudioResamplerFirProcessNeon.h"
+#include "AudioResamplerFirGen.h" // requires math.h
+#include "AudioResamplerDyn.h"
+
+//#define DEBUG_RESAMPLER
+
+namespace android {
+
+// generate a unique resample type compile-time constant (constexpr)
+#define RESAMPLETYPE(CHANNELS, LOCKED, STRIDE, COEFTYPE) \
+ ((((CHANNELS)-1)&1) | !!(LOCKED)<<1 | (COEFTYPE)<<2 \
+ | ((STRIDE)==8 ? 1 : (STRIDE)==16 ? 2 : 0)<<3)
+
+/*
+ * InBuffer is a type agnostic input buffer.
+ *
+ * Layout of the state buffer for halfNumCoefs=8.
+ *
+ * [rrrrrrppppppppnnnnnnnnrrrrrrrrrrrrrrrrrrr.... rrrrrrr]
+ * S I R
+ *
+ * S = mState
+ * I = mImpulse
+ * R = mRingFull
+ * p = past samples, convoluted with the (p)ositive side of sinc()
+ * n = future samples, convoluted with the (n)egative side of sinc()
+ * r = extra space for implementing the ring buffer
+ */
+
+template<typename TI>
+AudioResamplerDyn::InBuffer<TI>::InBuffer()
+ : mState(NULL), mImpulse(NULL), mRingFull(NULL), mStateSize(0) {
+}
+
+template<typename TI>
+AudioResamplerDyn::InBuffer<TI>::~InBuffer() {
+ init();
+}
+
+template<typename TI>
+void AudioResamplerDyn::InBuffer<TI>::init() {
+ free(mState);
+ mState = NULL;
+ mImpulse = NULL;
+ mRingFull = NULL;
+ mStateSize = 0;
+}
+
+// resizes the state buffer to accommodate the appropriate filter length
+template<typename TI>
+void AudioResamplerDyn::InBuffer<TI>::resize(int CHANNELS, int halfNumCoefs) {
+ // calculate desired state size
+ int stateSize = halfNumCoefs * CHANNELS * 2
+ * kStateSizeMultipleOfFilterLength;
+
+ // check if buffer needs resizing
+ if (mState
+ && stateSize == mStateSize
+ && mRingFull-mState == mStateSize-halfNumCoefs*CHANNELS) {
+ return;
+ }
+
+ // create new buffer
+ TI* state = (int16_t*)memalign(32, stateSize*sizeof(*state));
+ memset(state, 0, stateSize*sizeof(*state));
+
+ // attempt to preserve state
+ if (mState) {
+ TI* srcLo = mImpulse - halfNumCoefs*CHANNELS;
+ TI* srcHi = mImpulse + halfNumCoefs*CHANNELS;
+ TI* dst = state;
+
+ if (srcLo < mState) {
+ dst += mState-srcLo;
+ srcLo = mState;
+ }
+ if (srcHi > mState + mStateSize) {
+ srcHi = mState + mStateSize;
+ }
+ memcpy(dst, srcLo, (srcHi - srcLo) * sizeof(*srcLo));
+ free(mState);
+ }
+
+ // set class member vars
+ mState = state;
+ mStateSize = stateSize;
+ mImpulse = mState + halfNumCoefs*CHANNELS; // actually one sample greater than needed
+ mRingFull = mState + mStateSize - halfNumCoefs*CHANNELS;
+}
+
+// copy in the input data into the head (impulse+halfNumCoefs) of the buffer.
+template<typename TI>
+template<int CHANNELS>
+void AudioResamplerDyn::InBuffer<TI>::readAgain(TI*& impulse, const int halfNumCoefs,
+ const TI* const in, const size_t inputIndex) {
+ int16_t* head = impulse + halfNumCoefs*CHANNELS;
+ for (size_t i=0 ; i<CHANNELS ; i++) {
+ head[i] = in[inputIndex*CHANNELS + i];
+ }
+}
+
+// advance the impulse pointer, and load in data into the head (impulse+halfNumCoefs)
+template<typename TI>
+template<int CHANNELS>
+void AudioResamplerDyn::InBuffer<TI>::readAdvance(TI*& impulse, const int halfNumCoefs,
+ const TI* const in, const size_t inputIndex) {
+ impulse += CHANNELS;
+
+ if (CC_UNLIKELY(impulse >= mRingFull)) {
+ const size_t shiftDown = mRingFull - mState - halfNumCoefs*CHANNELS;
+ memcpy(mState, mState+shiftDown, halfNumCoefs*CHANNELS*2*sizeof(TI));
+ impulse -= shiftDown;
+ }
+ readAgain<CHANNELS>(impulse, halfNumCoefs, in, inputIndex);
+}
+
+void AudioResamplerDyn::Constants::set(
+ int L, int halfNumCoefs, int inSampleRate, int outSampleRate)
+{
+ int bits = 0;
+ int lscale = inSampleRate/outSampleRate < 2 ? L - 1 :
+ static_cast<int>(static_cast<uint64_t>(L)*inSampleRate/outSampleRate);
+ for (int i=lscale; i; ++bits, i>>=1)
+ ;
+ mL = L;
+ mShift = kNumPhaseBits - bits;
+ mHalfNumCoefs = halfNumCoefs;
+}
+
+AudioResamplerDyn::AudioResamplerDyn(int bitDepth,
+ int inChannelCount, int32_t sampleRate, src_quality quality)
+ : AudioResampler(bitDepth, inChannelCount, sampleRate, quality),
+ mResampleType(0), mFilterSampleRate(0), mCoefBuffer(NULL)
+{
+ mVolumeSimd[0] = mVolumeSimd[1] = 0;
+ mConstants.set(128, 8, mSampleRate, mSampleRate); // TODO: set better
+}
+
+AudioResamplerDyn::~AudioResamplerDyn() {
+ free(mCoefBuffer);
+}
+
+void AudioResamplerDyn::init() {
+ mFilterSampleRate = 0; // always trigger new filter generation
+ mInBuffer.init();
+}
+
+void AudioResamplerDyn::setVolume(int16_t left, int16_t right) {
+ AudioResampler::setVolume(left, right);
+ mVolumeSimd[0] = static_cast<int32_t>(left)<<16;
+ mVolumeSimd[1] = static_cast<int32_t>(right)<<16;
+}
+
+template <typename T> T max(T a, T b) {return a > b ? a : b;}
+
+template <typename T> T absdiff(T a, T b) {return a > b ? a - b : b - a;}
+
+template<typename T>
+void AudioResamplerDyn::createKaiserFir(Constants &c, double stopBandAtten,
+ int inSampleRate, int outSampleRate, double tbwCheat) {
+ T* buf = reinterpret_cast<T*>(memalign(32, (c.mL+1)*c.mHalfNumCoefs*sizeof(T)));
+ static const double atten = 0.9998; // to avoid ripple overflow
+ double fcr;
+ double tbw = firKaiserTbw(c.mHalfNumCoefs, stopBandAtten);
+
+ if (inSampleRate < outSampleRate) { // upsample
+ fcr = max(0.5*tbwCheat - tbw/2, tbw/2);
+ } else { // downsample
+ fcr = max(0.5*tbwCheat*outSampleRate/inSampleRate - tbw/2, tbw/2);
+ }
+ // create and set filter
+ firKaiserGen(buf, c.mL, c.mHalfNumCoefs, stopBandAtten, fcr, atten);
+ c.setBuf(buf);
+ if (mCoefBuffer) {
+ free(mCoefBuffer);
+ }
+ mCoefBuffer = buf;
+#ifdef DEBUG_RESAMPLER
+ // print basic filter stats
+ printf("L:%d hnc:%d stopBandAtten:%lf fcr:%lf atten:%lf tbw:%lf\n",
+ c.mL, c.mHalfNumCoefs, stopBandAtten, fcr, atten, tbw);
+ // test the filter and report results
+ double fp = (fcr - tbw/2)/c.mL;
+ double fs = (fcr + tbw/2)/c.mL;
+ double fmin, fmax;
+ testFir(buf, c.mL, c.mHalfNumCoefs, 0., fp, 100, fmin, fmax);
+ double d1 = (fmax - fmin)/2.;
+ double ap = -20.*log10(1. - d1); // passband ripple
+ printf("passband(%lf, %lf): %.8lf %.8lf %.8lf\n", 0., fp, (fmax + fmin)/2., d1, ap);
+ testFir(buf, c.mL, c.mHalfNumCoefs, fs, 0.5, 100, fmin, fmax);
+ double d2 = fmax;
+ double as = -20.*log10(d2); // stopband attenuation
+ printf("stopband(%lf, %lf): %.8lf %.8lf %.3lf\n", fs, 0.5, (fmax + fmin)/2., d2, as);
+#endif
+}
+
+// recursive gcd (TODO: verify tail recursion elimination should make this iterate)
+static int gcd(int n, int m) {
+ if (m == 0) {
+ return n;
+ }
+ return gcd(m, n % m);
+}
+
+static bool isClose(int32_t newSampleRate, int32_t prevSampleRate, int32_t filterSampleRate) {
+ int pdiff = absdiff(newSampleRate, prevSampleRate);
+ int adiff = absdiff(newSampleRate, filterSampleRate);
+
+ // allow up to 6% relative change increments.
+ // allow up to 12% absolute change increments (from filter design)
+ return pdiff < prevSampleRate>>4 && adiff < filterSampleRate>>3;
+}
+
+void AudioResamplerDyn::setSampleRate(int32_t inSampleRate) {
+ if (mInSampleRate == inSampleRate) {
+ return;
+ }
+ int32_t oldSampleRate = mInSampleRate;
+ int32_t oldHalfNumCoefs = mConstants.mHalfNumCoefs;
+ uint32_t oldPhaseWrapLimit = mConstants.mL << mConstants.mShift;
+ bool useS32 = false;
+
+ mInSampleRate = inSampleRate;
+
+ // TODO: Add precalculated Equiripple filters
+
+ if (!isClose(inSampleRate, oldSampleRate, mFilterSampleRate)) {
+ mFilterSampleRate = inSampleRate;
+
+ // Begin Kaiser Filter computation
+ //
+ // The quantization floor for S16 is about 96db - 10*log_10(#length) + 3dB.
+ // Keep the stop band attenuation no greater than 84-85dB for 32 length S16 filters
+ //
+ // For s32 we keep the stop band attenuation at the same as 16b resolution, about
+ // 96-98dB
+ //
+
+ double stopBandAtten;
+ double tbwCheat = 1.; // how much we "cheat" into aliasing
+ int halfLength;
+ if (getQuality() == DYN_HIGH_QUALITY) {
+ // 32b coefficients, 64 length
+ useS32 = true;
+ stopBandAtten = 98.;
+ halfLength = 32;
+ } else if (getQuality() == DYN_LOW_QUALITY) {
+ // 16b coefficients, 16-32 length
+ useS32 = false;
+ stopBandAtten = 80.;
+ if (mSampleRate >= inSampleRate * 2) {
+ halfLength = 16;
+ } else {
+ halfLength = 8;
+ }
+ if (mSampleRate >= inSampleRate) {
+ tbwCheat = 1.05;
+ } else {
+ tbwCheat = 1.03;
+ }
+ } else { // medium quality
+ // 16b coefficients, 32-64 length
+ useS32 = false;
+ stopBandAtten = 84.;
+ if (mSampleRate >= inSampleRate * 4) {
+ halfLength = 32;
+ } else if (mSampleRate >= inSampleRate * 2) {
+ halfLength = 24;
+ } else {
+ halfLength = 16;
+ }
+ if (mSampleRate >= inSampleRate) {
+ tbwCheat = 1.03;
+ } else {
+ tbwCheat = 1.01;
+ }
+ }
+
+ // determine the number of polyphases in the filterbank.
+ // for 16b, it is desirable to have 2^(16/2) = 256 phases.
+ // https://ccrma.stanford.edu/~jos/resample/Relation_Interpolation_Error_Quantization.html
+ //
+ // We are a bit more lax on this.
+
+ int phases = mSampleRate / gcd(mSampleRate, inSampleRate);
+
+ while (phases<63) { // too few phases, allow room for interpolation
+ phases *= 2; // this code only needed to support dynamic rate changes
+ }
+ if (phases>=256) { // too many phases, always interpolate
+ phases = 127;
+ }
+
+ // create the filter
+ mConstants.set(phases, halfLength, inSampleRate, mSampleRate);
+ if (useS32) {
+ createKaiserFir<int32_t>(mConstants, stopBandAtten,
+ inSampleRate, mSampleRate, tbwCheat);
+ } else {
+ createKaiserFir<int16_t>(mConstants, stopBandAtten,
+ inSampleRate, mSampleRate, tbwCheat);
+ }
+ } // End Kaiser filter
+
+ // update phase and state based on the new filter.
+ const Constants& c(mConstants);
+ mInBuffer.resize(mChannelCount, c.mHalfNumCoefs);
+ const uint32_t phaseWrapLimit = c.mL << c.mShift;
+ // try to preserve as much of the phase fraction as possible for on-the-fly changes
+ mPhaseFraction = static_cast<unsigned long long>(mPhaseFraction)
+ * phaseWrapLimit / oldPhaseWrapLimit;
+ mPhaseFraction %= phaseWrapLimit; // should not do anything, but just in case.
+ mPhaseIncrement = static_cast<uint32_t>(static_cast<double>(phaseWrapLimit)
+ * inSampleRate / mSampleRate);
+
+ // determine which resampler to use
+ // check if locked phase (works only if mPhaseIncrement has no "fractional phase bits")
+ int locked = (mPhaseIncrement << (sizeof(mPhaseIncrement)*8 - c.mShift)) == 0;
+ int stride = (c.mHalfNumCoefs&7)==0 ? 16 : (c.mHalfNumCoefs&3)==0 ? 8 : 2;
+ if (locked) {
+ mPhaseFraction = mPhaseFraction >> c.mShift << c.mShift; // remove fractional phase
+ }
+ if (!USE_NEON) {
+ stride = 2; // C version only
+ }
+ // TODO: Remove this for testing
+ //stride = 2;
+ mResampleType = RESAMPLETYPE(mChannelCount, locked, stride, !!useS32);
+#ifdef DEBUG_RESAMPLER
+ printf("channels:%d %s stride:%d %s coef:%d shift:%d\n",
+ mChannelCount, locked ? "locked" : "interpolated",
+ stride, useS32 ? "S32" : "S16", 2*c.mHalfNumCoefs, c.mShift);
+#endif
+}
+
+void AudioResamplerDyn::resample(int32_t* out, size_t outFrameCount,
+ AudioBufferProvider* provider)
+{
+ // TODO:
+ // 24 cases - this perhaps can be reduced later, as testing might take too long
+ switch (mResampleType) {
+
+ // stride 16 (stride 2 for machines that do not support NEON)
+ case RESAMPLETYPE(1, true, 16, 0):
+ return resample<1, true, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+ case RESAMPLETYPE(2, true, 16, 0):
+ return resample<2, true, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+ case RESAMPLETYPE(1, false, 16, 0):
+ return resample<1, false, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+ case RESAMPLETYPE(2, false, 16, 0):
+ return resample<2, false, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+ case RESAMPLETYPE(1, true, 16, 1):
+ return resample<1, true, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+ case RESAMPLETYPE(2, true, 16, 1):
+ return resample<2, true, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+ case RESAMPLETYPE(1, false, 16, 1):
+ return resample<1, false, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+ case RESAMPLETYPE(2, false, 16, 1):
+ return resample<2, false, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+#if 0
+ // TODO: Remove these?
+ // stride 8
+ case RESAMPLETYPE(1, true, 8, 0):
+ return resample<1, true, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+ case RESAMPLETYPE(2, true, 8, 0):
+ return resample<2, true, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+ case RESAMPLETYPE(1, false, 8, 0):
+ return resample<1, false, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+ case RESAMPLETYPE(2, false, 8, 0):
+ return resample<2, false, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+ case RESAMPLETYPE(1, true, 8, 1):
+ return resample<1, true, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+ case RESAMPLETYPE(2, true, 8, 1):
+ return resample<2, true, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+ case RESAMPLETYPE(1, false, 8, 1):
+ return resample<1, false, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+ case RESAMPLETYPE(2, false, 8, 1):
+ return resample<2, false, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+ // stride 2 (can handle any filter length)
+ case RESAMPLETYPE(1, true, 2, 0):
+ return resample<1, true, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+ case RESAMPLETYPE(2, true, 2, 0):
+ return resample<2, true, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+ case RESAMPLETYPE(1, false, 2, 0):
+ return resample<1, false, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+ case RESAMPLETYPE(2, false, 2, 0):
+ return resample<2, false, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+ case RESAMPLETYPE(1, true, 2, 1):
+ return resample<1, true, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+ case RESAMPLETYPE(2, true, 2, 1):
+ return resample<2, true, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+ case RESAMPLETYPE(1, false, 2, 1):
+ return resample<1, false, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+ case RESAMPLETYPE(2, false, 2, 1):
+ return resample<2, false, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+#endif
+ default:
+ ; // error
+ }
+}
+
+template<int CHANNELS, bool LOCKED, int STRIDE, typename TC>
+void AudioResamplerDyn::resample(int32_t* out, size_t outFrameCount,
+ const TC* const coefs, AudioBufferProvider* provider)
+{
+ const Constants& c(mConstants);
+ int16_t* impulse = mInBuffer.getImpulse();
+ size_t inputIndex = mInputIndex;
+ uint32_t phaseFraction = mPhaseFraction;
+ const uint32_t phaseIncrement = mPhaseIncrement;
+ size_t outputIndex = 0;
+ size_t outputSampleCount = outFrameCount * 2; // stereo output
+ size_t inFrameCount = (outFrameCount*mInSampleRate)/mSampleRate;
+ const uint32_t phaseWrapLimit = c.mL << c.mShift;
+
+ // NOTE: be very careful when modifying the code here. register
+ // pressure is very high and a small change might cause the compiler
+ // to generate far less efficient code.
+ // Always sanity check the result with objdump or test-resample.
+
+ // the following logic is a bit convoluted to keep the main processing loop
+ // as tight as possible with register allocation.
+ while (outputIndex < outputSampleCount) {
+ // buffer is empty, fetch a new one
+ while (mBuffer.frameCount == 0) {
+ mBuffer.frameCount = inFrameCount;
+ provider->getNextBuffer(&mBuffer,
+ calculateOutputPTS(outputIndex / 2));
+ if (mBuffer.raw == NULL) {
+ goto resample_exit;
+ }
+ if (phaseFraction >= phaseWrapLimit) { // read in data
+ mInBuffer.readAdvance<CHANNELS>(
+ impulse, c.mHalfNumCoefs, mBuffer.i16, inputIndex);
+ phaseFraction -= phaseWrapLimit;
+ while (phaseFraction >= phaseWrapLimit) {
+ inputIndex++;
+ if (inputIndex >= mBuffer.frameCount) {
+ inputIndex -= mBuffer.frameCount;
+ provider->releaseBuffer(&mBuffer);
+ break;
+ }
+ mInBuffer.readAdvance<CHANNELS>(
+ impulse, c.mHalfNumCoefs, mBuffer.i16, inputIndex);
+ phaseFraction -= phaseWrapLimit;
+ }
+ }
+ }
+ const int16_t* const in = mBuffer.i16;
+ const size_t frameCount = mBuffer.frameCount;
+ const int coefShift = c.mShift;
+ const int halfNumCoefs = c.mHalfNumCoefs;
+ const int32_t* const volumeSimd = mVolumeSimd;
+
+ // reread the last input in.
+ mInBuffer.readAgain<CHANNELS>(impulse, halfNumCoefs, in, inputIndex);
+
+ // main processing loop
+ while (CC_LIKELY(outputIndex < outputSampleCount)) {
+ // caution: fir() is inlined and may be large.
+ // output will be loaded with the appropriate values
+ //
+ // from the input samples in impulse[-halfNumCoefs+1]... impulse[halfNumCoefs]
+ // from the polyphase filter of (phaseFraction / phaseWrapLimit) in coefs.
+ //
+ fir<CHANNELS, LOCKED, STRIDE>(
+ &out[outputIndex],
+ phaseFraction, phaseWrapLimit,
+ coefShift, halfNumCoefs, coefs,
+ impulse, volumeSimd);
+ outputIndex += 2;
+
+ phaseFraction += phaseIncrement;
+ while (phaseFraction >= phaseWrapLimit) {
+ inputIndex++;
+ if (inputIndex >= frameCount) {
+ goto done; // need a new buffer
+ }
+ mInBuffer.readAdvance<CHANNELS>(impulse, halfNumCoefs, in, inputIndex);
+ phaseFraction -= phaseWrapLimit;
+ }
+ }
+done:
+ // often arrives here when input buffer runs out
+ if (inputIndex >= frameCount) {
+ inputIndex -= frameCount;
+ provider->releaseBuffer(&mBuffer);
+ // mBuffer.frameCount MUST be zero here.
+ }
+ }
+
+resample_exit:
+ mInBuffer.setImpulse(impulse);
+ mInputIndex = inputIndex;
+ mPhaseFraction = phaseFraction;
+}
+
+// ----------------------------------------------------------------------------
+}; // namespace android
diff --git a/services/audioflinger/AudioResamplerDyn.h b/services/audioflinger/AudioResamplerDyn.h
new file mode 100644
index 0000000..85a01ab
--- /dev/null
+++ b/services/audioflinger/AudioResamplerDyn.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_DYN_H
+#define ANDROID_AUDIO_RESAMPLER_DYN_H
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <cutils/log.h>
+
+#include "AudioResampler.h"
+
+namespace android {
+
+class AudioResamplerDyn: public AudioResampler {
+public:
+ AudioResamplerDyn(int bitDepth, int inChannelCount, int32_t sampleRate,
+ src_quality quality);
+
+ virtual ~AudioResamplerDyn();
+
+ virtual void init();
+
+ virtual void setSampleRate(int32_t inSampleRate);
+
+ virtual void setVolume(int16_t left, int16_t right);
+
+ virtual void resample(int32_t* out, size_t outFrameCount,
+ AudioBufferProvider* provider);
+
+private:
+
+ class Constants { // stores the filter constants.
+ public:
+ Constants() :
+ mL(0), mShift(0), mHalfNumCoefs(0), mFirCoefsS16(NULL)
+ {}
+ void set(int L, int halfNumCoefs,
+ int inSampleRate, int outSampleRate);
+ inline void setBuf(int16_t* buf) {
+ mFirCoefsS16 = buf;
+ }
+ inline void setBuf(int32_t* buf) {
+ mFirCoefsS32 = buf;
+ }
+
+ int mL; // interpolation phases in the filter.
+ int mShift; // right shift to get polyphase index
+ unsigned int mHalfNumCoefs; // filter half #coefs
+ union { // polyphase filter bank
+ const int16_t* mFirCoefsS16;
+ const int32_t* mFirCoefsS32;
+ };
+ };
+
+ // Input buffer management for a given input type TI, now (int16_t)
+ // Is agnostic of the actual type, can work with int32_t and float.
+ template<typename TI>
+ class InBuffer {
+ public:
+ InBuffer();
+ ~InBuffer();
+ void init();
+ void resize(int CHANNELS, int halfNumCoefs);
+
+ // used for direct management of the mImpulse pointer
+ inline TI* getImpulse() {
+ return mImpulse;
+ }
+ inline void setImpulse(TI *impulse) {
+ mImpulse = impulse;
+ }
+ template<int CHANNELS>
+ inline void readAgain(TI*& impulse, const int halfNumCoefs,
+ const TI* const in, const size_t inputIndex);
+ template<int CHANNELS>
+ inline void readAdvance(TI*& impulse, const int halfNumCoefs,
+ const TI* const in, const size_t inputIndex);
+
+ private:
+ // tuning parameter guidelines: 2 <= multiple <= 8
+ static const int kStateSizeMultipleOfFilterLength = 4;
+
+ TI* mState; // base pointer for the input buffer storage
+ TI* mImpulse; // current location of the impulse response (centered)
+ TI* mRingFull; // mState <= mImpulse < mRingFull
+ // in general, mRingFull = mState + mStateSize - halfNumCoefs*CHANNELS.
+ size_t mStateSize; // in units of TI.
+ };
+
+ template<int CHANNELS, bool LOCKED, int STRIDE, typename TC>
+ void resample(int32_t* out, size_t outFrameCount,
+ const TC* const coefs, AudioBufferProvider* provider);
+
+ template<typename T>
+ void createKaiserFir(Constants &c, double stopBandAtten,
+ int inSampleRate, int outSampleRate, double tbwCheat);
+
+ InBuffer<int16_t> mInBuffer;
+ Constants mConstants; // current set of coefficient parameters
+ int32_t __attribute__ ((aligned (8))) mVolumeSimd[2];
+ int32_t mResampleType; // contains the resample type.
+ int32_t mFilterSampleRate; // designed sample rate for the filter
+ void* mCoefBuffer; // if a filter is created, this is not null
+};
+
+// ----------------------------------------------------------------------------
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_DYN_H*/
diff --git a/services/audioflinger/AudioResamplerFirGen.h b/services/audioflinger/AudioResamplerFirGen.h
new file mode 100644
index 0000000..fa6ee3e
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirGen.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_GEN_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_GEN_H
+
+namespace android {
+
+/*
+ * Sinc function is the traditional variant.
+ *
+ * TODO: Investigate optimizations (regular sampling grid, NEON vector accelerations)
+ * TODO: Remove comparison at 0 and trap at a higher level.
+ *
+ */
+
+static inline double sinc(double x) {
+ if (fabs(x) < FLT_MIN) {
+ return 1.;
+ }
+ return sin(x) / x;
+}
+
+static inline double sqr(double x) {
+ return x * x;
+}
+
+/*
+ * rounds a double to the nearest integer for FIR coefficients.
+ *
+ * One variant uses noise shaping, which must keep error history
+ * to work (the err parameter, initialized to 0).
+ * The other variant is a non-noise shaped version for
+ * S32 coefficients (noise shaping doesn't gain much).
+ *
+ * Caution: No bounds saturation is applied, but isn't needed in
+ * this case.
+ *
+ * @param x is the value to round.
+ *
+ * @param maxval is the maximum integer scale factor expressed as an int64 (for headroom).
+ * Typically this may be the maximum positive integer+1 (using the fact that double precision
+ * FIR coefficients generated here are never that close to 1.0 to pose an overflow condition).
+ *
+ * @param err is the previous error (actual - rounded) for the previous rounding op.
+ *
+ */
+
+static inline int64_t toint(double x, int64_t maxval, double& err) {
+ double val = x * maxval;
+ double ival = floor(val + 0.5 + err*0.17);
+ err = val - ival;
+ return static_cast<int64_t>(ival);
+}
+
+static inline int64_t toint(double x, int64_t maxval) {
+ return static_cast<int64_t>(floor(x * maxval + 0.5));
+}
+
+/*
+ * Modified Bessel function of the first kind
+ * http://en.wikipedia.org/wiki/Bessel_function
+ *
+ * The formulas are taken from Abramowitz and Stegun:
+ *
+ * http://people.math.sfu.ca/~cbm/aands/page_375.htm
+ * http://people.math.sfu.ca/~cbm/aands/page_378.htm
+ *
+ * http://dlmf.nist.gov/10.25
+ * http://dlmf.nist.gov/10.40
+ *
+ * Note we assume x is nonnegative (the function is symmetric,
+ * pass in the absolute value as needed).
+ *
+ * Constants are compile time derived with templates I0Term<> and
+ * I0ATerm<> to the precision of the compiler. The series can be expanded
+ * to any precision needed, but currently set around 24b precision.
+ *
+ * We use a bit of template math here, constexpr would probably be
+ * more appropriate for a C++11 compiler.
+ *
+ */
+
+template <int N>
+struct I0Term {
+ static const double value = I0Term<N-1>::value/ (4. * N * N);
+};
+
+template <>
+struct I0Term<0> {
+ static const double value = 1.;
+};
+
+template <int N>
+struct I0ATerm {
+ static const double value = I0ATerm<N-1>::value * (2.*N-1.) * (2.*N-1.) / (8. * N);
+};
+
+template <>
+struct I0ATerm<0> { // 1/sqrt(2*PI);
+ static const double value = 0.398942280401432677939946059934381868475858631164934657665925;
+};
+
+static inline double I0(double x) {
+ if (x < 3.75) { // TODO: Estrin's method instead of Horner's method?
+ x *= x;
+ return I0Term<0>::value + x*(
+ I0Term<1>::value + x*(
+ I0Term<2>::value + x*(
+ I0Term<3>::value + x*(
+ I0Term<4>::value + x*(
+ I0Term<5>::value + x*(
+ I0Term<6>::value)))))); // e < 1.6e-7
+ }
+ // a bit ugly here - perhaps we expand the top series
+ // to permit computation to x < 20 (a reasonable range)
+ double y = 1./x;
+ return exp(x) * sqrt(y) * (
+ // note: reciprocal squareroot may be easier!
+ // http://en.wikipedia.org/wiki/Fast_inverse_square_root
+ I0ATerm<0>::value + y*(
+ I0ATerm<1>::value + y*(
+ I0ATerm<2>::value + y*(
+ I0ATerm<3>::value + y*(
+ I0ATerm<4>::value + y*(
+ I0ATerm<5>::value + y*(
+ I0ATerm<6>::value + y*(
+ I0ATerm<7>::value + y*(
+ I0ATerm<8>::value))))))))); // (... e) < 1.9e-7
+}
+
+/*
+ * calculates the transition bandwidth for a Kaiser filter
+ *
+ * Formula 3.2.8, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48
+ *
+ * @param halfNumCoef is half the number of coefficients per filter phase.
+ * @param stopBandAtten is the stop band attenuation desired.
+ * @return the transition bandwidth in normalized frequency (0 <= f <= 0.5)
+ */
+static inline double firKaiserTbw(int halfNumCoef, double stopBandAtten) {
+ return (stopBandAtten - 7.95)/(2.*14.36*halfNumCoef);
+}
+
+/*
+ * calculates the fir transfer response.
+ *
+ * calculates the transfer coefficient H(w) for 0 <= w <= PI.
+ * Be careful be careful to consider the fact that this is an interpolated filter
+ * of length L, so normalizing H(w)/L is probably what you expect.
+ */
+template <typename T>
+static inline double firTransfer(const T* coef, int L, int halfNumCoef, double w) {
+ double accum = static_cast<double>(coef[0])*0.5;
+ coef += halfNumCoef; // skip first row.
+ for (int i=1 ; i<=L ; ++i) {
+ for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) {
+ accum += cos(ix*w)*static_cast<double>(*coef++);
+ }
+ }
+ return accum*2.;
+}
+
+/*
+ * returns the minimum and maximum |H(f)| bounds
+ *
+ * @param coef is the designed polyphase filter banks
+ *
+ * @param L is the number of phases (for interpolation)
+ *
+ * @param halfNumCoef should be half the number of coefficients for a single
+ * polyphase.
+ *
+ * @param fstart is the normalized frequency start.
+ *
+ * @param fend is the normalized frequency end.
+ *
+ * @param steps is the number of steps to take (sampling) between frequency start and end
+ *
+ * @param firMin returns the minimum transfer |H(f)| found
+ *
+ * @param firMax returns the maximum transfer |H(f)| found
+ *
+ * 0 <= f <= 0.5.
+ * This is used to test passband and stopband performance.
+ */
+template <typename T>
+static void testFir(const T* coef, int L, int halfNumCoef,
+ double fstart, double fend, int steps, double &firMin, double &firMax) {
+ double wstart = fstart*(2.*M_PI);
+ double wend = fend*(2.*M_PI);
+ double wstep = (wend - wstart)/steps;
+ double fmax, fmin;
+ double trf = firTransfer(coef, L, halfNumCoef, wstart);
+ if (trf<0) {
+ trf = -trf;
+ }
+ fmin = fmax = trf;
+ wstart += wstep;
+ for (int i=1; i<steps; ++i) {
+ trf = firTransfer(coef, L, halfNumCoef, wstart);
+ if (trf<0) {
+ trf = -trf;
+ }
+ if (trf>fmax) {
+ fmax = trf;
+ }
+ else if (trf<fmin) {
+ fmin = trf;
+ }
+ wstart += wstep;
+ }
+ // renormalize - this is only needed for integer filter types
+ double norm = 1./((1ULL<<(sizeof(T)*8-1))*L);
+
+ firMin = fmin * norm;
+ firMax = fmax * norm;
+}
+
+/*
+ * Calculates the polyphase filter banks based on a windowed sinc function.
+ *
+ * The windowed sinc is an odd length symmetric filter of exactly L*halfNumCoef*2+1
+ * taps for the entire kernel. This is then decomposed into L+1 polyphase filterbanks.
+ * The last filterbank is used for interpolation purposes (and is mostly composed
+ * of the first bank shifted by one sample), and is unnecessary if one does
+ * not do interpolation.
+ *
+ * @param coef is the caller allocated space for coefficients. This should be
+ * exactly (L+1)*halfNumCoef in size.
+ *
+ * @param L is the number of phases (for interpolation)
+ *
+ * @param halfNumCoef should be half the number of coefficients for a single
+ * polyphase.
+ *
+ * @param stopBandAtten is the stopband value, should be >50dB.
+ *
+ * @param fcr is cutoff frequency/sampling rate (<0.5). At this point, the energy
+ * should be 6dB less. (fcr is where the amplitude drops by half). Use the
+ * firKaiserTbw() to calculate the transition bandwidth. fcr is the midpoint
+ * between the stop band and the pass band (fstop+fpass)/2.
+ *
+ * @param atten is the attenuation (generally slightly less than 1).
+ */
+
+template <typename T>
+static inline void firKaiserGen(T* coef, int L, int halfNumCoef,
+ double stopBandAtten, double fcr, double atten) {
+ //
+ // Formula 3.2.5, 3.2.7, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48
+ //
+ // See also: http://melodi.ee.washington.edu/courses/ee518/notes/lec17.pdf
+ //
+ // Kaiser window and beta parameter
+ //
+ // | 0.1102*(A - 8.7) A > 50
+ // beta = | 0.5842*(A - 21)^0.4 + 0.07886*(A - 21) 21 <= A <= 50
+ // | 0. A < 21
+ //
+ // with A is the desired stop-band attenuation in dBFS
+ //
+ // 30 dB 2.210
+ // 40 dB 3.384
+ // 50 dB 4.538
+ // 60 dB 5.658
+ // 70 dB 6.764
+ // 80 dB 7.865
+ // 90 dB 8.960
+ // 100 dB 10.056
+
+ const int N = L * halfNumCoef; // non-negative half
+ const double beta = 0.1102 * (stopBandAtten - 8.7); // >= 50dB always
+ const double yscale = 2. * atten * fcr / I0(beta);
+ const double xstep = 2. * M_PI * fcr / L;
+ const double xfrac = 1. / N;
+ double err = 0; // for noise shaping on int16_t coefficients
+ for (int i=0 ; i<=L ; ++i) { // generate an extra set of coefs for interpolation
+ for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) {
+ double y = I0(beta * sqrt(1.0 - sqr(ix * xfrac))) * sinc(ix * xstep) * yscale;
+
+ // (caution!) float version does not need rounding
+ if (is_same<T, int16_t>::value) { // int16_t needs noise shaping
+ *coef++ = static_cast<T>(toint(y, 1ULL<<(sizeof(T)*8-1), err));
+ } else {
+ *coef++ = static_cast<T>(toint(y, 1ULL<<(sizeof(T)*8-1)));
+ }
+ }
+ }
+}
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_GEN_H*/
diff --git a/services/audioflinger/AudioResamplerFirOps.h b/services/audioflinger/AudioResamplerFirOps.h
new file mode 100644
index 0000000..bf2163f
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirOps.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_OPS_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_OPS_H
+
+namespace android {
+
+#if defined(__arm__) && !defined(__thumb__)
+#define USE_INLINE_ASSEMBLY (true)
+#else
+#define USE_INLINE_ASSEMBLY (false)
+#endif
+
+#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__)
+#define USE_NEON (true)
+#include <arm_neon.h>
+#else
+#define USE_NEON (false)
+#endif
+
+template<typename T, typename U>
+struct is_same
+{
+ static const bool value = false;
+};
+
+template<typename T>
+struct is_same<T, T> // partial specialization
+{
+ static const bool value = true;
+};
+
+static inline
+int32_t mulRL(int left, int32_t in, uint32_t vRL)
+{
+#if USE_INLINE_ASSEMBLY
+ int32_t out;
+ if (left) {
+ asm( "smultb %[out], %[in], %[vRL] \n"
+ : [out]"=r"(out)
+ : [in]"%r"(in), [vRL]"r"(vRL)
+ : );
+ } else {
+ asm( "smultt %[out], %[in], %[vRL] \n"
+ : [out]"=r"(out)
+ : [in]"%r"(in), [vRL]"r"(vRL)
+ : );
+ }
+ return out;
+#else
+ int16_t v = left ? static_cast<int16_t>(vRL) : static_cast<int16_t>(vRL>>16);
+ return static_cast<int32_t>((static_cast<int64_t>(in) * v) >> 16);
+#endif
+}
+
+static inline
+int32_t mulAdd(int16_t in, int16_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+ int32_t out;
+ asm( "smlabb %[out], %[v], %[in], %[a] \n"
+ : [out]"=r"(out)
+ : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
+ : );
+ return out;
+#else
+ return a + v * in;
+#endif
+}
+
+static inline
+int32_t mulAdd(int16_t in, int32_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+ int32_t out;
+ asm( "smlawb %[out], %[v], %[in], %[a] \n"
+ : [out]"=r"(out)
+ : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
+ : );
+ return out;
+#else
+ return a + static_cast<int32_t>((static_cast<int64_t>(v) * in) >> 16);
+#endif
+}
+
+static inline
+int32_t mulAdd(int32_t in, int32_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+ int32_t out;
+ asm( "smmla %[out], %[v], %[in], %[a] \n"
+ : [out]"=r"(out)
+ : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
+ : );
+ return out;
+#else
+ return a + static_cast<int32_t>((static_cast<int64_t>(v) * in) >> 32);
+#endif
+}
+
+static inline
+int32_t mulAddRL(int left, uint32_t inRL, int16_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+ int32_t out;
+ if (left) {
+ asm( "smlabb %[out], %[v], %[inRL], %[a] \n"
+ : [out]"=r"(out)
+ : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+ : );
+ } else {
+ asm( "smlabt %[out], %[v], %[inRL], %[a] \n"
+ : [out]"=r"(out)
+ : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+ : );
+ }
+ return out;
+#else
+ int16_t s = left ? static_cast<int16_t>(inRL) : static_cast<int16_t>(inRL>>16);
+ return a + v * s;
+#endif
+}
+
+static inline
+int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+ int32_t out;
+ if (left) {
+ asm( "smlawb %[out], %[v], %[inRL], %[a] \n"
+ : [out]"=r"(out)
+ : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+ : );
+ } else {
+ asm( "smlawt %[out], %[v], %[inRL], %[a] \n"
+ : [out]"=r"(out)
+ : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+ : );
+ }
+ return out;
+#else
+ int16_t s = left ? static_cast<int16_t>(inRL) : static_cast<int16_t>(inRL>>16);
+ return a + static_cast<int32_t>((static_cast<int64_t>(v) * s) >> 16);
+#endif
+}
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_OPS_H*/
diff --git a/services/audioflinger/AudioResamplerFirProcess.h b/services/audioflinger/AudioResamplerFirProcess.h
new file mode 100644
index 0000000..38e387c
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirProcess.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H
+
+namespace android {
+
+// depends on AudioResamplerFirOps.h
+
+template<int CHANNELS, typename TC>
+static inline
+void mac(
+ int32_t& l, int32_t& r,
+ const TC coef,
+ const int16_t* samples)
+{
+ if (CHANNELS == 2) {
+ uint32_t rl = *reinterpret_cast<const uint32_t*>(samples);
+ l = mulAddRL(1, rl, coef, l);
+ r = mulAddRL(0, rl, coef, r);
+ } else {
+ r = l = mulAdd(samples[0], coef, l);
+ }
+}
+
+template<int CHANNELS, typename TC>
+static inline
+void interpolate(
+ int32_t& l, int32_t& r,
+ const TC coef_0, const TC coef_1,
+ const int16_t lerp, const int16_t* samples)
+{
+ TC sinc;
+
+ if (is_same<TC, int16_t>::value) {
+ sinc = (lerp * ((coef_1-coef_0)<<1)>>16) + coef_0;
+ } else {
+ sinc = mulAdd(lerp, (coef_1-coef_0)<<1, coef_0);
+ }
+ if (CHANNELS == 2) {
+ uint32_t rl = *reinterpret_cast<const uint32_t*>(samples);
+ l = mulAddRL(1, rl, sinc, l);
+ r = mulAddRL(0, rl, sinc, r);
+ } else {
+ r = l = mulAdd(samples[0], sinc, l);
+ }
+}
+
+/*
+ * Calculates a single output sample (two stereo frames).
+ *
+ * This function computes both the positive half FIR dot product and
+ * the negative half FIR dot product, accumulates, and then applies the volume.
+ *
+ * This is a locked phase filter (it does not compute the interpolation).
+ *
+ * Use fir() to compute the proper coefficient pointers for a polyphase
+ * filter bank.
+ */
+
+template <int CHANNELS, int STRIDE, typename TC>
+static inline
+void ProcessL(int32_t* const out,
+ int count,
+ const TC* coefsP,
+ const TC* coefsN,
+ const int16_t* sP,
+ const int16_t* sN,
+ const int32_t* const volumeLR)
+{
+ int32_t l = 0;
+ int32_t r = 0;
+ do {
+ mac<CHANNELS>(l, r, *coefsP++, sP);
+ sP -= CHANNELS;
+ mac<CHANNELS>(l, r, *coefsN++, sN);
+ sN += CHANNELS;
+ } while (--count > 0);
+ out[0] += 2 * mulRL(0, l, volumeLR[0]); // Note: only use top 16b
+ out[1] += 2 * mulRL(0, r, volumeLR[1]); // Note: only use top 16b
+}
+
+/*
+ * Calculates a single output sample (two stereo frames) interpolating phase.
+ *
+ * This function computes both the positive half FIR dot product and
+ * the negative half FIR dot product, accumulates, and then applies the volume.
+ *
+ * This is an interpolated phase filter.
+ *
+ * Use fir() to compute the proper coefficient pointers for a polyphase
+ * filter bank.
+ */
+
+template <int CHANNELS, int STRIDE, typename TC>
+static inline
+void Process(int32_t* const out,
+ int count,
+ const TC* coefsP,
+ const TC* coefsN,
+ const TC* coefsP1,
+ const TC* coefsN1,
+ const int16_t* sP,
+ const int16_t* sN,
+ uint32_t lerpP,
+ const int32_t* const volumeLR)
+{
+ (void) coefsP1; // suppress unused parameter warning
+ (void) coefsN1;
+ if (sizeof(*coefsP)==4) {
+ lerpP >>= 16; // ensure lerpP is 16b
+ }
+ int32_t l = 0;
+ int32_t r = 0;
+ for (size_t i = 0; i < count; ++i) {
+ interpolate<CHANNELS>(l, r, coefsP[0], coefsP[count], lerpP, sP);
+ coefsP++;
+ sP -= CHANNELS;
+ interpolate<CHANNELS>(l, r, coefsN[count], coefsN[0], lerpP, sN);
+ coefsN++;
+ sN += CHANNELS;
+ }
+ out[0] += 2 * mulRL(0, l, volumeLR[0]); // Note: only use top 16b
+ out[1] += 2 * mulRL(0, r, volumeLR[1]); // Note: only use top 16b
+}
+
+/*
+ * Calculates a single output sample (two stereo frames) from input sample pointer.
+ *
+ * This sets up the params for the accelerated Process() and ProcessL()
+ * functions to do the appropriate dot products.
+ *
+ * @param out should point to the output buffer with at least enough space for 2 output frames.
+ *
+ * @param phase is the fractional distance between input samples for interpolation:
+ * phase >= 0 && phase < phaseWrapLimit. It can be thought of as a rational fraction
+ * of phase/phaseWrapLimit.
+ *
+ * @param phaseWrapLimit is #polyphases<<coefShift, where #polyphases is the number of polyphases
+ * in the polyphase filter. Likewise, #polyphases can be obtained as (phaseWrapLimit>>coefShift).
+ *
+ * @param coefShift gives the bit alignment of the polyphase index in the phase parameter.
+ *
+ * @param halfNumCoefs is the half the number of coefficients per polyphase filter. Since the
+ * overall filterbank is odd-length symmetric, only halfNumCoefs need be stored.
+ *
+ * @param coefs is the polyphase filter bank, starting at from polyphase index 0, and ranging to
+ * and including the #polyphases. Each polyphase of the filter has half-length halfNumCoefs
+ * (due to symmetry). The total size of the filter bank in coefficients is
+ * (#polyphases+1)*halfNumCoefs.
+ *
+ * The filter bank coefs should be aligned to a minimum of 16 bytes (preferrably to cache line).
+ *
+ * The coefs should be attenuated (to compensate for passband ripple)
+ * if storing back into the native format.
+ *
+ * @param samples are unaligned input samples. The position is in the "middle" of the
+ * sample array with respect to the FIR filter:
+ * the negative half of the filter is dot product from samples+1 to samples+halfNumCoefs;
+ * the positive half of the filter is dot product from samples to samples-halfNumCoefs+1.
+ *
+ * @param volumeLR is a pointer to an array of two 32 bit volume values, one per stereo channel,
+ * expressed as a S32 integer. A negative value inverts the channel 180 degrees.
+ * The pointer volumeLR should be aligned to a minimum of 8 bytes.
+ * A typical value for volume is 0x1000 to align to a unity gain output of 20.12.
+ *
+ * In between calls to filterCoefficient, the phase is incremented by phaseIncrement, where
+ * phaseIncrement is calculated as inputSampling * phaseWrapLimit / outputSampling.
+ *
+ * The filter polyphase index is given by indexP = phase >> coefShift. Due to
+ * odd length symmetric filter, the polyphase index of the negative half depends on
+ * whether interpolation is used.
+ *
+ * The fractional siting between the polyphase indices is given by the bits below coefShift:
+ *
+ * lerpP = phase << 32 - coefShift >> 1; // for 32 bit unsigned phase multiply
+ * lerpP = phase << 32 - coefShift >> 17; // for 16 bit unsigned phase multiply
+ *
+ * For integer types, this is expressed as:
+ *
+ * lerpP = phase << sizeof(phase)*8 - coefShift
+ * >> (sizeof(phase)-sizeof(*coefs))*8 + 1;
+ *
+ */
+
+template<int CHANNELS, bool LOCKED, int STRIDE, typename TC>
+static inline
+void fir(int32_t* const out,
+ const uint32_t phase, const uint32_t phaseWrapLimit,
+ const int coefShift, const int halfNumCoefs, const TC* const coefs,
+ const int16_t* const samples, const int32_t* const volumeLR)
+{
+ // NOTE: be very careful when modifying the code here. register
+ // pressure is very high and a small change might cause the compiler
+ // to generate far less efficient code.
+ // Always sanity check the result with objdump or test-resample.
+
+ if (LOCKED) {
+ // locked polyphase (no interpolation)
+ // Compute the polyphase filter index on the positive and negative side.
+ uint32_t indexP = phase >> coefShift;
+ uint32_t indexN = (phaseWrapLimit - phase) >> coefShift;
+ const TC* coefsP = coefs + indexP*halfNumCoefs;
+ const TC* coefsN = coefs + indexN*halfNumCoefs;
+ const int16_t* sP = samples;
+ const int16_t* sN = samples + CHANNELS;
+
+ // dot product filter.
+ ProcessL<CHANNELS, STRIDE>(out,
+ halfNumCoefs, coefsP, coefsN, sP, sN, volumeLR);
+ } else {
+ // interpolated polyphase
+ // Compute the polyphase filter index on the positive and negative side.
+ uint32_t indexP = phase >> coefShift;
+ uint32_t indexN = (phaseWrapLimit - phase - 1) >> coefShift; // one's complement.
+ const TC* coefsP = coefs + indexP*halfNumCoefs;
+ const TC* coefsN = coefs + indexN*halfNumCoefs;
+ const TC* coefsP1 = coefsP + halfNumCoefs;
+ const TC* coefsN1 = coefsN + halfNumCoefs;
+ const int16_t* sP = samples;
+ const int16_t* sN = samples + CHANNELS;
+
+ // Interpolation fraction lerpP derived by shifting all the way up and down
+ // to clear the appropriate bits and align to the appropriate level
+ // for the integer multiply. The constants should resolve in compile time.
+ //
+ // The interpolated filter coefficient is derived as follows for the pos/neg half:
+ //
+ // interpolated[P] = index[P]*lerpP + index[P+1]*(1-lerpP)
+ // interpolated[N] = index[N+1]*lerpP + index[N]*(1-lerpP)
+ uint32_t lerpP = phase << (sizeof(phase)*8 - coefShift)
+ >> ((sizeof(phase)-sizeof(*coefs))*8 + 1);
+
+ // on-the-fly interpolated dot product filter
+ Process<CHANNELS, STRIDE>(out,
+ halfNumCoefs, coefsP, coefsN, coefsP1, coefsN1, sP, sN, lerpP, volumeLR);
+ }
+}
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H*/
diff --git a/services/audioflinger/AudioResamplerFirProcessNeon.h b/services/audioflinger/AudioResamplerFirProcessNeon.h
new file mode 100644
index 0000000..f311cef
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirProcessNeon.h
@@ -0,0 +1,1149 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
+
+namespace android {
+
+// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
+
+#if USE_NEON
+//
+// NEON specializations are enabled for Process() and ProcessL()
+//
+// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
+// and looping stride 16 (or vice versa). This has some polyphase coef data alignment
+// issues with S16 coefs. Consider this later.
+
+// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
+#define ASSEMBLY_ACCUMULATE_MONO \
+ "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\
+ "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\
+ "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\
+ "vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\
+ "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\
+ "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\
+ "vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */
+
+#define ASSEMBLY_ACCUMULATE_STEREO \
+ "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\
+ "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\
+ "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\
+ "vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\
+ "vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\
+ "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\
+ "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\
+ "vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/
+
+template <>
+inline void ProcessL<1, 16>(int32_t* const out,
+ int count,
+ const int16_t* coefsP,
+ const int16_t* coefsN,
+ const int16_t* sP,
+ const int16_t* sN,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 1; // template specialization does not preserve params
+ const int STRIDE = 16;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
+
+ "1: \n"
+
+ "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
+ "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
+ "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
+ "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
+
+ "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
+
+ // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+ "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef
+ "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef
+ "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
+ "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
+
+ // moving these ARM instructions before neon above seems to be slower
+ "subs %[count], %[count], #8 \n"// (1) update loop counter
+ "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
+
+ // sP used after branch (warning)
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_MONO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q8", "q10"
+ );
+}
+
+template <>
+inline void ProcessL<2, 16>(int32_t* const out,
+ int count,
+ const int16_t* coefsP,
+ const int16_t* coefsN,
+ const int16_t* sP,
+ const int16_t* sN,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 2; // template specialization does not preserve params
+ const int STRIDE = 16;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "veor q0, q0, q0 \n"// (1) acc_L = 0
+ "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
+
+ "1: \n"
+
+ "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
+ "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
+ "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
+ "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
+
+ "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
+ "vrev64.16 q3, q3 \n"// (0 combines+) reverse right positive
+
+ "vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left
+ "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left
+ "vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right
+ "vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right
+ "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
+ "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
+ "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
+ "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
+
+ // moving these ARM before neon seems to be slower
+ "subs %[count], %[count], #8 \n"// (1) update loop counter
+ "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
+
+ // sP used after branch (warning)
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_STEREO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q4", "q5", "q6",
+ "q8", "q10"
+ );
+}
+
+template <>
+inline void Process<1, 16>(int32_t* const out,
+ int count,
+ const int16_t* coefsP,
+ const int16_t* coefsN,
+ const int16_t* coefsP1,
+ const int16_t* coefsN1,
+ const int16_t* sP,
+ const int16_t* sN,
+ uint32_t lerpP,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 1; // template specialization does not preserve params
+ const int STRIDE = 16;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
+ "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
+
+ "1: \n"
+
+ "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
+ "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
+ "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
+ "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
+ "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
+ "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
+
+ "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
+ "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
+
+ "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
+ "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
+
+ "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
+
+ "vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set
+ "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
+
+ // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+ "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef
+ "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef
+ "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
+ "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
+
+ // moving these ARM instructions before neon above seems to be slower
+ "subs %[count], %[count], #8 \n"// (1) update loop counter
+ "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
+
+ // sP used after branch (warning)
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_MONO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [coefsP1] "+r" (coefsP1),
+ [coefsN1] "+r" (coefsN1),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [lerpP] "r" (lerpP),
+ [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11"
+ );
+}
+
+template <>
+inline void Process<2, 16>(int32_t* const out,
+ int count,
+ const int16_t* coefsP,
+ const int16_t* coefsN,
+ const int16_t* coefsP1,
+ const int16_t* coefsN1,
+ const int16_t* sP,
+ const int16_t* sN,
+ uint32_t lerpP,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 2; // template specialization does not preserve params
+ const int STRIDE = 16;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
+ "veor q0, q0, q0 \n"// (1) acc_L = 0
+ "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
+
+ "1: \n"
+
+ "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
+ "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
+ "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
+ "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
+ "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
+ "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
+
+ "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
+ "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
+
+ "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
+ "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
+
+ "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
+ "vrev64.16 q3, q3 \n"// (1) reverse 8 frames of the right positive
+
+ "vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set
+ "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
+
+ "vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left
+ "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left
+ "vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right
+ "vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right
+ "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
+ "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
+ "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
+ "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
+
+ // moving these ARM before neon seems to be slower
+ "subs %[count], %[count], #8 \n"// (1) update loop counter
+ "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
+
+ // sP used after branch (warning)
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_STEREO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [coefsP1] "+r" (coefsP1),
+ [coefsN1] "+r" (coefsN1),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [lerpP] "r" (lerpP),
+ [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q4", "q5", "q6",
+ "q8", "q9", "q10", "q11"
+ );
+}
+
+template <>
+inline void ProcessL<1, 16>(int32_t* const out,
+ int count,
+ const int32_t* coefsP,
+ const int32_t* coefsN,
+ const int16_t* sP,
+ const int16_t* sN,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 1; // template specialization does not preserve params
+ const int STRIDE = 16;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "veor q0, q0, q0 \n"// result, initialize to 0
+
+ "1: \n"
+
+ "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
+ "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
+ "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
+ "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
+
+ "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
+
+ "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
+
+ "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
+
+ "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
+
+ "vadd.s32 q0, q0, q12 \n"// accumulate result
+ "vadd.s32 q13, q13, q14 \n"// accumulate result
+ "vadd.s32 q0, q0, q15 \n"// accumulate result
+ "vadd.s32 q0, q0, q13 \n"// accumulate result
+
+ "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
+ "subs %[count], %[count], #8 \n"// update loop counter
+
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_MONO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15"
+ );
+}
+
+template <>
+inline void ProcessL<2, 16>(int32_t* const out,
+ int count,
+ const int32_t* coefsP,
+ const int32_t* coefsN,
+ const int16_t* sP,
+ const int16_t* sN,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 2; // template specialization does not preserve params
+ const int STRIDE = 16;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "veor q0, q0, q0 \n"// result, initialize to 0
+ "veor q4, q4, q4 \n"// result, initialize to 0
+
+ "1: \n"
+
+ "vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples
+ "vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples
+ "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
+ "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+
+ "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
+ "vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side
+
+ "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
+
+ "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
+
+ "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
+
+ "vadd.s32 q0, q0, q12 \n"// accumulate result
+ "vadd.s32 q13, q13, q14 \n"// accumulate result
+ "vadd.s32 q0, q0, q15 \n"// (+1) accumulate result
+ "vadd.s32 q0, q0, q13 \n"// (+1) accumulate result
+
+ "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
+
+ "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
+
+ "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
+
+ "vadd.s32 q4, q4, q12 \n"// accumulate result
+ "vadd.s32 q13, q13, q14 \n"// accumulate result
+ "vadd.s32 q4, q4, q15 \n"// (+1) accumulate result
+ "vadd.s32 q4, q4, q13 \n"// (+1) accumulate result
+
+ "subs %[count], %[count], #8 \n"// update loop counter
+ "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
+
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_STEREO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q4", "q5", "q6",
+ "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15"
+ );
+}
+
+template <>
+inline void Process<1, 16>(int32_t* const out,
+ int count,
+ const int32_t* coefsP,
+ const int32_t* coefsN,
+ const int32_t* coefsP1,
+ const int32_t* coefsN1,
+ const int16_t* sP,
+ const int16_t* sN,
+ uint32_t lerpP,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 1; // template specialization does not preserve params
+ const int STRIDE = 16;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
+ "veor q0, q0, q0 \n"// result, initialize to 0
+
+ "1: \n"
+
+ "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
+ "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
+ "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
+ "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
+ "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
+ "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
+
+ "vsub.s32 q12, q12, q8 \n"// interpolate (step1)
+ "vsub.s32 q13, q13, q9 \n"// interpolate (step1)
+ "vsub.s32 q14, q14, q10 \n"// interpolate (step1)
+ "vsub.s32 q15, q15, q11 \n"// interpolate (step1)
+
+ "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
+ "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
+ "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
+ "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
+
+ "vadd.s32 q8, q8, q12 \n"// interpolate (step3)
+ "vadd.s32 q9, q9, q13 \n"// interpolate (step3)
+ "vadd.s32 q10, q10, q14 \n"// interpolate (step3)
+ "vadd.s32 q11, q11, q15 \n"// interpolate (step3)
+
+ "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
+
+ "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
+
+ "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
+
+ "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
+
+ "vadd.s32 q0, q0, q12 \n"// accumulate result
+ "vadd.s32 q13, q13, q14 \n"// accumulate result
+ "vadd.s32 q0, q0, q15 \n"// accumulate result
+ "vadd.s32 q0, q0, q13 \n"// accumulate result
+
+ "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
+ "subs %[count], %[count], #8 \n"// update loop counter
+
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_MONO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [coefsP1] "+r" (coefsP1),
+ [coefsN1] "+r" (coefsN1),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [lerpP] "r" (lerpP),
+ [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15"
+ );
+}
+
+template <>
+inline void Process<2, 16>(int32_t* const out,
+ int count,
+ const int32_t* coefsP,
+ const int32_t* coefsN,
+ const int32_t* coefsP1,
+ const int32_t* coefsN1,
+ const int16_t* sP,
+ const int16_t* sN,
+ uint32_t lerpP,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 2; // template specialization does not preserve params
+ const int STRIDE = 16;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
+ "veor q0, q0, q0 \n"// result, initialize to 0
+ "veor q4, q4, q4 \n"// result, initialize to 0
+
+ "1: \n"
+
+ "vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples
+ "vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples
+ "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
+ "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
+ "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
+ "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
+
+ "vsub.s32 q12, q12, q8 \n"// interpolate (step1)
+ "vsub.s32 q13, q13, q9 \n"// interpolate (step1)
+ "vsub.s32 q14, q14, q10 \n"// interpolate (step1)
+ "vsub.s32 q15, q15, q11 \n"// interpolate (step1)
+
+ "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
+ "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
+ "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
+ "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
+
+ "vadd.s32 q8, q8, q12 \n"// interpolate (step3)
+ "vadd.s32 q9, q9, q13 \n"// interpolate (step3)
+ "vadd.s32 q10, q10, q14 \n"// interpolate (step3)
+ "vadd.s32 q11, q11, q15 \n"// interpolate (step3)
+
+ "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
+ "vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side
+
+ "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
+
+ "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
+
+ "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
+
+ "vadd.s32 q0, q0, q12 \n"// accumulate result
+ "vadd.s32 q13, q13, q14 \n"// accumulate result
+ "vadd.s32 q0, q0, q15 \n"// (+1) accumulate result
+ "vadd.s32 q0, q0, q13 \n"// (+1) accumulate result
+
+ "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
+
+ "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
+
+ "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
+
+ "vadd.s32 q4, q4, q12 \n"// accumulate result
+ "vadd.s32 q13, q13, q14 \n"// accumulate result
+ "vadd.s32 q4, q4, q15 \n"// (+1) accumulate result
+ "vadd.s32 q4, q4, q13 \n"// (+1) accumulate result
+
+ "subs %[count], %[count], #8 \n"// update loop counter
+ "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
+
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_STEREO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [coefsP1] "+r" (coefsP1),
+ [coefsN1] "+r" (coefsN1),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [lerpP] "r" (lerpP),
+ [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q4", "q5", "q6",
+ "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15"
+ );
+}
+
+template <>
+inline void ProcessL<1, 8>(int32_t* const out,
+ int count,
+ const int16_t* coefsP,
+ const int16_t* coefsN,
+ const int16_t* sP,
+ const int16_t* sN,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 1; // template specialization does not preserve params
+ const int STRIDE = 8;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
+
+ "1: \n"
+
+ "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples
+ "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples
+ "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs
+ "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs
+
+ "vrev64.16 d4, d4 \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
+
+ // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+ "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed)samples by coef
+ "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
+
+ // moving these ARM instructions before neon above seems to be slower
+ "subs %[count], %[count], #4 \n"// (1) update loop counter
+ "sub %[sP], %[sP], #8 \n"// (0) move pointer to next set of samples
+
+ // sP used after branch (warning)
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_MONO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q8", "q10"
+ );
+}
+
+template <>
+inline void ProcessL<2, 8>(int32_t* const out,
+ int count,
+ const int16_t* coefsP,
+ const int16_t* coefsN,
+ const int16_t* sP,
+ const int16_t* sN,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 2; // template specialization does not preserve params
+ const int STRIDE = 8;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "veor q0, q0, q0 \n"// (1) acc_L = 0
+ "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
+
+ "1: \n"
+
+ "vld2.16 {d4, d5}, [%[sP]] \n"// (2+0d) load 8 16-bits stereo samples
+ "vld2.16 {d6, d7}, [%[sN]]! \n"// (2) load 8 16-bits stereo samples
+ "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs
+ "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs
+
+ "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
+
+ "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left
+ "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right
+ "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left
+ "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right
+
+ // moving these ARM before neon seems to be slower
+ "subs %[count], %[count], #4 \n"// (1) update loop counter
+ "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
+
+ // sP used after branch (warning)
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_STEREO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q4", "q5", "q6",
+ "q8", "q10"
+ );
+}
+
+template <>
+inline void Process<1, 8>(int32_t* const out,
+ int count,
+ const int16_t* coefsP,
+ const int16_t* coefsN,
+ const int16_t* coefsP1,
+ const int16_t* coefsN1,
+ const int16_t* sP,
+ const int16_t* sN,
+ uint32_t lerpP,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 1; // template specialization does not preserve params
+ const int STRIDE = 8;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
+ "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
+
+ "1: \n"
+
+ "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples
+ "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples
+ "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs
+ "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 4 16-bits coefs for interpolation
+ "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 4 16-bits coefs
+ "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs for interpolation
+
+ "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs
+ "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets
+
+ "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
+ "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
+
+ "vrev64.16 d4, d4 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
+
+ "vadd.s16 d16, d16, d17 \n"// (1+2d) interpolate (step3) 1st set
+ "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set
+
+ // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+ "vmlal.s16 q0, d4, d16 \n"// (1+0d) multiply (reversed)by coef
+ "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
+
+ // moving these ARM instructions before neon above seems to be slower
+ "subs %[count], %[count], #4 \n"// (1) update loop counter
+ "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
+
+ // sP used after branch (warning)
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_MONO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [coefsP1] "+r" (coefsP1),
+ [coefsN1] "+r" (coefsN1),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [lerpP] "r" (lerpP),
+ [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11"
+ );
+}
+
+template <>
+inline void Process<2, 8>(int32_t* const out,
+ int count,
+ const int16_t* coefsP,
+ const int16_t* coefsN,
+ const int16_t* coefsP1,
+ const int16_t* coefsN1,
+ const int16_t* sP,
+ const int16_t* sN,
+ uint32_t lerpP,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 2; // template specialization does not preserve params
+ const int STRIDE = 8;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
+ "veor q0, q0, q0 \n"// (1) acc_L = 0
+ "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
+
+ "1: \n"
+
+ "vld2.16 {d4, d5}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
+ "vld2.16 {d6, d7}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
+ "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs
+ "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 8 16-bits coefs for interpolation
+ "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 8 16-bits coefs
+ "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs for interpolation
+
+ "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs
+ "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets
+
+ "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
+ "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
+
+ "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
+
+ "vadd.s16 d16, d16, d17 \n"// (1+1d) interpolate (step3) 1st set
+ "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set
+
+ "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left
+ "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right
+ "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left
+ "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right
+
+ // moving these ARM before neon seems to be slower
+ "subs %[count], %[count], #4 \n"// (1) update loop counter
+ "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
+
+ // sP used after branch (warning)
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_STEREO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [coefsP1] "+r" (coefsP1),
+ [coefsN1] "+r" (coefsN1),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [lerpP] "r" (lerpP),
+ [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q4", "q5", "q6",
+ "q8", "q9", "q10", "q11"
+ );
+}
+
+template <>
+inline void ProcessL<1, 8>(int32_t* const out,
+ int count,
+ const int32_t* coefsP,
+ const int32_t* coefsN,
+ const int16_t* sP,
+ const int16_t* sN,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 1; // template specialization does not preserve params
+ const int STRIDE = 8;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "veor q0, q0, q0 \n"// result, initialize to 0
+
+ "1: \n"
+
+ "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples
+ "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples
+ "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
+ "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+
+ "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side
+
+ "vshll.s16 q12, d4, #15 \n"// (stall) extend samples to 31 bits
+ "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
+
+ "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
+
+ "vadd.s32 q0, q0, q12 \n"// accumulate result
+ "vadd.s32 q0, q0, q14 \n"// (stall) accumulate result
+
+ "subs %[count], %[count], #4 \n"// update loop counter
+ "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
+
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_MONO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11",
+ "q12", "q14"
+ );
+}
+
+template <>
+inline void ProcessL<2, 8>(int32_t* const out,
+ int count,
+ const int32_t* coefsP,
+ const int32_t* coefsN,
+ const int16_t* sP,
+ const int16_t* sN,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 2; // template specialization does not preserve params
+ const int STRIDE = 8;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "veor q0, q0, q0 \n"// result, initialize to 0
+ "veor q4, q4, q4 \n"// result, initialize to 0
+
+ "1: \n"
+
+ "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples
+ "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples
+ "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
+ "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+
+ "vrev64.16 q2, q2 \n"// reverse 2 frames of the positive side
+
+ "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
+
+ "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
+
+ "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by coef
+ "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
+ "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by coef
+
+ "vadd.s32 q0, q0, q12 \n"// accumulate result
+ "vadd.s32 q4, q4, q13 \n"// accumulate result
+ "vadd.s32 q0, q0, q14 \n"// accumulate result
+ "vadd.s32 q4, q4, q15 \n"// accumulate result
+
+ "subs %[count], %[count], #4 \n"// update loop counter
+ "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
+
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_STEREO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsN0] "+r" (coefsN),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15"
+ );
+}
+
+template <>
+inline void Process<1, 8>(int32_t* const out,
+ int count,
+ const int32_t* coefsP,
+ const int32_t* coefsN,
+ const int32_t* coefsP1,
+ const int32_t* coefsN1,
+ const int16_t* sP,
+ const int16_t* sN,
+ uint32_t lerpP,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 1; // template specialization does not preserve params
+ const int STRIDE = 8;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
+ "veor q0, q0, q0 \n"// result, initialize to 0
+
+ "1: \n"
+
+ "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples
+ "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples
+ "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
+ "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation
+ "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
+ "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
+
+ "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side
+
+ "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs
+ "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets
+ "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
+
+ "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs
+ "vqrdmulh.s32 q11, q11, d2[0] \n"// interpolate (step2) 2nd set of coefs
+ "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
+
+ "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set
+ "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set
+
+ "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
+
+ "vadd.s32 q0, q0, q12 \n"// accumulate result
+ "vadd.s32 q0, q0, q14 \n"// accumulate result
+
+ "subs %[count], %[count], #4 \n"// update loop counter
+ "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
+
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_MONO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsP1] "+r" (coefsP1),
+ [coefsN0] "+r" (coefsN),
+ [coefsN1] "+r" (coefsN1),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [lerpP] "r" (lerpP),
+ [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11",
+ "q12", "q14"
+ );
+}
+
+template <>
+inline
+void Process<2, 8>(int32_t* const out,
+ int count,
+ const int32_t* coefsP,
+ const int32_t* coefsN,
+ const int32_t* coefsP1,
+ const int32_t* coefsN1,
+ const int16_t* sP,
+ const int16_t* sN,
+ uint32_t lerpP,
+ const int32_t* const volumeLR)
+{
+ const int CHANNELS = 2; // template specialization does not preserve params
+ const int STRIDE = 8;
+ sP -= CHANNELS*((STRIDE>>1)-1);
+ asm (
+ "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
+ "veor q0, q0, q0 \n"// result, initialize to 0
+ "veor q4, q4, q4 \n"// result, initialize to 0
+
+ "1: \n"
+ "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples
+ "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples
+ "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
+ "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation
+ "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
+ "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
+
+ "vrev64.16 q2, q2 \n"// (reversed) 2 frames of the positive side
+
+ "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs
+ "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets
+ "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
+
+ "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs
+ "vqrdmulh.s32 q11, q11, d2[1] \n"// interpolate (step3) 2nd set of coefs
+ "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
+ "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
+
+ "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set
+ "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set
+
+ "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
+ "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by interpolated coef
+
+ "vadd.s32 q0, q0, q12 \n"// accumulate result
+ "vadd.s32 q4, q4, q13 \n"// accumulate result
+ "vadd.s32 q0, q0, q14 \n"// accumulate result
+ "vadd.s32 q4, q4, q15 \n"// accumulate result
+
+ "subs %[count], %[count], #4 \n"// update loop counter
+ "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
+
+ "bne 1b \n"// loop
+
+ ASSEMBLY_ACCUMULATE_STEREO
+
+ : [out] "=Uv" (out[0]),
+ [count] "+r" (count),
+ [coefsP0] "+r" (coefsP),
+ [coefsP1] "+r" (coefsP1),
+ [coefsN0] "+r" (coefsN),
+ [coefsN1] "+r" (coefsN1),
+ [sP] "+r" (sP),
+ [sN] "+r" (sN)
+ : [lerpP] "r" (lerpP),
+ [vLR] "r" (volumeLR)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15"
+ );
+}
+
+#endif //USE_NEON
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
diff --git a/services/audioflinger/test-resample.cpp b/services/audioflinger/test-resample.cpp
index 403bb6d..3f2ce55 100644
--- a/services/audioflinger/test-resample.cpp
+++ b/services/audioflinger/test-resample.cpp
@@ -33,8 +33,9 @@ using namespace android;
bool gVerbose = false;
static int usage(const char* name) {
- fprintf(stderr,"Usage: %s [-p] [-h] [-v] [-s] [-q {dq|lq|mq|hq|vhq}] [-i input-sample-rate] "
- "[-o output-sample-rate] [<input-file>] <output-file>\n", name);
+ fprintf(stderr,"Usage: %s [-p] [-h] [-v] [-s] [-q {dq|lq|mq|hq|vhq|dlq|dmq|dhq}]"
+ " [-i input-sample-rate] [-o output-sample-rate] [<input-file>]"
+ " <output-file>\n", name);
fprintf(stderr," -p enable profiling\n");
fprintf(stderr," -h create wav file\n");
fprintf(stderr," -v verbose : log buffer provider calls\n");
@@ -45,6 +46,9 @@ static int usage(const char* name) {
fprintf(stderr," mq : medium quality\n");
fprintf(stderr," hq : high quality\n");
fprintf(stderr," vhq : very high quality\n");
+ fprintf(stderr," dlq : dynamic low quality\n");
+ fprintf(stderr," dmq : dynamic medium quality\n");
+ fprintf(stderr," dhq : dynamic high quality\n");
fprintf(stderr," -i input file sample rate (ignored if input file is specified)\n");
fprintf(stderr," -o output file sample rate\n");
return -1;
@@ -86,6 +90,12 @@ int main(int argc, char* argv[]) {
quality = AudioResampler::HIGH_QUALITY;
else if (!strcmp(optarg, "vhq"))
quality = AudioResampler::VERY_HIGH_QUALITY;
+ else if (!strcmp(optarg, "dlq"))
+ quality = AudioResampler::DYN_LOW_QUALITY;
+ else if (!strcmp(optarg, "dmq"))
+ quality = AudioResampler::DYN_MED_QUALITY;
+ else if (!strcmp(optarg, "dhq"))
+ quality = AudioResampler::DYN_HIGH_QUALITY;
else {
usage(progname);
return -1;
@@ -137,6 +147,8 @@ int main(int argc, char* argv[]) {
channels = info.channels;
input_freq = info.samplerate;
} else {
+ // data for testing is exactly (input sampling rate/1000)/2 seconds
+ // so 44.1khz input is 22.05 seconds
double k = 1000; // Hz / s
double time = (input_freq / 2) / k;
size_t input_frames = size_t(input_freq * time);
@@ -148,7 +160,7 @@ int main(int argc, char* argv[]) {
double y = sin(M_PI * k * t * t);
int16_t yi = floor(y * 32767.0 + 0.5);
for (size_t j=0 ; j<(size_t)channels ; j++) {
- in[i*channels + j] = yi / (1+j);
+ in[i*channels + j] = yi / (1+j); // right ch. 1/2 left ch.
}
}
}
@@ -170,6 +182,7 @@ int main(int argc, char* argv[]) {
}
virtual status_t getNextBuffer(Buffer* buffer,
int64_t pts = kInvalidPTS) {
+ (void)pts; // suppress warning
size_t requestedFrames = buffer->frameCount;
if (requestedFrames > mNumFrames - mNextFrame) {
buffer->frameCount = mNumFrames - mNextFrame;
@@ -205,6 +218,9 @@ int main(int argc, char* argv[]) {
buffer->frameCount = 0;
buffer->i16 = NULL;
}
+ void reset() {
+ mNextFrame = 0;
+ }
} provider(input_vaddr, input_size, channels);
size_t input_frames = input_size / (channels * sizeof(int16_t));
@@ -215,37 +231,29 @@ int main(int argc, char* argv[]) {
output_size &= ~7; // always stereo, 32-bits
void* output_vaddr = malloc(output_size);
+ AudioResampler* resampler = AudioResampler::create(16, channels,
+ output_freq, quality);
+ size_t out_frames = output_size/8;
+ resampler->setSampleRate(input_freq);
+ resampler->setVolume(0x1000, 0x1000);
if (profiling) {
- AudioResampler* resampler = AudioResampler::create(16, channels,
- output_freq, quality);
-
- size_t out_frames = output_size/8;
- resampler->setSampleRate(input_freq);
- resampler->setVolume(0x1000, 0x1000);
-
- memset(output_vaddr, 0, output_size);
+ const int looplimit = 100;
timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
- resampler->resample((int*) output_vaddr, out_frames, &provider);
- resampler->resample((int*) output_vaddr, out_frames, &provider);
- resampler->resample((int*) output_vaddr, out_frames, &provider);
- resampler->resample((int*) output_vaddr, out_frames, &provider);
+ for (int i = 0; i < looplimit; ++i) {
+ resampler->resample((int*) output_vaddr, out_frames, &provider);
+ provider.reset(); // reset only provider as benchmarking
+ }
clock_gettime(CLOCK_MONOTONIC, &end);
int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;
int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec;
- int64_t time = (end_ns - start_ns)/4;
- printf("%f Mspl/s\n", out_frames/(time/1e9)/1e6);
-
- delete resampler;
+ int64_t time = end_ns - start_ns;
+ printf("time(ns):%lld channels:%d quality:%d\n", time, channels, quality);
+ printf("%f Mspl/s\n", out_frames * looplimit / (time / 1e9) / 1e6);
+ resampler->reset();
}
- AudioResampler* resampler = AudioResampler::create(16, channels,
- output_freq, quality);
- size_t out_frames = output_size/8;
- resampler->setSampleRate(input_freq);
- resampler->setVolume(0x1000, 0x1000);
-
memset(output_vaddr, 0, output_size);
if (gVerbose) {
printf("resample() %u output frames\n", out_frames);
@@ -259,14 +267,18 @@ int main(int argc, char* argv[]) {
printf("reset() complete\n");
}
- // down-mix (we just truncate and keep the left channel)
+ // mono takes left channel only
+ // stereo right channel is half amplitude of stereo left channel (due to input creation)
int32_t* out = (int32_t*) output_vaddr;
int16_t* convert = (int16_t*) malloc(out_frames * channels * sizeof(int16_t));
+
for (size_t i = 0; i < out_frames; i++) {
- for (int j=0 ; j<channels ; j++) {
+ for (int j = 0; j < channels; j++) {
int32_t s = out[i * 2 + j] >> 12;
- if (s > 32767) s = 32767;
- else if (s < -32768) s = -32768;
+ if (s > 32767)
+ s = 32767;
+ else if (s < -32768)
+ s = -32768;
convert[i * channels + j] = int16_t(s);
}
}
diff --git a/tools/resampler_tools/fir.cpp b/tools/resampler_tools/fir.cpp
index cc3d509..27a9b05 100644
--- a/tools/resampler_tools/fir.cpp
+++ b/tools/resampler_tools/fir.cpp
@@ -20,15 +20,25 @@
#include <stdlib.h>
#include <string.h>
-static double sinc(double x) {
+static inline double sinc(double x) {
if (fabs(x) == 0.0f) return 1.0f;
return sin(x) / x;
}
-static double sqr(double x) {
+static inline double sqr(double x) {
return x*x;
}
+static inline int64_t toint(double x, int64_t maxval) {
+ int64_t v;
+
+ v = static_cast<int64_t>(floor(y * maxval + 0.5));
+ if (v >= maxval) {
+ return maxval - 1; // error!
+ }
+ return v;
+}
+
static double I0(double x) {
// from the Numerical Recipes in C p. 237
double ax,ans,y;
@@ -54,11 +64,12 @@ static double kaiser(int k, int N, double beta) {
return I0(beta * sqrt(1.0 - sqr((2.0*k)/N - 1.0))) / I0(beta);
}
-
static void usage(char* name) {
fprintf(stderr,
- "usage: %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings] [-f {float|fixed}] [-b beta] [-v dBFS] [-l lerp]\n"
- " %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings] [-f {float|fixed}] [-b beta] [-v dBFS] -p M/N\n"
+ "usage: %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings]"
+ " [-f {float|fixed|fixed16}] [-b beta] [-v dBFS] [-l lerp]\n"
+ " %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings]"
+ " [-f {float|fixed|fixed16}] [-b beta] [-v dBFS] -p M/N\n"
" -h this help message\n"
" -d debug, print comma-separated coefficient table\n"
" -p generate poly-phase filter coefficients, with sample increment M/N\n"
@@ -66,6 +77,7 @@ static void usage(char* name) {
" -c cut-off frequency (20478)\n"
" -n number of zero-crossings on one side (8)\n"
" -l number of lerping bits (4)\n"
+ " -m number of polyphases (related to -l, default 16)\n"
" -f output format, can be fixed-point or floating-point (fixed)\n"
" -b kaiser window parameter beta (7.865 [-80dB])\n"
" -v attenuation in dBFS (0)\n",
@@ -77,8 +89,7 @@ static void usage(char* name) {
int main(int argc, char** argv)
{
// nc is the number of bits to store the coefficients
- const int nc = 32;
-
+ int nc = 32;
bool polyphase = false;
unsigned int polyM = 160;
unsigned int polyN = 147;
@@ -88,7 +99,6 @@ int main(int argc, char** argv)
double atten = 1;
int format = 0;
-
// in order to keep the errors associated with the linear
// interpolation of the coefficients below the quantization error
// we must satisfy:
@@ -104,7 +114,6 @@ int main(int argc, char** argv)
// Smith, J.O. Digital Audio Resampling Home Page
// https://ccrma.stanford.edu/~jos/resample/, 2011-03-29
//
- int nz = 4;
// | 0.1102*(A - 8.7) A > 50
// beta = | 0.5842*(A - 21)^0.4 + 0.07886*(A - 21) 21 <= A <= 50
@@ -123,7 +132,6 @@ int main(int argc, char** argv)
// 100 dB 10.056
double beta = 7.865;
-
// 2*nzc = (A - 8) / (2.285 * dw)
// with dw the transition width = 2*pi*dF/Fs
//
@@ -148,8 +156,9 @@ int main(int argc, char** argv)
// nzc = 20
//
+ int M = 1 << 4; // number of phases for interpolation
int ch;
- while ((ch = getopt(argc, argv, ":hds:c:n:f:l:b:p:v:")) != -1) {
+ while ((ch = getopt(argc, argv, ":hds:c:n:f:l:m:b:p:v:z:")) != -1) {
switch (ch) {
case 'd':
debug = true;
@@ -169,13 +178,26 @@ int main(int argc, char** argv)
case 'n':
nzc = atoi(optarg);
break;
+ case 'm':
+ M = atoi(optarg);
+ break;
case 'l':
- nz = atoi(optarg);
+ M = 1 << atoi(optarg);
break;
case 'f':
- if (!strcmp(optarg,"fixed")) format = 0;
- else if (!strcmp(optarg,"float")) format = 1;
- else usage(argv[0]);
+ if (!strcmp(optarg, "fixed")) {
+ format = 0;
+ }
+ else if (!strcmp(optarg, "fixed16")) {
+ format = 0;
+ nc = 16;
+ }
+ else if (!strcmp(optarg, "float")) {
+ format = 1;
+ }
+ else {
+ usage(argv[0]);
+ }
break;
case 'b':
beta = atof(optarg);
@@ -193,11 +215,14 @@ int main(int argc, char** argv)
// cut off frequency ratio Fc/Fs
double Fcr = Fc / Fs;
-
// total number of coefficients (one side)
- const int M = (1 << nz);
+
const int N = M * nzc;
+ // lerp (which is most useful if M is a power of 2)
+
+ int nz = 0; // recalculate nz as the bits needed to represent M
+ for (int i = M-1 ; i; i>>=1, nz++);
// generate the right half of the filter
if (!debug) {
printf("// cmd-line: ");
@@ -207,7 +232,7 @@ int main(int argc, char** argv)
printf("\n");
if (!polyphase) {
printf("const int32_t RESAMPLE_FIR_SIZE = %d;\n", N);
- printf("const int32_t RESAMPLE_FIR_LERP_INT_BITS = %d;\n", nz);
+ printf("const int32_t RESAMPLE_FIR_INT_PHASES = %d;\n", M);
printf("const int32_t RESAMPLE_FIR_NUM_COEF = %d;\n", nzc);
} else {
printf("const int32_t RESAMPLE_FIR_SIZE = %d;\n", 2*nzc*polyN);
@@ -224,7 +249,7 @@ int main(int argc, char** argv)
for (int i=0 ; i<=M ; i++) { // an extra set of coefs for interpolation
for (int j=0 ; j<nzc ; j++) {
int ix = j*M + i;
- double x = (2.0 * M_PI * ix * Fcr) / (1 << nz);
+ double x = (2.0 * M_PI * ix * Fcr) / M;
double y = kaiser(ix+N, 2*N, beta) * sinc(x) * 2.0 * Fcr;
y *= atten;
@@ -232,11 +257,13 @@ int main(int argc, char** argv)
if (j == 0)
printf("\n ");
}
-
if (!format) {
- int64_t yi = floor(y * ((1ULL<<(nc-1))) + 0.5);
- if (yi >= (1LL<<(nc-1))) yi = (1LL<<(nc-1))-1;
- printf("0x%08x, ", int32_t(yi));
+ int64_t yi = toint(y, 1ULL<<(nc-1));
+ if (nc > 16) {
+ printf("0x%08x, ", int32_t(yi));
+ } else {
+ printf("0x%04x, ", int32_t(yi)&0xffff);
+ }
} else {
printf("%.9g%s ", y, debug ? "," : "f,");
}
@@ -254,9 +281,12 @@ int main(int argc, char** argv)
double y = kaiser(i+N, 2*N, beta) * sinc(x) * 2.0 * Fcr;;
y *= atten;
if (!format) {
- int64_t yi = floor(y * ((1ULL<<(nc-1))) + 0.5);
- if (yi >= (1LL<<(nc-1))) yi = (1LL<<(nc-1))-1;
- printf("0x%08x", int32_t(yi));
+ int64_t yi = toint(y, 1ULL<<(nc-1));
+ if (nc > 16) {
+ printf("0x%08x, ", int32_t(yi));
+ } else {
+ printf("0x%04x, ", int32_t(yi)&0xffff);
+ }
} else {
printf("%.9g%s", y, debug ? "" : "f");
}
@@ -277,5 +307,3 @@ int main(int argc, char** argv)
}
// http://www.csee.umbc.edu/help/sound/AFsp-V2R1/html/audio/ResampAudio.html
-
-