From 86eae0e5931103e040ac2cdd023ef5db252e09f6 Mon Sep 17 00:00:00 2001 From: Andy Hung Date: Mon, 9 Dec 2013 12:12:46 -0800 Subject: Audio resampler update to add S16 filters This does not affect the existing resamplers. New resampler accessed through additional quality settings: DYN_LOW_QUALITY = 5 DYN_MED_QUALITY = 6 DYN_HIGH_QUALITY = 7 Change-Id: Iebbd31871e808a4a6dee3f3abfd7e9dcf77c48e1 Signed-off-by: Andy Hung --- services/audioflinger/Android.mk | 10 +- services/audioflinger/AudioResampler.cpp | 27 +- services/audioflinger/AudioResampler.h | 3 + services/audioflinger/AudioResamplerDyn.cpp | 530 +++++++++ services/audioflinger/AudioResamplerDyn.h | 123 +++ services/audioflinger/AudioResamplerFirGen.h | 307 ++++++ services/audioflinger/AudioResamplerFirOps.h | 163 +++ services/audioflinger/AudioResamplerFirProcess.h | 256 +++++ .../audioflinger/AudioResamplerFirProcessNeon.h | 1149 ++++++++++++++++++++ services/audioflinger/test-resample.cpp | 70 +- tools/resampler_tools/fir.cpp | 84 +- 11 files changed, 2660 insertions(+), 62 deletions(-) create mode 100644 services/audioflinger/AudioResamplerDyn.cpp create mode 100644 services/audioflinger/AudioResamplerDyn.h create mode 100644 services/audioflinger/AudioResamplerFirGen.h create mode 100644 services/audioflinger/AudioResamplerFirOps.h create mode 100644 services/audioflinger/AudioResamplerFirProcess.h create mode 100644 services/audioflinger/AudioResamplerFirProcessNeon.h diff --git a/services/audioflinger/Android.mk b/services/audioflinger/Android.mk index 3ec9285..4524d3c 100644 --- a/services/audioflinger/Android.mk +++ b/services/audioflinger/Android.mk @@ -23,7 +23,8 @@ LOCAL_SRC_FILES:= \ AudioPolicyService.cpp \ ServiceUtilities.cpp \ AudioResamplerCubic.cpp.arm \ - AudioResamplerSinc.cpp.arm + AudioResamplerSinc.cpp.arm \ + AudioResamplerDyn.cpp.arm LOCAL_SRC_FILES += StateQueue.cpp @@ -74,10 +75,11 @@ include $(BUILD_SHARED_LIBRARY) include $(CLEAR_VARS) LOCAL_SRC_FILES:= \ - test-resample.cpp \ + test-resample.cpp \ AudioResampler.cpp.arm \ - AudioResamplerCubic.cpp.arm \ - AudioResamplerSinc.cpp.arm + AudioResamplerCubic.cpp.arm \ + AudioResamplerSinc.cpp.arm \ + AudioResamplerDyn.cpp.arm LOCAL_C_INCLUDES := \ $(call include-path-for, audio-utils) diff --git a/services/audioflinger/AudioResampler.cpp b/services/audioflinger/AudioResampler.cpp index 323f1a4..3b5a8c1 100644 --- a/services/audioflinger/AudioResampler.cpp +++ b/services/audioflinger/AudioResampler.cpp @@ -25,6 +25,7 @@ #include "AudioResampler.h" #include "AudioResamplerSinc.h" #include "AudioResamplerCubic.h" +#include "AudioResamplerDyn.h" #ifdef __arm__ #include @@ -85,6 +86,9 @@ bool AudioResampler::qualityIsSupported(src_quality quality) case MED_QUALITY: case HIGH_QUALITY: case VERY_HIGH_QUALITY: + case DYN_LOW_QUALITY: + case DYN_MED_QUALITY: + case DYN_HIGH_QUALITY: return true; default: return false; @@ -105,7 +109,7 @@ void AudioResampler::init_routine() if (*endptr == '\0') { defaultQuality = (src_quality) l; ALOGD("forcing AudioResampler quality to %d", defaultQuality); - if (defaultQuality < DEFAULT_QUALITY || defaultQuality > VERY_HIGH_QUALITY) { + if (defaultQuality < DEFAULT_QUALITY || defaultQuality > DYN_HIGH_QUALITY) { defaultQuality = DEFAULT_QUALITY; } } @@ -125,6 +129,12 @@ uint32_t AudioResampler::qualityMHz(src_quality quality) return 20; case VERY_HIGH_QUALITY: return 34; + case DYN_LOW_QUALITY: + return 4; + case DYN_MED_QUALITY: + return 6; + case DYN_HIGH_QUALITY: + return 12; } } @@ -175,6 +185,15 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount, case VERY_HIGH_QUALITY: quality = HIGH_QUALITY; break; + case DYN_LOW_QUALITY: + atFinalQuality = true; + break; + case DYN_MED_QUALITY: + quality = DYN_LOW_QUALITY; + break; + case DYN_HIGH_QUALITY: + quality = DYN_MED_QUALITY; + break; } } pthread_mutex_unlock(&mutex); @@ -200,6 +219,12 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount, ALOGV("Create VERY_HIGH_QUALITY sinc Resampler = %d", quality); resampler = new AudioResamplerSinc(bitDepth, inChannelCount, sampleRate, quality); break; + case DYN_LOW_QUALITY: + case DYN_MED_QUALITY: + case DYN_HIGH_QUALITY: + ALOGV("Create dynamic Resampler = %d", quality); + resampler = new AudioResamplerDyn(bitDepth, inChannelCount, sampleRate, quality); + break; } // initialize resampler diff --git a/services/audioflinger/AudioResampler.h b/services/audioflinger/AudioResampler.h index 33e64ce..c341325 100644 --- a/services/audioflinger/AudioResampler.h +++ b/services/audioflinger/AudioResampler.h @@ -41,6 +41,9 @@ public: MED_QUALITY=2, HIGH_QUALITY=3, VERY_HIGH_QUALITY=4, + DYN_LOW_QUALITY=5, + DYN_MED_QUALITY=6, + DYN_HIGH_QUALITY=7, }; static AudioResampler* create(int bitDepth, int inChannelCount, diff --git a/services/audioflinger/AudioResamplerDyn.cpp b/services/audioflinger/AudioResamplerDyn.cpp new file mode 100644 index 0000000..1e38d3e --- /dev/null +++ b/services/audioflinger/AudioResamplerDyn.cpp @@ -0,0 +1,530 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "AudioResamplerDyn" +//#define LOG_NDEBUG 0 + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "AudioResamplerFirOps.h" // USE_NEON and USE_INLINE_ASSEMBLY defined here +#include "AudioResamplerFirProcess.h" +#include "AudioResamplerFirProcessNeon.h" +#include "AudioResamplerFirGen.h" // requires math.h +#include "AudioResamplerDyn.h" + +//#define DEBUG_RESAMPLER + +namespace android { + +// generate a unique resample type compile-time constant (constexpr) +#define RESAMPLETYPE(CHANNELS, LOCKED, STRIDE, COEFTYPE) \ + ((((CHANNELS)-1)&1) | !!(LOCKED)<<1 | (COEFTYPE)<<2 \ + | ((STRIDE)==8 ? 1 : (STRIDE)==16 ? 2 : 0)<<3) + +/* + * InBuffer is a type agnostic input buffer. + * + * Layout of the state buffer for halfNumCoefs=8. + * + * [rrrrrrppppppppnnnnnnnnrrrrrrrrrrrrrrrrrrr.... rrrrrrr] + * S I R + * + * S = mState + * I = mImpulse + * R = mRingFull + * p = past samples, convoluted with the (p)ositive side of sinc() + * n = future samples, convoluted with the (n)egative side of sinc() + * r = extra space for implementing the ring buffer + */ + +template +AudioResamplerDyn::InBuffer::InBuffer() + : mState(NULL), mImpulse(NULL), mRingFull(NULL), mStateSize(0) { +} + +template +AudioResamplerDyn::InBuffer::~InBuffer() { + init(); +} + +template +void AudioResamplerDyn::InBuffer::init() { + free(mState); + mState = NULL; + mImpulse = NULL; + mRingFull = NULL; + mStateSize = 0; +} + +// resizes the state buffer to accommodate the appropriate filter length +template +void AudioResamplerDyn::InBuffer::resize(int CHANNELS, int halfNumCoefs) { + // calculate desired state size + int stateSize = halfNumCoefs * CHANNELS * 2 + * kStateSizeMultipleOfFilterLength; + + // check if buffer needs resizing + if (mState + && stateSize == mStateSize + && mRingFull-mState == mStateSize-halfNumCoefs*CHANNELS) { + return; + } + + // create new buffer + TI* state = (int16_t*)memalign(32, stateSize*sizeof(*state)); + memset(state, 0, stateSize*sizeof(*state)); + + // attempt to preserve state + if (mState) { + TI* srcLo = mImpulse - halfNumCoefs*CHANNELS; + TI* srcHi = mImpulse + halfNumCoefs*CHANNELS; + TI* dst = state; + + if (srcLo < mState) { + dst += mState-srcLo; + srcLo = mState; + } + if (srcHi > mState + mStateSize) { + srcHi = mState + mStateSize; + } + memcpy(dst, srcLo, (srcHi - srcLo) * sizeof(*srcLo)); + free(mState); + } + + // set class member vars + mState = state; + mStateSize = stateSize; + mImpulse = mState + halfNumCoefs*CHANNELS; // actually one sample greater than needed + mRingFull = mState + mStateSize - halfNumCoefs*CHANNELS; +} + +// copy in the input data into the head (impulse+halfNumCoefs) of the buffer. +template +template +void AudioResamplerDyn::InBuffer::readAgain(TI*& impulse, const int halfNumCoefs, + const TI* const in, const size_t inputIndex) { + int16_t* head = impulse + halfNumCoefs*CHANNELS; + for (size_t i=0 ; i +template +void AudioResamplerDyn::InBuffer::readAdvance(TI*& impulse, const int halfNumCoefs, + const TI* const in, const size_t inputIndex) { + impulse += CHANNELS; + + if (CC_UNLIKELY(impulse >= mRingFull)) { + const size_t shiftDown = mRingFull - mState - halfNumCoefs*CHANNELS; + memcpy(mState, mState+shiftDown, halfNumCoefs*CHANNELS*2*sizeof(TI)); + impulse -= shiftDown; + } + readAgain(impulse, halfNumCoefs, in, inputIndex); +} + +void AudioResamplerDyn::Constants::set( + int L, int halfNumCoefs, int inSampleRate, int outSampleRate) +{ + int bits = 0; + int lscale = inSampleRate/outSampleRate < 2 ? L - 1 : + static_cast(static_cast(L)*inSampleRate/outSampleRate); + for (int i=lscale; i; ++bits, i>>=1) + ; + mL = L; + mShift = kNumPhaseBits - bits; + mHalfNumCoefs = halfNumCoefs; +} + +AudioResamplerDyn::AudioResamplerDyn(int bitDepth, + int inChannelCount, int32_t sampleRate, src_quality quality) + : AudioResampler(bitDepth, inChannelCount, sampleRate, quality), + mResampleType(0), mFilterSampleRate(0), mCoefBuffer(NULL) +{ + mVolumeSimd[0] = mVolumeSimd[1] = 0; + mConstants.set(128, 8, mSampleRate, mSampleRate); // TODO: set better +} + +AudioResamplerDyn::~AudioResamplerDyn() { + free(mCoefBuffer); +} + +void AudioResamplerDyn::init() { + mFilterSampleRate = 0; // always trigger new filter generation + mInBuffer.init(); +} + +void AudioResamplerDyn::setVolume(int16_t left, int16_t right) { + AudioResampler::setVolume(left, right); + mVolumeSimd[0] = static_cast(left)<<16; + mVolumeSimd[1] = static_cast(right)<<16; +} + +template T max(T a, T b) {return a > b ? a : b;} + +template T absdiff(T a, T b) {return a > b ? a - b : b - a;} + +template +void AudioResamplerDyn::createKaiserFir(Constants &c, double stopBandAtten, + int inSampleRate, int outSampleRate, double tbwCheat) { + T* buf = reinterpret_cast(memalign(32, (c.mL+1)*c.mHalfNumCoefs*sizeof(T))); + static const double atten = 0.9998; // to avoid ripple overflow + double fcr; + double tbw = firKaiserTbw(c.mHalfNumCoefs, stopBandAtten); + + if (inSampleRate < outSampleRate) { // upsample + fcr = max(0.5*tbwCheat - tbw/2, tbw/2); + } else { // downsample + fcr = max(0.5*tbwCheat*outSampleRate/inSampleRate - tbw/2, tbw/2); + } + // create and set filter + firKaiserGen(buf, c.mL, c.mHalfNumCoefs, stopBandAtten, fcr, atten); + c.setBuf(buf); + if (mCoefBuffer) { + free(mCoefBuffer); + } + mCoefBuffer = buf; +#ifdef DEBUG_RESAMPLER + // print basic filter stats + printf("L:%d hnc:%d stopBandAtten:%lf fcr:%lf atten:%lf tbw:%lf\n", + c.mL, c.mHalfNumCoefs, stopBandAtten, fcr, atten, tbw); + // test the filter and report results + double fp = (fcr - tbw/2)/c.mL; + double fs = (fcr + tbw/2)/c.mL; + double fmin, fmax; + testFir(buf, c.mL, c.mHalfNumCoefs, 0., fp, 100, fmin, fmax); + double d1 = (fmax - fmin)/2.; + double ap = -20.*log10(1. - d1); // passband ripple + printf("passband(%lf, %lf): %.8lf %.8lf %.8lf\n", 0., fp, (fmax + fmin)/2., d1, ap); + testFir(buf, c.mL, c.mHalfNumCoefs, fs, 0.5, 100, fmin, fmax); + double d2 = fmax; + double as = -20.*log10(d2); // stopband attenuation + printf("stopband(%lf, %lf): %.8lf %.8lf %.3lf\n", fs, 0.5, (fmax + fmin)/2., d2, as); +#endif +} + +// recursive gcd (TODO: verify tail recursion elimination should make this iterate) +static int gcd(int n, int m) { + if (m == 0) { + return n; + } + return gcd(m, n % m); +} + +static bool isClose(int32_t newSampleRate, int32_t prevSampleRate, int32_t filterSampleRate) { + int pdiff = absdiff(newSampleRate, prevSampleRate); + int adiff = absdiff(newSampleRate, filterSampleRate); + + // allow up to 6% relative change increments. + // allow up to 12% absolute change increments (from filter design) + return pdiff < prevSampleRate>>4 && adiff < filterSampleRate>>3; +} + +void AudioResamplerDyn::setSampleRate(int32_t inSampleRate) { + if (mInSampleRate == inSampleRate) { + return; + } + int32_t oldSampleRate = mInSampleRate; + int32_t oldHalfNumCoefs = mConstants.mHalfNumCoefs; + uint32_t oldPhaseWrapLimit = mConstants.mL << mConstants.mShift; + bool useS32 = false; + + mInSampleRate = inSampleRate; + + // TODO: Add precalculated Equiripple filters + + if (!isClose(inSampleRate, oldSampleRate, mFilterSampleRate)) { + mFilterSampleRate = inSampleRate; + + // Begin Kaiser Filter computation + // + // The quantization floor for S16 is about 96db - 10*log_10(#length) + 3dB. + // Keep the stop band attenuation no greater than 84-85dB for 32 length S16 filters + // + // For s32 we keep the stop band attenuation at the same as 16b resolution, about + // 96-98dB + // + + double stopBandAtten; + double tbwCheat = 1.; // how much we "cheat" into aliasing + int halfLength; + if (getQuality() == DYN_HIGH_QUALITY) { + // 32b coefficients, 64 length + useS32 = true; + stopBandAtten = 98.; + halfLength = 32; + } else if (getQuality() == DYN_LOW_QUALITY) { + // 16b coefficients, 16-32 length + useS32 = false; + stopBandAtten = 80.; + if (mSampleRate >= inSampleRate * 2) { + halfLength = 16; + } else { + halfLength = 8; + } + if (mSampleRate >= inSampleRate) { + tbwCheat = 1.05; + } else { + tbwCheat = 1.03; + } + } else { // medium quality + // 16b coefficients, 32-64 length + useS32 = false; + stopBandAtten = 84.; + if (mSampleRate >= inSampleRate * 4) { + halfLength = 32; + } else if (mSampleRate >= inSampleRate * 2) { + halfLength = 24; + } else { + halfLength = 16; + } + if (mSampleRate >= inSampleRate) { + tbwCheat = 1.03; + } else { + tbwCheat = 1.01; + } + } + + // determine the number of polyphases in the filterbank. + // for 16b, it is desirable to have 2^(16/2) = 256 phases. + // https://ccrma.stanford.edu/~jos/resample/Relation_Interpolation_Error_Quantization.html + // + // We are a bit more lax on this. + + int phases = mSampleRate / gcd(mSampleRate, inSampleRate); + + while (phases<63) { // too few phases, allow room for interpolation + phases *= 2; // this code only needed to support dynamic rate changes + } + if (phases>=256) { // too many phases, always interpolate + phases = 127; + } + + // create the filter + mConstants.set(phases, halfLength, inSampleRate, mSampleRate); + if (useS32) { + createKaiserFir(mConstants, stopBandAtten, + inSampleRate, mSampleRate, tbwCheat); + } else { + createKaiserFir(mConstants, stopBandAtten, + inSampleRate, mSampleRate, tbwCheat); + } + } // End Kaiser filter + + // update phase and state based on the new filter. + const Constants& c(mConstants); + mInBuffer.resize(mChannelCount, c.mHalfNumCoefs); + const uint32_t phaseWrapLimit = c.mL << c.mShift; + // try to preserve as much of the phase fraction as possible for on-the-fly changes + mPhaseFraction = static_cast(mPhaseFraction) + * phaseWrapLimit / oldPhaseWrapLimit; + mPhaseFraction %= phaseWrapLimit; // should not do anything, but just in case. + mPhaseIncrement = static_cast(static_cast(phaseWrapLimit) + * inSampleRate / mSampleRate); + + // determine which resampler to use + // check if locked phase (works only if mPhaseIncrement has no "fractional phase bits") + int locked = (mPhaseIncrement << (sizeof(mPhaseIncrement)*8 - c.mShift)) == 0; + int stride = (c.mHalfNumCoefs&7)==0 ? 16 : (c.mHalfNumCoefs&3)==0 ? 8 : 2; + if (locked) { + mPhaseFraction = mPhaseFraction >> c.mShift << c.mShift; // remove fractional phase + } + if (!USE_NEON) { + stride = 2; // C version only + } + // TODO: Remove this for testing + //stride = 2; + mResampleType = RESAMPLETYPE(mChannelCount, locked, stride, !!useS32); +#ifdef DEBUG_RESAMPLER + printf("channels:%d %s stride:%d %s coef:%d shift:%d\n", + mChannelCount, locked ? "locked" : "interpolated", + stride, useS32 ? "S32" : "S16", 2*c.mHalfNumCoefs, c.mShift); +#endif +} + +void AudioResamplerDyn::resample(int32_t* out, size_t outFrameCount, + AudioBufferProvider* provider) +{ + // TODO: + // 24 cases - this perhaps can be reduced later, as testing might take too long + switch (mResampleType) { + + // stride 16 (stride 2 for machines that do not support NEON) + case RESAMPLETYPE(1, true, 16, 0): + return resample<1, true, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider); + case RESAMPLETYPE(2, true, 16, 0): + return resample<2, true, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider); + case RESAMPLETYPE(1, false, 16, 0): + return resample<1, false, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider); + case RESAMPLETYPE(2, false, 16, 0): + return resample<2, false, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider); + case RESAMPLETYPE(1, true, 16, 1): + return resample<1, true, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider); + case RESAMPLETYPE(2, true, 16, 1): + return resample<2, true, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider); + case RESAMPLETYPE(1, false, 16, 1): + return resample<1, false, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider); + case RESAMPLETYPE(2, false, 16, 1): + return resample<2, false, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +#if 0 + // TODO: Remove these? + // stride 8 + case RESAMPLETYPE(1, true, 8, 0): + return resample<1, true, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider); + case RESAMPLETYPE(2, true, 8, 0): + return resample<2, true, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider); + case RESAMPLETYPE(1, false, 8, 0): + return resample<1, false, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider); + case RESAMPLETYPE(2, false, 8, 0): + return resample<2, false, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider); + case RESAMPLETYPE(1, true, 8, 1): + return resample<1, true, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider); + case RESAMPLETYPE(2, true, 8, 1): + return resample<2, true, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider); + case RESAMPLETYPE(1, false, 8, 1): + return resample<1, false, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider); + case RESAMPLETYPE(2, false, 8, 1): + return resample<2, false, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider); + // stride 2 (can handle any filter length) + case RESAMPLETYPE(1, true, 2, 0): + return resample<1, true, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider); + case RESAMPLETYPE(2, true, 2, 0): + return resample<2, true, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider); + case RESAMPLETYPE(1, false, 2, 0): + return resample<1, false, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider); + case RESAMPLETYPE(2, false, 2, 0): + return resample<2, false, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider); + case RESAMPLETYPE(1, true, 2, 1): + return resample<1, true, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider); + case RESAMPLETYPE(2, true, 2, 1): + return resample<2, true, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider); + case RESAMPLETYPE(1, false, 2, 1): + return resample<1, false, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider); + case RESAMPLETYPE(2, false, 2, 1): + return resample<2, false, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +#endif + default: + ; // error + } +} + +template +void AudioResamplerDyn::resample(int32_t* out, size_t outFrameCount, + const TC* const coefs, AudioBufferProvider* provider) +{ + const Constants& c(mConstants); + int16_t* impulse = mInBuffer.getImpulse(); + size_t inputIndex = mInputIndex; + uint32_t phaseFraction = mPhaseFraction; + const uint32_t phaseIncrement = mPhaseIncrement; + size_t outputIndex = 0; + size_t outputSampleCount = outFrameCount * 2; // stereo output + size_t inFrameCount = (outFrameCount*mInSampleRate)/mSampleRate; + const uint32_t phaseWrapLimit = c.mL << c.mShift; + + // NOTE: be very careful when modifying the code here. register + // pressure is very high and a small change might cause the compiler + // to generate far less efficient code. + // Always sanity check the result with objdump or test-resample. + + // the following logic is a bit convoluted to keep the main processing loop + // as tight as possible with register allocation. + while (outputIndex < outputSampleCount) { + // buffer is empty, fetch a new one + while (mBuffer.frameCount == 0) { + mBuffer.frameCount = inFrameCount; + provider->getNextBuffer(&mBuffer, + calculateOutputPTS(outputIndex / 2)); + if (mBuffer.raw == NULL) { + goto resample_exit; + } + if (phaseFraction >= phaseWrapLimit) { // read in data + mInBuffer.readAdvance( + impulse, c.mHalfNumCoefs, mBuffer.i16, inputIndex); + phaseFraction -= phaseWrapLimit; + while (phaseFraction >= phaseWrapLimit) { + inputIndex++; + if (inputIndex >= mBuffer.frameCount) { + inputIndex -= mBuffer.frameCount; + provider->releaseBuffer(&mBuffer); + break; + } + mInBuffer.readAdvance( + impulse, c.mHalfNumCoefs, mBuffer.i16, inputIndex); + phaseFraction -= phaseWrapLimit; + } + } + } + const int16_t* const in = mBuffer.i16; + const size_t frameCount = mBuffer.frameCount; + const int coefShift = c.mShift; + const int halfNumCoefs = c.mHalfNumCoefs; + const int32_t* const volumeSimd = mVolumeSimd; + + // reread the last input in. + mInBuffer.readAgain(impulse, halfNumCoefs, in, inputIndex); + + // main processing loop + while (CC_LIKELY(outputIndex < outputSampleCount)) { + // caution: fir() is inlined and may be large. + // output will be loaded with the appropriate values + // + // from the input samples in impulse[-halfNumCoefs+1]... impulse[halfNumCoefs] + // from the polyphase filter of (phaseFraction / phaseWrapLimit) in coefs. + // + fir( + &out[outputIndex], + phaseFraction, phaseWrapLimit, + coefShift, halfNumCoefs, coefs, + impulse, volumeSimd); + outputIndex += 2; + + phaseFraction += phaseIncrement; + while (phaseFraction >= phaseWrapLimit) { + inputIndex++; + if (inputIndex >= frameCount) { + goto done; // need a new buffer + } + mInBuffer.readAdvance(impulse, halfNumCoefs, in, inputIndex); + phaseFraction -= phaseWrapLimit; + } + } +done: + // often arrives here when input buffer runs out + if (inputIndex >= frameCount) { + inputIndex -= frameCount; + provider->releaseBuffer(&mBuffer); + // mBuffer.frameCount MUST be zero here. + } + } + +resample_exit: + mInBuffer.setImpulse(impulse); + mInputIndex = inputIndex; + mPhaseFraction = phaseFraction; +} + +// ---------------------------------------------------------------------------- +}; // namespace android diff --git a/services/audioflinger/AudioResamplerDyn.h b/services/audioflinger/AudioResamplerDyn.h new file mode 100644 index 0000000..85a01ab --- /dev/null +++ b/services/audioflinger/AudioResamplerDyn.h @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ANDROID_AUDIO_RESAMPLER_DYN_H +#define ANDROID_AUDIO_RESAMPLER_DYN_H + +#include +#include +#include + +#include "AudioResampler.h" + +namespace android { + +class AudioResamplerDyn: public AudioResampler { +public: + AudioResamplerDyn(int bitDepth, int inChannelCount, int32_t sampleRate, + src_quality quality); + + virtual ~AudioResamplerDyn(); + + virtual void init(); + + virtual void setSampleRate(int32_t inSampleRate); + + virtual void setVolume(int16_t left, int16_t right); + + virtual void resample(int32_t* out, size_t outFrameCount, + AudioBufferProvider* provider); + +private: + + class Constants { // stores the filter constants. + public: + Constants() : + mL(0), mShift(0), mHalfNumCoefs(0), mFirCoefsS16(NULL) + {} + void set(int L, int halfNumCoefs, + int inSampleRate, int outSampleRate); + inline void setBuf(int16_t* buf) { + mFirCoefsS16 = buf; + } + inline void setBuf(int32_t* buf) { + mFirCoefsS32 = buf; + } + + int mL; // interpolation phases in the filter. + int mShift; // right shift to get polyphase index + unsigned int mHalfNumCoefs; // filter half #coefs + union { // polyphase filter bank + const int16_t* mFirCoefsS16; + const int32_t* mFirCoefsS32; + }; + }; + + // Input buffer management for a given input type TI, now (int16_t) + // Is agnostic of the actual type, can work with int32_t and float. + template + class InBuffer { + public: + InBuffer(); + ~InBuffer(); + void init(); + void resize(int CHANNELS, int halfNumCoefs); + + // used for direct management of the mImpulse pointer + inline TI* getImpulse() { + return mImpulse; + } + inline void setImpulse(TI *impulse) { + mImpulse = impulse; + } + template + inline void readAgain(TI*& impulse, const int halfNumCoefs, + const TI* const in, const size_t inputIndex); + template + inline void readAdvance(TI*& impulse, const int halfNumCoefs, + const TI* const in, const size_t inputIndex); + + private: + // tuning parameter guidelines: 2 <= multiple <= 8 + static const int kStateSizeMultipleOfFilterLength = 4; + + TI* mState; // base pointer for the input buffer storage + TI* mImpulse; // current location of the impulse response (centered) + TI* mRingFull; // mState <= mImpulse < mRingFull + // in general, mRingFull = mState + mStateSize - halfNumCoefs*CHANNELS. + size_t mStateSize; // in units of TI. + }; + + template + void resample(int32_t* out, size_t outFrameCount, + const TC* const coefs, AudioBufferProvider* provider); + + template + void createKaiserFir(Constants &c, double stopBandAtten, + int inSampleRate, int outSampleRate, double tbwCheat); + + InBuffer mInBuffer; + Constants mConstants; // current set of coefficient parameters + int32_t __attribute__ ((aligned (8))) mVolumeSimd[2]; + int32_t mResampleType; // contains the resample type. + int32_t mFilterSampleRate; // designed sample rate for the filter + void* mCoefBuffer; // if a filter is created, this is not null +}; + +// ---------------------------------------------------------------------------- +}; // namespace android + +#endif /*ANDROID_AUDIO_RESAMPLER_DYN_H*/ diff --git a/services/audioflinger/AudioResamplerFirGen.h b/services/audioflinger/AudioResamplerFirGen.h new file mode 100644 index 0000000..fa6ee3e --- /dev/null +++ b/services/audioflinger/AudioResamplerFirGen.h @@ -0,0 +1,307 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ANDROID_AUDIO_RESAMPLER_FIR_GEN_H +#define ANDROID_AUDIO_RESAMPLER_FIR_GEN_H + +namespace android { + +/* + * Sinc function is the traditional variant. + * + * TODO: Investigate optimizations (regular sampling grid, NEON vector accelerations) + * TODO: Remove comparison at 0 and trap at a higher level. + * + */ + +static inline double sinc(double x) { + if (fabs(x) < FLT_MIN) { + return 1.; + } + return sin(x) / x; +} + +static inline double sqr(double x) { + return x * x; +} + +/* + * rounds a double to the nearest integer for FIR coefficients. + * + * One variant uses noise shaping, which must keep error history + * to work (the err parameter, initialized to 0). + * The other variant is a non-noise shaped version for + * S32 coefficients (noise shaping doesn't gain much). + * + * Caution: No bounds saturation is applied, but isn't needed in + * this case. + * + * @param x is the value to round. + * + * @param maxval is the maximum integer scale factor expressed as an int64 (for headroom). + * Typically this may be the maximum positive integer+1 (using the fact that double precision + * FIR coefficients generated here are never that close to 1.0 to pose an overflow condition). + * + * @param err is the previous error (actual - rounded) for the previous rounding op. + * + */ + +static inline int64_t toint(double x, int64_t maxval, double& err) { + double val = x * maxval; + double ival = floor(val + 0.5 + err*0.17); + err = val - ival; + return static_cast(ival); +} + +static inline int64_t toint(double x, int64_t maxval) { + return static_cast(floor(x * maxval + 0.5)); +} + +/* + * Modified Bessel function of the first kind + * http://en.wikipedia.org/wiki/Bessel_function + * + * The formulas are taken from Abramowitz and Stegun: + * + * http://people.math.sfu.ca/~cbm/aands/page_375.htm + * http://people.math.sfu.ca/~cbm/aands/page_378.htm + * + * http://dlmf.nist.gov/10.25 + * http://dlmf.nist.gov/10.40 + * + * Note we assume x is nonnegative (the function is symmetric, + * pass in the absolute value as needed). + * + * Constants are compile time derived with templates I0Term<> and + * I0ATerm<> to the precision of the compiler. The series can be expanded + * to any precision needed, but currently set around 24b precision. + * + * We use a bit of template math here, constexpr would probably be + * more appropriate for a C++11 compiler. + * + */ + +template +struct I0Term { + static const double value = I0Term::value/ (4. * N * N); +}; + +template <> +struct I0Term<0> { + static const double value = 1.; +}; + +template +struct I0ATerm { + static const double value = I0ATerm::value * (2.*N-1.) * (2.*N-1.) / (8. * N); +}; + +template <> +struct I0ATerm<0> { // 1/sqrt(2*PI); + static const double value = 0.398942280401432677939946059934381868475858631164934657665925; +}; + +static inline double I0(double x) { + if (x < 3.75) { // TODO: Estrin's method instead of Horner's method? + x *= x; + return I0Term<0>::value + x*( + I0Term<1>::value + x*( + I0Term<2>::value + x*( + I0Term<3>::value + x*( + I0Term<4>::value + x*( + I0Term<5>::value + x*( + I0Term<6>::value)))))); // e < 1.6e-7 + } + // a bit ugly here - perhaps we expand the top series + // to permit computation to x < 20 (a reasonable range) + double y = 1./x; + return exp(x) * sqrt(y) * ( + // note: reciprocal squareroot may be easier! + // http://en.wikipedia.org/wiki/Fast_inverse_square_root + I0ATerm<0>::value + y*( + I0ATerm<1>::value + y*( + I0ATerm<2>::value + y*( + I0ATerm<3>::value + y*( + I0ATerm<4>::value + y*( + I0ATerm<5>::value + y*( + I0ATerm<6>::value + y*( + I0ATerm<7>::value + y*( + I0ATerm<8>::value))))))))); // (... e) < 1.9e-7 +} + +/* + * calculates the transition bandwidth for a Kaiser filter + * + * Formula 3.2.8, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48 + * + * @param halfNumCoef is half the number of coefficients per filter phase. + * @param stopBandAtten is the stop band attenuation desired. + * @return the transition bandwidth in normalized frequency (0 <= f <= 0.5) + */ +static inline double firKaiserTbw(int halfNumCoef, double stopBandAtten) { + return (stopBandAtten - 7.95)/(2.*14.36*halfNumCoef); +} + +/* + * calculates the fir transfer response. + * + * calculates the transfer coefficient H(w) for 0 <= w <= PI. + * Be careful be careful to consider the fact that this is an interpolated filter + * of length L, so normalizing H(w)/L is probably what you expect. + */ +template +static inline double firTransfer(const T* coef, int L, int halfNumCoef, double w) { + double accum = static_cast(coef[0])*0.5; + coef += halfNumCoef; // skip first row. + for (int i=1 ; i<=L ; ++i) { + for (int j=0, ix=i ; j(*coef++); + } + } + return accum*2.; +} + +/* + * returns the minimum and maximum |H(f)| bounds + * + * @param coef is the designed polyphase filter banks + * + * @param L is the number of phases (for interpolation) + * + * @param halfNumCoef should be half the number of coefficients for a single + * polyphase. + * + * @param fstart is the normalized frequency start. + * + * @param fend is the normalized frequency end. + * + * @param steps is the number of steps to take (sampling) between frequency start and end + * + * @param firMin returns the minimum transfer |H(f)| found + * + * @param firMax returns the maximum transfer |H(f)| found + * + * 0 <= f <= 0.5. + * This is used to test passband and stopband performance. + */ +template +static void testFir(const T* coef, int L, int halfNumCoef, + double fstart, double fend, int steps, double &firMin, double &firMax) { + double wstart = fstart*(2.*M_PI); + double wend = fend*(2.*M_PI); + double wstep = (wend - wstart)/steps; + double fmax, fmin; + double trf = firTransfer(coef, L, halfNumCoef, wstart); + if (trf<0) { + trf = -trf; + } + fmin = fmax = trf; + wstart += wstep; + for (int i=1; ifmax) { + fmax = trf; + } + else if (trf50dB. + * + * @param fcr is cutoff frequency/sampling rate (<0.5). At this point, the energy + * should be 6dB less. (fcr is where the amplitude drops by half). Use the + * firKaiserTbw() to calculate the transition bandwidth. fcr is the midpoint + * between the stop band and the pass band (fstop+fpass)/2. + * + * @param atten is the attenuation (generally slightly less than 1). + */ + +template +static inline void firKaiserGen(T* coef, int L, int halfNumCoef, + double stopBandAtten, double fcr, double atten) { + // + // Formula 3.2.5, 3.2.7, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48 + // + // See also: http://melodi.ee.washington.edu/courses/ee518/notes/lec17.pdf + // + // Kaiser window and beta parameter + // + // | 0.1102*(A - 8.7) A > 50 + // beta = | 0.5842*(A - 21)^0.4 + 0.07886*(A - 21) 21 <= A <= 50 + // | 0. A < 21 + // + // with A is the desired stop-band attenuation in dBFS + // + // 30 dB 2.210 + // 40 dB 3.384 + // 50 dB 4.538 + // 60 dB 5.658 + // 70 dB 6.764 + // 80 dB 7.865 + // 90 dB 8.960 + // 100 dB 10.056 + + const int N = L * halfNumCoef; // non-negative half + const double beta = 0.1102 * (stopBandAtten - 8.7); // >= 50dB always + const double yscale = 2. * atten * fcr / I0(beta); + const double xstep = 2. * M_PI * fcr / L; + const double xfrac = 1. / N; + double err = 0; // for noise shaping on int16_t coefficients + for (int i=0 ; i<=L ; ++i) { // generate an extra set of coefs for interpolation + for (int j=0, ix=i ; j::value) { // int16_t needs noise shaping + *coef++ = static_cast(toint(y, 1ULL<<(sizeof(T)*8-1), err)); + } else { + *coef++ = static_cast(toint(y, 1ULL<<(sizeof(T)*8-1))); + } + } + } +} + +}; // namespace android + +#endif /*ANDROID_AUDIO_RESAMPLER_FIR_GEN_H*/ diff --git a/services/audioflinger/AudioResamplerFirOps.h b/services/audioflinger/AudioResamplerFirOps.h new file mode 100644 index 0000000..bf2163f --- /dev/null +++ b/services/audioflinger/AudioResamplerFirOps.h @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ANDROID_AUDIO_RESAMPLER_FIR_OPS_H +#define ANDROID_AUDIO_RESAMPLER_FIR_OPS_H + +namespace android { + +#if defined(__arm__) && !defined(__thumb__) +#define USE_INLINE_ASSEMBLY (true) +#else +#define USE_INLINE_ASSEMBLY (false) +#endif + +#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__) +#define USE_NEON (true) +#include +#else +#define USE_NEON (false) +#endif + +template +struct is_same +{ + static const bool value = false; +}; + +template +struct is_same // partial specialization +{ + static const bool value = true; +}; + +static inline +int32_t mulRL(int left, int32_t in, uint32_t vRL) +{ +#if USE_INLINE_ASSEMBLY + int32_t out; + if (left) { + asm( "smultb %[out], %[in], %[vRL] \n" + : [out]"=r"(out) + : [in]"%r"(in), [vRL]"r"(vRL) + : ); + } else { + asm( "smultt %[out], %[in], %[vRL] \n" + : [out]"=r"(out) + : [in]"%r"(in), [vRL]"r"(vRL) + : ); + } + return out; +#else + int16_t v = left ? static_cast(vRL) : static_cast(vRL>>16); + return static_cast((static_cast(in) * v) >> 16); +#endif +} + +static inline +int32_t mulAdd(int16_t in, int16_t v, int32_t a) +{ +#if USE_INLINE_ASSEMBLY + int32_t out; + asm( "smlabb %[out], %[v], %[in], %[a] \n" + : [out]"=r"(out) + : [in]"%r"(in), [v]"r"(v), [a]"r"(a) + : ); + return out; +#else + return a + v * in; +#endif +} + +static inline +int32_t mulAdd(int16_t in, int32_t v, int32_t a) +{ +#if USE_INLINE_ASSEMBLY + int32_t out; + asm( "smlawb %[out], %[v], %[in], %[a] \n" + : [out]"=r"(out) + : [in]"%r"(in), [v]"r"(v), [a]"r"(a) + : ); + return out; +#else + return a + static_cast((static_cast(v) * in) >> 16); +#endif +} + +static inline +int32_t mulAdd(int32_t in, int32_t v, int32_t a) +{ +#if USE_INLINE_ASSEMBLY + int32_t out; + asm( "smmla %[out], %[v], %[in], %[a] \n" + : [out]"=r"(out) + : [in]"%r"(in), [v]"r"(v), [a]"r"(a) + : ); + return out; +#else + return a + static_cast((static_cast(v) * in) >> 32); +#endif +} + +static inline +int32_t mulAddRL(int left, uint32_t inRL, int16_t v, int32_t a) +{ +#if USE_INLINE_ASSEMBLY + int32_t out; + if (left) { + asm( "smlabb %[out], %[v], %[inRL], %[a] \n" + : [out]"=r"(out) + : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a) + : ); + } else { + asm( "smlabt %[out], %[v], %[inRL], %[a] \n" + : [out]"=r"(out) + : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a) + : ); + } + return out; +#else + int16_t s = left ? static_cast(inRL) : static_cast(inRL>>16); + return a + v * s; +#endif +} + +static inline +int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a) +{ +#if USE_INLINE_ASSEMBLY + int32_t out; + if (left) { + asm( "smlawb %[out], %[v], %[inRL], %[a] \n" + : [out]"=r"(out) + : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a) + : ); + } else { + asm( "smlawt %[out], %[v], %[inRL], %[a] \n" + : [out]"=r"(out) + : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a) + : ); + } + return out; +#else + int16_t s = left ? static_cast(inRL) : static_cast(inRL>>16); + return a + static_cast((static_cast(v) * s) >> 16); +#endif +} + +}; // namespace android + +#endif /*ANDROID_AUDIO_RESAMPLER_FIR_OPS_H*/ diff --git a/services/audioflinger/AudioResamplerFirProcess.h b/services/audioflinger/AudioResamplerFirProcess.h new file mode 100644 index 0000000..38e387c --- /dev/null +++ b/services/audioflinger/AudioResamplerFirProcess.h @@ -0,0 +1,256 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H +#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H + +namespace android { + +// depends on AudioResamplerFirOps.h + +template +static inline +void mac( + int32_t& l, int32_t& r, + const TC coef, + const int16_t* samples) +{ + if (CHANNELS == 2) { + uint32_t rl = *reinterpret_cast(samples); + l = mulAddRL(1, rl, coef, l); + r = mulAddRL(0, rl, coef, r); + } else { + r = l = mulAdd(samples[0], coef, l); + } +} + +template +static inline +void interpolate( + int32_t& l, int32_t& r, + const TC coef_0, const TC coef_1, + const int16_t lerp, const int16_t* samples) +{ + TC sinc; + + if (is_same::value) { + sinc = (lerp * ((coef_1-coef_0)<<1)>>16) + coef_0; + } else { + sinc = mulAdd(lerp, (coef_1-coef_0)<<1, coef_0); + } + if (CHANNELS == 2) { + uint32_t rl = *reinterpret_cast(samples); + l = mulAddRL(1, rl, sinc, l); + r = mulAddRL(0, rl, sinc, r); + } else { + r = l = mulAdd(samples[0], sinc, l); + } +} + +/* + * Calculates a single output sample (two stereo frames). + * + * This function computes both the positive half FIR dot product and + * the negative half FIR dot product, accumulates, and then applies the volume. + * + * This is a locked phase filter (it does not compute the interpolation). + * + * Use fir() to compute the proper coefficient pointers for a polyphase + * filter bank. + */ + +template +static inline +void ProcessL(int32_t* const out, + int count, + const TC* coefsP, + const TC* coefsN, + const int16_t* sP, + const int16_t* sN, + const int32_t* const volumeLR) +{ + int32_t l = 0; + int32_t r = 0; + do { + mac(l, r, *coefsP++, sP); + sP -= CHANNELS; + mac(l, r, *coefsN++, sN); + sN += CHANNELS; + } while (--count > 0); + out[0] += 2 * mulRL(0, l, volumeLR[0]); // Note: only use top 16b + out[1] += 2 * mulRL(0, r, volumeLR[1]); // Note: only use top 16b +} + +/* + * Calculates a single output sample (two stereo frames) interpolating phase. + * + * This function computes both the positive half FIR dot product and + * the negative half FIR dot product, accumulates, and then applies the volume. + * + * This is an interpolated phase filter. + * + * Use fir() to compute the proper coefficient pointers for a polyphase + * filter bank. + */ + +template +static inline +void Process(int32_t* const out, + int count, + const TC* coefsP, + const TC* coefsN, + const TC* coefsP1, + const TC* coefsN1, + const int16_t* sP, + const int16_t* sN, + uint32_t lerpP, + const int32_t* const volumeLR) +{ + (void) coefsP1; // suppress unused parameter warning + (void) coefsN1; + if (sizeof(*coefsP)==4) { + lerpP >>= 16; // ensure lerpP is 16b + } + int32_t l = 0; + int32_t r = 0; + for (size_t i = 0; i < count; ++i) { + interpolate(l, r, coefsP[0], coefsP[count], lerpP, sP); + coefsP++; + sP -= CHANNELS; + interpolate(l, r, coefsN[count], coefsN[0], lerpP, sN); + coefsN++; + sN += CHANNELS; + } + out[0] += 2 * mulRL(0, l, volumeLR[0]); // Note: only use top 16b + out[1] += 2 * mulRL(0, r, volumeLR[1]); // Note: only use top 16b +} + +/* + * Calculates a single output sample (two stereo frames) from input sample pointer. + * + * This sets up the params for the accelerated Process() and ProcessL() + * functions to do the appropriate dot products. + * + * @param out should point to the output buffer with at least enough space for 2 output frames. + * + * @param phase is the fractional distance between input samples for interpolation: + * phase >= 0 && phase < phaseWrapLimit. It can be thought of as a rational fraction + * of phase/phaseWrapLimit. + * + * @param phaseWrapLimit is #polyphases<>coefShift). + * + * @param coefShift gives the bit alignment of the polyphase index in the phase parameter. + * + * @param halfNumCoefs is the half the number of coefficients per polyphase filter. Since the + * overall filterbank is odd-length symmetric, only halfNumCoefs need be stored. + * + * @param coefs is the polyphase filter bank, starting at from polyphase index 0, and ranging to + * and including the #polyphases. Each polyphase of the filter has half-length halfNumCoefs + * (due to symmetry). The total size of the filter bank in coefficients is + * (#polyphases+1)*halfNumCoefs. + * + * The filter bank coefs should be aligned to a minimum of 16 bytes (preferrably to cache line). + * + * The coefs should be attenuated (to compensate for passband ripple) + * if storing back into the native format. + * + * @param samples are unaligned input samples. The position is in the "middle" of the + * sample array with respect to the FIR filter: + * the negative half of the filter is dot product from samples+1 to samples+halfNumCoefs; + * the positive half of the filter is dot product from samples to samples-halfNumCoefs+1. + * + * @param volumeLR is a pointer to an array of two 32 bit volume values, one per stereo channel, + * expressed as a S32 integer. A negative value inverts the channel 180 degrees. + * The pointer volumeLR should be aligned to a minimum of 8 bytes. + * A typical value for volume is 0x1000 to align to a unity gain output of 20.12. + * + * In between calls to filterCoefficient, the phase is incremented by phaseIncrement, where + * phaseIncrement is calculated as inputSampling * phaseWrapLimit / outputSampling. + * + * The filter polyphase index is given by indexP = phase >> coefShift. Due to + * odd length symmetric filter, the polyphase index of the negative half depends on + * whether interpolation is used. + * + * The fractional siting between the polyphase indices is given by the bits below coefShift: + * + * lerpP = phase << 32 - coefShift >> 1; // for 32 bit unsigned phase multiply + * lerpP = phase << 32 - coefShift >> 17; // for 16 bit unsigned phase multiply + * + * For integer types, this is expressed as: + * + * lerpP = phase << sizeof(phase)*8 - coefShift + * >> (sizeof(phase)-sizeof(*coefs))*8 + 1; + * + */ + +template +static inline +void fir(int32_t* const out, + const uint32_t phase, const uint32_t phaseWrapLimit, + const int coefShift, const int halfNumCoefs, const TC* const coefs, + const int16_t* const samples, const int32_t* const volumeLR) +{ + // NOTE: be very careful when modifying the code here. register + // pressure is very high and a small change might cause the compiler + // to generate far less efficient code. + // Always sanity check the result with objdump or test-resample. + + if (LOCKED) { + // locked polyphase (no interpolation) + // Compute the polyphase filter index on the positive and negative side. + uint32_t indexP = phase >> coefShift; + uint32_t indexN = (phaseWrapLimit - phase) >> coefShift; + const TC* coefsP = coefs + indexP*halfNumCoefs; + const TC* coefsN = coefs + indexN*halfNumCoefs; + const int16_t* sP = samples; + const int16_t* sN = samples + CHANNELS; + + // dot product filter. + ProcessL(out, + halfNumCoefs, coefsP, coefsN, sP, sN, volumeLR); + } else { + // interpolated polyphase + // Compute the polyphase filter index on the positive and negative side. + uint32_t indexP = phase >> coefShift; + uint32_t indexN = (phaseWrapLimit - phase - 1) >> coefShift; // one's complement. + const TC* coefsP = coefs + indexP*halfNumCoefs; + const TC* coefsN = coefs + indexN*halfNumCoefs; + const TC* coefsP1 = coefsP + halfNumCoefs; + const TC* coefsN1 = coefsN + halfNumCoefs; + const int16_t* sP = samples; + const int16_t* sN = samples + CHANNELS; + + // Interpolation fraction lerpP derived by shifting all the way up and down + // to clear the appropriate bits and align to the appropriate level + // for the integer multiply. The constants should resolve in compile time. + // + // The interpolated filter coefficient is derived as follows for the pos/neg half: + // + // interpolated[P] = index[P]*lerpP + index[P+1]*(1-lerpP) + // interpolated[N] = index[N+1]*lerpP + index[N]*(1-lerpP) + uint32_t lerpP = phase << (sizeof(phase)*8 - coefShift) + >> ((sizeof(phase)-sizeof(*coefs))*8 + 1); + + // on-the-fly interpolated dot product filter + Process(out, + halfNumCoefs, coefsP, coefsN, coefsP1, coefsN1, sP, sN, lerpP, volumeLR); + } +} + +}; // namespace android + +#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H*/ diff --git a/services/audioflinger/AudioResamplerFirProcessNeon.h b/services/audioflinger/AudioResamplerFirProcessNeon.h new file mode 100644 index 0000000..f311cef --- /dev/null +++ b/services/audioflinger/AudioResamplerFirProcessNeon.h @@ -0,0 +1,1149 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H +#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H + +namespace android { + +// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h + +#if USE_NEON +// +// NEON specializations are enabled for Process() and ProcessL() +// +// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary) +// and looping stride 16 (or vice versa). This has some polyphase coef data alignment +// issues with S16 coefs. Consider this later. + +// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out. +#define ASSEMBLY_ACCUMULATE_MONO \ + "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\ + "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\ + "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\ + "vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\ + "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\ + "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\ + "vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */ + +#define ASSEMBLY_ACCUMULATE_STEREO \ + "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\ + "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\ + "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\ + "vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\ + "vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\ + "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\ + "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\ + "vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/ + +template <> +inline void ProcessL<1, 16>(int32_t* const out, + int count, + const int16_t* coefsP, + const int16_t* coefsN, + const int16_t* sP, + const int16_t* sN, + const int32_t* const volumeLR) +{ + const int CHANNELS = 1; // template specialization does not preserve params + const int STRIDE = 16; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 + + "1: \n" + + "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples + "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples + "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs + "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs + + "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 + + // reordering the vmal to do d6, d7 before d4, d5 is slower(?) + "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef + "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef + "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples + "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples + + // moving these ARM instructions before neon above seems to be slower + "subs %[count], %[count], #8 \n"// (1) update loop counter + "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples + + // sP used after branch (warning) + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_MONO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q8", "q10" + ); +} + +template <> +inline void ProcessL<2, 16>(int32_t* const out, + int count, + const int16_t* coefsP, + const int16_t* coefsN, + const int16_t* sP, + const int16_t* sN, + const int32_t* const volumeLR) +{ + const int CHANNELS = 2; // template specialization does not preserve params + const int STRIDE = 16; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "veor q0, q0, q0 \n"// (1) acc_L = 0 + "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 + + "1: \n" + + "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples + "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples + "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs + "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs + + "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive + "vrev64.16 q3, q3 \n"// (0 combines+) reverse right positive + + "vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left + "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left + "vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right + "vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right + "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left + "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left + "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right + "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right + + // moving these ARM before neon seems to be slower + "subs %[count], %[count], #8 \n"// (1) update loop counter + "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples + + // sP used after branch (warning) + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_STEREO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q4", "q5", "q6", + "q8", "q10" + ); +} + +template <> +inline void Process<1, 16>(int32_t* const out, + int count, + const int16_t* coefsP, + const int16_t* coefsN, + const int16_t* coefsP1, + const int16_t* coefsN1, + const int16_t* sP, + const int16_t* sN, + uint32_t lerpP, + const int32_t* const volumeLR) +{ + const int CHANNELS = 1; // template specialization does not preserve params + const int STRIDE = 16; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15 + "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 + + "1: \n" + + "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples + "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples + "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs + "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation + "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs + "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation + + "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs + "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets + + "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs + "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs + + "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 + + "vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set + "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set + + // reordering the vmal to do d6, d7 before d4, d5 is slower(?) + "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef + "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef + "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples + "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples + + // moving these ARM instructions before neon above seems to be slower + "subs %[count], %[count], #8 \n"// (1) update loop counter + "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples + + // sP used after branch (warning) + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_MONO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [coefsP1] "+r" (coefsP1), + [coefsN1] "+r" (coefsN1), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [lerpP] "r" (lerpP), + [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11" + ); +} + +template <> +inline void Process<2, 16>(int32_t* const out, + int count, + const int16_t* coefsP, + const int16_t* coefsN, + const int16_t* coefsP1, + const int16_t* coefsN1, + const int16_t* sP, + const int16_t* sN, + uint32_t lerpP, + const int32_t* const volumeLR) +{ + const int CHANNELS = 2; // template specialization does not preserve params + const int STRIDE = 16; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "vmov.32 d2[0], %[lerpP] \n"// load the positive phase + "veor q0, q0, q0 \n"// (1) acc_L = 0 + "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 + + "1: \n" + + "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples + "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples + "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs + "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation + "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs + "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation + + "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs + "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets + + "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs + "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs + + "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive + "vrev64.16 q3, q3 \n"// (1) reverse 8 frames of the right positive + + "vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set + "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set + + "vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left + "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left + "vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right + "vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right + "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left + "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left + "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right + "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right + + // moving these ARM before neon seems to be slower + "subs %[count], %[count], #8 \n"// (1) update loop counter + "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples + + // sP used after branch (warning) + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_STEREO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [coefsP1] "+r" (coefsP1), + [coefsN1] "+r" (coefsN1), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [lerpP] "r" (lerpP), + [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q4", "q5", "q6", + "q8", "q9", "q10", "q11" + ); +} + +template <> +inline void ProcessL<1, 16>(int32_t* const out, + int count, + const int32_t* coefsP, + const int32_t* coefsN, + const int16_t* sP, + const int16_t* sN, + const int32_t* const volumeLR) +{ + const int CHANNELS = 1; // template specialization does not preserve params + const int STRIDE = 16; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "veor q0, q0, q0 \n"// result, initialize to 0 + + "1: \n" + + "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples + "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples + "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs + "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs + + "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side + + "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits + "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits + + "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits + "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits + + "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef + + "vadd.s32 q0, q0, q12 \n"// accumulate result + "vadd.s32 q13, q13, q14 \n"// accumulate result + "vadd.s32 q0, q0, q15 \n"// accumulate result + "vadd.s32 q0, q0, q13 \n"// accumulate result + + "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples + "subs %[count], %[count], #8 \n"// update loop counter + + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_MONO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15" + ); +} + +template <> +inline void ProcessL<2, 16>(int32_t* const out, + int count, + const int32_t* coefsP, + const int32_t* coefsN, + const int16_t* sP, + const int16_t* sN, + const int32_t* const volumeLR) +{ + const int CHANNELS = 2; // template specialization does not preserve params + const int STRIDE = 16; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "veor q0, q0, q0 \n"// result, initialize to 0 + "veor q4, q4, q4 \n"// result, initialize to 0 + + "1: \n" + + "vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples + "vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples + "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs + "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs + + "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side + "vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side + + "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits + "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits + + "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits + "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits + + "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef + + "vadd.s32 q0, q0, q12 \n"// accumulate result + "vadd.s32 q13, q13, q14 \n"// accumulate result + "vadd.s32 q0, q0, q15 \n"// (+1) accumulate result + "vadd.s32 q0, q0, q13 \n"// (+1) accumulate result + + "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits + "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits + + "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits + "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits + + "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef + + "vadd.s32 q4, q4, q12 \n"// accumulate result + "vadd.s32 q13, q13, q14 \n"// accumulate result + "vadd.s32 q4, q4, q15 \n"// (+1) accumulate result + "vadd.s32 q4, q4, q13 \n"// (+1) accumulate result + + "subs %[count], %[count], #8 \n"// update loop counter + "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples + + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_STEREO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q4", "q5", "q6", + "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15" + ); +} + +template <> +inline void Process<1, 16>(int32_t* const out, + int count, + const int32_t* coefsP, + const int32_t* coefsN, + const int32_t* coefsP1, + const int32_t* coefsN1, + const int16_t* sP, + const int16_t* sN, + uint32_t lerpP, + const int32_t* const volumeLR) +{ + const int CHANNELS = 1; // template specialization does not preserve params + const int STRIDE = 16; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "vmov.32 d2[0], %[lerpP] \n"// load the positive phase + "veor q0, q0, q0 \n"// result, initialize to 0 + + "1: \n" + + "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples + "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples + "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs + "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs + "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs + "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs + + "vsub.s32 q12, q12, q8 \n"// interpolate (step1) + "vsub.s32 q13, q13, q9 \n"// interpolate (step1) + "vsub.s32 q14, q14, q10 \n"// interpolate (step1) + "vsub.s32 q15, q15, q11 \n"// interpolate (step1) + + "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) + "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) + "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) + "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) + + "vadd.s32 q8, q8, q12 \n"// interpolate (step3) + "vadd.s32 q9, q9, q13 \n"// interpolate (step3) + "vadd.s32 q10, q10, q14 \n"// interpolate (step3) + "vadd.s32 q11, q11, q15 \n"// interpolate (step3) + + "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side + + "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits + "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits + + "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits + "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits + + "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef + + "vadd.s32 q0, q0, q12 \n"// accumulate result + "vadd.s32 q13, q13, q14 \n"// accumulate result + "vadd.s32 q0, q0, q15 \n"// accumulate result + "vadd.s32 q0, q0, q13 \n"// accumulate result + + "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples + "subs %[count], %[count], #8 \n"// update loop counter + + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_MONO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [coefsP1] "+r" (coefsP1), + [coefsN1] "+r" (coefsN1), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [lerpP] "r" (lerpP), + [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15" + ); +} + +template <> +inline void Process<2, 16>(int32_t* const out, + int count, + const int32_t* coefsP, + const int32_t* coefsN, + const int32_t* coefsP1, + const int32_t* coefsN1, + const int16_t* sP, + const int16_t* sN, + uint32_t lerpP, + const int32_t* const volumeLR) +{ + const int CHANNELS = 2; // template specialization does not preserve params + const int STRIDE = 16; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "vmov.32 d2[0], %[lerpP] \n"// load the positive phase + "veor q0, q0, q0 \n"// result, initialize to 0 + "veor q4, q4, q4 \n"// result, initialize to 0 + + "1: \n" + + "vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples + "vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples + "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs + "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs + "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs + "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs + + "vsub.s32 q12, q12, q8 \n"// interpolate (step1) + "vsub.s32 q13, q13, q9 \n"// interpolate (step1) + "vsub.s32 q14, q14, q10 \n"// interpolate (step1) + "vsub.s32 q15, q15, q11 \n"// interpolate (step1) + + "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) + "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) + "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) + "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) + + "vadd.s32 q8, q8, q12 \n"// interpolate (step3) + "vadd.s32 q9, q9, q13 \n"// interpolate (step3) + "vadd.s32 q10, q10, q14 \n"// interpolate (step3) + "vadd.s32 q11, q11, q15 \n"// interpolate (step3) + + "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side + "vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side + + "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits + "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits + + "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits + "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits + + "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef + + "vadd.s32 q0, q0, q12 \n"// accumulate result + "vadd.s32 q13, q13, q14 \n"// accumulate result + "vadd.s32 q0, q0, q15 \n"// (+1) accumulate result + "vadd.s32 q0, q0, q13 \n"// (+1) accumulate result + + "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits + "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits + + "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits + "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits + + "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef + + "vadd.s32 q4, q4, q12 \n"// accumulate result + "vadd.s32 q13, q13, q14 \n"// accumulate result + "vadd.s32 q4, q4, q15 \n"// (+1) accumulate result + "vadd.s32 q4, q4, q13 \n"// (+1) accumulate result + + "subs %[count], %[count], #8 \n"// update loop counter + "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples + + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_STEREO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [coefsP1] "+r" (coefsP1), + [coefsN1] "+r" (coefsN1), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [lerpP] "r" (lerpP), + [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q4", "q5", "q6", + "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15" + ); +} + +template <> +inline void ProcessL<1, 8>(int32_t* const out, + int count, + const int16_t* coefsP, + const int16_t* coefsN, + const int16_t* sP, + const int16_t* sN, + const int32_t* const volumeLR) +{ + const int CHANNELS = 1; // template specialization does not preserve params + const int STRIDE = 8; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 + + "1: \n" + + "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples + "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples + "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs + "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs + + "vrev64.16 d4, d4 \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4 + + // reordering the vmal to do d6, d7 before d4, d5 is slower(?) + "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed)samples by coef + "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples + + // moving these ARM instructions before neon above seems to be slower + "subs %[count], %[count], #4 \n"// (1) update loop counter + "sub %[sP], %[sP], #8 \n"// (0) move pointer to next set of samples + + // sP used after branch (warning) + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_MONO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q8", "q10" + ); +} + +template <> +inline void ProcessL<2, 8>(int32_t* const out, + int count, + const int16_t* coefsP, + const int16_t* coefsN, + const int16_t* sP, + const int16_t* sN, + const int32_t* const volumeLR) +{ + const int CHANNELS = 2; // template specialization does not preserve params + const int STRIDE = 8; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "veor q0, q0, q0 \n"// (1) acc_L = 0 + "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 + + "1: \n" + + "vld2.16 {d4, d5}, [%[sP]] \n"// (2+0d) load 8 16-bits stereo samples + "vld2.16 {d6, d7}, [%[sN]]! \n"// (2) load 8 16-bits stereo samples + "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs + "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs + + "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive + + "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left + "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right + "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left + "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right + + // moving these ARM before neon seems to be slower + "subs %[count], %[count], #4 \n"// (1) update loop counter + "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples + + // sP used after branch (warning) + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_STEREO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q4", "q5", "q6", + "q8", "q10" + ); +} + +template <> +inline void Process<1, 8>(int32_t* const out, + int count, + const int16_t* coefsP, + const int16_t* coefsN, + const int16_t* coefsP1, + const int16_t* coefsN1, + const int16_t* sP, + const int16_t* sN, + uint32_t lerpP, + const int32_t* const volumeLR) +{ + const int CHANNELS = 1; // template specialization does not preserve params + const int STRIDE = 8; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15 + "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 + + "1: \n" + + "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples + "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples + "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs + "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 4 16-bits coefs for interpolation + "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 4 16-bits coefs + "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs for interpolation + + "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs + "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets + + "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs + "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs + + "vrev64.16 d4, d4 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 + + "vadd.s16 d16, d16, d17 \n"// (1+2d) interpolate (step3) 1st set + "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set + + // reordering the vmal to do d6, d7 before d4, d5 is slower(?) + "vmlal.s16 q0, d4, d16 \n"// (1+0d) multiply (reversed)by coef + "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples + + // moving these ARM instructions before neon above seems to be slower + "subs %[count], %[count], #4 \n"// (1) update loop counter + "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples + + // sP used after branch (warning) + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_MONO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [coefsP1] "+r" (coefsP1), + [coefsN1] "+r" (coefsN1), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [lerpP] "r" (lerpP), + [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11" + ); +} + +template <> +inline void Process<2, 8>(int32_t* const out, + int count, + const int16_t* coefsP, + const int16_t* coefsN, + const int16_t* coefsP1, + const int16_t* coefsN1, + const int16_t* sP, + const int16_t* sN, + uint32_t lerpP, + const int32_t* const volumeLR) +{ + const int CHANNELS = 2; // template specialization does not preserve params + const int STRIDE = 8; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "vmov.32 d2[0], %[lerpP] \n"// load the positive phase + "veor q0, q0, q0 \n"// (1) acc_L = 0 + "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 + + "1: \n" + + "vld2.16 {d4, d5}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples + "vld2.16 {d6, d7}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples + "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs + "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 8 16-bits coefs for interpolation + "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 8 16-bits coefs + "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs for interpolation + + "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs + "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets + + "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs + "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs + + "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive + + "vadd.s16 d16, d16, d17 \n"// (1+1d) interpolate (step3) 1st set + "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set + + "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left + "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right + "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left + "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right + + // moving these ARM before neon seems to be slower + "subs %[count], %[count], #4 \n"// (1) update loop counter + "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples + + // sP used after branch (warning) + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_STEREO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [coefsP1] "+r" (coefsP1), + [coefsN1] "+r" (coefsN1), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [lerpP] "r" (lerpP), + [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q4", "q5", "q6", + "q8", "q9", "q10", "q11" + ); +} + +template <> +inline void ProcessL<1, 8>(int32_t* const out, + int count, + const int32_t* coefsP, + const int32_t* coefsN, + const int16_t* sP, + const int16_t* sN, + const int32_t* const volumeLR) +{ + const int CHANNELS = 1; // template specialization does not preserve params + const int STRIDE = 8; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "veor q0, q0, q0 \n"// result, initialize to 0 + + "1: \n" + + "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples + "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples + "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs + "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs + + "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side + + "vshll.s16 q12, d4, #15 \n"// (stall) extend samples to 31 bits + "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits + + "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef + + "vadd.s32 q0, q0, q12 \n"// accumulate result + "vadd.s32 q0, q0, q14 \n"// (stall) accumulate result + + "subs %[count], %[count], #4 \n"// update loop counter + "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples + + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_MONO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", + "q12", "q14" + ); +} + +template <> +inline void ProcessL<2, 8>(int32_t* const out, + int count, + const int32_t* coefsP, + const int32_t* coefsN, + const int16_t* sP, + const int16_t* sN, + const int32_t* const volumeLR) +{ + const int CHANNELS = 2; // template specialization does not preserve params + const int STRIDE = 8; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "veor q0, q0, q0 \n"// result, initialize to 0 + "veor q4, q4, q4 \n"// result, initialize to 0 + + "1: \n" + + "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples + "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples + "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs + "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs + + "vrev64.16 q2, q2 \n"// reverse 2 frames of the positive side + + "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits + "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits + + "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits + "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits + + "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by coef + "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef + "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef + "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by coef + + "vadd.s32 q0, q0, q12 \n"// accumulate result + "vadd.s32 q4, q4, q13 \n"// accumulate result + "vadd.s32 q0, q0, q14 \n"// accumulate result + "vadd.s32 q4, q4, q15 \n"// accumulate result + + "subs %[count], %[count], #4 \n"// update loop counter + "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples + + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_STEREO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsN0] "+r" (coefsN), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15" + ); +} + +template <> +inline void Process<1, 8>(int32_t* const out, + int count, + const int32_t* coefsP, + const int32_t* coefsN, + const int32_t* coefsP1, + const int32_t* coefsN1, + const int16_t* sP, + const int16_t* sN, + uint32_t lerpP, + const int32_t* const volumeLR) +{ + const int CHANNELS = 1; // template specialization does not preserve params + const int STRIDE = 8; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "vmov.32 d2[0], %[lerpP] \n"// load the positive phase + "veor q0, q0, q0 \n"// result, initialize to 0 + + "1: \n" + + "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples + "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples + "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs + "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation + "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs + "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation + + "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side + + "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs + "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets + "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits + + "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs + "vqrdmulh.s32 q11, q11, d2[0] \n"// interpolate (step2) 2nd set of coefs + "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits + + "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set + "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set + + "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef + + "vadd.s32 q0, q0, q12 \n"// accumulate result + "vadd.s32 q0, q0, q14 \n"// accumulate result + + "subs %[count], %[count], #4 \n"// update loop counter + "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples + + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_MONO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsP1] "+r" (coefsP1), + [coefsN0] "+r" (coefsN), + [coefsN1] "+r" (coefsN1), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [lerpP] "r" (lerpP), + [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", + "q12", "q14" + ); +} + +template <> +inline +void Process<2, 8>(int32_t* const out, + int count, + const int32_t* coefsP, + const int32_t* coefsN, + const int32_t* coefsP1, + const int32_t* coefsN1, + const int16_t* sP, + const int16_t* sN, + uint32_t lerpP, + const int32_t* const volumeLR) +{ + const int CHANNELS = 2; // template specialization does not preserve params + const int STRIDE = 8; + sP -= CHANNELS*((STRIDE>>1)-1); + asm ( + "vmov.32 d2[0], %[lerpP] \n"// load the positive phase + "veor q0, q0, q0 \n"// result, initialize to 0 + "veor q4, q4, q4 \n"// result, initialize to 0 + + "1: \n" + "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples + "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples + "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs + "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation + "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs + "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation + + "vrev64.16 q2, q2 \n"// (reversed) 2 frames of the positive side + + "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs + "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets + "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits + "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits + + "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs + "vqrdmulh.s32 q11, q11, d2[1] \n"// interpolate (step3) 2nd set of coefs + "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits + "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits + + "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set + "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set + + "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef + "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by interpolated coef + + "vadd.s32 q0, q0, q12 \n"// accumulate result + "vadd.s32 q4, q4, q13 \n"// accumulate result + "vadd.s32 q0, q0, q14 \n"// accumulate result + "vadd.s32 q4, q4, q15 \n"// accumulate result + + "subs %[count], %[count], #4 \n"// update loop counter + "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples + + "bne 1b \n"// loop + + ASSEMBLY_ACCUMULATE_STEREO + + : [out] "=Uv" (out[0]), + [count] "+r" (count), + [coefsP0] "+r" (coefsP), + [coefsP1] "+r" (coefsP1), + [coefsN0] "+r" (coefsN), + [coefsN1] "+r" (coefsN1), + [sP] "+r" (sP), + [sN] "+r" (sN) + : [lerpP] "r" (lerpP), + [vLR] "r" (volumeLR) + : "cc", "memory", + "q0", "q1", "q2", "q3", "q4", + "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15" + ); +} + +#endif //USE_NEON + +}; // namespace android + +#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/ diff --git a/services/audioflinger/test-resample.cpp b/services/audioflinger/test-resample.cpp index 403bb6d..3f2ce55 100644 --- a/services/audioflinger/test-resample.cpp +++ b/services/audioflinger/test-resample.cpp @@ -33,8 +33,9 @@ using namespace android; bool gVerbose = false; static int usage(const char* name) { - fprintf(stderr,"Usage: %s [-p] [-h] [-v] [-s] [-q {dq|lq|mq|hq|vhq}] [-i input-sample-rate] " - "[-o output-sample-rate] [] \n", name); + fprintf(stderr,"Usage: %s [-p] [-h] [-v] [-s] [-q {dq|lq|mq|hq|vhq|dlq|dmq|dhq}]" + " [-i input-sample-rate] [-o output-sample-rate] []" + " \n", name); fprintf(stderr," -p enable profiling\n"); fprintf(stderr," -h create wav file\n"); fprintf(stderr," -v verbose : log buffer provider calls\n"); @@ -45,6 +46,9 @@ static int usage(const char* name) { fprintf(stderr," mq : medium quality\n"); fprintf(stderr," hq : high quality\n"); fprintf(stderr," vhq : very high quality\n"); + fprintf(stderr," dlq : dynamic low quality\n"); + fprintf(stderr," dmq : dynamic medium quality\n"); + fprintf(stderr," dhq : dynamic high quality\n"); fprintf(stderr," -i input file sample rate (ignored if input file is specified)\n"); fprintf(stderr," -o output file sample rate\n"); return -1; @@ -86,6 +90,12 @@ int main(int argc, char* argv[]) { quality = AudioResampler::HIGH_QUALITY; else if (!strcmp(optarg, "vhq")) quality = AudioResampler::VERY_HIGH_QUALITY; + else if (!strcmp(optarg, "dlq")) + quality = AudioResampler::DYN_LOW_QUALITY; + else if (!strcmp(optarg, "dmq")) + quality = AudioResampler::DYN_MED_QUALITY; + else if (!strcmp(optarg, "dhq")) + quality = AudioResampler::DYN_HIGH_QUALITY; else { usage(progname); return -1; @@ -137,6 +147,8 @@ int main(int argc, char* argv[]) { channels = info.channels; input_freq = info.samplerate; } else { + // data for testing is exactly (input sampling rate/1000)/2 seconds + // so 44.1khz input is 22.05 seconds double k = 1000; // Hz / s double time = (input_freq / 2) / k; size_t input_frames = size_t(input_freq * time); @@ -148,7 +160,7 @@ int main(int argc, char* argv[]) { double y = sin(M_PI * k * t * t); int16_t yi = floor(y * 32767.0 + 0.5); for (size_t j=0 ; j<(size_t)channels ; j++) { - in[i*channels + j] = yi / (1+j); + in[i*channels + j] = yi / (1+j); // right ch. 1/2 left ch. } } } @@ -170,6 +182,7 @@ int main(int argc, char* argv[]) { } virtual status_t getNextBuffer(Buffer* buffer, int64_t pts = kInvalidPTS) { + (void)pts; // suppress warning size_t requestedFrames = buffer->frameCount; if (requestedFrames > mNumFrames - mNextFrame) { buffer->frameCount = mNumFrames - mNextFrame; @@ -205,6 +218,9 @@ int main(int argc, char* argv[]) { buffer->frameCount = 0; buffer->i16 = NULL; } + void reset() { + mNextFrame = 0; + } } provider(input_vaddr, input_size, channels); size_t input_frames = input_size / (channels * sizeof(int16_t)); @@ -215,37 +231,29 @@ int main(int argc, char* argv[]) { output_size &= ~7; // always stereo, 32-bits void* output_vaddr = malloc(output_size); + AudioResampler* resampler = AudioResampler::create(16, channels, + output_freq, quality); + size_t out_frames = output_size/8; + resampler->setSampleRate(input_freq); + resampler->setVolume(0x1000, 0x1000); if (profiling) { - AudioResampler* resampler = AudioResampler::create(16, channels, - output_freq, quality); - - size_t out_frames = output_size/8; - resampler->setSampleRate(input_freq); - resampler->setVolume(0x1000, 0x1000); - - memset(output_vaddr, 0, output_size); + const int looplimit = 100; timespec start, end; clock_gettime(CLOCK_MONOTONIC, &start); - resampler->resample((int*) output_vaddr, out_frames, &provider); - resampler->resample((int*) output_vaddr, out_frames, &provider); - resampler->resample((int*) output_vaddr, out_frames, &provider); - resampler->resample((int*) output_vaddr, out_frames, &provider); + for (int i = 0; i < looplimit; ++i) { + resampler->resample((int*) output_vaddr, out_frames, &provider); + provider.reset(); // reset only provider as benchmarking + } clock_gettime(CLOCK_MONOTONIC, &end); int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec; int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec; - int64_t time = (end_ns - start_ns)/4; - printf("%f Mspl/s\n", out_frames/(time/1e9)/1e6); - - delete resampler; + int64_t time = end_ns - start_ns; + printf("time(ns):%lld channels:%d quality:%d\n", time, channels, quality); + printf("%f Mspl/s\n", out_frames * looplimit / (time / 1e9) / 1e6); + resampler->reset(); } - AudioResampler* resampler = AudioResampler::create(16, channels, - output_freq, quality); - size_t out_frames = output_size/8; - resampler->setSampleRate(input_freq); - resampler->setVolume(0x1000, 0x1000); - memset(output_vaddr, 0, output_size); if (gVerbose) { printf("resample() %u output frames\n", out_frames); @@ -259,14 +267,18 @@ int main(int argc, char* argv[]) { printf("reset() complete\n"); } - // down-mix (we just truncate and keep the left channel) + // mono takes left channel only + // stereo right channel is half amplitude of stereo left channel (due to input creation) int32_t* out = (int32_t*) output_vaddr; int16_t* convert = (int16_t*) malloc(out_frames * channels * sizeof(int16_t)); + for (size_t i = 0; i < out_frames; i++) { - for (int j=0 ; j> 12; - if (s > 32767) s = 32767; - else if (s < -32768) s = -32768; + if (s > 32767) + s = 32767; + else if (s < -32768) + s = -32768; convert[i * channels + j] = int16_t(s); } } diff --git a/tools/resampler_tools/fir.cpp b/tools/resampler_tools/fir.cpp index cc3d509..27a9b05 100644 --- a/tools/resampler_tools/fir.cpp +++ b/tools/resampler_tools/fir.cpp @@ -20,15 +20,25 @@ #include #include -static double sinc(double x) { +static inline double sinc(double x) { if (fabs(x) == 0.0f) return 1.0f; return sin(x) / x; } -static double sqr(double x) { +static inline double sqr(double x) { return x*x; } +static inline int64_t toint(double x, int64_t maxval) { + int64_t v; + + v = static_cast(floor(y * maxval + 0.5)); + if (v >= maxval) { + return maxval - 1; // error! + } + return v; +} + static double I0(double x) { // from the Numerical Recipes in C p. 237 double ax,ans,y; @@ -54,11 +64,12 @@ static double kaiser(int k, int N, double beta) { return I0(beta * sqrt(1.0 - sqr((2.0*k)/N - 1.0))) / I0(beta); } - static void usage(char* name) { fprintf(stderr, - "usage: %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings] [-f {float|fixed}] [-b beta] [-v dBFS] [-l lerp]\n" - " %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings] [-f {float|fixed}] [-b beta] [-v dBFS] -p M/N\n" + "usage: %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings]" + " [-f {float|fixed|fixed16}] [-b beta] [-v dBFS] [-l lerp]\n" + " %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings]" + " [-f {float|fixed|fixed16}] [-b beta] [-v dBFS] -p M/N\n" " -h this help message\n" " -d debug, print comma-separated coefficient table\n" " -p generate poly-phase filter coefficients, with sample increment M/N\n" @@ -66,6 +77,7 @@ static void usage(char* name) { " -c cut-off frequency (20478)\n" " -n number of zero-crossings on one side (8)\n" " -l number of lerping bits (4)\n" + " -m number of polyphases (related to -l, default 16)\n" " -f output format, can be fixed-point or floating-point (fixed)\n" " -b kaiser window parameter beta (7.865 [-80dB])\n" " -v attenuation in dBFS (0)\n", @@ -77,8 +89,7 @@ static void usage(char* name) { int main(int argc, char** argv) { // nc is the number of bits to store the coefficients - const int nc = 32; - + int nc = 32; bool polyphase = false; unsigned int polyM = 160; unsigned int polyN = 147; @@ -88,7 +99,6 @@ int main(int argc, char** argv) double atten = 1; int format = 0; - // in order to keep the errors associated with the linear // interpolation of the coefficients below the quantization error // we must satisfy: @@ -104,7 +114,6 @@ int main(int argc, char** argv) // Smith, J.O. Digital Audio Resampling Home Page // https://ccrma.stanford.edu/~jos/resample/, 2011-03-29 // - int nz = 4; // | 0.1102*(A - 8.7) A > 50 // beta = | 0.5842*(A - 21)^0.4 + 0.07886*(A - 21) 21 <= A <= 50 @@ -123,7 +132,6 @@ int main(int argc, char** argv) // 100 dB 10.056 double beta = 7.865; - // 2*nzc = (A - 8) / (2.285 * dw) // with dw the transition width = 2*pi*dF/Fs // @@ -148,8 +156,9 @@ int main(int argc, char** argv) // nzc = 20 // + int M = 1 << 4; // number of phases for interpolation int ch; - while ((ch = getopt(argc, argv, ":hds:c:n:f:l:b:p:v:")) != -1) { + while ((ch = getopt(argc, argv, ":hds:c:n:f:l:m:b:p:v:z:")) != -1) { switch (ch) { case 'd': debug = true; @@ -169,13 +178,26 @@ int main(int argc, char** argv) case 'n': nzc = atoi(optarg); break; + case 'm': + M = atoi(optarg); + break; case 'l': - nz = atoi(optarg); + M = 1 << atoi(optarg); break; case 'f': - if (!strcmp(optarg,"fixed")) format = 0; - else if (!strcmp(optarg,"float")) format = 1; - else usage(argv[0]); + if (!strcmp(optarg, "fixed")) { + format = 0; + } + else if (!strcmp(optarg, "fixed16")) { + format = 0; + nc = 16; + } + else if (!strcmp(optarg, "float")) { + format = 1; + } + else { + usage(argv[0]); + } break; case 'b': beta = atof(optarg); @@ -193,11 +215,14 @@ int main(int argc, char** argv) // cut off frequency ratio Fc/Fs double Fcr = Fc / Fs; - // total number of coefficients (one side) - const int M = (1 << nz); + const int N = M * nzc; + // lerp (which is most useful if M is a power of 2) + + int nz = 0; // recalculate nz as the bits needed to represent M + for (int i = M-1 ; i; i>>=1, nz++); // generate the right half of the filter if (!debug) { printf("// cmd-line: "); @@ -207,7 +232,7 @@ int main(int argc, char** argv) printf("\n"); if (!polyphase) { printf("const int32_t RESAMPLE_FIR_SIZE = %d;\n", N); - printf("const int32_t RESAMPLE_FIR_LERP_INT_BITS = %d;\n", nz); + printf("const int32_t RESAMPLE_FIR_INT_PHASES = %d;\n", M); printf("const int32_t RESAMPLE_FIR_NUM_COEF = %d;\n", nzc); } else { printf("const int32_t RESAMPLE_FIR_SIZE = %d;\n", 2*nzc*polyN); @@ -224,7 +249,7 @@ int main(int argc, char** argv) for (int i=0 ; i<=M ; i++) { // an extra set of coefs for interpolation for (int j=0 ; j= (1LL<<(nc-1))) yi = (1LL<<(nc-1))-1; - printf("0x%08x, ", int32_t(yi)); + int64_t yi = toint(y, 1ULL<<(nc-1)); + if (nc > 16) { + printf("0x%08x, ", int32_t(yi)); + } else { + printf("0x%04x, ", int32_t(yi)&0xffff); + } } else { printf("%.9g%s ", y, debug ? "," : "f,"); } @@ -254,9 +281,12 @@ int main(int argc, char** argv) double y = kaiser(i+N, 2*N, beta) * sinc(x) * 2.0 * Fcr;; y *= atten; if (!format) { - int64_t yi = floor(y * ((1ULL<<(nc-1))) + 0.5); - if (yi >= (1LL<<(nc-1))) yi = (1LL<<(nc-1))-1; - printf("0x%08x", int32_t(yi)); + int64_t yi = toint(y, 1ULL<<(nc-1)); + if (nc > 16) { + printf("0x%08x, ", int32_t(yi)); + } else { + printf("0x%04x, ", int32_t(yi)&0xffff); + } } else { printf("%.9g%s", y, debug ? "" : "f"); } @@ -277,5 +307,3 @@ int main(int argc, char** argv) } // http://www.csee.umbc.edu/help/sound/AFsp-V2R1/html/audio/ResampAudio.html - - -- cgit v1.1