diff options
| -rw-r--r-- | services/audioflinger/Android.mk | 10 | ||||
| -rw-r--r-- | services/audioflinger/AudioResampler.cpp | 27 | ||||
| -rw-r--r-- | services/audioflinger/AudioResampler.h | 3 | ||||
| -rw-r--r-- | services/audioflinger/AudioResamplerDyn.cpp | 530 | ||||
| -rw-r--r-- | services/audioflinger/AudioResamplerDyn.h | 123 | ||||
| -rw-r--r-- | services/audioflinger/AudioResamplerFirGen.h | 307 | ||||
| -rw-r--r-- | services/audioflinger/AudioResamplerFirOps.h | 163 | ||||
| -rw-r--r-- | services/audioflinger/AudioResamplerFirProcess.h | 256 | ||||
| -rw-r--r-- | services/audioflinger/AudioResamplerFirProcessNeon.h | 1149 | ||||
| -rw-r--r-- | services/audioflinger/test-resample.cpp | 70 | ||||
| -rw-r--r-- | tools/resampler_tools/fir.cpp | 84 | 
11 files changed, 2660 insertions, 62 deletions
diff --git a/services/audioflinger/Android.mk b/services/audioflinger/Android.mk index 3ec9285..4524d3c 100644 --- a/services/audioflinger/Android.mk +++ b/services/audioflinger/Android.mk @@ -23,7 +23,8 @@ LOCAL_SRC_FILES:=               \      AudioPolicyService.cpp      \      ServiceUtilities.cpp        \      AudioResamplerCubic.cpp.arm \ -    AudioResamplerSinc.cpp.arm +    AudioResamplerSinc.cpp.arm  \ +    AudioResamplerDyn.cpp.arm  LOCAL_SRC_FILES += StateQueue.cpp @@ -74,10 +75,11 @@ include $(BUILD_SHARED_LIBRARY)  include $(CLEAR_VARS)  LOCAL_SRC_FILES:=               \ -	test-resample.cpp 			\ +    test-resample.cpp           \      AudioResampler.cpp.arm      \ -	AudioResamplerCubic.cpp.arm \ -    AudioResamplerSinc.cpp.arm +    AudioResamplerCubic.cpp.arm \ +    AudioResamplerSinc.cpp.arm  \ +    AudioResamplerDyn.cpp.arm  LOCAL_C_INCLUDES := \      $(call include-path-for, audio-utils) diff --git a/services/audioflinger/AudioResampler.cpp b/services/audioflinger/AudioResampler.cpp index 323f1a4..3b5a8c1 100644 --- a/services/audioflinger/AudioResampler.cpp +++ b/services/audioflinger/AudioResampler.cpp @@ -25,6 +25,7 @@  #include "AudioResampler.h"  #include "AudioResamplerSinc.h"  #include "AudioResamplerCubic.h" +#include "AudioResamplerDyn.h"  #ifdef __arm__  #include <machine/cpu-features.h> @@ -85,6 +86,9 @@ bool AudioResampler::qualityIsSupported(src_quality quality)      case MED_QUALITY:      case HIGH_QUALITY:      case VERY_HIGH_QUALITY: +    case DYN_LOW_QUALITY: +    case DYN_MED_QUALITY: +    case DYN_HIGH_QUALITY:          return true;      default:          return false; @@ -105,7 +109,7 @@ void AudioResampler::init_routine()          if (*endptr == '\0') {              defaultQuality = (src_quality) l;              ALOGD("forcing AudioResampler quality to %d", defaultQuality); -            if (defaultQuality < DEFAULT_QUALITY || defaultQuality > VERY_HIGH_QUALITY) { +            if (defaultQuality < DEFAULT_QUALITY || defaultQuality > DYN_HIGH_QUALITY) {                  defaultQuality = DEFAULT_QUALITY;              }          } @@ -125,6 +129,12 @@ uint32_t AudioResampler::qualityMHz(src_quality quality)          return 20;      case VERY_HIGH_QUALITY:          return 34; +    case DYN_LOW_QUALITY: +        return 4; +    case DYN_MED_QUALITY: +        return 6; +    case DYN_HIGH_QUALITY: +        return 12;      }  } @@ -175,6 +185,15 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount,          case VERY_HIGH_QUALITY:              quality = HIGH_QUALITY;              break; +        case DYN_LOW_QUALITY: +            atFinalQuality = true; +            break; +        case DYN_MED_QUALITY: +            quality = DYN_LOW_QUALITY; +            break; +        case DYN_HIGH_QUALITY: +            quality = DYN_MED_QUALITY; +            break;          }      }      pthread_mutex_unlock(&mutex); @@ -200,6 +219,12 @@ AudioResampler* AudioResampler::create(int bitDepth, int inChannelCount,          ALOGV("Create VERY_HIGH_QUALITY sinc Resampler = %d", quality);          resampler = new AudioResamplerSinc(bitDepth, inChannelCount, sampleRate, quality);          break; +    case DYN_LOW_QUALITY: +    case DYN_MED_QUALITY: +    case DYN_HIGH_QUALITY: +        ALOGV("Create dynamic Resampler = %d", quality); +        resampler = new AudioResamplerDyn(bitDepth, inChannelCount, sampleRate, quality); +        break;      }      // initialize resampler diff --git a/services/audioflinger/AudioResampler.h b/services/audioflinger/AudioResampler.h index 33e64ce..c341325 100644 --- a/services/audioflinger/AudioResampler.h +++ b/services/audioflinger/AudioResampler.h @@ -41,6 +41,9 @@ public:          MED_QUALITY=2,          HIGH_QUALITY=3,          VERY_HIGH_QUALITY=4, +        DYN_LOW_QUALITY=5, +        DYN_MED_QUALITY=6, +        DYN_HIGH_QUALITY=7,      };      static AudioResampler* create(int bitDepth, int inChannelCount, diff --git a/services/audioflinger/AudioResamplerDyn.cpp b/services/audioflinger/AudioResamplerDyn.cpp new file mode 100644 index 0000000..1e38d3e --- /dev/null +++ b/services/audioflinger/AudioResamplerDyn.cpp @@ -0,0 +1,530 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + *      http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "AudioResamplerDyn" +//#define LOG_NDEBUG 0 + +#include <malloc.h> +#include <string.h> +#include <stdlib.h> +#include <dlfcn.h> +#include <math.h> + +#include <cutils/compiler.h> +#include <cutils/properties.h> +#include <utils/Log.h> + +#include "AudioResamplerFirOps.h" // USE_NEON and USE_INLINE_ASSEMBLY defined here +#include "AudioResamplerFirProcess.h" +#include "AudioResamplerFirProcessNeon.h" +#include "AudioResamplerFirGen.h" // requires math.h +#include "AudioResamplerDyn.h" + +//#define DEBUG_RESAMPLER + +namespace android { + +// generate a unique resample type compile-time constant (constexpr) +#define RESAMPLETYPE(CHANNELS, LOCKED, STRIDE, COEFTYPE) \ +    ((((CHANNELS)-1)&1) | !!(LOCKED)<<1 | (COEFTYPE)<<2 \ +    | ((STRIDE)==8 ? 1 : (STRIDE)==16 ? 2 : 0)<<3) + +/* + * InBuffer is a type agnostic input buffer. + * + * Layout of the state buffer for halfNumCoefs=8. + * + * [rrrrrrppppppppnnnnnnnnrrrrrrrrrrrrrrrrrrr.... rrrrrrr] + *  S            I                                R + * + * S = mState + * I = mImpulse + * R = mRingFull + * p = past samples, convoluted with the (p)ositive side of sinc() + * n = future samples, convoluted with the (n)egative side of sinc() + * r = extra space for implementing the ring buffer + */ + +template<typename TI> +AudioResamplerDyn::InBuffer<TI>::InBuffer() +    : mState(NULL), mImpulse(NULL), mRingFull(NULL), mStateSize(0) { +} + +template<typename TI> +AudioResamplerDyn::InBuffer<TI>::~InBuffer() { +    init(); +} + +template<typename TI> +void AudioResamplerDyn::InBuffer<TI>::init() { +    free(mState); +    mState = NULL; +    mImpulse = NULL; +    mRingFull = NULL; +    mStateSize = 0; +} + +// resizes the state buffer to accommodate the appropriate filter length +template<typename TI> +void AudioResamplerDyn::InBuffer<TI>::resize(int CHANNELS, int halfNumCoefs) { +    // calculate desired state size +    int stateSize = halfNumCoefs * CHANNELS * 2 +            * kStateSizeMultipleOfFilterLength; + +    // check if buffer needs resizing +    if (mState +            && stateSize == mStateSize +            && mRingFull-mState == mStateSize-halfNumCoefs*CHANNELS) { +        return; +    } + +    // create new buffer +    TI* state = (int16_t*)memalign(32, stateSize*sizeof(*state)); +    memset(state, 0, stateSize*sizeof(*state)); + +    // attempt to preserve state +    if (mState) { +        TI* srcLo = mImpulse - halfNumCoefs*CHANNELS; +        TI* srcHi = mImpulse + halfNumCoefs*CHANNELS; +        TI* dst = state; + +        if (srcLo < mState) { +            dst += mState-srcLo; +            srcLo = mState; +        } +        if (srcHi > mState + mStateSize) { +            srcHi = mState + mStateSize; +        } +        memcpy(dst, srcLo, (srcHi - srcLo) * sizeof(*srcLo)); +        free(mState); +    } + +    // set class member vars +    mState = state; +    mStateSize = stateSize; +    mImpulse = mState + halfNumCoefs*CHANNELS; // actually one sample greater than needed +    mRingFull = mState + mStateSize - halfNumCoefs*CHANNELS; +} + +// copy in the input data into the head (impulse+halfNumCoefs) of the buffer. +template<typename TI> +template<int CHANNELS> +void AudioResamplerDyn::InBuffer<TI>::readAgain(TI*& impulse, const int halfNumCoefs, +        const TI* const in, const size_t inputIndex) { +    int16_t* head = impulse + halfNumCoefs*CHANNELS; +    for (size_t i=0 ; i<CHANNELS ; i++) { +        head[i] = in[inputIndex*CHANNELS + i]; +    } +} + +// advance the impulse pointer, and load in data into the head (impulse+halfNumCoefs) +template<typename TI> +template<int CHANNELS> +void AudioResamplerDyn::InBuffer<TI>::readAdvance(TI*& impulse, const int halfNumCoefs, +        const TI* const in, const size_t inputIndex) { +    impulse += CHANNELS; + +    if (CC_UNLIKELY(impulse >= mRingFull)) { +        const size_t shiftDown = mRingFull - mState - halfNumCoefs*CHANNELS; +        memcpy(mState, mState+shiftDown, halfNumCoefs*CHANNELS*2*sizeof(TI)); +        impulse -= shiftDown; +    } +    readAgain<CHANNELS>(impulse, halfNumCoefs, in, inputIndex); +} + +void AudioResamplerDyn::Constants::set( +        int L, int halfNumCoefs, int inSampleRate, int outSampleRate) +{ +    int bits = 0; +    int lscale = inSampleRate/outSampleRate < 2 ? L - 1 : +            static_cast<int>(static_cast<uint64_t>(L)*inSampleRate/outSampleRate); +    for (int i=lscale; i; ++bits, i>>=1) +        ; +    mL = L; +    mShift = kNumPhaseBits - bits; +    mHalfNumCoefs = halfNumCoefs; +} + +AudioResamplerDyn::AudioResamplerDyn(int bitDepth, +        int inChannelCount, int32_t sampleRate, src_quality quality) +    : AudioResampler(bitDepth, inChannelCount, sampleRate, quality), +    mResampleType(0), mFilterSampleRate(0), mCoefBuffer(NULL) +{ +    mVolumeSimd[0] = mVolumeSimd[1] = 0; +    mConstants.set(128, 8, mSampleRate, mSampleRate); // TODO: set better +} + +AudioResamplerDyn::~AudioResamplerDyn() { +    free(mCoefBuffer); +} + +void AudioResamplerDyn::init() { +    mFilterSampleRate = 0; // always trigger new filter generation +    mInBuffer.init(); +} + +void AudioResamplerDyn::setVolume(int16_t left, int16_t right) { +    AudioResampler::setVolume(left, right); +    mVolumeSimd[0] = static_cast<int32_t>(left)<<16; +    mVolumeSimd[1] = static_cast<int32_t>(right)<<16; +} + +template <typename T> T max(T a, T b) {return a > b ? a : b;} + +template <typename T> T absdiff(T a, T b) {return a > b ? a - b : b - a;} + +template<typename T> +void AudioResamplerDyn::createKaiserFir(Constants &c, double stopBandAtten, +        int inSampleRate, int outSampleRate, double tbwCheat) { +    T* buf = reinterpret_cast<T*>(memalign(32, (c.mL+1)*c.mHalfNumCoefs*sizeof(T))); +    static const double atten = 0.9998;   // to avoid ripple overflow +    double fcr; +    double tbw = firKaiserTbw(c.mHalfNumCoefs, stopBandAtten); + +    if (inSampleRate < outSampleRate) { // upsample +        fcr = max(0.5*tbwCheat - tbw/2, tbw/2); +    } else { // downsample +        fcr = max(0.5*tbwCheat*outSampleRate/inSampleRate - tbw/2, tbw/2); +    } +    // create and set filter +    firKaiserGen(buf, c.mL, c.mHalfNumCoefs, stopBandAtten, fcr, atten); +    c.setBuf(buf); +    if (mCoefBuffer) { +        free(mCoefBuffer); +    } +    mCoefBuffer = buf; +#ifdef DEBUG_RESAMPLER +    // print basic filter stats +    printf("L:%d  hnc:%d  stopBandAtten:%lf  fcr:%lf  atten:%lf  tbw:%lf\n", +            c.mL, c.mHalfNumCoefs, stopBandAtten, fcr, atten, tbw); +    // test the filter and report results +    double fp = (fcr - tbw/2)/c.mL; +    double fs = (fcr + tbw/2)/c.mL; +    double fmin, fmax; +    testFir(buf, c.mL, c.mHalfNumCoefs, 0., fp, 100, fmin, fmax); +    double d1 = (fmax - fmin)/2.; +    double ap = -20.*log10(1. - d1); // passband ripple +    printf("passband(%lf, %lf): %.8lf %.8lf %.8lf\n", 0., fp, (fmax + fmin)/2., d1, ap); +    testFir(buf, c.mL, c.mHalfNumCoefs, fs, 0.5, 100, fmin, fmax); +    double d2 = fmax; +    double as = -20.*log10(d2); // stopband attenuation +    printf("stopband(%lf, %lf): %.8lf %.8lf %.3lf\n", fs, 0.5, (fmax + fmin)/2., d2, as); +#endif +} + +// recursive gcd (TODO: verify tail recursion elimination should make this iterate) +static int gcd(int n, int m) { +    if (m == 0) { +        return n; +    } +    return gcd(m, n % m); +} + +static bool isClose(int32_t newSampleRate, int32_t prevSampleRate, int32_t filterSampleRate) { +    int pdiff = absdiff(newSampleRate, prevSampleRate); +    int adiff = absdiff(newSampleRate, filterSampleRate); + +    // allow up to 6% relative change increments. +    // allow up to 12% absolute change increments (from filter design) +    return pdiff < prevSampleRate>>4 && adiff < filterSampleRate>>3; +} + +void AudioResamplerDyn::setSampleRate(int32_t inSampleRate) { +    if (mInSampleRate == inSampleRate) { +        return; +    } +    int32_t oldSampleRate = mInSampleRate; +    int32_t oldHalfNumCoefs = mConstants.mHalfNumCoefs; +    uint32_t oldPhaseWrapLimit = mConstants.mL << mConstants.mShift; +    bool useS32 = false; + +    mInSampleRate = inSampleRate; + +    // TODO: Add precalculated Equiripple filters + +    if (!isClose(inSampleRate, oldSampleRate, mFilterSampleRate)) { +        mFilterSampleRate = inSampleRate; + +        // Begin Kaiser Filter computation +        // +        // The quantization floor for S16 is about 96db - 10*log_10(#length) + 3dB. +        // Keep the stop band attenuation no greater than 84-85dB for 32 length S16 filters +        // +        // For s32 we keep the stop band attenuation at the same as 16b resolution, about +        // 96-98dB +        // + +        double stopBandAtten; +        double tbwCheat = 1.; // how much we "cheat" into aliasing +        int halfLength; +        if (getQuality() == DYN_HIGH_QUALITY) { +            // 32b coefficients, 64 length +            useS32 = true; +            stopBandAtten = 98.; +            halfLength = 32; +        } else if (getQuality() == DYN_LOW_QUALITY) { +            // 16b coefficients, 16-32 length +            useS32 = false; +            stopBandAtten = 80.; +            if (mSampleRate >= inSampleRate * 2) { +                halfLength = 16; +            } else { +                halfLength = 8; +            } +            if (mSampleRate >= inSampleRate) { +                tbwCheat = 1.05; +            } else { +                tbwCheat = 1.03; +            } +        } else { // medium quality +            // 16b coefficients, 32-64 length +            useS32 = false; +            stopBandAtten = 84.; +            if (mSampleRate >= inSampleRate * 4) { +                halfLength = 32; +            } else if (mSampleRate >= inSampleRate * 2) { +                halfLength = 24; +            } else { +                halfLength = 16; +            } +            if (mSampleRate >= inSampleRate) { +                tbwCheat = 1.03; +            } else { +                tbwCheat = 1.01; +            } +        } + +        // determine the number of polyphases in the filterbank. +        // for 16b, it is desirable to have 2^(16/2) = 256 phases. +        // https://ccrma.stanford.edu/~jos/resample/Relation_Interpolation_Error_Quantization.html +        // +        // We are a bit more lax on this. + +        int phases = mSampleRate / gcd(mSampleRate, inSampleRate); + +        while (phases<63) { // too few phases, allow room for interpolation +            phases *= 2; // this code only needed to support dynamic rate changes +        } +        if (phases>=256) {  // too many phases, always interpolate +            phases = 127; +        } + +        // create the filter +        mConstants.set(phases, halfLength, inSampleRate, mSampleRate); +        if (useS32) { +            createKaiserFir<int32_t>(mConstants, stopBandAtten, +                    inSampleRate, mSampleRate, tbwCheat); +        } else { +            createKaiserFir<int16_t>(mConstants, stopBandAtten, +                    inSampleRate, mSampleRate, tbwCheat); +        } +    } // End Kaiser filter + +    // update phase and state based on the new filter. +    const Constants& c(mConstants); +    mInBuffer.resize(mChannelCount, c.mHalfNumCoefs); +    const uint32_t phaseWrapLimit = c.mL << c.mShift; +    // try to preserve as much of the phase fraction as possible for on-the-fly changes +    mPhaseFraction = static_cast<unsigned long long>(mPhaseFraction) +            * phaseWrapLimit / oldPhaseWrapLimit; +    mPhaseFraction %= phaseWrapLimit; // should not do anything, but just in case. +    mPhaseIncrement = static_cast<uint32_t>(static_cast<double>(phaseWrapLimit) +            * inSampleRate / mSampleRate); + +    // determine which resampler to use +    // check if locked phase (works only if mPhaseIncrement has no "fractional phase bits") +    int locked = (mPhaseIncrement << (sizeof(mPhaseIncrement)*8 - c.mShift)) == 0; +    int stride = (c.mHalfNumCoefs&7)==0 ? 16 : (c.mHalfNumCoefs&3)==0 ? 8 : 2; +    if (locked) { +        mPhaseFraction = mPhaseFraction >> c.mShift << c.mShift; // remove fractional phase +    } +    if (!USE_NEON) { +        stride = 2; // C version only +    } +    // TODO: Remove this for testing +    //stride = 2; +    mResampleType = RESAMPLETYPE(mChannelCount, locked, stride, !!useS32); +#ifdef DEBUG_RESAMPLER +    printf("channels:%d  %s  stride:%d  %s  coef:%d  shift:%d\n", +            mChannelCount, locked ? "locked" : "interpolated", +            stride, useS32 ? "S32" : "S16", 2*c.mHalfNumCoefs, c.mShift); +#endif +} + +void AudioResamplerDyn::resample(int32_t* out, size_t outFrameCount, +            AudioBufferProvider* provider) +{ +    // TODO: +    // 24 cases - this perhaps can be reduced later, as testing might take too long +    switch (mResampleType) { + +    // stride 16 (stride 2 for machines that do not support NEON) +    case RESAMPLETYPE(1, true, 16, 0): +        return resample<1, true, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider); +    case RESAMPLETYPE(2, true, 16, 0): +        return resample<2, true, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider); +    case RESAMPLETYPE(1, false, 16, 0): +        return resample<1, false, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider); +    case RESAMPLETYPE(2, false, 16, 0): +        return resample<2, false, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider); +    case RESAMPLETYPE(1, true, 16, 1): +        return resample<1, true, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +    case RESAMPLETYPE(2, true, 16, 1): +        return resample<2, true, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +    case RESAMPLETYPE(1, false, 16, 1): +        return resample<1, false, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +    case RESAMPLETYPE(2, false, 16, 1): +        return resample<2, false, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +#if 0 +    // TODO: Remove these? +    // stride 8 +    case RESAMPLETYPE(1, true, 8, 0): +        return resample<1, true, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider); +    case RESAMPLETYPE(2, true, 8, 0): +        return resample<2, true, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider); +    case RESAMPLETYPE(1, false, 8, 0): +        return resample<1, false, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider); +    case RESAMPLETYPE(2, false, 8, 0): +        return resample<2, false, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider); +    case RESAMPLETYPE(1, true, 8, 1): +        return resample<1, true, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +    case RESAMPLETYPE(2, true, 8, 1): +        return resample<2, true, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +    case RESAMPLETYPE(1, false, 8, 1): +        return resample<1, false, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +    case RESAMPLETYPE(2, false, 8, 1): +        return resample<2, false, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +    // stride 2 (can handle any filter length) +    case RESAMPLETYPE(1, true, 2, 0): +        return resample<1, true, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider); +    case RESAMPLETYPE(2, true, 2, 0): +        return resample<2, true, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider); +    case RESAMPLETYPE(1, false, 2, 0): +        return resample<1, false, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider); +    case RESAMPLETYPE(2, false, 2, 0): +        return resample<2, false, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider); +    case RESAMPLETYPE(1, true, 2, 1): +        return resample<1, true, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +    case RESAMPLETYPE(2, true, 2, 1): +        return resample<2, true, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +    case RESAMPLETYPE(1, false, 2, 1): +        return resample<1, false, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +    case RESAMPLETYPE(2, false, 2, 1): +        return resample<2, false, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider); +#endif +    default: +        ; // error +    } +} + +template<int CHANNELS, bool LOCKED, int STRIDE, typename TC> +void AudioResamplerDyn::resample(int32_t* out, size_t outFrameCount, +        const TC* const coefs,  AudioBufferProvider* provider) +{ +    const Constants& c(mConstants); +    int16_t* impulse = mInBuffer.getImpulse(); +    size_t inputIndex = mInputIndex; +    uint32_t phaseFraction = mPhaseFraction; +    const uint32_t phaseIncrement = mPhaseIncrement; +    size_t outputIndex = 0; +    size_t outputSampleCount = outFrameCount * 2;   // stereo output +    size_t inFrameCount = (outFrameCount*mInSampleRate)/mSampleRate; +    const uint32_t phaseWrapLimit = c.mL << c.mShift; + +    // NOTE: be very careful when modifying the code here. register +    // pressure is very high and a small change might cause the compiler +    // to generate far less efficient code. +    // Always sanity check the result with objdump or test-resample. + +    // the following logic is a bit convoluted to keep the main processing loop +    // as tight as possible with register allocation. +    while (outputIndex < outputSampleCount) { +        // buffer is empty, fetch a new one +        while (mBuffer.frameCount == 0) { +            mBuffer.frameCount = inFrameCount; +            provider->getNextBuffer(&mBuffer, +                    calculateOutputPTS(outputIndex / 2)); +            if (mBuffer.raw == NULL) { +                goto resample_exit; +            } +            if (phaseFraction >= phaseWrapLimit) { // read in data +                mInBuffer.readAdvance<CHANNELS>( +                        impulse, c.mHalfNumCoefs, mBuffer.i16, inputIndex); +                phaseFraction -= phaseWrapLimit; +                while (phaseFraction >= phaseWrapLimit) { +                    inputIndex++; +                    if (inputIndex >= mBuffer.frameCount) { +                        inputIndex -= mBuffer.frameCount; +                        provider->releaseBuffer(&mBuffer); +                        break; +                    } +                    mInBuffer.readAdvance<CHANNELS>( +                            impulse, c.mHalfNumCoefs, mBuffer.i16, inputIndex); +                    phaseFraction -= phaseWrapLimit; +                } +            } +        } +        const int16_t* const in = mBuffer.i16; +        const size_t frameCount = mBuffer.frameCount; +        const int coefShift = c.mShift; +        const int halfNumCoefs = c.mHalfNumCoefs; +        const int32_t* const volumeSimd = mVolumeSimd; + +        // reread the last input in. +        mInBuffer.readAgain<CHANNELS>(impulse, halfNumCoefs, in, inputIndex); + +        // main processing loop +        while (CC_LIKELY(outputIndex < outputSampleCount)) { +            // caution: fir() is inlined and may be large. +            // output will be loaded with the appropriate values +            // +            // from the input samples in impulse[-halfNumCoefs+1]... impulse[halfNumCoefs] +            // from the polyphase filter of (phaseFraction / phaseWrapLimit) in coefs. +            // +            fir<CHANNELS, LOCKED, STRIDE>( +                    &out[outputIndex], +                    phaseFraction, phaseWrapLimit, +                    coefShift, halfNumCoefs, coefs, +                    impulse, volumeSimd); +            outputIndex += 2; + +            phaseFraction += phaseIncrement; +            while (phaseFraction >= phaseWrapLimit) { +                inputIndex++; +                if (inputIndex >= frameCount) { +                    goto done;  // need a new buffer +                } +                mInBuffer.readAdvance<CHANNELS>(impulse, halfNumCoefs, in, inputIndex); +                phaseFraction -= phaseWrapLimit; +            } +        } +done: +        // often arrives here when input buffer runs out +        if (inputIndex >= frameCount) { +            inputIndex -= frameCount; +            provider->releaseBuffer(&mBuffer); +            // mBuffer.frameCount MUST be zero here. +        } +    } + +resample_exit: +    mInBuffer.setImpulse(impulse); +    mInputIndex = inputIndex; +    mPhaseFraction = phaseFraction; +} + +// ---------------------------------------------------------------------------- +}; // namespace android diff --git a/services/audioflinger/AudioResamplerDyn.h b/services/audioflinger/AudioResamplerDyn.h new file mode 100644 index 0000000..85a01ab --- /dev/null +++ b/services/audioflinger/AudioResamplerDyn.h @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + *      http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ANDROID_AUDIO_RESAMPLER_DYN_H +#define ANDROID_AUDIO_RESAMPLER_DYN_H + +#include <stdint.h> +#include <sys/types.h> +#include <cutils/log.h> + +#include "AudioResampler.h" + +namespace android { + +class AudioResamplerDyn: public AudioResampler { +public: +    AudioResamplerDyn(int bitDepth, int inChannelCount, int32_t sampleRate, +            src_quality quality); + +    virtual ~AudioResamplerDyn(); + +    virtual void init(); + +    virtual void setSampleRate(int32_t inSampleRate); + +    virtual void setVolume(int16_t left, int16_t right); + +    virtual void resample(int32_t* out, size_t outFrameCount, +            AudioBufferProvider* provider); + +private: + +    class Constants { // stores the filter constants. +    public: +        Constants() : +            mL(0), mShift(0), mHalfNumCoefs(0), mFirCoefsS16(NULL) +        {} +        void set(int L, int halfNumCoefs, +                int inSampleRate, int outSampleRate); +        inline void setBuf(int16_t* buf) { +            mFirCoefsS16 = buf; +        } +        inline void setBuf(int32_t* buf) { +            mFirCoefsS32 = buf; +        } + +        int mL;       // interpolation phases in the filter. +        int mShift;   // right shift to get polyphase index +        unsigned int mHalfNumCoefs; // filter half #coefs +        union {       // polyphase filter bank +            const int16_t* mFirCoefsS16; +            const int32_t* mFirCoefsS32; +        }; +    }; + +    // Input buffer management for a given input type TI, now (int16_t) +    // Is agnostic of the actual type, can work with int32_t and float. +    template<typename TI> +    class InBuffer { +    public: +        InBuffer(); +        ~InBuffer(); +        void init(); +        void resize(int CHANNELS, int halfNumCoefs); + +        // used for direct management of the mImpulse pointer +        inline TI* getImpulse() { +            return mImpulse; +        } +        inline void setImpulse(TI *impulse) { +            mImpulse = impulse; +        } +        template<int CHANNELS> +        inline void readAgain(TI*& impulse, const int halfNumCoefs, +                const TI* const in, const size_t inputIndex); +        template<int CHANNELS> +        inline void readAdvance(TI*& impulse, const int halfNumCoefs, +                const TI* const in, const size_t inputIndex); + +    private: +        // tuning parameter guidelines: 2 <= multiple <= 8 +        static const int kStateSizeMultipleOfFilterLength = 4; + +        TI* mState;    // base pointer for the input buffer storage +        TI* mImpulse;  // current location of the impulse response (centered) +        TI* mRingFull; // mState <= mImpulse < mRingFull +        // in general, mRingFull = mState + mStateSize - halfNumCoefs*CHANNELS. +        size_t mStateSize; // in units of TI. +    }; + +    template<int CHANNELS, bool LOCKED, int STRIDE, typename TC> +    void resample(int32_t* out, size_t outFrameCount, +            const TC* const coefs, AudioBufferProvider* provider); + +    template<typename T> +    void createKaiserFir(Constants &c, double stopBandAtten, +            int inSampleRate, int outSampleRate, double tbwCheat); + +    InBuffer<int16_t> mInBuffer; +    Constants mConstants;  // current set of coefficient parameters +    int32_t __attribute__ ((aligned (8))) mVolumeSimd[2]; +    int32_t mResampleType; // contains the resample type. +    int32_t mFilterSampleRate; // designed sample rate for the filter +    void* mCoefBuffer; // if a filter is created, this is not null +}; + +// ---------------------------------------------------------------------------- +}; // namespace android + +#endif /*ANDROID_AUDIO_RESAMPLER_DYN_H*/ diff --git a/services/audioflinger/AudioResamplerFirGen.h b/services/audioflinger/AudioResamplerFirGen.h new file mode 100644 index 0000000..fa6ee3e --- /dev/null +++ b/services/audioflinger/AudioResamplerFirGen.h @@ -0,0 +1,307 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + *      http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ANDROID_AUDIO_RESAMPLER_FIR_GEN_H +#define ANDROID_AUDIO_RESAMPLER_FIR_GEN_H + +namespace android { + +/* + * Sinc function is the traditional variant. + * + * TODO: Investigate optimizations (regular sampling grid, NEON vector accelerations) + * TODO: Remove comparison at 0 and trap at a higher level. + * + */ + +static inline double sinc(double x) { +    if (fabs(x) < FLT_MIN) { +        return 1.; +    } +    return sin(x) / x; +} + +static inline double sqr(double x) { +    return x * x; +} + +/* + * rounds a double to the nearest integer for FIR coefficients. + * + * One variant uses noise shaping, which must keep error history + * to work (the err parameter, initialized to 0). + * The other variant is a non-noise shaped version for + * S32 coefficients (noise shaping doesn't gain much). + * + * Caution: No bounds saturation is applied, but isn't needed in + * this case. + * + * @param x is the value to round. + * + * @param maxval is the maximum integer scale factor expressed as an int64 (for headroom). + * Typically this may be the maximum positive integer+1 (using the fact that double precision + * FIR coefficients generated here are never that close to 1.0 to pose an overflow condition). + * + * @param err is the previous error (actual - rounded) for the previous rounding op. + * + */ + +static inline int64_t toint(double x, int64_t maxval, double& err) { +    double val = x * maxval; +    double ival = floor(val + 0.5 + err*0.17); +    err = val - ival; +    return static_cast<int64_t>(ival); +} + +static inline int64_t toint(double x, int64_t maxval) { +    return static_cast<int64_t>(floor(x * maxval + 0.5)); +} + +/* + * Modified Bessel function of the first kind + * http://en.wikipedia.org/wiki/Bessel_function + * + * The formulas are taken from Abramowitz and Stegun: + * + * http://people.math.sfu.ca/~cbm/aands/page_375.htm + * http://people.math.sfu.ca/~cbm/aands/page_378.htm + * + * http://dlmf.nist.gov/10.25 + * http://dlmf.nist.gov/10.40 + * + * Note we assume x is nonnegative (the function is symmetric, + * pass in the absolute value as needed). + * + * Constants are compile time derived with templates I0Term<> and + * I0ATerm<> to the precision of the compiler.  The series can be expanded + * to any precision needed, but currently set around 24b precision. + * + * We use a bit of template math here, constexpr would probably be + * more appropriate for a C++11 compiler. + * + */ + +template <int N> +struct I0Term { +    static const double value = I0Term<N-1>::value/ (4. * N * N); +}; + +template <> +struct I0Term<0> { +    static const double value = 1.; +}; + +template <int N> +struct I0ATerm { +    static const double value = I0ATerm<N-1>::value * (2.*N-1.) * (2.*N-1.) / (8. * N); +}; + +template <> +struct I0ATerm<0> { // 1/sqrt(2*PI); +    static const double value = 0.398942280401432677939946059934381868475858631164934657665925; +}; + +static inline double I0(double x) { +    if (x < 3.75) { // TODO: Estrin's method instead of Horner's method? +        x *= x; +        return I0Term<0>::value + x*( +                I0Term<1>::value + x*( +                I0Term<2>::value + x*( +                I0Term<3>::value + x*( +                I0Term<4>::value + x*( +                I0Term<5>::value + x*( +                I0Term<6>::value)))))); // e < 1.6e-7 +    } +    // a bit ugly here - perhaps we expand the top series +    // to permit computation to x < 20 (a reasonable range) +    double y = 1./x; +    return exp(x) * sqrt(y) * ( +            // note: reciprocal squareroot may be easier! +            // http://en.wikipedia.org/wiki/Fast_inverse_square_root +            I0ATerm<0>::value + y*( +            I0ATerm<1>::value + y*( +            I0ATerm<2>::value + y*( +            I0ATerm<3>::value + y*( +            I0ATerm<4>::value + y*( +            I0ATerm<5>::value + y*( +            I0ATerm<6>::value + y*( +            I0ATerm<7>::value + y*( +            I0ATerm<8>::value))))))))); // (... e) < 1.9e-7 +} + +/* + * calculates the transition bandwidth for a Kaiser filter + * + * Formula 3.2.8, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48 + * + * @param halfNumCoef is half the number of coefficients per filter phase. + * @param stopBandAtten is the stop band attenuation desired. + * @return the transition bandwidth in normalized frequency (0 <= f <= 0.5) + */ +static inline double firKaiserTbw(int halfNumCoef, double stopBandAtten) { +    return (stopBandAtten - 7.95)/(2.*14.36*halfNumCoef); +} + +/* + * calculates the fir transfer response. + * + * calculates the transfer coefficient H(w) for 0 <= w <= PI. + * Be careful be careful to consider the fact that this is an interpolated filter + * of length L, so normalizing H(w)/L is probably what you expect. + */ +template <typename T> +static inline double firTransfer(const T* coef, int L, int halfNumCoef, double w) { +    double accum = static_cast<double>(coef[0])*0.5; +    coef += halfNumCoef;    // skip first row. +    for (int i=1 ; i<=L ; ++i) { +        for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) { +            accum += cos(ix*w)*static_cast<double>(*coef++); +        } +    } +    return accum*2.; +} + +/* + * returns the minimum and maximum |H(f)| bounds + * + * @param coef is the designed polyphase filter banks + * + * @param L is the number of phases (for interpolation) + * + * @param halfNumCoef should be half the number of coefficients for a single + * polyphase. + * + * @param fstart is the normalized frequency start. + * + * @param fend is the normalized frequency end. + * + * @param steps is the number of steps to take (sampling) between frequency start and end + * + * @param firMin returns the minimum transfer |H(f)| found + * + * @param firMax returns the maximum transfer |H(f)| found + * + * 0 <= f <= 0.5. + * This is used to test passband and stopband performance. + */ +template <typename T> +static void testFir(const T* coef, int L, int halfNumCoef, +        double fstart, double fend, int steps, double &firMin, double &firMax) { +    double wstart = fstart*(2.*M_PI); +    double wend = fend*(2.*M_PI); +    double wstep = (wend - wstart)/steps; +    double fmax, fmin; +    double trf = firTransfer(coef, L, halfNumCoef, wstart); +    if (trf<0) { +        trf = -trf; +    } +    fmin = fmax = trf; +    wstart += wstep; +    for (int i=1; i<steps; ++i) { +        trf = firTransfer(coef, L, halfNumCoef, wstart); +        if (trf<0) { +            trf = -trf; +        } +        if (trf>fmax) { +            fmax = trf; +        } +        else if (trf<fmin) { +            fmin = trf; +        } +        wstart += wstep; +    } +    // renormalize - this is only needed for integer filter types +    double norm = 1./((1ULL<<(sizeof(T)*8-1))*L); + +    firMin = fmin * norm; +    firMax = fmax * norm; +} + +/* + * Calculates the polyphase filter banks based on a windowed sinc function. + * + * The windowed sinc is an odd length symmetric filter of exactly L*halfNumCoef*2+1 + * taps for the entire kernel.  This is then decomposed into L+1 polyphase filterbanks. + * The last filterbank is used for interpolation purposes (and is mostly composed + * of the first bank shifted by one sample), and is unnecessary if one does + * not do interpolation. + * + * @param coef is the caller allocated space for coefficients.  This should be + * exactly (L+1)*halfNumCoef in size. + * + * @param L is the number of phases (for interpolation) + * + * @param halfNumCoef should be half the number of coefficients for a single + * polyphase. + * + * @param stopBandAtten is the stopband value, should be >50dB. + * + * @param fcr is cutoff frequency/sampling rate (<0.5).  At this point, the energy + * should be 6dB less. (fcr is where the amplitude drops by half).  Use the + * firKaiserTbw() to calculate the transition bandwidth.  fcr is the midpoint + * between the stop band and the pass band (fstop+fpass)/2. + * + * @param atten is the attenuation (generally slightly less than 1). + */ + +template <typename T> +static inline void firKaiserGen(T* coef, int L, int halfNumCoef, +        double stopBandAtten, double fcr, double atten) { +    // +    // Formula 3.2.5, 3.2.7, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48 +    // +    // See also: http://melodi.ee.washington.edu/courses/ee518/notes/lec17.pdf +    // +    // Kaiser window and beta parameter +    // +    //         | 0.1102*(A - 8.7)                         A > 50 +    //  beta = | 0.5842*(A - 21)^0.4 + 0.07886*(A - 21)   21 <= A <= 50 +    //         | 0.                                       A < 21 +    // +    // with A is the desired stop-band attenuation in dBFS +    // +    //    30 dB    2.210 +    //    40 dB    3.384 +    //    50 dB    4.538 +    //    60 dB    5.658 +    //    70 dB    6.764 +    //    80 dB    7.865 +    //    90 dB    8.960 +    //   100 dB   10.056 + +    const int N = L * halfNumCoef; // non-negative half +    const double beta = 0.1102 * (stopBandAtten - 8.7); // >= 50dB always +    const double yscale = 2. * atten * fcr / I0(beta); +    const double xstep = 2. * M_PI * fcr / L; +    const double xfrac = 1. / N; +    double err = 0; // for noise shaping on int16_t coefficients +    for (int i=0 ; i<=L ; ++i) { // generate an extra set of coefs for interpolation +        for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) { +            double y = I0(beta * sqrt(1.0 - sqr(ix * xfrac))) * sinc(ix * xstep) * yscale; + +            // (caution!) float version does not need rounding +            if (is_same<T, int16_t>::value) { // int16_t needs noise shaping +                *coef++ = static_cast<T>(toint(y, 1ULL<<(sizeof(T)*8-1), err)); +            } else { +                *coef++ = static_cast<T>(toint(y, 1ULL<<(sizeof(T)*8-1))); +            } +        } +    } +} + +}; // namespace android + +#endif /*ANDROID_AUDIO_RESAMPLER_FIR_GEN_H*/ diff --git a/services/audioflinger/AudioResamplerFirOps.h b/services/audioflinger/AudioResamplerFirOps.h new file mode 100644 index 0000000..bf2163f --- /dev/null +++ b/services/audioflinger/AudioResamplerFirOps.h @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + *      http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ANDROID_AUDIO_RESAMPLER_FIR_OPS_H +#define ANDROID_AUDIO_RESAMPLER_FIR_OPS_H + +namespace android { + +#if defined(__arm__) && !defined(__thumb__) +#define USE_INLINE_ASSEMBLY (true) +#else +#define USE_INLINE_ASSEMBLY (false) +#endif + +#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__) +#define USE_NEON (true) +#include <arm_neon.h> +#else +#define USE_NEON (false) +#endif + +template<typename T, typename U> +struct is_same +{ +    static const bool value = false; +}; + +template<typename T> +struct is_same<T, T>  // partial specialization +{ +    static const bool value = true; +}; + +static inline +int32_t mulRL(int left, int32_t in, uint32_t vRL) +{ +#if USE_INLINE_ASSEMBLY +    int32_t out; +    if (left) { +        asm( "smultb %[out], %[in], %[vRL] \n" +             : [out]"=r"(out) +             : [in]"%r"(in), [vRL]"r"(vRL) +             : ); +    } else { +        asm( "smultt %[out], %[in], %[vRL] \n" +             : [out]"=r"(out) +             : [in]"%r"(in), [vRL]"r"(vRL) +             : ); +    } +    return out; +#else +    int16_t v = left ? static_cast<int16_t>(vRL) : static_cast<int16_t>(vRL>>16); +    return static_cast<int32_t>((static_cast<int64_t>(in) * v) >> 16); +#endif +} + +static inline +int32_t mulAdd(int16_t in, int16_t v, int32_t a) +{ +#if USE_INLINE_ASSEMBLY +    int32_t out; +    asm( "smlabb %[out], %[v], %[in], %[a] \n" +         : [out]"=r"(out) +         : [in]"%r"(in), [v]"r"(v), [a]"r"(a) +         : ); +    return out; +#else +    return a + v * in; +#endif +} + +static inline +int32_t mulAdd(int16_t in, int32_t v, int32_t a) +{ +#if USE_INLINE_ASSEMBLY +    int32_t out; +    asm( "smlawb %[out], %[v], %[in], %[a] \n" +         : [out]"=r"(out) +         : [in]"%r"(in), [v]"r"(v), [a]"r"(a) +         : ); +    return out; +#else +    return a + static_cast<int32_t>((static_cast<int64_t>(v) * in) >> 16); +#endif +} + +static inline +int32_t mulAdd(int32_t in, int32_t v, int32_t a) +{ +#if USE_INLINE_ASSEMBLY +    int32_t out; +    asm( "smmla %[out], %[v], %[in], %[a] \n" +         : [out]"=r"(out) +         : [in]"%r"(in), [v]"r"(v), [a]"r"(a) +         : ); +    return out; +#else +    return a + static_cast<int32_t>((static_cast<int64_t>(v) * in) >> 32); +#endif +} + +static inline +int32_t mulAddRL(int left, uint32_t inRL, int16_t v, int32_t a) +{ +#if USE_INLINE_ASSEMBLY +    int32_t out; +    if (left) { +        asm( "smlabb %[out], %[v], %[inRL], %[a] \n" +             : [out]"=r"(out) +             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a) +             : ); +    } else { +        asm( "smlabt %[out], %[v], %[inRL], %[a] \n" +             : [out]"=r"(out) +             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a) +             : ); +    } +    return out; +#else +    int16_t s = left ? static_cast<int16_t>(inRL) : static_cast<int16_t>(inRL>>16); +    return a + v * s; +#endif +} + +static inline +int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a) +{ +#if USE_INLINE_ASSEMBLY +    int32_t out; +    if (left) { +        asm( "smlawb %[out], %[v], %[inRL], %[a] \n" +             : [out]"=r"(out) +             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a) +             : ); +    } else { +        asm( "smlawt %[out], %[v], %[inRL], %[a] \n" +             : [out]"=r"(out) +             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a) +             : ); +    } +    return out; +#else +    int16_t s = left ? static_cast<int16_t>(inRL) : static_cast<int16_t>(inRL>>16); +    return a + static_cast<int32_t>((static_cast<int64_t>(v) * s) >> 16); +#endif +} + +}; // namespace android + +#endif /*ANDROID_AUDIO_RESAMPLER_FIR_OPS_H*/ diff --git a/services/audioflinger/AudioResamplerFirProcess.h b/services/audioflinger/AudioResamplerFirProcess.h new file mode 100644 index 0000000..38e387c --- /dev/null +++ b/services/audioflinger/AudioResamplerFirProcess.h @@ -0,0 +1,256 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + *      http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H +#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H + +namespace android { + +// depends on AudioResamplerFirOps.h + +template<int CHANNELS, typename TC> +static inline +void mac( +        int32_t& l, int32_t& r, +        const TC coef, +        const int16_t* samples) +{ +    if (CHANNELS == 2) { +        uint32_t rl = *reinterpret_cast<const uint32_t*>(samples); +        l = mulAddRL(1, rl, coef, l); +        r = mulAddRL(0, rl, coef, r); +    } else { +        r = l = mulAdd(samples[0], coef, l); +    } +} + +template<int CHANNELS, typename TC> +static inline +void interpolate( +        int32_t& l, int32_t& r, +        const TC coef_0, const TC coef_1, +        const int16_t lerp, const int16_t* samples) +{ +    TC sinc; + +    if (is_same<TC, int16_t>::value) { +        sinc = (lerp * ((coef_1-coef_0)<<1)>>16) + coef_0; +    } else { +        sinc = mulAdd(lerp, (coef_1-coef_0)<<1, coef_0); +    } +    if (CHANNELS == 2) { +        uint32_t rl = *reinterpret_cast<const uint32_t*>(samples); +        l = mulAddRL(1, rl, sinc, l); +        r = mulAddRL(0, rl, sinc, r); +    } else { +        r = l = mulAdd(samples[0], sinc, l); +    } +} + +/* + * Calculates a single output sample (two stereo frames). + * + * This function computes both the positive half FIR dot product and + * the negative half FIR dot product, accumulates, and then applies the volume. + * + * This is a locked phase filter (it does not compute the interpolation). + * + * Use fir() to compute the proper coefficient pointers for a polyphase + * filter bank. + */ + +template <int CHANNELS, int STRIDE, typename TC> +static inline +void ProcessL(int32_t* const out, +        int count, +        const TC* coefsP, +        const TC* coefsN, +        const int16_t* sP, +        const int16_t* sN, +        const int32_t* const volumeLR) +{ +    int32_t l = 0; +    int32_t r = 0; +    do { +        mac<CHANNELS>(l, r, *coefsP++, sP); +        sP -= CHANNELS; +        mac<CHANNELS>(l, r, *coefsN++, sN); +        sN += CHANNELS; +    } while (--count > 0); +    out[0] += 2 * mulRL(0, l, volumeLR[0]); // Note: only use top 16b +    out[1] += 2 * mulRL(0, r, volumeLR[1]); // Note: only use top 16b +} + +/* + * Calculates a single output sample (two stereo frames) interpolating phase. + * + * This function computes both the positive half FIR dot product and + * the negative half FIR dot product, accumulates, and then applies the volume. + * + * This is an interpolated phase filter. + * + * Use fir() to compute the proper coefficient pointers for a polyphase + * filter bank. + */ + +template <int CHANNELS, int STRIDE, typename TC> +static inline +void Process(int32_t* const out, +        int count, +        const TC* coefsP, +        const TC* coefsN, +        const TC* coefsP1, +        const TC* coefsN1, +        const int16_t* sP, +        const int16_t* sN, +        uint32_t lerpP, +        const int32_t* const volumeLR) +{ +    (void) coefsP1; // suppress unused parameter warning +    (void) coefsN1; +    if (sizeof(*coefsP)==4) { +        lerpP >>= 16;   // ensure lerpP is 16b +    } +    int32_t l = 0; +    int32_t r = 0; +    for (size_t i = 0; i < count; ++i) { +        interpolate<CHANNELS>(l, r, coefsP[0], coefsP[count], lerpP, sP); +        coefsP++; +        sP -= CHANNELS; +        interpolate<CHANNELS>(l, r, coefsN[count], coefsN[0], lerpP, sN); +        coefsN++; +        sN += CHANNELS; +    } +    out[0] += 2 * mulRL(0, l, volumeLR[0]); // Note: only use top 16b +    out[1] += 2 * mulRL(0, r, volumeLR[1]); // Note: only use top 16b +} + +/* + * Calculates a single output sample (two stereo frames) from input sample pointer. + * + * This sets up the params for the accelerated Process() and ProcessL() + * functions to do the appropriate dot products. + * + * @param out should point to the output buffer with at least enough space for 2 output frames. + * + * @param phase is the fractional distance between input samples for interpolation: + * phase >= 0  && phase < phaseWrapLimit.  It can be thought of as a rational fraction + * of phase/phaseWrapLimit. + * + * @param phaseWrapLimit is #polyphases<<coefShift, where #polyphases is the number of polyphases + * in the polyphase filter. Likewise, #polyphases can be obtained as (phaseWrapLimit>>coefShift). + * + * @param coefShift gives the bit alignment of the polyphase index in the phase parameter. + * + * @param halfNumCoefs is the half the number of coefficients per polyphase filter. Since the + * overall filterbank is odd-length symmetric, only halfNumCoefs need be stored. + * + * @param coefs is the polyphase filter bank, starting at from polyphase index 0, and ranging to + * and including the #polyphases.  Each polyphase of the filter has half-length halfNumCoefs + * (due to symmetry).  The total size of the filter bank in coefficients is + * (#polyphases+1)*halfNumCoefs. + * + * The filter bank coefs should be aligned to a minimum of 16 bytes (preferrably to cache line). + * + * The coefs should be attenuated (to compensate for passband ripple) + * if storing back into the native format. + * + * @param samples are unaligned input samples.  The position is in the "middle" of the + * sample array with respect to the FIR filter: + * the negative half of the filter is dot product from samples+1 to samples+halfNumCoefs; + * the positive half of the filter is dot product from samples to samples-halfNumCoefs+1. + * + * @param volumeLR is a pointer to an array of two 32 bit volume values, one per stereo channel, + * expressed as a S32 integer.  A negative value inverts the channel 180 degrees. + * The pointer volumeLR should be aligned to a minimum of 8 bytes. + * A typical value for volume is 0x1000 to align to a unity gain output of 20.12. + * + * In between calls to filterCoefficient, the phase is incremented by phaseIncrement, where + * phaseIncrement is calculated as inputSampling * phaseWrapLimit / outputSampling. + * + * The filter polyphase index is given by indexP = phase >> coefShift. Due to + * odd length symmetric filter, the polyphase index of the negative half depends on + * whether interpolation is used. + * + * The fractional siting between the polyphase indices is given by the bits below coefShift: + * + * lerpP = phase << 32 - coefShift >> 1;  // for 32 bit unsigned phase multiply + * lerpP = phase << 32 - coefShift >> 17; // for 16 bit unsigned phase multiply + * + * For integer types, this is expressed as: + * + * lerpP = phase << sizeof(phase)*8 - coefShift + *              >> (sizeof(phase)-sizeof(*coefs))*8 + 1; + * + */ + +template<int CHANNELS, bool LOCKED, int STRIDE, typename TC> +static inline +void fir(int32_t* const out, +        const uint32_t phase, const uint32_t phaseWrapLimit, +        const int coefShift, const int halfNumCoefs, const TC* const coefs, +        const int16_t* const samples, const int32_t* const volumeLR) +{ +    // NOTE: be very careful when modifying the code here. register +    // pressure is very high and a small change might cause the compiler +    // to generate far less efficient code. +    // Always sanity check the result with objdump or test-resample. + +    if (LOCKED) { +        // locked polyphase (no interpolation) +        // Compute the polyphase filter index on the positive and negative side. +        uint32_t indexP = phase >> coefShift; +        uint32_t indexN = (phaseWrapLimit - phase) >> coefShift; +        const TC* coefsP = coefs + indexP*halfNumCoefs; +        const TC* coefsN = coefs + indexN*halfNumCoefs; +        const int16_t* sP = samples; +        const int16_t* sN = samples + CHANNELS; + +        // dot product filter. +        ProcessL<CHANNELS, STRIDE>(out, +                halfNumCoefs, coefsP, coefsN, sP, sN, volumeLR); +    } else { +        // interpolated polyphase +        // Compute the polyphase filter index on the positive and negative side. +        uint32_t indexP = phase >> coefShift; +        uint32_t indexN = (phaseWrapLimit - phase - 1) >> coefShift; // one's complement. +        const TC* coefsP = coefs + indexP*halfNumCoefs; +        const TC* coefsN = coefs + indexN*halfNumCoefs; +        const TC* coefsP1 = coefsP + halfNumCoefs; +        const TC* coefsN1 = coefsN + halfNumCoefs; +        const int16_t* sP = samples; +        const int16_t* sN = samples + CHANNELS; + +        // Interpolation fraction lerpP derived by shifting all the way up and down +        // to clear the appropriate bits and align to the appropriate level +        // for the integer multiply.  The constants should resolve in compile time. +        // +        // The interpolated filter coefficient is derived as follows for the pos/neg half: +        // +        // interpolated[P] = index[P]*lerpP + index[P+1]*(1-lerpP) +        // interpolated[N] = index[N+1]*lerpP + index[N]*(1-lerpP) +        uint32_t lerpP = phase << (sizeof(phase)*8 - coefShift) +                >> ((sizeof(phase)-sizeof(*coefs))*8 + 1); + +        // on-the-fly interpolated dot product filter +        Process<CHANNELS, STRIDE>(out, +                halfNumCoefs, coefsP, coefsN, coefsP1, coefsN1, sP, sN, lerpP, volumeLR); +    } +} + +}; // namespace android + +#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H*/ diff --git a/services/audioflinger/AudioResamplerFirProcessNeon.h b/services/audioflinger/AudioResamplerFirProcessNeon.h new file mode 100644 index 0000000..f311cef --- /dev/null +++ b/services/audioflinger/AudioResamplerFirProcessNeon.h @@ -0,0 +1,1149 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + *      http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H +#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H + +namespace android { + +// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h + +#if USE_NEON +// +// NEON specializations are enabled for Process() and ProcessL() +// +// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary) +// and looping stride 16 (or vice versa). This has some polyphase coef data alignment +// issues with S16 coefs. Consider this later. + +// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out. +#define ASSEMBLY_ACCUMULATE_MONO \ +        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\ +        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\ +        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\ +        "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\ +        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\ +        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\ +        "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */ + +#define ASSEMBLY_ACCUMULATE_STEREO \ +        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\ +        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\ +        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\ +        "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\ +        "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\ +        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\ +        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\ +        "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/ + +template <> +inline void ProcessL<1, 16>(int32_t* const out, +        int count, +        const int16_t* coefsP, +        const int16_t* coefsN, +        const int16_t* sP, +        const int16_t* sN, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 1; // template specialization does not preserve params +    const int STRIDE = 16; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0 + +        "1:                                      \n" + +        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples +        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples +        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs +        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs + +        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 + +        // reordering the vmal to do d6, d7 before d4, d5 is slower(?) +        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef +        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef +        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples +        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples + +        // moving these ARM instructions before neon above seems to be slower +        "subs           %[count], %[count], #8   \n"// (1) update loop counter +        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples + +        // sP used after branch (warning) +        "bne            1b                       \n"// loop + +         ASSEMBLY_ACCUMULATE_MONO + +        : [out]     "=Uv" (out[0]), +          [count]   "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [sP]      "+r" (sP), +          [sN]      "+r" (sN) +        : [vLR]     "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q8", "q10" +    ); +} + +template <> +inline void ProcessL<2, 16>(int32_t* const out, +        int count, +        const int16_t* coefsP, +        const int16_t* coefsN, +        const int16_t* sP, +        const int16_t* sN, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 2; // template specialization does not preserve params +    const int STRIDE = 16; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "veor           q0, q0, q0               \n"// (1) acc_L = 0 +        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0 + +        "1:                                      \n" + +        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples +        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples +        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs +        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs + +        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive +        "vrev64.16      q3, q3                   \n"// (0 combines+) reverse right positive + +        "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left +        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left +        "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right +        "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right +        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left +        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left +        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right +        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right + +        // moving these ARM before neon seems to be slower +        "subs           %[count], %[count], #8   \n"// (1) update loop counter +        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples + +        // sP used after branch (warning) +        "bne            1b                       \n"// loop + +        ASSEMBLY_ACCUMULATE_STEREO + +        : [out] "=Uv" (out[0]), +          [count] "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [sP] "+r" (sP), +          [sN] "+r" (sN) +        : [vLR] "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q4", "q5", "q6", +          "q8", "q10" +     ); +} + +template <> +inline void Process<1, 16>(int32_t* const out, +        int count, +        const int16_t* coefsP, +        const int16_t* coefsN, +        const int16_t* coefsP1, +        const int16_t* coefsN1, +        const int16_t* sP, +        const int16_t* sN, +        uint32_t lerpP, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 1; // template specialization does not preserve params +    const int STRIDE = 16; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15 +        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0 + +        "1:                                      \n" + +        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples +        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples +        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs +        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation +        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs +        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation + +        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs +        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets + +        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs +        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs + +        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 + +        "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set +        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set + +        // reordering the vmal to do d6, d7 before d4, d5 is slower(?) +        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef +        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef +        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples +        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples + +        // moving these ARM instructions before neon above seems to be slower +        "subs           %[count], %[count], #8   \n"// (1) update loop counter +        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples + +        // sP used after branch (warning) +        "bne            1b                       \n"// loop + +        ASSEMBLY_ACCUMULATE_MONO + +        : [out]     "=Uv" (out[0]), +          [count]   "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [coefsP1] "+r" (coefsP1), +          [coefsN1] "+r" (coefsN1), +          [sP]      "+r" (sP), +          [sN]      "+r" (sN) +        : [lerpP]   "r" (lerpP), +          [vLR]     "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q8", "q9", "q10", "q11" +    ); +} + +template <> +inline void Process<2, 16>(int32_t* const out, +        int count, +        const int16_t* coefsP, +        const int16_t* coefsN, +        const int16_t* coefsP1, +        const int16_t* coefsN1, +        const int16_t* sP, +        const int16_t* sN, +        uint32_t lerpP, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 2; // template specialization does not preserve params +    const int STRIDE = 16; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase +        "veor           q0, q0, q0               \n"// (1) acc_L = 0 +        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0 + +        "1:                                      \n" + +        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples +        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples +        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs +        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation +        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs +        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation + +        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs +        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets + +        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs +        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs + +        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive +        "vrev64.16      q3, q3                   \n"// (1) reverse 8 frames of the right positive + +        "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set +        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set + +        "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left +        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left +        "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right +        "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right +        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left +        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left +        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right +        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right + +        // moving these ARM before neon seems to be slower +        "subs           %[count], %[count], #8   \n"// (1) update loop counter +        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples + +        // sP used after branch (warning) +        "bne            1b                       \n"// loop + +        ASSEMBLY_ACCUMULATE_STEREO + +        : [out] "=Uv" (out[0]), +          [count] "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [coefsP1] "+r" (coefsP1), +          [coefsN1] "+r" (coefsN1), +          [sP] "+r" (sP), +          [sN] "+r" (sN) +        : [lerpP]   "r" (lerpP), +          [vLR] "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q4", "q5", "q6", +          "q8", "q9", "q10", "q11" +    ); +} + +template <> +inline void ProcessL<1, 16>(int32_t* const out, +        int count, +        const int32_t* coefsP, +        const int32_t* coefsN, +        const int16_t* sP, +        const int16_t* sN, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 1; // template specialization does not preserve params +    const int STRIDE = 16; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "veor           q0, q0, q0                    \n"// result, initialize to 0 + +        "1:                                           \n" + +        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples +        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples +        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs +        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs + +        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side + +        "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits +        "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits + +        "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits +        "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits + +        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef + +        "vadd.s32       q0, q0, q12                   \n"// accumulate result +        "vadd.s32       q13, q13, q14                 \n"// accumulate result +        "vadd.s32       q0, q0, q15                   \n"// accumulate result +        "vadd.s32       q0, q0, q13                   \n"// accumulate result + +        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples +        "subs           %[count], %[count], #8        \n"// update loop counter + +        "bne            1b                            \n"// loop + +        ASSEMBLY_ACCUMULATE_MONO + +        : [out]     "=Uv" (out[0]), +          [count]   "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [sP]      "+r" (sP), +          [sN]      "+r" (sN) +        : [vLR]     "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q8", "q9", "q10", "q11", +          "q12", "q13", "q14", "q15" +    ); +} + +template <> +inline void ProcessL<2, 16>(int32_t* const out, +        int count, +        const int32_t* coefsP, +        const int32_t* coefsN, +        const int16_t* sP, +        const int16_t* sN, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 2; // template specialization does not preserve params +    const int STRIDE = 16; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "veor           q0, q0, q0                    \n"// result, initialize to 0 +        "veor           q4, q4, q4                    \n"// result, initialize to 0 + +        "1:                                           \n" + +        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples +        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples +        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 4 32-bits coefs +        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs + +        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side +        "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side + +        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits +        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits + +        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits +        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits + +        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef + +        "vadd.s32       q0, q0, q12                   \n"// accumulate result +        "vadd.s32       q13, q13, q14                 \n"// accumulate result +        "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result +        "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result + +        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits +        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits + +        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits +        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits + +        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef + +        "vadd.s32       q4, q4, q12                   \n"// accumulate result +        "vadd.s32       q13, q13, q14                 \n"// accumulate result +        "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result +        "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result + +        "subs           %[count], %[count], #8        \n"// update loop counter +        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples + +        "bne            1b                            \n"// loop + +        ASSEMBLY_ACCUMULATE_STEREO + +        : [out]     "=Uv" (out[0]), +          [count]   "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [sP]      "+r" (sP), +          [sN]      "+r" (sN) +        : [vLR]     "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q4", "q5", "q6", +          "q8", "q9", "q10", "q11", +          "q12", "q13", "q14", "q15" +    ); +} + +template <> +inline void Process<1, 16>(int32_t* const out, +        int count, +        const int32_t* coefsP, +        const int32_t* coefsN, +        const int32_t* coefsP1, +        const int32_t* coefsN1, +        const int16_t* sP, +        const int16_t* sN, +        uint32_t lerpP, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 1; // template specialization does not preserve params +    const int STRIDE = 16; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase +        "veor           q0, q0, q0                    \n"// result, initialize to 0 + +        "1:                                           \n" + +        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples +        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples +        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs +        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs +        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs +        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs + +        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1) +        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1) +        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1) +        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1) + +        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2) +        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2) +        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2) +        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2) + +        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3) +        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3) +        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3) +        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3) + +        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side + +        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits +        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits + +        "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits +        "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits + +        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef + +        "vadd.s32       q0, q0, q12                   \n"// accumulate result +        "vadd.s32       q13, q13, q14                 \n"// accumulate result +        "vadd.s32       q0, q0, q15                   \n"// accumulate result +        "vadd.s32       q0, q0, q13                   \n"// accumulate result + +        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples +        "subs           %[count], %[count], #8        \n"// update loop counter + +        "bne            1b                            \n"// loop + +        ASSEMBLY_ACCUMULATE_MONO + +        : [out]     "=Uv" (out[0]), +          [count]   "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [coefsP1] "+r" (coefsP1), +          [coefsN1] "+r" (coefsN1), +          [sP]      "+r" (sP), +          [sN]      "+r" (sN) +        : [lerpP]   "r" (lerpP), +          [vLR]     "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q8", "q9", "q10", "q11", +          "q12", "q13", "q14", "q15" +    ); +} + +template <> +inline void Process<2, 16>(int32_t* const out, +        int count, +        const int32_t* coefsP, +        const int32_t* coefsN, +        const int32_t* coefsP1, +        const int32_t* coefsN1, +        const int16_t* sP, +        const int16_t* sN, +        uint32_t lerpP, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 2; // template specialization does not preserve params +    const int STRIDE = 16; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase +        "veor           q0, q0, q0                    \n"// result, initialize to 0 +        "veor           q4, q4, q4                    \n"// result, initialize to 0 + +        "1:                                           \n" + +        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples +        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples +        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs +        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs +        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs +        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs + +        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1) +        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1) +        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1) +        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1) + +        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2) +        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2) +        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2) +        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2) + +        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3) +        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3) +        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3) +        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3) + +        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side +        "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side + +        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits +        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits + +        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits +        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits + +        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef + +        "vadd.s32       q0, q0, q12                   \n"// accumulate result +        "vadd.s32       q13, q13, q14                 \n"// accumulate result +        "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result +        "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result + +        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits +        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits + +        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits +        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits + +        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef + +        "vadd.s32       q4, q4, q12                   \n"// accumulate result +        "vadd.s32       q13, q13, q14                 \n"// accumulate result +        "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result +        "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result + +        "subs           %[count], %[count], #8        \n"// update loop counter +        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples + +        "bne            1b                            \n"// loop + +        ASSEMBLY_ACCUMULATE_STEREO + +        : [out]     "=Uv" (out[0]), +          [count]   "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [coefsP1] "+r" (coefsP1), +          [coefsN1] "+r" (coefsN1), +          [sP]      "+r" (sP), +          [sN]      "+r" (sN) +        : [lerpP]   "r" (lerpP), +          [vLR]     "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q4", "q5", "q6", +          "q8", "q9", "q10", "q11", +          "q12", "q13", "q14", "q15" +    ); +} + +template <> +inline void ProcessL<1, 8>(int32_t* const out, +        int count, +        const int16_t* coefsP, +        const int16_t* coefsN, +        const int16_t* sP, +        const int16_t* sN, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 1; // template specialization does not preserve params +    const int STRIDE = 8; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0 + +        "1:                                      \n" + +        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples +        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples +        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs +        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs + +        "vrev64.16      d4, d4                   \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4 + +        // reordering the vmal to do d6, d7 before d4, d5 is slower(?) +        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed)samples by coef +        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples + +        // moving these ARM instructions before neon above seems to be slower +        "subs           %[count], %[count], #4   \n"// (1) update loop counter +        "sub            %[sP], %[sP], #8         \n"// (0) move pointer to next set of samples + +        // sP used after branch (warning) +        "bne            1b                       \n"// loop + +        ASSEMBLY_ACCUMULATE_MONO + +        : [out]     "=Uv" (out[0]), +          [count]   "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [sP]      "+r" (sP), +          [sN]      "+r" (sN) +        : [vLR]     "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q8", "q10" +    ); +} + +template <> +inline void ProcessL<2, 8>(int32_t* const out, +        int count, +        const int16_t* coefsP, +        const int16_t* coefsN, +        const int16_t* sP, +        const int16_t* sN, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 2; // template specialization does not preserve params +    const int STRIDE = 8; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "veor           q0, q0, q0               \n"// (1) acc_L = 0 +        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0 + +        "1:                                      \n" + +        "vld2.16        {d4, d5}, [%[sP]]        \n"// (2+0d) load 8 16-bits stereo samples +        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (2) load 8 16-bits stereo samples +        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs +        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs + +        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive + +        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left +        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right +        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left +        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right + +        // moving these ARM before neon seems to be slower +        "subs           %[count], %[count], #4   \n"// (1) update loop counter +        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples + +        // sP used after branch (warning) +        "bne            1b                       \n"// loop + +        ASSEMBLY_ACCUMULATE_STEREO + +        : [out] "=Uv" (out[0]), +          [count] "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [sP] "+r" (sP), +          [sN] "+r" (sN) +        : [vLR] "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q4", "q5", "q6", +          "q8", "q10" +     ); +} + +template <> +inline void Process<1, 8>(int32_t* const out, +        int count, +        const int16_t* coefsP, +        const int16_t* coefsN, +        const int16_t* coefsP1, +        const int16_t* coefsN1, +        const int16_t* sP, +        const int16_t* sN, +        uint32_t lerpP, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 1; // template specialization does not preserve params +    const int STRIDE = 8; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15 +        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0 + +        "1:                                      \n" + +        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples +        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples +        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs +        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 4 16-bits coefs for interpolation +        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 4 16-bits coefs +        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs for interpolation + +        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs +        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets + +        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs +        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs + +        "vrev64.16      d4, d4                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 + +        "vadd.s16       d16, d16, d17            \n"// (1+2d) interpolate (step3) 1st set +        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set + +        // reordering the vmal to do d6, d7 before d4, d5 is slower(?) +        "vmlal.s16      q0, d4, d16              \n"// (1+0d) multiply (reversed)by coef +        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples + +        // moving these ARM instructions before neon above seems to be slower +        "subs           %[count], %[count], #4   \n"// (1) update loop counter +        "sub            %[sP], %[sP], #8        \n"// move pointer to next set of samples + +        // sP used after branch (warning) +        "bne            1b                       \n"// loop + +        ASSEMBLY_ACCUMULATE_MONO + +        : [out]     "=Uv" (out[0]), +          [count]   "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [coefsP1] "+r" (coefsP1), +          [coefsN1] "+r" (coefsN1), +          [sP]      "+r" (sP), +          [sN]      "+r" (sN) +        : [lerpP]   "r" (lerpP), +          [vLR]     "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q8", "q9", "q10", "q11" +    ); +} + +template <> +inline void Process<2, 8>(int32_t* const out, +        int count, +        const int16_t* coefsP, +        const int16_t* coefsN, +        const int16_t* coefsP1, +        const int16_t* coefsN1, +        const int16_t* sP, +        const int16_t* sN, +        uint32_t lerpP, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 2; // template specialization does not preserve params +    const int STRIDE = 8; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase +        "veor           q0, q0, q0               \n"// (1) acc_L = 0 +        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0 + +        "1:                                      \n" + +        "vld2.16        {d4, d5}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples +        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples +        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs +        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 8 16-bits coefs for interpolation +        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 8 16-bits coefs +        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs for interpolation + +        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs +        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets + +        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs +        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs + +        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive + +        "vadd.s16       d16, d16, d17            \n"// (1+1d) interpolate (step3) 1st set +        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set + +        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left +        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right +        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left +        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right + +        // moving these ARM before neon seems to be slower +        "subs           %[count], %[count], #4   \n"// (1) update loop counter +        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples + +        // sP used after branch (warning) +        "bne            1b                       \n"// loop + +        ASSEMBLY_ACCUMULATE_STEREO + +        : [out] "=Uv" (out[0]), +          [count] "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [coefsP1] "+r" (coefsP1), +          [coefsN1] "+r" (coefsN1), +          [sP] "+r" (sP), +          [sN] "+r" (sN) +        : [lerpP]   "r" (lerpP), +          [vLR] "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q4", "q5", "q6", +          "q8", "q9", "q10", "q11" +    ); +} + +template <> +inline void ProcessL<1, 8>(int32_t* const out, +        int count, +        const int32_t* coefsP, +        const int32_t* coefsN, +        const int16_t* sP, +        const int16_t* sN, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 1; // template specialization does not preserve params +    const int STRIDE = 8; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "veor           q0, q0, q0               \n"// result, initialize to 0 + +        "1:                                      \n" + +        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples +        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples +        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs +        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs + +        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side + +        "vshll.s16      q12, d4, #15             \n"// (stall) extend samples to 31 bits +        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits + +        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef + +        "vadd.s32       q0, q0, q12              \n"// accumulate result +        "vadd.s32       q0, q0, q14              \n"// (stall) accumulate result + +        "subs           %[count], %[count], #4   \n"// update loop counter +        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples + +        "bne            1b                       \n"// loop + +        ASSEMBLY_ACCUMULATE_MONO + +        : [out] "=Uv" (out[0]), +          [count] "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [sP] "+r" (sP), +          [sN] "+r" (sN) +        : [vLR] "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q8", "q9", "q10", "q11", +          "q12", "q14" +    ); +} + +template <> +inline void ProcessL<2, 8>(int32_t* const out, +        int count, +        const int32_t* coefsP, +        const int32_t* coefsN, +        const int16_t* sP, +        const int16_t* sN, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 2; // template specialization does not preserve params +    const int STRIDE = 8; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "veor           q0, q0, q0               \n"// result, initialize to 0 +        "veor           q4, q4, q4               \n"// result, initialize to 0 + +        "1:                                      \n" + +        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples +        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples +        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs +        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs + +        "vrev64.16      q2, q2                   \n"// reverse 2 frames of the positive side + +        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits +        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits + +        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits +        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits + +        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by coef +        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by coef +        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by coef +        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by coef + +        "vadd.s32       q0, q0, q12              \n"// accumulate result +        "vadd.s32       q4, q4, q13              \n"// accumulate result +        "vadd.s32       q0, q0, q14              \n"// accumulate result +        "vadd.s32       q4, q4, q15              \n"// accumulate result + +        "subs           %[count], %[count], #4   \n"// update loop counter +        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples + +        "bne            1b                       \n"// loop + +        ASSEMBLY_ACCUMULATE_STEREO + +        : [out]     "=Uv" (out[0]), +          [count]   "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsN0] "+r" (coefsN), +          [sP]      "+r" (sP), +          [sN]      "+r" (sN) +        : [vLR]     "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", "q4", +          "q8", "q9", "q10", "q11", +          "q12", "q13", "q14", "q15" +    ); +} + +template <> +inline void Process<1, 8>(int32_t* const out, +        int count, +        const int32_t* coefsP, +        const int32_t* coefsN, +        const int32_t* coefsP1, +        const int32_t* coefsN1, +        const int16_t* sP, +        const int16_t* sN, +        uint32_t lerpP, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 1; // template specialization does not preserve params +    const int STRIDE = 8; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase +        "veor           q0, q0, q0               \n"// result, initialize to 0 + +        "1:                                      \n" + +        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples +        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples +        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs +        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation +        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs +        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation + +        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side + +        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs +        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets +        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits + +        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs +        "vqrdmulh.s32   q11, q11, d2[0]          \n"// interpolate (step2) 2nd set of coefs +        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits + +        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set +        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set + +        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef + +        "vadd.s32       q0, q0, q12              \n"// accumulate result +        "vadd.s32       q0, q0, q14              \n"// accumulate result + +        "subs           %[count], %[count], #4   \n"// update loop counter +        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples + +        "bne            1b                       \n"// loop + +        ASSEMBLY_ACCUMULATE_MONO + +        : [out]     "=Uv" (out[0]), +          [count]   "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsP1] "+r" (coefsP1), +          [coefsN0] "+r" (coefsN), +          [coefsN1] "+r" (coefsN1), +          [sP]      "+r" (sP), +          [sN]      "+r" (sN) +        : [lerpP]   "r" (lerpP), +          [vLR]     "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", +          "q8", "q9", "q10", "q11", +          "q12", "q14" +    ); +} + +template <> +inline +void Process<2, 8>(int32_t* const out, +        int count, +        const int32_t* coefsP, +        const int32_t* coefsN, +        const int32_t* coefsP1, +        const int32_t* coefsN1, +        const int16_t* sP, +        const int16_t* sN, +        uint32_t lerpP, +        const int32_t* const volumeLR) +{ +    const int CHANNELS = 2; // template specialization does not preserve params +    const int STRIDE = 8; +    sP -= CHANNELS*((STRIDE>>1)-1); +    asm ( +        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase +        "veor           q0, q0, q0               \n"// result, initialize to 0 +        "veor           q4, q4, q4               \n"// result, initialize to 0 + +        "1:                                      \n" +        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples +        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples +        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs +        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation +        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs +        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation + +        "vrev64.16      q2, q2                   \n"// (reversed) 2 frames of the positive side + +        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs +        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets +        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits +        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits + +        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs +        "vqrdmulh.s32   q11, q11, d2[1]          \n"// interpolate (step3) 2nd set of coefs +        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits +        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits + +        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set +        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set + +        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef +        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by interpolated coef + +        "vadd.s32       q0, q0, q12              \n"// accumulate result +        "vadd.s32       q4, q4, q13              \n"// accumulate result +        "vadd.s32       q0, q0, q14              \n"// accumulate result +        "vadd.s32       q4, q4, q15              \n"// accumulate result + +        "subs           %[count], %[count], #4   \n"// update loop counter +        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples + +        "bne            1b                       \n"// loop + +        ASSEMBLY_ACCUMULATE_STEREO + +        : [out]     "=Uv" (out[0]), +          [count]   "+r" (count), +          [coefsP0] "+r" (coefsP), +          [coefsP1] "+r" (coefsP1), +          [coefsN0] "+r" (coefsN), +          [coefsN1] "+r" (coefsN1), +          [sP]      "+r" (sP), +          [sN]      "+r" (sN) +        : [lerpP]   "r" (lerpP), +          [vLR]     "r" (volumeLR) +        : "cc", "memory", +          "q0", "q1", "q2", "q3", "q4", +          "q8", "q9", "q10", "q11", +          "q12", "q13", "q14", "q15" +    ); +} + +#endif //USE_NEON + +}; // namespace android + +#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/ diff --git a/services/audioflinger/test-resample.cpp b/services/audioflinger/test-resample.cpp index 403bb6d..3f2ce55 100644 --- a/services/audioflinger/test-resample.cpp +++ b/services/audioflinger/test-resample.cpp @@ -33,8 +33,9 @@ using namespace android;  bool gVerbose = false;  static int usage(const char* name) { -    fprintf(stderr,"Usage: %s [-p] [-h] [-v] [-s] [-q {dq|lq|mq|hq|vhq}] [-i input-sample-rate] " -                   "[-o output-sample-rate] [<input-file>] <output-file>\n", name); +    fprintf(stderr,"Usage: %s [-p] [-h] [-v] [-s] [-q {dq|lq|mq|hq|vhq|dlq|dmq|dhq}]" +                   " [-i input-sample-rate] [-o output-sample-rate] [<input-file>]" +                   " <output-file>\n", name);      fprintf(stderr,"    -p    enable profiling\n");      fprintf(stderr,"    -h    create wav file\n");      fprintf(stderr,"    -v    verbose : log buffer provider calls\n"); @@ -45,6 +46,9 @@ static int usage(const char* name) {      fprintf(stderr,"              mq  : medium quality\n");      fprintf(stderr,"              hq  : high quality\n");      fprintf(stderr,"              vhq : very high quality\n"); +    fprintf(stderr,"              dlq : dynamic low quality\n"); +    fprintf(stderr,"              dmq : dynamic medium quality\n"); +    fprintf(stderr,"              dhq : dynamic high quality\n");      fprintf(stderr,"    -i    input file sample rate (ignored if input file is specified)\n");      fprintf(stderr,"    -o    output file sample rate\n");      return -1; @@ -86,6 +90,12 @@ int main(int argc, char* argv[]) {                  quality = AudioResampler::HIGH_QUALITY;              else if (!strcmp(optarg, "vhq"))                  quality = AudioResampler::VERY_HIGH_QUALITY; +            else if (!strcmp(optarg, "dlq")) +                quality = AudioResampler::DYN_LOW_QUALITY; +            else if (!strcmp(optarg, "dmq")) +                quality = AudioResampler::DYN_MED_QUALITY; +            else if (!strcmp(optarg, "dhq")) +                quality = AudioResampler::DYN_HIGH_QUALITY;              else {                  usage(progname);                  return -1; @@ -137,6 +147,8 @@ int main(int argc, char* argv[]) {          channels = info.channels;          input_freq = info.samplerate;      } else { +        // data for testing is exactly (input sampling rate/1000)/2 seconds +        // so 44.1khz input is 22.05 seconds          double k = 1000; // Hz / s          double time = (input_freq / 2) / k;          size_t input_frames = size_t(input_freq * time); @@ -148,7 +160,7 @@ int main(int argc, char* argv[]) {              double y = sin(M_PI * k * t * t);              int16_t yi = floor(y * 32767.0 + 0.5);              for (size_t j=0 ; j<(size_t)channels ; j++) { -                in[i*channels + j] = yi / (1+j); +                in[i*channels + j] = yi / (1+j); // right ch. 1/2 left ch.              }          }      } @@ -170,6 +182,7 @@ int main(int argc, char* argv[]) {          }          virtual status_t getNextBuffer(Buffer* buffer,                  int64_t pts = kInvalidPTS) { +            (void)pts; // suppress warning              size_t requestedFrames = buffer->frameCount;              if (requestedFrames > mNumFrames - mNextFrame) {                  buffer->frameCount = mNumFrames - mNextFrame; @@ -205,6 +218,9 @@ int main(int argc, char* argv[]) {              buffer->frameCount = 0;              buffer->i16 = NULL;          } +        void reset() { +            mNextFrame = 0; +        }      } provider(input_vaddr, input_size, channels);      size_t input_frames = input_size / (channels * sizeof(int16_t)); @@ -215,37 +231,29 @@ int main(int argc, char* argv[]) {      output_size &= ~7; // always stereo, 32-bits      void* output_vaddr = malloc(output_size); +    AudioResampler* resampler = AudioResampler::create(16, channels, +            output_freq, quality); +    size_t out_frames = output_size/8; +    resampler->setSampleRate(input_freq); +    resampler->setVolume(0x1000, 0x1000);      if (profiling) { -        AudioResampler* resampler = AudioResampler::create(16, channels, -                output_freq, quality); - -        size_t out_frames = output_size/8; -        resampler->setSampleRate(input_freq); -        resampler->setVolume(0x1000, 0x1000); - -        memset(output_vaddr, 0, output_size); +        const int looplimit = 100;          timespec start, end;          clock_gettime(CLOCK_MONOTONIC, &start); -        resampler->resample((int*) output_vaddr, out_frames, &provider); -        resampler->resample((int*) output_vaddr, out_frames, &provider); -        resampler->resample((int*) output_vaddr, out_frames, &provider); -        resampler->resample((int*) output_vaddr, out_frames, &provider); +        for (int i = 0; i < looplimit; ++i) { +            resampler->resample((int*) output_vaddr, out_frames, &provider); +            provider.reset(); //  reset only provider as benchmarking +        }          clock_gettime(CLOCK_MONOTONIC, &end);          int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;          int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec; -        int64_t time = (end_ns - start_ns)/4; -        printf("%f Mspl/s\n", out_frames/(time/1e9)/1e6); - -        delete resampler; +        int64_t time = end_ns - start_ns; +        printf("time(ns):%lld  channels:%d  quality:%d\n", time, channels, quality); +        printf("%f Mspl/s\n", out_frames * looplimit / (time / 1e9) / 1e6); +        resampler->reset();      } -    AudioResampler* resampler = AudioResampler::create(16, channels, -            output_freq, quality); -    size_t out_frames = output_size/8; -    resampler->setSampleRate(input_freq); -    resampler->setVolume(0x1000, 0x1000); -      memset(output_vaddr, 0, output_size);      if (gVerbose) {          printf("resample() %u output frames\n", out_frames); @@ -259,14 +267,18 @@ int main(int argc, char* argv[]) {          printf("reset() complete\n");      } -    // down-mix (we just truncate and keep the left channel) +    // mono takes left channel only +    // stereo right channel is half amplitude of stereo left channel (due to input creation)      int32_t* out = (int32_t*) output_vaddr;      int16_t* convert = (int16_t*) malloc(out_frames * channels * sizeof(int16_t)); +      for (size_t i = 0; i < out_frames; i++) { -        for (int j=0 ; j<channels ; j++) { +        for (int j = 0; j < channels; j++) {              int32_t s = out[i * 2 + j] >> 12; -            if (s > 32767)       s =  32767; -            else if (s < -32768) s = -32768; +            if (s > 32767) +                s = 32767; +            else if (s < -32768) +                s = -32768;              convert[i * channels + j] = int16_t(s);          }      } diff --git a/tools/resampler_tools/fir.cpp b/tools/resampler_tools/fir.cpp index cc3d509..27a9b05 100644 --- a/tools/resampler_tools/fir.cpp +++ b/tools/resampler_tools/fir.cpp @@ -20,15 +20,25 @@  #include <stdlib.h>  #include <string.h> -static double sinc(double x) { +static inline double sinc(double x) {      if (fabs(x) == 0.0f) return 1.0f;      return sin(x) / x;  } -static double sqr(double x) { +static inline double sqr(double x) {      return x*x;  } +static inline int64_t toint(double x, int64_t maxval) { +    int64_t v; + +    v = static_cast<int64_t>(floor(y * maxval + 0.5)); +    if (v >= maxval) { +        return maxval - 1; // error! +    } +    return v; +} +  static double I0(double x) {      // from the Numerical Recipes in C p. 237      double ax,ans,y; @@ -54,11 +64,12 @@ static double kaiser(int k, int N, double beta) {      return I0(beta * sqrt(1.0 - sqr((2.0*k)/N - 1.0))) / I0(beta);  } -  static void usage(char* name) {      fprintf(stderr, -            "usage: %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings] [-f {float|fixed}] [-b beta] [-v dBFS] [-l lerp]\n" -            "       %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings] [-f {float|fixed}] [-b beta] [-v dBFS] -p M/N\n" +            "usage: %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings]" +            " [-f {float|fixed|fixed16}] [-b beta] [-v dBFS] [-l lerp]\n" +            "       %s [-h] [-d] [-s sample_rate] [-c cut-off_frequency] [-n half_zero_crossings]" +            " [-f {float|fixed|fixed16}] [-b beta] [-v dBFS] -p M/N\n"              "    -h    this help message\n"              "    -d    debug, print comma-separated coefficient table\n"              "    -p    generate poly-phase filter coefficients, with sample increment M/N\n" @@ -66,6 +77,7 @@ static void usage(char* name) {              "    -c    cut-off frequency (20478)\n"              "    -n    number of zero-crossings on one side (8)\n"              "    -l    number of lerping bits (4)\n" +            "    -m    number of polyphases (related to -l, default 16)\n"              "    -f    output format, can be fixed-point or floating-point (fixed)\n"              "    -b    kaiser window parameter beta (7.865 [-80dB])\n"              "    -v    attenuation in dBFS (0)\n", @@ -77,8 +89,7 @@ static void usage(char* name) {  int main(int argc, char** argv)  {      // nc is the number of bits to store the coefficients -    const int nc = 32; - +    int nc = 32;      bool polyphase = false;      unsigned int polyM = 160;      unsigned int polyN = 147; @@ -88,7 +99,6 @@ int main(int argc, char** argv)      double atten = 1;      int format = 0; -      // in order to keep the errors associated with the linear      // interpolation of the coefficients below the quantization error      // we must satisfy: @@ -104,7 +114,6 @@ int main(int argc, char** argv)      // Smith, J.O. Digital Audio Resampling Home Page      // https://ccrma.stanford.edu/~jos/resample/, 2011-03-29      // -    int nz = 4;      //         | 0.1102*(A - 8.7)                         A > 50      //  beta = | 0.5842*(A - 21)^0.4 + 0.07886*(A - 21)   21 <= A <= 50 @@ -123,7 +132,6 @@ int main(int argc, char** argv)      //   100 dB   10.056      double beta = 7.865; -      // 2*nzc = (A - 8) / (2.285 * dw)      //      with dw the transition width = 2*pi*dF/Fs      // @@ -148,8 +156,9 @@ int main(int argc, char** argv)      // nzc  = 20      // +    int M = 1 << 4; // number of phases for interpolation      int ch; -    while ((ch = getopt(argc, argv, ":hds:c:n:f:l:b:p:v:")) != -1) { +    while ((ch = getopt(argc, argv, ":hds:c:n:f:l:m:b:p:v:z:")) != -1) {          switch (ch) {              case 'd':                  debug = true; @@ -169,13 +178,26 @@ int main(int argc, char** argv)              case 'n':                  nzc = atoi(optarg);                  break; +            case 'm': +                M = atoi(optarg); +                break;              case 'l': -                nz = atoi(optarg); +                M = 1 << atoi(optarg);                  break;              case 'f': -                if (!strcmp(optarg,"fixed")) format = 0; -                else if (!strcmp(optarg,"float")) format = 1; -                else usage(argv[0]); +                if (!strcmp(optarg, "fixed")) { +                    format = 0; +                } +                else if (!strcmp(optarg, "fixed16")) { +                    format = 0; +                    nc = 16; +                } +                else if (!strcmp(optarg, "float")) { +                    format = 1; +                } +                else { +                    usage(argv[0]); +                }                  break;              case 'b':                  beta = atof(optarg); @@ -193,11 +215,14 @@ int main(int argc, char** argv)      // cut off frequency ratio Fc/Fs      double Fcr = Fc / Fs; -      // total number of coefficients (one side) -    const int M = (1 << nz); +      const int N = M * nzc; +    // lerp (which is most useful if M is a power of 2) + +    int nz = 0; // recalculate nz as the bits needed to represent M +    for (int i = M-1 ; i; i>>=1, nz++);      // generate the right half of the filter      if (!debug) {          printf("// cmd-line: "); @@ -207,7 +232,7 @@ int main(int argc, char** argv)          printf("\n");          if (!polyphase) {              printf("const int32_t RESAMPLE_FIR_SIZE           = %d;\n", N); -            printf("const int32_t RESAMPLE_FIR_LERP_INT_BITS  = %d;\n", nz); +            printf("const int32_t RESAMPLE_FIR_INT_PHASES     = %d;\n", M);              printf("const int32_t RESAMPLE_FIR_NUM_COEF       = %d;\n", nzc);          } else {              printf("const int32_t RESAMPLE_FIR_SIZE           = %d;\n", 2*nzc*polyN); @@ -224,7 +249,7 @@ int main(int argc, char** argv)          for (int i=0 ; i<=M ; i++) { // an extra set of coefs for interpolation              for (int j=0 ; j<nzc ; j++) {                  int ix = j*M + i; -                double x = (2.0 * M_PI * ix * Fcr) / (1 << nz); +                double x = (2.0 * M_PI * ix * Fcr) / M;                  double y = kaiser(ix+N, 2*N, beta) * sinc(x) * 2.0 * Fcr;                  y *= atten; @@ -232,11 +257,13 @@ int main(int argc, char** argv)                      if (j == 0)                          printf("\n    ");                  } -                  if (!format) { -                    int64_t yi = floor(y * ((1ULL<<(nc-1))) + 0.5); -                    if (yi >= (1LL<<(nc-1))) yi = (1LL<<(nc-1))-1; -                    printf("0x%08x, ", int32_t(yi)); +                    int64_t yi = toint(y, 1ULL<<(nc-1)); +                    if (nc > 16) { +                        printf("0x%08x, ", int32_t(yi)); +                    } else { +                        printf("0x%04x, ", int32_t(yi)&0xffff); +                    }                  } else {                      printf("%.9g%s ", y, debug ? "," : "f,");                  } @@ -254,9 +281,12 @@ int main(int argc, char** argv)                  double y = kaiser(i+N, 2*N, beta) * sinc(x) * 2.0 * Fcr;;                  y *= atten;                  if (!format) { -                    int64_t yi = floor(y * ((1ULL<<(nc-1))) + 0.5); -                    if (yi >= (1LL<<(nc-1))) yi = (1LL<<(nc-1))-1; -                    printf("0x%08x", int32_t(yi)); +                    int64_t yi = toint(y, 1ULL<<(nc-1)); +                    if (nc > 16) { +                        printf("0x%08x, ", int32_t(yi)); +                    } else { +                        printf("0x%04x, ", int32_t(yi)&0xffff); +                    }                  } else {                      printf("%.9g%s", y, debug ? "" : "f");                  } @@ -277,5 +307,3 @@ int main(int argc, char** argv)  }  // http://www.csee.umbc.edu/help/sound/AFsp-V2R1/html/audio/ResampAudio.html - -  | 
