Modify libatrac to use fixed-point arithmetic.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@22298 a1c6a512-1295-4272-9138-f99709370657
2009-08-13 20:38:59 +00:00 · 2009-08-13 20:38:59 +00:00 · 432e2ecc13
commit 432e2ecc13
parent c956059ec5
10 changed files with 324 additions and 5756 deletions
--- a/apps/codecs/libatrac/Makefile.test
+++ b/apps/codecs/libatrac/Makefile.test
@ -1,8 +1,8 @@
-CFLAGS = -Wall -O3 -DTEST -D"DEBUGF=printf"
+CFLAGS = -Wall -O3 -DTEST -D"DEBUGF=printf" -D"ROCKBOX_LITTLE_ENDIAN=1" -D"ICONST_ATTR=" -D"ICODE_ATTR="
-OBJS = atrac3.o dsputil.o bitstream.o fft.o mdct.o libavutil/log.o libavutil/mem.o ../librm/rm.o
+OBJS = atrac3.o bitstream.o libavutil/log.o libavutil/mem.o ../librm/rm.o fixp_math.o ../lib/mdct2.o ../lib/mdct_lookup.o
 atractest: $(OBJS)
-	gcc -o atractest $(OBJS) -lm
+	gcc -o atractest $(OBJS)
 .c.o :
 	$(CC) $(CFLAGS) -c -o $@ $<
--- a/apps/codecs/libatrac/README.rockbox
+++ b/apps/codecs/libatrac/README.rockbox
@ -8,13 +8,19 @@ ffmpeg is licensed under the Lesser GNU General Public License.
 IMPORT DETAILS
-The decoder is based on ffmpeg-svn r18079. It still uses floating 
+The decoder is based on ffmpeg-svn r18079.
-point math and not suitable to be used in rockbox.
+The decoder had been modified to use fixed-point arithmetic.
 TESTING
 The test program should compile in any Unix-like environment using the
 command "make -f Makefile.test".
 For ARM targets add -DCPU_ARM to CFLAGS in Makefile.test to make use of 
 the asm ARM optimisations in rockbox's mdct library.
 For Big-endian targets, change -D"ROCKBOX_LITTLE_ENDIAN=1" 
 to -D"ROCKBOX_BIG_ENDIAN=1" in Makefile.test.
 Running "./atractest file.rm" will decode the audio data to a WAV file
 called "output.wav" in the current directory.
--- a/apps/codecs/libatrac/atrac3.c
+++ b/apps/codecs/libatrac/atrac3.c
@ -38,7 +38,6 @@
 #include "avcodec.h"
 #include "bitstream.h"
 #include "dsputil.h"
 #include "bytestream.h"
 #include <stdint.h>
@ -50,6 +49,10 @@
 #include "../librm/rm.h"
 #include "atrac3data.h"
 #include "atrac3data_fixed.h"
 #include "fixp_math.h"
 //#include "fixp_mdct.h"
 #include "../lib/mdct2.h"
 #define JOINT_STEREO    0x12
 #define STEREO          0x2
@ -70,23 +73,23 @@ typedef struct {
 typedef struct {
    int     pos;
    int     numCoefs;
-    float   coef[8];
+    int32_t coef[8];
 } tonal_component;
 typedef struct {
    int               bandsCoded;
    int               numComponents;
    tonal_component   components[64];
-    float             prevFrame[1024];
+    int32_t           prevFrame[1024];
    int               gcBlkSwitch;
    gain_block        gainBlock[2];
-    DECLARE_ALIGNED_16(float, spectrum[1024]);
+    int32_t           spectrum[1024] __attribute__((aligned(16)));
-    DECLARE_ALIGNED_16(float, IMDCT_buf[1024]);
+    int32_t           IMDCT_buf[1024] __attribute__((aligned(16)));
-    float             delayBuf1[46]; ///<qmf delay buffers
+    int32_t           delayBuf1[46]; ///<qmf delay buffers
-    float             delayBuf2[46];
+    int32_t           delayBuf2[46];
-    float             delayBuf3[46];
+    int32_t           delayBuf3[46];
 } channel_unit;
 typedef struct {
@ -114,9 +117,9 @@ typedef struct {
    //@}
    //@{
    /** data buffers */
-    float               outSamples[2048];
+    int32_t             outSamples[2048];
    uint8_t*            decoded_bytes_buffer;
-    float               tempBuf[1070];
+    int32_t             tempBuf[1070];
    //@}
    //@{
    /** extradata */
@ -127,17 +130,8 @@ typedef struct {
    //@}
 } ATRAC3Context;
-static DECLARE_ALIGNED_16(float,mdct_window[512]);
+static int32_t          qmf_window[48];
 static float            qmf_window[48];
 static VLC              spectral_coeff_tab[7];
 static float            SFTable[64];
 static float            gain_tab1[16];
 static float            gain_tab2[31];
 static MDCTContext      mdct_ctx;
 static DSPContext       dsp;
 /* quadrature mirror synthesis filter */
 /**
 * Quadrature mirror synthesis filter.
@ -149,14 +143,12 @@ static DSPContext       dsp;
 * @param delayBuf  delayBuf buffer
 * @param temp      temp buffer
 */
-
+static void iqmf (int32_t *inlo, int32_t *inhi, unsigned int nIn, int32_t *pOut, int32_t *delayBuf, int32_t *temp)
 static void iqmf (float *inlo, float *inhi, unsigned int nIn, float *pOut, float *delayBuf, float *temp)
 {
    int   i, j;
-    float   *p1, *p3;
+    int32_t   *p1, *p3;
-    memcpy(temp, delayBuf, 46*sizeof(float));
+    memcpy(temp, delayBuf, 46*sizeof(int32_t));
    p3 = temp + 46;
@ -171,12 +163,12 @@ static void iqmf (float *inlo, float *inhi, unsigned int nIn, float *pOut, float
    /* loop2 */
    p1 = temp;
    for (j = nIn; j != 0; j--) {
-        float s1 = 0.0;
+        int32_t s1 = 0;
-        float s2 = 0.0;
+        int32_t s2 = 0;
        for (i = 0; i < 48; i += 2) {
-            s1 += p1[i] * qmf_window[i];
+            s1 += fixmul31(p1[i], qmf_window[i]);
-            s2 += p1[i+1] * qmf_window[i+1];
+            s2 += fixmul31(p1[i+1], qmf_window[i+1]);
        }
        pOut[0] = s2;
@ -187,7 +179,7 @@ static void iqmf (float *inlo, float *inhi, unsigned int nIn, float *pOut, float
    }
    /* Update the delay buffer. */
-    memcpy(delayBuf, temp + nIn*2, 46*sizeof(float));
+    memcpy(delayBuf, temp + (nIn << 1), 46*sizeof(int32_t));
 }
 /**
@ -199,10 +191,9 @@ static void iqmf (float *inlo, float *inhi, unsigned int nIn, float *pOut, float
 * @param odd_band  1 if the band is an odd band
 */
-static void IMLT(float *pInput, float *pOutput, int odd_band)
+static void IMLT(int32_t *pInput, int32_t *pOutput, int odd_band)
 {
    int     i;
    if (odd_band) {
        /**
        * Reverse the odd bands before IMDCT, this is an effect of the QMF transform
@ -214,13 +205,15 @@ static void IMLT(float *pInput, float *pOutput, int odd_band)
        */
        for (i=0; i<128; i++)
-            FFSWAP(float, pInput[i], pInput[255-i]);
+            FFSWAP(int32_t, pInput[i], pInput[255-i]);
    }
    /* Apply the imdct. */
    mdct_backward(512, pInput, pOutput);
-    ff_imdct_calc(&mdct_ctx,pOutput,pInput);
+    /* Windowing. */
-
+    for(i = 0; i<512; i++)
-    /* Perform windowing on the output. */
+        pOutput[i] = fixmul31(pOutput[i], window_lookup[i]);
    dsp.vector_fmul(pOutput,mdct_window,512);
 }
@ -259,30 +252,20 @@ static int decode_bytes(const uint8_t* inbuffer, uint8_t* out, int bytes){
 static av_cold void init_atrac3_transforms(ATRAC3Context *q) {
-    float enc_window[256];
+    int32_t s;
    float s;
    int i;
    /* Generate the mdct window, for details see
     * http://wiki.multimedia.cx/index.php?title=RealAudio_atrc#Windows */
    for (i=0 ; i<256; i++)
        enc_window[i] = (sin(((i + 0.5) / 256.0 - 0.5) * M_PI) + 1.0) * 0.5;
-    if (!mdct_window[0])
+    /* mdct window had been generated and saved as a lookup table in atrac3data_fixed.h */
        for (i=0 ; i<256; i++) {
            mdct_window[i] = enc_window[i]/(enc_window[i]*enc_window[i] + enc_window[255-i]*enc_window[255-i]);
            mdct_window[511-i] = mdct_window[i];
        }
    /* Generate the QMF window. */
    for (i=0 ; i<24; i++) {
-        s = qmf_48tap_half[i] * 2.0;
+        s = qmf_48tap_half_fix[i] << 1;
        qmf_window[i] = s;
        qmf_window[47 - i] = s;
    }
    /* Initialize the MDCT transform. */
    ff_mdct_init(&mdct_ctx, 9, 1);
 }
 /**
@ -367,12 +350,12 @@ static void readQuantSpectralCoeffs (GetBitContext *gb, int selector, int coding
 * @return outSubbands   subband counter, fix for broken specification/files
 */
-static int decodeSpectrum (GetBitContext *gb, float *pOut)
+static int decodeSpectrum (GetBitContext *gb, int32_t *pOut)
 {
    int   numSubbands, codingMode, cnt, first, last, subbWidth, *pIn;
    int   subband_vlc_index[32], SF_idxs[32];
    int   mantissas[128];
-    float SF;
+    int32_t SF;
    numSubbands = get_bits(gb, 5); // number of coded subbands
    codingMode = get_bits1(gb); // coding Mode: 0 - VLC/ 1-CLC
@ -400,20 +383,20 @@ static int decodeSpectrum (GetBitContext *gb, float *pOut)
            readQuantSpectralCoeffs (gb, subband_vlc_index[cnt], codingMode, mantissas, subbWidth);
            /* Decode the scale factor for this subband. */
-            SF = SFTable[SF_idxs[cnt]] * iMaxQuant[subband_vlc_index[cnt]];
+            SF = fixmul31(SFTable_fixed[SF_idxs[cnt]], iMaxQuant_fix[subband_vlc_index[cnt]]);
            /* Inverse quantize the coefficients. */
            for (pIn=mantissas ; first<last; first++, pIn++)
-                pOut[first] = *pIn * SF;
+                pOut[first] = fixmul16(*pIn, SF);
        } else {
            /* This subband was not coded, so zero the entire subband. */
-            memset(pOut+first, 0, subbWidth*sizeof(float));
+            memset(pOut+first, 0, subbWidth*sizeof(int32_t));
        }
    }
    /* Clear the subbands that were not coded. */
    first = subbandTab[cnt];
-    memset(pOut+first, 0, (1024 - first) * sizeof(float));
+    memset(pOut+first, 0, (1024 - first) * sizeof(int32_t));
    return numSubbands;
 }
@ -431,8 +414,8 @@ static int decodeTonalComponents (GetBitContext *gb, tonal_component *pComponent
    int   components, coding_mode_selector, coding_mode, coded_values_per_component;
    int   sfIndx, coded_values, max_coded_values, quant_step_index, coded_components;
    int   band_flags[4], mantissa[8];
-    float  *pCoef;
+    int32_t  *pCoef;
-    float  scalefactor;
+    int32_t  scalefactor;
    int   component_count = 0;
    components = get_bits(gb,5);
@ -473,7 +456,7 @@ static int decodeTonalComponents (GetBitContext *gb, tonal_component *pComponent
                coded_values = coded_values_per_component + 1;
                coded_values = FFMIN(max_coded_values,coded_values);
-                scalefactor = SFTable[sfIndx] * iMaxQuant[quant_step_index];
+                scalefactor = fixmul31(SFTable_fixed[sfIndx], iMaxQuant_fix[quant_step_index]);
                readQuantSpectralCoeffs(gb, quant_step_index, coding_mode, mantissa, coded_values);
@ -482,7 +465,7 @@ static int decodeTonalComponents (GetBitContext *gb, tonal_component *pComponent
                /* inverse quant */
                pCoef = pComponent[component_count].coef;
                for (cnt = 0; cnt < coded_values; cnt++)
-                    pCoef[cnt] = mantissa[cnt] * scalefactor;
+                    pCoef[cnt] = fixmul16(mantissa[cnt], scalefactor);
                component_count++;
            }
@ -539,21 +522,21 @@ static int decodeGainControl (GetBitContext *gb, gain_block *pGb, int numBands)
 * @param pGain2        next band gain info
 */
-static void gainCompensateAndOverlap (float *pIn, float *pPrev, float *pOut, gain_info *pGain1, gain_info *pGain2)
+static void gainCompensateAndOverlap (int32_t *pIn, int32_t *pPrev, int32_t *pOut, gain_info *pGain1, gain_info *pGain2)
 {
    /* gain compensation function */
-    float  gain1, gain2, gain_inc;
+    int32_t  gain1, gain2, gain_inc;
    int   cnt, numdata, nsample, startLoc, endLoc;
    if (pGain2->num_gain_data == 0)
-        gain1 = 1.0;
+        gain1 = ONE_16;
    else
        gain1 = gain_tab1[pGain2->levcode[0]];
    if (pGain1->num_gain_data == 0) {
        for (cnt = 0; cnt < 256; cnt++)
-            pOut[cnt] = pIn[cnt] * gain1 + pPrev[cnt];
+            pOut[cnt] = fixmul16(pIn[cnt], gain1) + pPrev[cnt];
    } else {
        numdata = pGain1->num_gain_data;
        pGain1->loccode[numdata] = 32;
@ -570,36 +553,38 @@ static void gainCompensateAndOverlap (float *pIn, float *pPrev, float *pOut, gai
            /* interpolate */
            for (; nsample < startLoc; nsample++)
-                pOut[nsample] = (pIn[nsample] * gain1 + pPrev[nsample]) * gain2;
+                pOut[nsample] = fixmul16((fixmul16(pIn[nsample], gain1) + pPrev[nsample]), gain2);
            /* interpolation is done over eight samples */
            for (; nsample < endLoc; nsample++) {
-                pOut[nsample] = (pIn[nsample] * gain1 + pPrev[nsample]) * gain2;
+                pOut[nsample] = fixmul16((fixmul16(pIn[nsample], gain1) + pPrev[nsample]),gain2);
-                gain2 *= gain_inc;
+                gain2 = fixmul16(gain2, gain_inc);
            }
        }
        for (; nsample < 256; nsample++)
-            pOut[nsample] = (pIn[nsample] * gain1) + pPrev[nsample];
+            pOut[nsample] = fixmul16(pIn[nsample], gain1) + pPrev[nsample];
    }
    /* Delay for the overlapping part. */
-    memcpy(pPrev, &pIn[256], 256*sizeof(float));
+    memcpy(pPrev, &pIn[256], 256*sizeof(int32_t));
 }
 /**
 * Combine the tonal band spectrum and regular band spectrum
 * Return position of the last tonal coefficient
 *
 * @param pSpectrum     output spectrum buffer
 * @param numComponents amount of tonal components
 * @param pComponent    tonal components for this band
 */
-static int addTonalComponents (float *pSpectrum, int numComponents, tonal_component *pComponent)
+static int addTonalComponents (int32_t *pSpectrum, int numComponents, tonal_component *pComponent)
 {
    int   cnt, i, lastPos = -1;
-    float   *pIn, *pOut;
+    int32_t *pOut;
    int32_t *pIn;
    for (cnt = 0; cnt < numComponents; cnt++){
        lastPos = FFMAX(pComponent[cnt].pos + pComponent[cnt].numCoefs, lastPos);
@ -614,13 +599,13 @@ static int addTonalComponents (float *pSpectrum, int numComponents, tonal_compon
 }
-#define INTERPOLATE(old,new,nsample) ((old) + (nsample)*0.125*((new)-(old)))
+#define INTERPOLATE(old,new,nsample)  ((old*ONE_16) + fixmul16(((nsample*ONE_16)>>3), (((new) - (old))*ONE_16)))
-static void reverseMatrixing(float *su1, float *su2, int *pPrevCode, int *pCurrCode)
+static void reverseMatrixing(int32_t *su1, int32_t *su2, int *pPrevCode, int *pCurrCode)
 {
    int    i, band, nsample, s1, s2;
-    float    c1, c2;
+    int32_t    c1, c2;
-    float    mc1_l, mc1_r, mc2_l, mc2_r;
+    int32_t    mc1_l, mc1_r, mc2_l, mc2_r;
    for (i=0,band = 0; band < 4*256; band+=256,i++) {
        s1 = pPrevCode[i];
@ -629,18 +614,18 @@ static void reverseMatrixing(float *su1, float *su2, int *pPrevCode, int *pCurrC
        if (s1 != s2) {
            /* Selector value changed, interpolation needed. */
-            mc1_l = matrixCoeffs[s1*2];
+            mc1_l = matrixCoeffs_fix[s1<<1];
-            mc1_r = matrixCoeffs[s1*2+1];
+            mc1_r = matrixCoeffs_fix[(s1<<1)+1];
-            mc2_l = matrixCoeffs[s2*2];
+            mc2_l = matrixCoeffs_fix[s2<<1];
-            mc2_r = matrixCoeffs[s2*2+1];
+            mc2_r = matrixCoeffs_fix[(s2<<1)+1];
            /* Interpolation is done over the first eight samples. */
            for(; nsample < 8; nsample++) {
                c1 = su1[band+nsample];
                c2 = su2[band+nsample];
-                c2 = c1 * INTERPOLATE(mc1_l,mc2_l,nsample) + c2 * INTERPOLATE(mc1_r,mc2_r,nsample);
+                c2 = fixmul16(c1, INTERPOLATE(mc1_l, mc2_l, nsample)) + fixmul16(c2, INTERPOLATE(mc1_r, mc2_r, nsample));
                su1[band+nsample] = c2;
-                su2[band+nsample] = c1 * 2.0 - c2;
+                su2[band+nsample] = (c1 << 1) - c2;
            }
        }
@ -650,8 +635,8 @@ static void reverseMatrixing(float *su1, float *su2, int *pPrevCode, int *pCurrC
                for (; nsample < 256; nsample++) {
                    c1 = su1[band+nsample];
                    c2 = su2[band+nsample];
-                    su1[band+nsample] = c2 * 2.0;
+                    su1[band+nsample] = c2 << 1;
-                    su2[band+nsample] = (c1 - c2) * 2.0;
+                    su2[band+nsample] = (c1 - c2) << 1;
                }
                break;
@ -659,8 +644,8 @@ static void reverseMatrixing(float *su1, float *su2, int *pPrevCode, int *pCurrC
                for (; nsample < 256; nsample++) {
                    c1 = su1[band+nsample];
                    c2 = su2[band+nsample];
-                    su1[band+nsample] = (c1 + c2) * 2.0;
+                    su1[band+nsample] = (c1 + c2) << 1;
-                    su2[band+nsample] = c2 * -2.0;
+                    su2[band+nsample] = -1*(c2 << 1);
                }
                break;
            case 2:
@ -678,24 +663,23 @@ static void reverseMatrixing(float *su1, float *su2, int *pPrevCode, int *pCurrC
    }
 }
-static void getChannelWeights (int indx, int flag, float ch[2]){
+static void getChannelWeights (int indx, int flag, int32_t ch[2]){
    if (indx == 7) {
-        ch[0] = 1.0;
+        ch[0] = ONE_16;
-        ch[1] = 1.0;
+        ch[1] = ONE_16;
    } else {
-        ch[0] = (float)(indx & 7) / 7.0;
+        ch[0] = fixdiv16(((indx & 7)*ONE_16), 7*ONE_16);
-        ch[1] = sqrt(2 - ch[0]*ch[0]);
+        ch[1] = fastSqrt((ONE_16 << 1) - fixmul16(ch[0], ch[0]));
        if(flag)
-            FFSWAP(float, ch[0], ch[1]);
+            FFSWAP(int32_t, ch[0], ch[1]);
    }
 }
-static void channelWeighting (float *su1, float *su2, int *p3)
+static void channelWeighting (int32_t *su1, int32_t *su2, int *p3)
 {
    int   band, nsample;
    /* w[x][y] y=0 is left y=1 is right */
-    float w[2][2];
+    int32_t w[2][2];
    if (p3[1] != 7 || p3[3] != 7){
        getChannelWeights(p3[1], p3[0], w[0]);
@ -704,13 +688,13 @@ static void channelWeighting (float *su1, float *su2, int *p3)
        for(band = 1; band < 4; band++) {
            /* scale the channels by the weights */
            for(nsample = 0; nsample < 8; nsample++) {
-                su1[band*256+nsample] *= INTERPOLATE(w[0][0], w[0][1], nsample);
+                su1[band*256+nsample] = fixmul16(su1[band*256+nsample], INTERPOLATE(w[0][0], w[0][1], nsample));
-                su2[band*256+nsample] *= INTERPOLATE(w[1][0], w[1][1], nsample);
+                su2[band*256+nsample] = fixmul16(su2[band*256+nsample], INTERPOLATE(w[1][0], w[1][1], nsample));
            }
            for(; nsample < 256; nsample++) {
-                su1[band*256+nsample] *= w[1][0];
+                su1[band*256+nsample] = fixmul16(su1[band*256+nsample], w[1][0]);
-                su2[band*256+nsample] *= w[1][1];
+                su2[band*256+nsample] = fixmul16(su2[band*256+nsample], w[1][1]);
            }
        }
    }
@ -728,10 +712,9 @@ static void channelWeighting (float *su1, float *su2, int *p3)
 */
-static int decodeChannelSoundUnit (ATRAC3Context *q, GetBitContext *gb, channel_unit *pSnd, float *pOut, int channelNum, int codingMode)
+static int decodeChannelSoundUnit (ATRAC3Context *q, GetBitContext *gb, channel_unit *pSnd, int32_t *pOut, int channelNum, int codingMode)
 {
    int   band, result=0, numSubbands, lastTonal, numBands;
    if (codingMode == JOINT_STEREO && channelNum == 1) {
        if (get_bits(gb,2) != 3) {
            av_log(NULL,AV_LOG_ERROR,"JS mono Sound Unit id != 3.\n");
@ -771,7 +754,7 @@ static int decodeChannelSoundUnit (ATRAC3Context *q, GetBitContext *gb, channel_
        if (band <= numBands) {
            IMLT(&(pSnd->spectrum[band*256]), pSnd->IMDCT_buf, band&1);
        } else
-            memset(pSnd->IMDCT_buf, 0, 512 * sizeof(float));
+            memset(pSnd->IMDCT_buf, 0, 512 * sizeof(int32_t));
        /* gain compensation and overlapping */
        gainCompensateAndOverlap (pSnd->IMDCT_buf, &(pSnd->prevFrame[band*256]), &(pOut[band*256]),
@ -795,7 +778,7 @@ static int decodeChannelSoundUnit (ATRAC3Context *q, GetBitContext *gb, channel_
 static int decodeFrame(ATRAC3Context *q, const uint8_t* databuf)
 {
    int   result, i;
-    float   *p1, *p2, *p3, *p4;
+    int32_t   *p1, *p2, *p3, *p4;
    uint8_t *ptr1;
    if (q->codingMode == JOINT_STEREO) {
@ -893,7 +876,6 @@ static int decodeFrame(ATRAC3Context *q, const uint8_t* databuf)
 static int atrac3_decode_frame(RMContext *rmctx, ATRAC3Context *q,
            void *data, int *data_size,
            const uint8_t *buf, int buf_size) {
    //ATRAC3Context *q = rmctx->priv_data;
    int result = 0, i;
    const uint8_t* databuf;
    int16_t* samples = data;
@ -919,13 +901,13 @@ static int atrac3_decode_frame(RMContext *rmctx, ATRAC3Context *q,
    if (q->channels == 1) {
        /* mono */
        for (i = 0; i<1024; i++)
-            samples[i] = av_clip_int16(round(q->outSamples[i]));
+            samples[i] = av_clip_int16(q->outSamples[i]);
        *data_size = 1024 * sizeof(int16_t);
    } else {
        /* stereo */
        for (i = 0; i < 1024; i++) {
-            samples[i*2] = av_clip_int16(round(q->outSamples[i]));
+            samples[i*2] = av_clip_int16(q->outSamples[i]);
-            samples[i*2+1] = av_clip_int16(round(q->outSamples[1024+i]));
+            samples[i*2+1] = av_clip_int16(q->outSamples[1024+i]);
        }
        *data_size = 2048 * sizeof(int16_t);
    }
@ -944,7 +926,6 @@ static av_cold int atrac3_decode_init(ATRAC3Context *q, RMContext *rmctx)
 {
    int i;
    const uint8_t *edata_ptr = rmctx->codec_extradata;
    //ATRAC3Context *q = rmctx->priv_data;
    static VLC_TYPE atrac3_vlc_table[4096][2];
    static int vlcs_initialized = 0;
@ -1051,17 +1032,6 @@ static av_cold int atrac3_decode_init(ATRAC3Context *q, RMContext *rmctx)
    init_atrac3_transforms(q);
    /* Generate the scale factors. */
    for (i=0 ; i<64 ; i++)
        SFTable[i] = pow(2.0, (i - 15) / 3.0);
    /* Generate gain tables. */
    for (i=0 ; i<16 ; i++)
        gain_tab1[i] = powf (2.0, (4 - i));
    for (i=-15 ; i<16 ; i++)
        gain_tab2[i+15] = powf (2.0, i * -0.125);
    /* init the joint-stereo decoding data */
    q->weighting_delay[0] = 0;
    q->weighting_delay[1] = 7;
@ -1076,8 +1046,6 @@ static av_cold int atrac3_decode_init(ATRAC3Context *q, RMContext *rmctx)
        q->matrix_coeff_index_next[i] = 3;
    }
    dsputil_init(&dsp);
    q->pUnits = av_mallocz(sizeof(channel_unit)*q->channels);
    if (!q->pUnits) {
        av_free(q->decoded_bytes_buffer);
--- a/apps/codecs/libatrac/atrac3data_fixed.h
+++ b/apps/codecs/libatrac/atrac3data_fixed.h
@ -0,0 +1,145 @@
 /* tables for the scalefactor decoding */
 /* scaled by 2^31*/
 static const int32_t iMaxQuant_fix[8] = {
    0x0, 0x55555580, 0x33333340, 0x24924940, 0x1c71c720, 0x11111120, 0x8421080, 
    0x4104108
 };
 /* scaled by 2^16 */
 static const int32_t SFTable_fixed[64] = {
    0x00000800, 0x00000a14, 0x00000cb3, 0x00001000, 0x00001429, 0x00001966, 
    0x00002000, 0x00002851, 0x000032cc, 0x00004000, 0x000050a3, 0x00006598, 
    0x00008000, 0x0000a145, 0x0000cb30, 0x00010000, 0x0001428a, 0x00019660, 
    0x00020000, 0x00028514, 0x00032cc0, 0x00040000, 0x00050a29, 0x00065980, 
    0x00080000, 0x000a1452, 0x000cb2ff, 0x00100000, 0x001428a3, 0x001965ff, 
    0x00200000, 0x00285146, 0x0032cbfd, 0x00400000, 0x0050a28c, 0x006597fb, 
    0x00800000, 0x00a14518, 0x00cb2ff5, 0x01000000, 0x01428a30, 0x01965fea, 
    0x02000000, 0x02851460, 0x032cbfd4, 0x04000000, 0x050a28c0, 0x06597fa8, 
    0x08000000, 0x0a145180, 0x0cb2ff50, 0x10000000, 0x1428a300, 0x1965fea0, 
    0x20000000, 0x28514600, 0x32cbfd40, 0x40000000, 0x50a28c00, 0x6597fa80, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000,
 };
 /* transform data */
 /* floating point values scaled by 2^31  */
 static const int32_t qmf_48tap_half_fix[24] = {
    0xffff855e, 0xfffcfbca, 0xfffe28eb, 0x9de6b,    0x7f028,    0xffe40d08, 
    0xffeef140, 0x42a692,   0x19ab1f,   0xff75dec7, 0xffe738f5, 0x100e928, 
    0xfffdfedf, 0xfe478b84, 0x50b279,   0x2c83f88,  0xff005ad7, 0xfba2ee80, 
    0x2685970,  0x6f42798,  0xfa6b6f10, 0xf3475f80, 0x10e7f7c0, 0x3b6c44c0 
 };
 /* mdct window scaled by 2^31 */
 static const int32_t window_lookup[512] = {
    0xffffb10c, 0xfffd394b, 0xfff8494f, 0xfff0e025, 0xffe6fc5f, 0xffda9c15, 
    0xffcbbce6, 0xffba5bf4, 0xffa675e8, 0xff9006f0, 0xff770aba, 0xff5b7c7e, 
    0xff3d56f2, 0xff1c9452, 0xfef92e59, 0xfed31e45, 0xfeaa5cd5, 0xfe7ee247, 
    0xfe50a657, 0xfe1fa041, 0xfdebc6c1, 0xfdb5100d, 0xfd7b71d5, 0xfd3ee149, 
    0xfcff5311, 0xfcbcbb49, 0xfc770d99, 0xfc2e3d15, 0xfbe23c39, 0xfb92fd29, 
    0xfb407141, 0xfaea8989, 0xfa913661, 0xfa3467b1, 0xf9d40cd9, 0xf9701499, 
    0xf9086d41, 0xf89d04a9, 0xf82dc7f1, 0xf7baa3e1, 0xf74384b1, 0xf6c85611, 
    0xf6490321, 0xf5c576b1, 0xf53d9b21, 0xf4b15a01, 0xf4209ce1, 0xf38b4c71, 
    0xf2f15171, 0xf2529411, 0xf1aefbf1, 0xf10670a1, 0xf058d941, 0xefa61cc1, 
    0xeeee21c1, 0xee30cec1, 0xed6e0a41, 0xeca5ba61, 0xebd7c5c1, 0xeb041241, 
    0xea2a8601, 0xe94b0861, 0xe8657f61, 0xe779d241, 0xe687e861, 0xe58fa9e1, 
    0xe490fec1, 0xe38bd101, 0xe28009c1, 0xe16d93e1, 0xe0545ba1, 0xdf344dc1, 
    0xde0d5881, 0xdcdf6bc1, 0xdbaa7801, 0xda6e70c1, 0xd92b4ac1, 0xd7e0fc81, 
    0xd68f7ec1, 0xd536cd41, 0xd3d6e5c1, 0xd26fc901, 0xd10179c1, 0xcf8bff41, 
    0xce0f6301, 0xcc8bb241, 0xcb00fdc1, 0xc96f5b01, 0xc7d6e141, 0xc637af41, 
    0xc491e4c1, 0xc2e5a801, 0xc1332401, 0xbf7a8701, 0xbdbc0681, 0xbbf7da01, 
    0xba2e4181, 0xb85f7f81, 0xb68bde01, 0xb4b3a981, 0xb2d73781, 0xb0f6df01, 
    0xaf12ff01, 0xad2bfa81, 0xab423981, 0xa9562981, 0xa7683c01, 0xa578e701, 
    0xa388a681, 0xa197f801, 0x9fa75e81, 0x9db75f01, 0x9bc88201, 0x99db5301, 
    0x97f06001, 0x96083601, 0x94236601, 0x92427f81, 0x90661481, 0x8e8eb481, 
    0x8cbced01, 0x8af14d81, 0x892c5f81, 0x876eab01, 0x85b8b681, 0x840b0301, 
    0x82660c01, 0x80ca4a01, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 
    0x80ca4a01, 0x82660c01, 0x840b0301, 0x85b8b681, 0x876eab01, 0x892c5f81, 
    0x8af14d81, 0x8cbced01, 0x8e8eb481, 0x90661481, 0x92427f81, 0x94236601, 
    0x96083601, 0x97f06001, 0x99db5301, 0x9bc88201, 0x9db75f01, 0x9fa75e81, 
    0xa197f801, 0xa388a681, 0xa578e701, 0xa7683c01, 0xa9562981, 0xab423981, 
    0xad2bfa81, 0xaf12ff01, 0xb0f6df01, 0xb2d73781, 0xb4b3a981, 0xb68bde01, 
    0xb85f7f81, 0xba2e4181, 0xbbf7da01, 0xbdbc0681, 0xbf7a8701, 0xc1332401, 
    0xc2e5a801, 0xc491e4c1, 0xc637af41, 0xc7d6e141, 0xc96f5b01, 0xcb00fdc1, 
    0xcc8bb241, 0xce0f6301, 0xcf8bff41, 0xd10179c1, 0xd26fc901, 0xd3d6e5c1, 
    0xd536cd41, 0xd68f7ec1, 0xd7e0fc81, 0xd92b4ac1, 0xda6e70c1, 0xdbaa7801, 
    0xdcdf6bc1, 0xde0d5881, 0xdf344dc1, 0xe0545ba1, 0xe16d93e1, 0xe28009c1, 
    0xe38bd101, 0xe490fec1, 0xe58fa9e1, 0xe687e861, 0xe779d241, 0xe8657f61, 
    0xe94b0861, 0xea2a8601, 0xeb041241, 0xebd7c5c1, 0xeca5ba61, 0xed6e0a41, 
    0xee30cec1, 0xeeee21c1, 0xefa61cc1, 0xf058d941, 0xf10670a1, 0xf1aefbf1, 
    0xf2529411, 0xf2f15171, 0xf38b4c71, 0xf4209ce1, 0xf4b15a01, 0xf53d9b21, 
    0xf5c576b1, 0xf6490321, 0xf6c85611, 0xf74384b1, 0xf7baa3e1, 0xf82dc7f1, 
    0xf89d04a9, 0xf9086d41, 0xf9701499, 0xf9d40cd9, 0xfa3467b1, 0xfa913661, 
    0xfaea8989, 0xfb407141, 0xfb92fd29, 0xfbe23c39, 0xfc2e3d15, 0xfc770d99, 
    0xfcbcbb49, 0xfcff5311, 0xfd3ee149, 0xfd7b71d5, 0xfdb5100d, 0xfdebc6c1, 
    0xfe1fa041, 0xfe50a657, 0xfe7ee247, 0xfeaa5cd5, 0xfed31e45, 0xfef92e59, 
    0xff1c9452, 0xff3d56f2, 0xff5b7c7e, 0xff770aba, 0xff9006f0, 0xffa675e8, 
    0xffba5bf4, 0xffcbbce6, 0xffda9c15, 0xffe6fc5f, 0xfff0e025, 0xfff8494f, 
    0xfffd394b, 0xffffb10c, 
 };
 /* Gain tables scaled by 2^16 */
 static const int32_t gain_tab1[16] = {
    0x00100000, 0x00080000, 0x00040000, 0x00020000, 0x00010000, 0x00008000, 
    0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400, 0x00000200, 
    0x00000100, 0x00000080, 0x00000040, 0x00000020, 
 };
 static const int32_t gain_tab2[31] = {
    0x0003ab03, 0x00035d14, 0x0003159d, 0x0002d414, 0x000297fb, 0x000260e0, 
    0x00022e57, 0x00020000, 0x0001d582, 0x0001ae8a, 0x00018ace, 0x00016a0a, 
    0x00014bfe, 0x00013070, 0x0001172c, 0x00010000, 0x0000eac1, 0x0000d745, 
    0x0000c567, 0x0000b505, 0x0000a5ff, 0x00009838, 0x00008b96, 0x00008000, 
    0x00007560, 0x00006ba2, 0x000062b4, 0x00005a82, 0x000052ff, 0x00004c1c, 
    0x000045cb, 
 };
 /* Joint-Stereo related tables, scaled by 2^16 */
 static const int32_t matrixCoeffs_fix[8] = {
    0x00000000, 0x00020000, 0x00020000, 0x00020000, 
    0x00000000, 0x00000000, 0x00010000, 0x00010000, 
 };
--- a/apps/codecs/libatrac/dsputil.c
+++ b/apps/codecs/libatrac/dsputil.c
--- a/apps/codecs/libatrac/dsputil.h
+++ b/apps/codecs/libatrac/dsputil.h
@ -1,898 +0,0 @@
 /*
 * DSP utils
 * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 /**
 * @file libavcodec/dsputil.h
 * DSP utils.
 * note, many functions in here may use MMX which trashes the FPU state, it is
 * absolutely necessary to call emms_c() between dsp & float/double code
 */
 #ifndef AVCODEC_DSPUTIL_H
 #define AVCODEC_DSPUTIL_H
 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 //#define DEBUG
 /* dct code */
 typedef short DCTELEM;
 typedef int DWTELEM;
 typedef short IDWTELEM;
 void fdct_ifast (DCTELEM *data);
 void fdct_ifast248 (DCTELEM *data);
 void ff_jpeg_fdct_islow (DCTELEM *data);
 void ff_fdct248_islow (DCTELEM *data);
 void j_rev_dct (DCTELEM *data);
 void j_rev_dct4 (DCTELEM *data);
 void j_rev_dct2 (DCTELEM *data);
 void j_rev_dct1 (DCTELEM *data);
 void ff_wmv2_idct_c(DCTELEM *data);
 void ff_fdct_mmx(DCTELEM *block);
 void ff_fdct_mmx2(DCTELEM *block);
 void ff_fdct_sse2(DCTELEM *block);
 void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
 void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
 void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
 void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
 void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
 void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1,
                              const float *src2, int src3, int blocksize, int step);
 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
                             const float *win, float add_bias, int len);
 void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels);
 /* encoding scans */
 extern const uint8_t ff_alternate_horizontal_scan[64];
 extern const uint8_t ff_alternate_vertical_scan[64];
 extern const uint8_t ff_zigzag_direct[64];
 extern const uint8_t ff_zigzag248_direct[64];
 /* pixel operations */
 #define MAX_NEG_CROP 1024
 /* temporary */
 extern uint32_t ff_squareTbl[512];
 extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
 /* VP3 DSP functions */
 void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
 void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
 void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
 void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
 void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
 /* VP6 DSP functions */
 void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, int stride,
                           const int16_t *h_weights, const int16_t *v_weights);
 /* 1/2^n downscaling functions from imgconvert.c */
 void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
 void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
 void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
 void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
              int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
 /* minimum alignment rules ;)
 If you notice errors in the align stuff, need more alignment for some ASM code
 for some CPU or need to use a function with less aligned data then send a mail
 to the ffmpeg-devel mailing list, ...
 !warning These alignments might not match reality, (missing attribute((align))
 stuff somewhere possible).
 I (Michael) did not check them, these are just the alignments which I think
 could be reached easily ...
 !future video codecs might need functions with less strict alignment
 */
 /*
 void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size);
 void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
 void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
 void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
 void clear_blocks_c(DCTELEM *blocks);
 */
 /* add and put pixel (decoding) */
 // blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
 //h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4
 typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h);
 typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
 typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
 typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
 typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
 typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
 #define DEF_OLD_QPEL(name)\
 void ff_put_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
 void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
 void ff_avg_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
 DEF_OLD_QPEL(qpel16_mc11_old_c)
 DEF_OLD_QPEL(qpel16_mc31_old_c)
 DEF_OLD_QPEL(qpel16_mc12_old_c)
 DEF_OLD_QPEL(qpel16_mc32_old_c)
 DEF_OLD_QPEL(qpel16_mc13_old_c)
 DEF_OLD_QPEL(qpel16_mc33_old_c)
 DEF_OLD_QPEL(qpel8_mc11_old_c)
 DEF_OLD_QPEL(qpel8_mc31_old_c)
 DEF_OLD_QPEL(qpel8_mc12_old_c)
 DEF_OLD_QPEL(qpel8_mc32_old_c)
 DEF_OLD_QPEL(qpel8_mc13_old_c)
 DEF_OLD_QPEL(qpel8_mc33_old_c)
 #define CALL_2X_PIXELS(a, b, n)\
 static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
    b(block  , pixels  , line_size, h);\
    b(block+n, pixels+n, line_size, h);\
 }
 /* motion estimation */
 // h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2
 // although currently h<4 is not used as functions with width <8 are neither used nor implemented
 typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
 // for snow slices
 typedef struct slice_buffer_s slice_buffer;
 /**
 * Scantable.
 */
 typedef struct ScanTable{
    const uint8_t *scantable;
    uint8_t permutated[64];
    uint8_t raster_end[64];
 #if ARCH_PPC
                /** Used by dct_quantize_altivec to find last-non-zero */
    DECLARE_ALIGNED(16, uint8_t, inverse[64]);
 #endif
 } ScanTable;
 void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);
 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize,
                         int block_w, int block_h,
                         int src_x, int src_y, int w, int h);
 /**
 * DSPContext.
 */
 typedef struct DSPContext {
    /* pixel ops : interface with DCT */
    void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
    void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
    void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
    void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
    void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
    void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size);
    void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size);
    int (*sum_abs_dctelem)(DCTELEM *block/*align 16*/);
    /**
     * translational global motion compensation.
     */
    void (*gmc1)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder);
    /**
     * global motion compensation.
     */
    void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy,
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
    void (*clear_block)(DCTELEM *block/*align 16*/);
    void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
    int (*pix_sum)(uint8_t * pix, int line_size);
    int (*pix_norm1)(uint8_t * pix, int line_size);
 // 16x16 8x8 4x4 2x2 16x8 8x4 4x2 8x16 4x8 2x4
    me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
    me_cmp_func sse[6];
    me_cmp_func hadamard8_diff[6];
    me_cmp_func dct_sad[6];
    me_cmp_func quant_psnr[6];
    me_cmp_func bit[6];
    me_cmp_func rd[6];
    me_cmp_func vsad[6];
    me_cmp_func vsse[6];
    me_cmp_func nsse[6];
    me_cmp_func w53[6];
    me_cmp_func w97[6];
    me_cmp_func dct_max[6];
    me_cmp_func dct264_sad[6];
    me_cmp_func me_pre_cmp[6];
    me_cmp_func me_cmp[6];
    me_cmp_func me_sub_cmp[6];
    me_cmp_func mb_cmp[6];
    me_cmp_func ildct_cmp[6]; //only width 16 used
    me_cmp_func frame_skip_cmp[6]; //only width 8 used
    int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
                             int size);
    /**
     * Halfpel motion compensation with rounding (a+b+1)>>1.
     * this is an array[4][4] of motion compensation functions for 4
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
    op_pixels_func put_pixels_tab[4][4];
    /**
     * Halfpel motion compensation with rounding (a+b+1)>>1.
     * This is an array[4][4] of motion compensation functions for 4
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
     * @param block destination into which the result is averaged (a+b+1)>>1
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
    op_pixels_func avg_pixels_tab[4][4];
    /**
     * Halfpel motion compensation with no rounding (a+b)>>1.
     * this is an array[2][4] of motion compensation functions for 2
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
    op_pixels_func put_no_rnd_pixels_tab[4][4];
    /**
     * Halfpel motion compensation with no rounding (a+b)>>1.
     * this is an array[2][4] of motion compensation functions for 2
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
     * @param block destination into which the result is averaged (a+b)>>1
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
    op_pixels_func avg_no_rnd_pixels_tab[4][4];
    void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h);
    /**
     * Thirdpel motion compensation with rounding (a+b+1)>>1.
     * this is an array[12] of motion compensation functions for the 9 thirdpe
     * positions<br>
     * *pixels_tab[ xthirdpel + 4*ythirdpel ]
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
    tpel_mc_func put_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
    tpel_mc_func avg_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
    qpel_mc_func put_qpel_pixels_tab[2][16];
    qpel_mc_func avg_qpel_pixels_tab[2][16];
    qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
    qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
    qpel_mc_func put_mspel_pixels_tab[8];
    /**
     * h264 Chroma MC
     */
    h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
    /* This is really one func used in VC-1 decoding */
    h264_chroma_mc_func put_no_rnd_h264_chroma_pixels_tab[3];
    h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
    qpel_mc_func put_h264_qpel_pixels_tab[4][16];
    qpel_mc_func avg_h264_qpel_pixels_tab[4][16];
    qpel_mc_func put_2tap_qpel_pixels_tab[4][16];
    qpel_mc_func avg_2tap_qpel_pixels_tab[4][16];
    h264_weight_func weight_h264_pixels_tab[10];
    h264_biweight_func biweight_h264_pixels_tab[10];
    /* AVS specific */
    qpel_mc_func put_cavs_qpel_pixels_tab[2][16];
    qpel_mc_func avg_cavs_qpel_pixels_tab[2][16];
    void (*cavs_filter_lv)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
    void (*cavs_filter_lh)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
    void (*cavs_filter_cv)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
    void (*cavs_filter_ch)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
    void (*cavs_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);
    me_cmp_func pix_abs[2][4];
    /* huffyuv specific */
    void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w);
    void (*add_bytes_l2)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 16*/, int w);
    void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w);
    /**
     * subtract huffyuv's variant of median prediction
     * note, this might read from src1[-1], src2[-1]
     */
    void (*sub_hfyu_median_prediction)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top);
    void (*add_hfyu_median_prediction)(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top);
    /* this might write to dst[w] */
    void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
    void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
    void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
    void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0);
    /* v/h_loop_filter_luma_intra: align 16 */
    void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
    void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
    void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
    void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0);
    void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
    void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
    // h264_loop_filter_strength: simd only. the C version is inlined in h264.c
    void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
                                      int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field);
    void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
    void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
    void (*h261_loop_filter)(uint8_t *src, int stride);
    void (*x8_v_loop_filter)(uint8_t *src, int stride, int qscale);
    void (*x8_h_loop_filter)(uint8_t *src, int stride, int qscale);
    void (*vp3_v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
    void (*vp3_h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
    void (*vp6_filter_diag4)(uint8_t *dst, uint8_t *src, int stride,
                             const int16_t *h_weights,const int16_t *v_weights);
    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
    void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
    void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
    /* no alignment needed */
    void (*flac_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
    void (*vector_fmul)(float *dst, const float *src, int len);
    void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
    /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
    void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step);
    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
    /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767]
     * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
    void (*float_to_int16)(int16_t *dst, const float *src, long len);
    void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
    /* (I)DCT */
    void (*fdct)(DCTELEM *block/* align 16*/);
    void (*fdct248)(DCTELEM *block/* align 16*/);
    /* IDCT really*/
    void (*idct)(DCTELEM *block/* align 16*/);
    /**
     * block -> idct -> clip to unsigned 8 bit -> dest.
     * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
     * @param line_size size in bytes of a horizontal line of dest
     */
    void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
    /**
     * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
     * @param line_size size in bytes of a horizontal line of dest
     */
    void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
    /**
     * idct input permutation.
     * several optimized IDCTs need a permutated input (relative to the normal order of the reference
     * IDCT)
     * this permutation must be performed before the idct_put/add, note, normally this can be merged
     * with the zigzag/alternate scan<br>
     * an example to avoid confusion:
     * - (->decode coeffs -> zigzag reorder -> dequant -> reference idct ->...)
     * - (x -> referece dct -> reference idct -> x)
     * - (x -> referece dct -> simple_mmx_perm = idct_permutation -> simple_idct_mmx -> x)
     * - (->decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant -> simple_idct_mmx ->...)
     */
    uint8_t idct_permutation[64];
    int idct_permutation_type;
 #define FF_NO_IDCT_PERM 1
 #define FF_LIBMPEG2_IDCT_PERM 2
 #define FF_SIMPLE_IDCT_PERM 3
 #define FF_TRANSPOSE_IDCT_PERM 4
 #define FF_PARTTRANS_IDCT_PERM 5
 #define FF_SSE2_IDCT_PERM 6
    int (*try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale);
    void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
 #define BASIS_SHIFT 16
 #define RECON_SHIFT 6
    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w);
 #define EDGE_WIDTH 16
    /* h264 functions */
    /* NOTE!!! if you implement any of h264_idct8_add, h264_idct8_add4 then you must implement all of them
       NOTE!!! if you implement any of h264_idct_add, h264_idct_add16, h264_idct_add16intra, h264_idct_add8 then you must implement all of them
        The reason for above, is that no 2 out of one list may use a different permutation.
    */
    void (*h264_idct_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
    void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
    void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
    void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
    void (*h264_dct)(DCTELEM block[4][4]);
    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
    /* snow wavelet */
    void (*vertical_compose97i)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
    void (*horizontal_compose97i)(IDWTELEM *b, int width);
    void (*inner_add_yblock)(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
    void (*prefetch)(void *mem, int stride, int h);
    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
    /* vc1 functions */
    void (*vc1_inv_trans_8x8)(DCTELEM *b);
    void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, DCTELEM *block);
    void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, DCTELEM *block);
    void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, DCTELEM *block);
    void (*vc1_v_overlap)(uint8_t* src, int stride);
    void (*vc1_h_overlap)(uint8_t* src, int stride);
    /* put 8x8 block with bicubic interpolation and quarterpel precision
     * last argument is actually round value instead of height
     */
    op_pixels_func put_vc1_mspel_pixels_tab[16];
    /* intrax8 functions */
    void (*x8_spatial_compensation[12])(uint8_t *src , uint8_t *dst, int linesize);
    void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
           int * range, int * sum,  int edges);
    /* ape functions */
    /**
     * Add contents of the second vector to the first one.
     * @param len length of vectors, should be multiple of 16
     */
    void (*add_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
    /**
     * Add contents of the second vector to the first one.
     * @param len length of vectors, should be multiple of 16
     */
    void (*sub_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
    /**
     * Calculate scalar product of two vectors.
     * @param len length of vectors, should be multiple of 16
     * @param shift number of bits to discard from product
     */
    int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift);
    /* rv30 functions */
    qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
    qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
    /* rv40 functions */
    qpel_mc_func put_rv40_qpel_pixels_tab[4][16];
    qpel_mc_func avg_rv40_qpel_pixels_tab[4][16];
    h264_chroma_mc_func put_rv40_chroma_pixels_tab[3];
    h264_chroma_mc_func avg_rv40_chroma_pixels_tab[3];
 } DSPContext;
 void dsputil_static_init(void);
 void dsputil_init(DSPContext* p);
 int ff_check_alignment(void);
 /**
 * permute block according to permuatation.
 * @param last last non zero element in scantable order
 */
 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last);
 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);
 #define         BYTE_VEC32(c)   ((c)*0x01010101UL)
 static inline uint32_t rnd_avg32(uint32_t a, uint32_t b)
 {
    return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
 }
 static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
 {
    return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
 }
 static inline int get_penalty_factor(int lambda, int lambda2, int type){
    switch(type&0xFF){
    default:
    case FF_CMP_SAD:
        return lambda>>FF_LAMBDA_SHIFT;
    case FF_CMP_DCT:
        return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
    case FF_CMP_W53:
        return (4*lambda)>>(FF_LAMBDA_SHIFT);
    case FF_CMP_W97:
        return (2*lambda)>>(FF_LAMBDA_SHIFT);
    case FF_CMP_SATD:
    case FF_CMP_DCT264:
        return (2*lambda)>>FF_LAMBDA_SHIFT;
    case FF_CMP_RD:
    case FF_CMP_PSNR:
    case FF_CMP_SSE:
    case FF_CMP_NSSE:
        return lambda2>>FF_LAMBDA_SHIFT;
    case FF_CMP_BIT:
        return 1;
    }
 }
 /**
 * Empty mmx state.
 * this must be called between any dsp function and float/double code.
 * for example sin(); dsp->idct_put(); emms_c(); cos()
 */
 #define emms_c()
 /* should be defined by architectures supporting
   one or more MultiMedia extension */
 int mm_support(void);
 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
 #define DECLARE_ALIGNED_16(t, v) DECLARE_ALIGNED(16, t, v)
 #if HAVE_MMX
 #undef emms_c
 extern int mm_flags;
 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
 static inline void emms(void)
 {
    __asm__ volatile ("emms;":::"memory");
 }
 #define emms_c() \
 {\
    if (mm_flags & FF_MM_MMX)\
        emms();\
 }
 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
 #elif ARCH_ARM
 extern int mm_flags;
 #if HAVE_NEON
 #   define DECLARE_ALIGNED_8(t, v) DECLARE_ALIGNED(16, t, v)
 #   define STRIDE_ALIGN 16
 #endif
 #elif ARCH_PPC
 extern int mm_flags;
 #define DECLARE_ALIGNED_8(t, v) DECLARE_ALIGNED(16, t, v)
 #define STRIDE_ALIGN 16
 #elif HAVE_MMI
 #define DECLARE_ALIGNED_8(t, v) DECLARE_ALIGNED(16, t, v)
 #define STRIDE_ALIGN 16
 #else
 #define mm_flags 0
 #define mm_support() 0
 #endif
 #ifndef DECLARE_ALIGNED_8
 #   define DECLARE_ALIGNED_8(t, v) DECLARE_ALIGNED(8, t, v)
 #endif
 #ifndef STRIDE_ALIGN
 #   define STRIDE_ALIGN 8
 #endif
 /* PSNR */
 void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3],
              int orig_linesize[3], int coded_linesize,
              AVCodecContext *avctx);
 /* FFT computation */
 /* NOTE: soon integer code will be added, so you must use the
   FFTSample type */
 typedef float FFTSample;
 struct MDCTContext;
 typedef struct FFTComplex {
    FFTSample re, im;
 } FFTComplex;
 typedef struct FFTContext {
    int nbits;
    int inverse;
    uint16_t *revtab;
    FFTComplex *exptab;
    FFTComplex *exptab1; /* only used by SSE code */
    FFTComplex *tmp_buf;
    void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
    void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
    void (*imdct_calc)(struct MDCTContext *s, FFTSample *output, const FFTSample *input);
    void (*imdct_half)(struct MDCTContext *s, FFTSample *output, const FFTSample *input);
 } FFTContext;
 extern FFTSample* ff_cos_tabs[13];
 /**
 * Sets up a complex FFT.
 * @param nbits           log2 of the length of the input array
 * @param inverse         if 0 perform the forward transform, if 1 perform the inverse
 */
 int ff_fft_init(FFTContext *s, int nbits, int inverse);
 void ff_fft_permute_c(FFTContext *s, FFTComplex *z);
 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
 /**
 * Do the permutation needed BEFORE calling ff_fft_calc().
 */
 static inline void ff_fft_permute(FFTContext *s, FFTComplex *z)
 {
    s->fft_permute(s, z);
 }
 /**
 * Do a complex FFT with the parameters defined in ff_fft_init(). The
 * input data must be permuted before. No 1.0/sqrt(n) normalization is done.
 */
 static inline void ff_fft_calc(FFTContext *s, FFTComplex *z)
 {
    s->fft_calc(s, z);
 }
 void ff_fft_end(FFTContext *s);
 /* MDCT computation */
 typedef struct MDCTContext {
    int n;  /* size of MDCT (i.e. number of input data * 2) */
    int nbits; /* n = 2^nbits */
    /* pre/post rotation tables */
    FFTSample *tcos;
    FFTSample *tsin;
    FFTContext fft;
 } MDCTContext;
 static inline void ff_imdct_calc(MDCTContext *s, FFTSample *output, const FFTSample *input)
 {
    s->fft.imdct_calc(s, output, input);
 }
 static inline void ff_imdct_half(MDCTContext *s, FFTSample *output, const FFTSample *input)
 {
    s->fft.imdct_half(s, output, input);
 }
 /**
 * Generate a Kaiser-Bessel Derived Window.
 * @param   window  pointer to half window
 * @param   alpha   determines window shape
 * @param   n       size of half window
 */
 void ff_kbd_window_init(float *window, float alpha, int n);
 /**
 * Generate a sine window.
 * @param   window  pointer to half window
 * @param   n       size of half window
 */
 void ff_sine_window_init(float *window, int n);
 extern float ff_sine_128 [ 128];
 extern float ff_sine_256 [ 256];
 extern float ff_sine_512 [ 512];
 extern float ff_sine_1024[1024];
 extern float ff_sine_2048[2048];
 extern float ff_sine_4096[4096];
 extern float *ff_sine_windows[6];
 int ff_mdct_init(MDCTContext *s, int nbits, int inverse);
 void ff_imdct_calc_c(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_c(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_3dn(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_3dn(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_mdct_calc(MDCTContext *s, FFTSample *out, const FFTSample *input);
 void ff_mdct_end(MDCTContext *s);
 /* Real Discrete Fourier Transform */
 enum RDFTransformType {
    RDFT,
    IRDFT,
    RIDFT,
    IRIDFT,
 };
 typedef struct {
    int nbits;
    int inverse;
    int sign_convention;
    /* pre/post rotation tables */
    FFTSample *tcos;
    FFTSample *tsin;
    FFTContext fft;
 } RDFTContext;
 /**
 * Sets up a real FFT.
 * @param nbits           log2 of the length of the input array
 * @param trans           the type of transform
 */
 int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans);
 void ff_rdft_calc(RDFTContext *s, FFTSample *data);
 void ff_rdft_end(RDFTContext *s);
 #define WRAPPER8_16(name8, name16)\
 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
    return name8(s, dst           , src           , stride, h)\
          +name8(s, dst+8         , src+8         , stride, h);\
 }
 #define WRAPPER8_16_SQ(name8, name16)\
 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
    int score=0;\
    score +=name8(s, dst           , src           , stride, 8);\
    score +=name8(s, dst+8         , src+8         , stride, 8);\
    if(h==16){\
        dst += 8*stride;\
        src += 8*stride;\
        score +=name8(s, dst           , src           , stride, 8);\
        score +=name8(s, dst+8         , src+8         , stride, 8);\
    }\
    return score;\
 }
 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
 {
    int i;
    for(i=0; i<h; i++)
    {
        AV_WN16(dst   , AV_RN16(src   ));
        dst+=dstStride;
        src+=srcStride;
    }
 }
 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
 {
    int i;
    for(i=0; i<h; i++)
    {
        AV_WN32(dst   , AV_RN32(src   ));
        dst+=dstStride;
        src+=srcStride;
    }
 }
 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
 {
    int i;
    for(i=0; i<h; i++)
    {
        AV_WN32(dst   , AV_RN32(src   ));
        AV_WN32(dst+4 , AV_RN32(src+4 ));
        dst+=dstStride;
        src+=srcStride;
    }
 }
 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
 {
    int i;
    for(i=0; i<h; i++)
    {
        AV_WN32(dst   , AV_RN32(src   ));
        AV_WN32(dst+4 , AV_RN32(src+4 ));
        dst[8]= src[8];
        dst+=dstStride;
        src+=srcStride;
    }
 }
 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
 {
    int i;
    for(i=0; i<h; i++)
    {
        AV_WN32(dst   , AV_RN32(src   ));
        AV_WN32(dst+4 , AV_RN32(src+4 ));
        AV_WN32(dst+8 , AV_RN32(src+8 ));
        AV_WN32(dst+12, AV_RN32(src+12));
        dst+=dstStride;
        src+=srcStride;
    }
 }
 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
 {
    int i;
    for(i=0; i<h; i++)
    {
        AV_WN32(dst   , AV_RN32(src   ));
        AV_WN32(dst+4 , AV_RN32(src+4 ));
        AV_WN32(dst+8 , AV_RN32(src+8 ));
        AV_WN32(dst+12, AV_RN32(src+12));
        dst[16]= src[16];
        dst+=dstStride;
        src+=srcStride;
    }
 }
 #endif /* AVCODEC_DSPUTIL_H */
--- a/apps/codecs/libatrac/fft.c
+++ b/apps/codecs/libatrac/fft.c
@ -1,374 +0,0 @@
 /*
 * FFT/IFFT transforms
 * Copyright (c) 2008 Loren Merritt
 * Copyright (c) 2002 Fabrice Bellard
 * Partly based on libdjbfft by D. J. Bernstein
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 /**
 * @file libavcodec/fft.c
 * FFT/IFFT transforms.
 */
 #include "dsputil.h"
 /* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
 DECLARE_ALIGNED_16(FFTSample, ff_cos_16[8]);
 DECLARE_ALIGNED_16(FFTSample, ff_cos_32[16]);
 DECLARE_ALIGNED_16(FFTSample, ff_cos_64[32]);
 DECLARE_ALIGNED_16(FFTSample, ff_cos_128[64]);
 DECLARE_ALIGNED_16(FFTSample, ff_cos_256[128]);
 DECLARE_ALIGNED_16(FFTSample, ff_cos_512[256]);
 DECLARE_ALIGNED_16(FFTSample, ff_cos_1024[512]);
 DECLARE_ALIGNED_16(FFTSample, ff_cos_2048[1024]);
 DECLARE_ALIGNED_16(FFTSample, ff_cos_4096[2048]);
 DECLARE_ALIGNED_16(FFTSample, ff_cos_8192[4096]);
 DECLARE_ALIGNED_16(FFTSample, ff_cos_16384[8192]);
 DECLARE_ALIGNED_16(FFTSample, ff_cos_32768[16384]);
 DECLARE_ALIGNED_16(FFTSample, ff_cos_65536[32768]);
 FFTSample *ff_cos_tabs[] = {
    ff_cos_16, ff_cos_32, ff_cos_64, ff_cos_128, ff_cos_256, ff_cos_512, ff_cos_1024,
    ff_cos_2048, ff_cos_4096, ff_cos_8192, ff_cos_16384, ff_cos_32768, ff_cos_65536,
 };
 static int split_radix_permutation(int i, int n, int inverse)
 {
    int m;
    if(n <= 2) return i&1;
    m = n >> 1;
    if(!(i&m))            return split_radix_permutation(i, m, inverse)*2;
    m >>= 1;
    if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1;
    else                  return split_radix_permutation(i, m, inverse)*4 - 1;
 }
 av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 {
    int i, j, m, n;
    float alpha, c1, s1, s2;
    int split_radix = 1;
    int av_unused has_vectors;
    if (nbits < 2 || nbits > 16)
        goto fail;
    s->nbits = nbits;
    n = 1 << nbits;
    s->tmp_buf = NULL;
    s->exptab  = av_malloc((n / 2) * sizeof(FFTComplex));
    if (!s->exptab)
        goto fail;
    s->revtab = av_malloc(n * sizeof(uint16_t));
    if (!s->revtab)
        goto fail;
    s->inverse = inverse;
    s2 = inverse ? 1.0 : -1.0;
    s->fft_permute = ff_fft_permute_c;
    s->fft_calc    = ff_fft_calc_c;
    s->imdct_calc  = ff_imdct_calc_c;
    s->imdct_half  = ff_imdct_half_c;
    s->exptab1     = NULL;
 #if HAVE_MMX && HAVE_YASM
    has_vectors = mm_support();
    if (has_vectors & FF_MM_SSE && HAVE_SSE) {
        /* SSE for P3/P4/K8 */
        s->imdct_calc  = ff_imdct_calc_sse;
        s->imdct_half  = ff_imdct_half_sse;
        s->fft_permute = ff_fft_permute_sse;
        s->fft_calc    = ff_fft_calc_sse;
    } else if (has_vectors & FF_MM_3DNOWEXT && HAVE_AMD3DNOWEXT) {
        /* 3DNowEx for K7 */
        s->imdct_calc = ff_imdct_calc_3dn2;
        s->imdct_half = ff_imdct_half_3dn2;
        s->fft_calc   = ff_fft_calc_3dn2;
    } else if (has_vectors & FF_MM_3DNOW && HAVE_AMD3DNOW) {
        /* 3DNow! for K6-2/3 */
        s->imdct_calc = ff_imdct_calc_3dn;
        s->imdct_half = ff_imdct_half_3dn;
        s->fft_calc   = ff_fft_calc_3dn;
    }
 #elif HAVE_ALTIVEC
    has_vectors = mm_support();
    if (has_vectors & FF_MM_ALTIVEC) {
        s->fft_calc = ff_fft_calc_altivec;
        split_radix = 0;
    }
 #endif
    if (split_radix) {
        for(j=4; j<=nbits; j++) {
            int m = 1<<j;
            double freq = 2*M_PI/m;
            FFTSample *tab = ff_cos_tabs[j-4];
            for(i=0; i<=m/4; i++)
                tab[i] = cos(i*freq);
            for(i=1; i<m/4; i++)
                tab[m/2-i] = tab[i];
        }
        for(i=0; i<n; i++)
            s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i;
        s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
    } else {
        int np, nblocks, np2, l;
        FFTComplex *q;
        for(i=0; i<(n/2); i++) {
            alpha = 2 * M_PI * (float)i / (float)n;
            c1 = cos(alpha);
            s1 = sin(alpha) * s2;
            s->exptab[i].re = c1;
            s->exptab[i].im = s1;
        }
        np = 1 << nbits;
        nblocks = np >> 3;
        np2 = np >> 1;
        s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex));
        if (!s->exptab1)
            goto fail;
        q = s->exptab1;
        do {
            for(l = 0; l < np2; l += 2 * nblocks) {
                *q++ = s->exptab[l];
                *q++ = s->exptab[l + nblocks];
                q->re = -s->exptab[l].im;
                q->im = s->exptab[l].re;
                q++;
                q->re = -s->exptab[l + nblocks].im;
                q->im = s->exptab[l + nblocks].re;
                q++;
            }
            nblocks = nblocks >> 1;
        } while (nblocks != 0);
        av_freep(&s->exptab);
        /* compute bit reverse table */
        for(i=0;i<n;i++) {
            m=0;
            for(j=0;j<nbits;j++) {
                m |= ((i >> j) & 1) << (nbits-j-1);
            }
            s->revtab[i]=m;
        }
    }
    return 0;
 fail:
    av_freep(&s->revtab);
    av_freep(&s->exptab);
    av_freep(&s->exptab1);
    av_freep(&s->tmp_buf);
    return -1;
 }
 void ff_fft_permute_c(FFTContext *s, FFTComplex *z)
 {
    int j, k, np;
    FFTComplex tmp;
    const uint16_t *revtab = s->revtab;
    np = 1 << s->nbits;
    if (s->tmp_buf) {
        /* TODO: handle split-radix permute in a more optimal way, probably in-place */
        for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
        memcpy(z, s->tmp_buf, np * sizeof(FFTComplex));
        return;
    }
    /* reverse */
    for(j=0;j<np;j++) {
        k = revtab[j];
        if (k < j) {
            tmp = z[k];
            z[k] = z[j];
            z[j] = tmp;
        }
    }
 }
 av_cold void ff_fft_end(FFTContext *s)
 {
    av_freep(&s->revtab);
    av_freep(&s->exptab);
    av_freep(&s->exptab1);
    av_freep(&s->tmp_buf);
 }
 #define sqrthalf (float)M_SQRT1_2
 #define BF(x,y,a,b) {\
    x = a - b;\
    y = a + b;\
 }
 #define BUTTERFLIES(a0,a1,a2,a3) {\
    BF(t3, t5, t5, t1);\
    BF(a2.re, a0.re, a0.re, t5);\
    BF(a3.im, a1.im, a1.im, t3);\
    BF(t4, t6, t2, t6);\
    BF(a3.re, a1.re, a1.re, t4);\
    BF(a2.im, a0.im, a0.im, t6);\
 }
 // force loading all the inputs before storing any.
 // this is slightly slower for small data, but avoids store->load aliasing
 // for addresses separated by large powers of 2.
 #define BUTTERFLIES_BIG(a0,a1,a2,a3) {\
    FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\
    BF(t3, t5, t5, t1);\
    BF(a2.re, a0.re, r0, t5);\
    BF(a3.im, a1.im, i1, t3);\
    BF(t4, t6, t2, t6);\
    BF(a3.re, a1.re, r1, t4);\
    BF(a2.im, a0.im, i0, t6);\
 }
 #define TRANSFORM(a0,a1,a2,a3,wre,wim) {\
    t1 = a2.re * wre + a2.im * wim;\
    t2 = a2.im * wre - a2.re * wim;\
    t5 = a3.re * wre - a3.im * wim;\
    t6 = a3.im * wre + a3.re * wim;\
    BUTTERFLIES(a0,a1,a2,a3)\
 }
 #define TRANSFORM_ZERO(a0,a1,a2,a3) {\
    t1 = a2.re;\
    t2 = a2.im;\
    t5 = a3.re;\
    t6 = a3.im;\
    BUTTERFLIES(a0,a1,a2,a3)\
 }
 /* z[0...8n-1], w[1...2n-1] */
 #define PASS(name)\
 static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\
 {\
    FFTSample t1, t2, t3, t4, t5, t6;\
    int o1 = 2*n;\
    int o2 = 4*n;\
    int o3 = 6*n;\
    const FFTSample *wim = wre+o1;\
    n--;\
 \
    TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\
    TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
    do {\
        z += 2;\
        wre += 2;\
        wim -= 2;\
        TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\
        TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
    } while(--n);\
 }
 PASS(pass)
 #undef BUTTERFLIES
 #define BUTTERFLIES BUTTERFLIES_BIG
 PASS(pass_big)
 #define DECL_FFT(n,n2,n4)\
 static void fft##n(FFTComplex *z)\
 {\
    fft##n2(z);\
    fft##n4(z+n4*2);\
    fft##n4(z+n4*3);\
    pass(z,ff_cos_##n,n4/2);\
 }
 static void fft4(FFTComplex *z)
 {
    FFTSample t1, t2, t3, t4, t5, t6, t7, t8;
    BF(t3, t1, z[0].re, z[1].re);
    BF(t8, t6, z[3].re, z[2].re);
    BF(z[2].re, z[0].re, t1, t6);
    BF(t4, t2, z[0].im, z[1].im);
    BF(t7, t5, z[2].im, z[3].im);
    BF(z[3].im, z[1].im, t4, t8);
    BF(z[3].re, z[1].re, t3, t7);
    BF(z[2].im, z[0].im, t2, t5);
 }
 static void fft8(FFTComplex *z)
 {
    FFTSample t1, t2, t3, t4, t5, t6, t7, t8;
    fft4(z);
    BF(t1, z[5].re, z[4].re, -z[5].re);
    BF(t2, z[5].im, z[4].im, -z[5].im);
    BF(t3, z[7].re, z[6].re, -z[7].re);
    BF(t4, z[7].im, z[6].im, -z[7].im);
    BF(t8, t1, t3, t1);
    BF(t7, t2, t2, t4);
    BF(z[4].re, z[0].re, z[0].re, t1);
    BF(z[4].im, z[0].im, z[0].im, t2);
    BF(z[6].re, z[2].re, z[2].re, t7);
    BF(z[6].im, z[2].im, z[2].im, t8);
    TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf);
 }
 #if !CONFIG_SMALL
 static void fft16(FFTComplex *z)
 {
    FFTSample t1, t2, t3, t4, t5, t6;
    fft8(z);
    fft4(z+8);
    fft4(z+12);
    TRANSFORM_ZERO(z[0],z[4],z[8],z[12]);
    TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf);
    TRANSFORM(z[1],z[5],z[9],z[13],ff_cos_16[1],ff_cos_16[3]);
    TRANSFORM(z[3],z[7],z[11],z[15],ff_cos_16[3],ff_cos_16[1]);
 }
 #else
 DECL_FFT(16,8,4)
 #endif
 DECL_FFT(32,16,8)
 DECL_FFT(64,32,16)
 DECL_FFT(128,64,32)
 DECL_FFT(256,128,64)
 DECL_FFT(512,256,128)
 #if !CONFIG_SMALL
 #define pass pass_big
 #endif
 DECL_FFT(1024,512,256)
 DECL_FFT(2048,1024,512)
 DECL_FFT(4096,2048,1024)
 DECL_FFT(8192,4096,2048)
 DECL_FFT(16384,8192,4096)
 DECL_FFT(32768,16384,8192)
 DECL_FFT(65536,32768,16384)
 static void (*fft_dispatch[])(FFTComplex*) = {
    fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024,
    fft2048, fft4096, fft8192, fft16384, fft32768, fft65536,
 };
 void ff_fft_calc_c(FFTContext *s, FFTComplex *z)
 {
    fft_dispatch[s->nbits-2](z);
 }
--- a/apps/codecs/libatrac/fixp_math.c
+++ b/apps/codecs/libatrac/fixp_math.c
@ -0,0 +1,66 @@
 #include "fixp_math.h"
 inline int32_t fixmul31(int32_t x, int32_t y)
 {
    int64_t temp;
    temp = x;
    temp *= y;
    temp >>= 31;        //16+31-16 = 31 bits
    return (int32_t)temp;
 }
 /*
 * Fast integer square root adapted from algorithm, 
 * Martin Guy @ UKC, June 1985.
 * Originally from a book on programming abaci by Mr C. Woo.
 * This is taken from :
 * http://wiki.forum.nokia.com/index.php/How_to_use_fixed_point_maths#How_to_get_square_root_for_integers
 * with a added shift up of the result by 8 bits to return result in 16.16 fixed-point representation.
 */
 inline int32_t fastSqrt(int32_t n)
 {
   /*
    * Logically, these are unsigned. 
    * We need the sign bit to test
    *	whether (op - res - one) underflowed.
    */
    int32_t op, res, one;
    op = n;
    res = 0;
    /* "one" starts at the highest power of four <= than the argument. */
    one = 1 << 30;	/* second-to-top bit set */
    while (one > op) one >>= 2;
    while (one != 0) 
    {
        if (op >= res + one) 
        {
            op = op - (res + one);
            res = res +  (one<<1);
        }
        res >>= 1;
        one >>= 2;
    }
    return(res << 8);
 }
 inline int32_t fixmul16(int32_t x, int32_t y)
 {
    int64_t temp;
    temp = x;
    temp *= y;
    temp >>= 16;
    return (int32_t)temp;
 }
 inline int32_t fixdiv16(int32_t x, int32_t y)
 {
    int64_t temp;
    temp = x << 16;
    temp /= y;
    return (int32_t)temp;
 }
--- a/apps/codecs/libatrac/fixp_math.h
+++ b/apps/codecs/libatrac/fixp_math.h
@ -0,0 +1,14 @@
 #include <stdlib.h>
 /* Macros for converting between various fixed-point representations and floating point. */
 #define ONE_16 (1L << 16)
 #define fixtof64(x)       (float)((float)(x) / (float)(1 << 16))        //does not work on int64_t!
 #define ftofix32(x)       ((int32_t)((x) * (float)(1 << 16) + ((x) < 0 ? -0.5 : 0.5)))
 #define ftofix31(x)       ((int32_t)((x) * (float)(1 << 31) + ((x) < 0 ? -0.5 : 0.5)))
 #define fix31tof64(x)     (float)((float)(x) / (float)(1 << 31))
 /* Fixed point math routines for use in atrac3.c */
 inline int32_t fixdiv16(int32_t x, int32_t y);
 inline int32_t fixmul16(int32_t x, int32_t y);
 inline int32_t fixmul31(int32_t x, int32_t y);
 inline int32_t fastSqrt(int32_t n);
--- a/apps/codecs/libatrac/mdct.c
+++ b/apps/codecs/libatrac/mdct.c
@ -1,245 +0,0 @@
 /*
 * MDCT/IMDCT transforms
 * Copyright (c) 2002 Fabrice Bellard
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "dsputil.h"
 #ifndef M_E
 #define M_E            2.7182818284590452354   /* e */
 #endif
 #ifndef M_LN2
 #define M_LN2          0.69314718055994530942  /* log_e 2 */
 #endif
 #ifndef M_LN10
 #define M_LN10         2.30258509299404568402  /* log_e 10 */
 #endif
 #ifndef M_PI
 #define M_PI           3.14159265358979323846  /* pi */
 #endif
 #ifndef M_SQRT1_2
 #define M_SQRT1_2      0.70710678118654752440  /* 1/sqrt(2) */
 #endif
 /**
 * @file libavcodec/mdct.c
 * MDCT/IMDCT transforms.
 */
 // Generate a Kaiser-Bessel Derived Window.
 #define BESSEL_I0_ITER 50 // default: 50 iterations of Bessel I0 approximation
 av_cold void ff_kbd_window_init(float *window, float alpha, int n)
 {
   int i, j;
   double sum = 0.0, bessel, tmp;
   double local_window[n];
   double alpha2 = (alpha * M_PI / n) * (alpha * M_PI / n);
   for (i = 0; i < n; i++) {
       tmp = i * (n - i) * alpha2;
       bessel = 1.0;
       for (j = BESSEL_I0_ITER; j > 0; j--)
           bessel = bessel * tmp / (j * j) + 1;
       sum += bessel;
       local_window[i] = sum;
   }
   sum++;
   for (i = 0; i < n; i++)
       window[i] = sqrt(local_window[i] / sum);
 }
 DECLARE_ALIGNED(16, float, ff_sine_128 [ 128]);
 DECLARE_ALIGNED(16, float, ff_sine_256 [ 256]);
 DECLARE_ALIGNED(16, float, ff_sine_512 [ 512]);
 DECLARE_ALIGNED(16, float, ff_sine_1024[1024]);
 DECLARE_ALIGNED(16, float, ff_sine_2048[2048]);
 DECLARE_ALIGNED(16, float, ff_sine_4096[4096]);
 float *ff_sine_windows[6] = {
    ff_sine_128, ff_sine_256, ff_sine_512, ff_sine_1024, ff_sine_2048, ff_sine_4096
 };
 // Generate a sine window.
 av_cold void ff_sine_window_init(float *window, int n) {
    int i;
    for(i = 0; i < n; i++)
        window[i] = sinf((i + 0.5) * (M_PI / (2.0 * n)));
 }
 /**
 * init MDCT or IMDCT computation.
 */
 av_cold int ff_mdct_init(MDCTContext *s, int nbits, int inverse)
 {
    int n, n4, i;
    double alpha;
    memset(s, 0, sizeof(*s));
    n = 1 << nbits;
    s->nbits = nbits;
    s->n = n;
    n4 = n >> 2;
    s->tcos = av_malloc(n4 * sizeof(FFTSample));
    if (!s->tcos)
        goto fail;
    s->tsin = av_malloc(n4 * sizeof(FFTSample));
    if (!s->tsin)
        goto fail;
    for(i=0;i<n4;i++) {
        alpha = 2 * M_PI * (i + 1.0 / 8.0) / n;
        s->tcos[i] = -cos(alpha);
        s->tsin[i] = -sin(alpha);
    }
    if (ff_fft_init(&s->fft, s->nbits - 2, inverse) < 0)
        goto fail;
    return 0;
 fail:
    av_freep(&s->tcos);
    av_freep(&s->tsin);
    return -1;
 }
 /* complex multiplication: p = a * b */
 #define CMUL(pre, pim, are, aim, bre, bim) \
 {\
    FFTSample _are = (are);\
    FFTSample _aim = (aim);\
    FFTSample _bre = (bre);\
    FFTSample _bim = (bim);\
    (pre) = _are * _bre - _aim * _bim;\
    (pim) = _are * _bim + _aim * _bre;\
 }
 /**
 * Compute the middle half of the inverse MDCT of size N = 2^nbits,
 * thus excluding the parts that can be derived by symmetry
 * @param output N/2 samples
 * @param input N/2 samples
 */
 void ff_imdct_half_c(MDCTContext *s, FFTSample *output, const FFTSample *input)
 {
    int k, n8, n4, n2, n, j;
    const uint16_t *revtab = s->fft.revtab;
    const FFTSample *tcos = s->tcos;
    const FFTSample *tsin = s->tsin;
    const FFTSample *in1, *in2;
    FFTComplex *z = (FFTComplex *)output;
    n = 1 << s->nbits;
    n2 = n >> 1;
    n4 = n >> 2;
    n8 = n >> 3;
    /* pre rotation */
    in1 = input;
    in2 = input + n2 - 1;
    for(k = 0; k < n4; k++) {
        j=revtab[k];
        CMUL(z[j].re, z[j].im, *in2, *in1, tcos[k], tsin[k]);
        in1 += 2;
        in2 -= 2;
    }
    ff_fft_calc(&s->fft, z);
    /* post rotation + reordering */
    output += n4;
    for(k = 0; k < n8; k++) {
        FFTSample r0, i0, r1, i1;
        CMUL(r0, i1, z[n8-k-1].im, z[n8-k-1].re, tsin[n8-k-1], tcos[n8-k-1]);
        CMUL(r1, i0, z[n8+k  ].im, z[n8+k  ].re, tsin[n8+k  ], tcos[n8+k  ]);
        z[n8-k-1].re = r0;
        z[n8-k-1].im = i0;
        z[n8+k  ].re = r1;
        z[n8+k  ].im = i1;
    }
 }
 /**
 * Compute inverse MDCT of size N = 2^nbits
 * @param output N samples
 * @param input N/2 samples
 */
 void ff_imdct_calc_c(MDCTContext *s, FFTSample *output, const FFTSample *input)
 {
    int k;
    int n = 1 << s->nbits;
    int n2 = n >> 1;
    int n4 = n >> 2;
    ff_imdct_half_c(s, output+n4, input);
    for(k = 0; k < n4; k++) {
        output[k] = -output[n2-k-1];
        output[n-k-1] = output[n2+k];
    }
 }
 /**
 * Compute MDCT of size N = 2^nbits
 * @param input N samples
 * @param out N/2 samples
 */
 void ff_mdct_calc(MDCTContext *s, FFTSample *out, const FFTSample *input)
 {
    int i, j, n, n8, n4, n2, n3;
    FFTSample re, im;
    const uint16_t *revtab = s->fft.revtab;
    const FFTSample *tcos = s->tcos;
    const FFTSample *tsin = s->tsin;
    FFTComplex *x = (FFTComplex *)out;
    n = 1 << s->nbits;
    n2 = n >> 1;
    n4 = n >> 2;
    n8 = n >> 3;
    n3 = 3 * n4;
    /* pre rotation */
    for(i=0;i<n8;i++) {
        re = -input[2*i+3*n4] - input[n3-1-2*i];
        im = -input[n4+2*i] + input[n4-1-2*i];
        j = revtab[i];
        CMUL(x[j].re, x[j].im, re, im, -tcos[i], tsin[i]);
        re = input[2*i] - input[n2-1-2*i];
        im = -(input[n2+2*i] + input[n-1-2*i]);
        j = revtab[n8 + i];
        CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]);
    }
    ff_fft_calc(&s->fft, x);
    /* post rotation */
    for(i=0;i<n8;i++) {
        FFTSample r0, i0, r1, i1;
        CMUL(i1, r0, x[n8-i-1].re, x[n8-i-1].im, -tsin[n8-i-1], -tcos[n8-i-1]);
        CMUL(i0, r1, x[n8+i  ].re, x[n8+i  ].im, -tsin[n8+i  ], -tcos[n8+i  ]);
        x[n8-i-1].re = r0;
        x[n8-i-1].im = i0;
        x[n8+i  ].re = r1;
        x[n8+i  ].im = i1;
    }
 }
 av_cold void ff_mdct_end(MDCTContext *s)
 {
    av_freep(&s->tcos);
    av_freep(&s->tsin);
    ff_fft_end(&s->fft);
 }