1
0
Fork 0
forked from len0rd/rockbox

Submit FS#11461. Major speedup for aac he profile (PP5002 +20%, PP5020 +15%, PP5022 +19%, MCF5249 +35%, MCF5250 +80%), still not realtime on most targets though. This change does a lot of refactoring in the sbr filters and the dct, switching to our optimized codeclib fft and tweaking IRAM usage.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27358 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Andree Buschmann 2010-07-09 18:32:37 +00:00
parent f3e0207384
commit 811af5968a
7 changed files with 203 additions and 545 deletions

View file

@ -319,7 +319,7 @@ static const uint32_t pow2_tab[] ICONST_ATTR = {
UFIX_CONST(2.000000000000000,POWTBL_PRECIS)
};
static const real_t log2_tab[] ICONST_ATTR = {
static const real_t log2_tab[] ICONST_ATTR_FAAD_LARGE_IRAM = {
REAL_CONST(0.000000000000000), REAL_CONST(0.022367813028455), REAL_CONST(0.044394119358453),
REAL_CONST(0.066089190457772), REAL_CONST(0.087462841250339), REAL_CONST(0.108524456778169),
REAL_CONST(0.129283016944966), REAL_CONST(0.149747119504682), REAL_CONST(0.169925001442312),

View file

@ -26,6 +26,9 @@
**/
#include "common.h"
#include "../lib/fft.h"
#include "../lib/mdct_lookup.h"
#ifdef SBR_DEC
@ -1447,267 +1450,9 @@ void DCT2_32_unscaled(real_t *y, real_t *x)
y[17] = f286 - f285;
}
#else
#else /* #ifdef SBR_LOW_POWER */
#define n 32
#define log2n 5
// w_array_real[i] = cos(2*M_PI*i/32)
static const real_t w_array_real[] = {
FRAC_CONST(1.000000000000000), FRAC_CONST(0.980785279337272),
FRAC_CONST(0.923879528329380), FRAC_CONST(0.831469603195765),
FRAC_CONST(0.707106765732237), FRAC_CONST(0.555570210304169),
FRAC_CONST(0.382683402077046), FRAC_CONST(0.195090284503576),
FRAC_CONST(0.000000000000000), FRAC_CONST(-0.195090370246552),
FRAC_CONST(-0.382683482845162), FRAC_CONST(-0.555570282993553),
FRAC_CONST(-0.707106827549476), FRAC_CONST(-0.831469651765257),
FRAC_CONST(-0.923879561784627), FRAC_CONST(-0.980785296392607)
};
// w_array_imag[i] = sin(-2*M_PI*i/32)
static const real_t w_array_imag[] = {
FRAC_CONST(0.000000000000000), FRAC_CONST(-0.195090327375064),
FRAC_CONST(-0.382683442461104), FRAC_CONST(-0.555570246648862),
FRAC_CONST(-0.707106796640858), FRAC_CONST(-0.831469627480512),
FRAC_CONST(-0.923879545057005), FRAC_CONST(-0.980785287864940),
FRAC_CONST(-1.000000000000000), FRAC_CONST(-0.980785270809601),
FRAC_CONST(-0.923879511601754), FRAC_CONST(-0.831469578911016),
FRAC_CONST(-0.707106734823616), FRAC_CONST(-0.555570173959476),
FRAC_CONST(-0.382683361692986), FRAC_CONST(-0.195090241632088)
};
// FFT decimation in frequency
// 4*16*2+16=128+16=144 multiplications
// 6*16*2+10*8+4*16*2=192+80+128=400 additions
static void fft_dif(real_t * Real, real_t * Imag)
{
real_t w_real, w_imag; // For faster access
real_t point1_real, point1_imag, point2_real, point2_imag; // For faster access
uint32_t j, i, i2, w_index; // Counters
// First 2 stages of 32 point FFT decimation in frequency
// 4*16*2=64*2=128 multiplications
// 6*16*2=96*2=192 additions
// Stage 1 of 32 point FFT decimation in frequency
for (i = 0; i < 16; i++)
{
point1_real = Real[i];
point1_imag = Imag[i];
i2 = i+16;
point2_real = Real[i2];
point2_imag = Imag[i2];
w_real = w_array_real[i];
w_imag = w_array_imag[i];
// temp1 = x[i] - x[i2]
point1_real -= point2_real;
point1_imag -= point2_imag;
// x[i1] = x[i] + x[i2]
Real[i] += point2_real;
Imag[i] += point2_imag;
// x[i2] = (x[i] - x[i2]) * w
Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag));
Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real));
}
// Stage 2 of 32 point FFT decimation in frequency
for (j = 0, w_index = 0; j < 8; j++, w_index += 2)
{
w_real = w_array_real[w_index];
w_imag = w_array_imag[w_index];
i = j;
point1_real = Real[i];
point1_imag = Imag[i];
i2 = i+8;
point2_real = Real[i2];
point2_imag = Imag[i2];
// temp1 = x[i] - x[i2]
point1_real -= point2_real;
point1_imag -= point2_imag;
// x[i1] = x[i] + x[i2]
Real[i] += point2_real;
Imag[i] += point2_imag;
// x[i2] = (x[i] - x[i2]) * w
Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag));
Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real));
i = j+16;
point1_real = Real[i];
point1_imag = Imag[i];
i2 = i+8;
point2_real = Real[i2];
point2_imag = Imag[i2];
// temp1 = x[i] - x[i2]
point1_real -= point2_real;
point1_imag -= point2_imag;
// x[i1] = x[i] + x[i2]
Real[i] += point2_real;
Imag[i] += point2_imag;
// x[i2] = (x[i] - x[i2]) * w
Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag));
Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real));
}
// Stage 3 of 32 point FFT decimation in frequency
// 2*4*2=16 multiplications
// 4*4*2+6*4*2=10*8=80 additions
for (i = 0; i < n; i += 8)
{
i2 = i+4;
point1_real = Real[i];
point1_imag = Imag[i];
point2_real = Real[i2];
point2_imag = Imag[i2];
// out[i1] = point1 + point2
Real[i] += point2_real;
Imag[i] += point2_imag;
// out[i2] = point1 - point2
Real[i2] = point1_real - point2_real;
Imag[i2] = point1_imag - point2_imag;
}
w_real = w_array_real[4]; // = sqrt(2)/2
// w_imag = -w_real; // = w_array_imag[4]; // = -sqrt(2)/2
for (i = 1; i < n; i += 8)
{
i2 = i+4;
point1_real = Real[i];
point1_imag = Imag[i];
point2_real = Real[i2];
point2_imag = Imag[i2];
// temp1 = x[i] - x[i2]
point1_real -= point2_real;
point1_imag -= point2_imag;
// x[i1] = x[i] + x[i2]
Real[i] += point2_real;
Imag[i] += point2_imag;
// x[i2] = (x[i] - x[i2]) * w
Real[i2] = MUL_F(point1_real+point1_imag, w_real);
Imag[i2] = MUL_F(point1_imag-point1_real, w_real);
}
for (i = 2; i < n; i += 8)
{
i2 = i+4;
point1_real = Real[i];
point1_imag = Imag[i];
point2_real = Real[i2];
point2_imag = Imag[i2];
// x[i] = x[i] + x[i2]
Real[i] += point2_real;
Imag[i] += point2_imag;
// x[i2] = (x[i] - x[i2]) * (-i)
Real[i2] = point1_imag - point2_imag;
Imag[i2] = point2_real - point1_real;
}
w_real = w_array_real[12]; // = -sqrt(2)/2
// w_imag = w_real; // = w_array_imag[12]; // = -sqrt(2)/2
for (i = 3; i < n; i += 8)
{
i2 = i+4;
point1_real = Real[i];
point1_imag = Imag[i];
point2_real = Real[i2];
point2_imag = Imag[i2];
// temp1 = x[i] - x[i2]
point1_real -= point2_real;
point1_imag -= point2_imag;
// x[i1] = x[i] + x[i2]
Real[i] += point2_real;
Imag[i] += point2_imag;
// x[i2] = (x[i] - x[i2]) * w
Real[i2] = MUL_F(point1_real-point1_imag, w_real);
Imag[i2] = MUL_F(point1_real+point1_imag, w_real);
}
// Stage 4 of 32 point FFT decimation in frequency (no multiplications)
// 16*4=64 additions
for (i = 0; i < n; i += 4)
{
i2 = i+2;
point1_real = Real[i];
point1_imag = Imag[i];
point2_real = Real[i2];
point2_imag = Imag[i2];
// x[i1] = x[i] + x[i2]
Real[i] += point2_real;
Imag[i] += point2_imag;
// x[i2] = x[i] - x[i2]
Real[i2] = point1_real - point2_real;
Imag[i2] = point1_imag - point2_imag;
}
for (i = 1; i < n; i += 4)
{
i2 = i+2;
point1_real = Real[i];
point1_imag = Imag[i];
point2_real = Real[i2];
point2_imag = Imag[i2];
// x[i] = x[i] + x[i2]
Real[i] += point2_real;
Imag[i] += point2_imag;
// x[i2] = (x[i] - x[i2]) * (-i)
Real[i2] = point1_imag - point2_imag;
Imag[i2] = point2_real - point1_real;
}
// Stage 5 of 32 point FFT decimation in frequency (no multiplications)
// 16*4=64 additions
for (i = 0; i < n; i += 2)
{
i2 = i+1;
point1_real = Real[i];
point1_imag = Imag[i];
point2_real = Real[i2];
point2_imag = Imag[i2];
// out[i1] = point1 + point2
Real[i] += point2_real;
Imag[i] += point2_imag;
// out[i2] = point1 - point2
Real[i2] = point1_real - point2_real;
Imag[i2] = point1_imag - point2_imag;
}
#ifdef REORDER_IN_FFT
FFTReorder(Real, Imag);
#endif // #ifdef REORDER_IN_FFT
}
#undef n
#undef log2n
static const real_t dct4_64_tab[] = {
static const real_t dct4_64_tab[] ICONST_ATTR = {
COEF_CONST(0.999924719333649), COEF_CONST(0.998118102550507),
COEF_CONST(0.993906974792480), COEF_CONST(0.987301409244537),
COEF_CONST(0.978317379951477), COEF_CONST(0.966976463794708),
@ -1806,57 +1551,65 @@ static const real_t dct4_64_tab[] = {
COEF_CONST(0.897167563438416), COEF_CONST(0.949727773666382)
};
/* size 64 only! */
void dct4_kernel(real_t * in_real, real_t * in_imag, real_t * out_real, real_t * out_imag)
{
// Tables with bit reverse values for 5 bits, bit reverse of i at i-th position
const uint8_t bit_rev_tab[32] = { 0,16,8,24,4,20,12,28,2,18,10,26,6,22,14,30,1,17,9,25,5,21,13,29,3,19,11,27,7,23,15,31 };
uint16_t i, i_rev;
// Table adapted from codeclib to fit into IRAM
const uint32_t dct4_revtab[32] ICONST_ATTR = {
0, 24, 12, 22, 6, 30, 11, 19, 3, 27, 15, 21, 5, 29, 9, 17,
1, 25, 13, 23, 7, 31, 10, 18, 2, 26, 14, 20, 4, 28, 8, 16};
/* Step 2: modulate */
/* size 64 only! */
void dct4_kernel(real_t *real, real_t *imag)
{
uint32_t i, idx;
real_t x_re, x_im, tmp;
FFTComplex xc[32]; /* used for calling codeclib's fft implementation */
/* Step 2: modulate and pre-rotate for codeclib's fft implementation */
// 3*32=96 multiplications
// 3*32=96 additions
for (i = 0; i < 32; i++)
{
real_t x_re, x_im, tmp;
x_re = in_real[i];
x_im = in_imag[i];
idx = dct4_revtab[i];
x_re = real[i];
x_im = imag[i];
tmp = MUL_C(x_re + x_im, dct4_64_tab[i ]);
in_real[i] = MUL_C(x_im, dct4_64_tab[i + 64]) + tmp;
in_imag[i] = MUL_C(x_re, dct4_64_tab[i + 32]) + tmp;
xc[idx].re = MUL_C(x_im , dct4_64_tab[i + 64]) + tmp;
xc[idx].im = MUL_C(x_re , dct4_64_tab[i + 32]) + tmp;
}
/* Step 3: FFT, but with output in bit reverse order */
fft_dif(in_real, in_imag);
/* Step 3: FFT (codeclib's implementation) */
ff_fft_calc_c(5, xc);
/* Step 4: modulate + bitreverse reordering */
/* Step 4: modulate + reordering */
// 3*31+2=95 multiplications
// 3*31+2=95 additions
for (i = 0; i < 16; i++)
x_re = xc[0].re;
x_im = xc[0].im;
tmp = MUL_C(x_re + x_im, dct4_64_tab[0 + 3*32]);
real[0] = MUL_C(x_im , dct4_64_tab[0 + 5*32]) + tmp;
imag[0] = MUL_C(x_re , dct4_64_tab[0 + 4*32]) + tmp;
for (i = 1; i < 16; i++)
{
real_t x_re, x_im, tmp;
i_rev = bit_rev_tab[i];
x_re = in_real[i_rev];
x_im = in_imag[i_rev];
idx = 32-i;
x_re = xc[idx].re;
x_im = xc[idx].im;
tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]);
out_real[i] = MUL_C(x_im, dct4_64_tab[i + 5*32]) + tmp;
out_imag[i] = MUL_C(x_re, dct4_64_tab[i + 4*32]) + tmp;
real[i] = MUL_C(x_im , dct4_64_tab[i + 5*32]) + tmp;
imag[i] = MUL_C(x_re , dct4_64_tab[i + 4*32]) + tmp;
}
// i = 16, i_rev = 1 = rev(16);
out_imag[16] = MUL_C(in_imag[1] - in_real[1], dct4_64_tab[16 + 3*32]);
out_real[16] = MUL_C(in_real[1] + in_imag[1], dct4_64_tab[16 + 3*32]);
// i = 16, idx = 16 = reorder_tab[16];
x_re = xc[16].re;
x_im = xc[16].im;
imag[16] = MUL_C(x_im - x_re, dct4_64_tab[16 + 3*32]);
real[16] = MUL_C(x_re + x_im, dct4_64_tab[16 + 3*32]);
for (i = 17; i < 32; i++)
{
real_t x_re, x_im, tmp;
i_rev = bit_rev_tab[i];
x_re = in_real[i_rev];
x_im = in_imag[i_rev];
idx = 32-i;
x_re = xc[idx].re;
x_im = xc[idx].im;
tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]);
out_real[i] = MUL_C(x_im, dct4_64_tab[i + 5*32]) + tmp;
out_imag[i] = MUL_C(x_re, dct4_64_tab[i + 4*32]) + tmp;
real[i] = MUL_C(x_im , dct4_64_tab[i + 5*32]) + tmp;
imag[i] = MUL_C(x_re , dct4_64_tab[i + 4*32]) + tmp;
}
}
void DST4_32(real_t *y, real_t *x)
@ -2266,6 +2019,6 @@ void DST4_32(real_t *y, real_t *x)
y[0] = MUL_R(REAL_CONST(20.3738781672314530), f304);
}
#endif
#endif /* #ifdef SBR_LOW_POWER */
#endif
#endif /* #ifdef SBR_DEC */

View file

@ -32,7 +32,7 @@
extern "C" {
#endif
void dct4_kernel(real_t * in_real, real_t * in_imag, real_t * out_real, real_t * out_imag);
void dct4_kernel(real_t *real, real_t *imag);
void DCT3_32_unscaled(real_t *y, real_t *x);
void DCT4_32(real_t *y, real_t *x);

View file

@ -454,6 +454,7 @@ uint8_t sbrDecodeCoupleFrame(sbr_info *sbr, real_t *left_chan, real_t *right_cha
}
ALIGN qmf_t X[MAX_NTSR][64];
uint8_t sbrDecodeSingleFrame(sbr_info *sbr, real_t *channel,
const uint8_t just_seeked, const uint8_t downSampledSBR)
{
@ -520,9 +521,8 @@ uint8_t sbrDecodeSingleFrame(sbr_info *sbr, real_t *channel,
return 0;
}
ALIGN qmf_t X_left[38][64];// = {{0}};
ALIGN qmf_t X_right[38][64];// = {{0}}; /* must set this to 0 */
ALIGN qmf_t X_left[MAX_NTSRHFG][64];// = {{0}};
ALIGN qmf_t X_right[MAX_NTSRHFG][64];// = {{0}}; /* must set this to 0 */
#if (defined(PS_DEC) || defined(DRM_PS))
uint8_t sbrDecodeSingleFramePS(sbr_info *sbr, real_t *left_channel, real_t *right_channel,

View file

@ -38,6 +38,16 @@
#include "sbr_qmf_c.h"
#include "sbr_syntax.h"
#ifdef FIXED_POINT
#define FAAD_SYNTHESIS_SCALE(X) ((X)>>1)
#define FAAD_ANALYSIS_SCALE1(X) ((X)>>4)
#define FAAD_ANALYSIS_SCALE2(X) ((X))
#else
#define FAAD_ANALYSIS_SCALE1(X) ((X)*scale)
#define FAAD_ANALYSIS_SCALE1(X) ((X))
#define FAAD_ANALYSIS_SCALE2(X) (2.*(X))
#endif
qmfa_info *qmfa_init(uint8_t channels)
{
qmfa_info *qmfa = (qmfa_info*)faad_malloc(sizeof(qmfa_info));
@ -68,40 +78,44 @@ void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input,
{
ALIGN real_t u[64];
#ifndef SBR_LOW_POWER
static ALIGN real_t in_real[32], in_imag[32], out_real[32], out_imag[32];
ALIGN real_t real[32];
ALIGN real_t imag[32];
#else
ALIGN real_t y[32];
#endif
uint16_t in = 0;
uint8_t l;
qmf_t *pX;
uint32_t in = 0;
uint32_t l, idx0, idx1;
/* qmf subsample l */
for (l = 0; l < sbr->numTimeSlotsRate; l++)
{
int16_t n;
int32_t n;
/* shift input buffer x */
/* input buffer is not shifted anymore, x is implemented as double ringbuffer */
//memmove(qmfa->x + 32, qmfa->x, (320-32)*sizeof(real_t));
/* add new samples to input buffer x */
for (n = 32 - 1; n >= 0; n--)
idx0 = qmfa->x_index + 31; idx1 = idx0 + 320;
for (n = 32 - 1; n >= 0; n-=4)
{
#ifdef FIXED_POINT
qmfa->x[qmfa->x_index + n] = qmfa->x[qmfa->x_index + n + 320] = (input[in++]) >> 4;
#else
qmfa->x[qmfa->x_index + n] = qmfa->x[qmfa->x_index + n + 320] = input[in++];
#endif
qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]);
qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]);
qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]);
qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]);
}
/* window and summation to create array u */
for (n = 0; n < 64; n++)
{
u[n] = MUL_F(qmfa->x[qmfa->x_index + n], qmf_c[2*n]) +
MUL_F(qmfa->x[qmfa->x_index + n + 64], qmf_c[2*(n + 64)]) +
MUL_F(qmfa->x[qmfa->x_index + n + 128], qmf_c[2*(n + 128)]) +
MUL_F(qmfa->x[qmfa->x_index + n + 192], qmf_c[2*(n + 192)]) +
MUL_F(qmfa->x[qmfa->x_index + n + 256], qmf_c[2*(n + 256)]);
idx0 = qmfa->x_index + n; idx1 = n * 2;
u[n] = FAAD_ANALYSIS_SCALE1(
MUL_F(qmfa->x[idx0 ], qmf_c[idx1]) +
MUL_F(qmfa->x[idx0 + 64], qmf_c[idx1 + 2 * 64]) +
MUL_F(qmfa->x[idx0 + 128], qmf_c[idx1 + 2 * 128]) +
MUL_F(qmfa->x[idx0 + 192], qmf_c[idx1 + 2 * 192]) +
MUL_F(qmfa->x[idx0 + 256], qmf_c[idx1 + 2 * 256]));
}
/* update ringbuffer index */
@ -123,64 +137,52 @@ void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input,
{
if (n < kx)
{
#ifdef FIXED_POINT
QMF_RE(X[l + offset][n]) = u[n] /*<< 1*/;
#else
QMF_RE(X[l + offset][n]) = 2. * u[n];
#endif
QMF_RE(X[l + offset][n]) = FAAD_ANALYSIS_SCALE2(u[n]);
} else {
QMF_RE(X[l + offset][n]) = 0;
}
}
#else
#else /* #ifdef SBR_LOW_POWER */
// Reordering of data moved from DCT_IV to here
in_imag[31] = u[1];
in_real[0] = u[0];
for (n = 1; n < 31; n++)
idx0 = 30; idx1 = 63;
imag[31] = u[ 1]; real[ 0] = u[ 0];
for (n = 1; n < 31; n+=3)
{
in_imag[31 - n] = u[n+1];
in_real[n] = -u[64-n];
imag[idx0--] = u[n+1]; real[n ] = -u[idx1--];
imag[idx0--] = u[n+2]; real[n+1] = -u[idx1--];
imag[idx0--] = u[n+3]; real[n+2] = -u[idx1--];
}
in_imag[0] = u[32];
in_real[31] = -u[33];
imag[ 0] = u[32]; real[31] = -u[33];
// dct4_kernel is DCT_IV without reordering which is done before and after FFT
dct4_kernel(in_real, in_imag, out_real, out_imag);
dct4_kernel(real, imag);
// Reordering of data moved from DCT_IV to here
for (n = 0; n < 16; n++) {
if (2*n+1 < kx) {
#ifdef FIXED_POINT
QMF_RE(X[l + offset][2*n]) = out_real[n];
QMF_IM(X[l + offset][2*n]) = out_imag[n];
QMF_RE(X[l + offset][2*n+1]) = -out_imag[31-n];
QMF_IM(X[l + offset][2*n+1]) = -out_real[31-n];
#else
QMF_RE(X[l + offset][2*n]) = 2. * out_real[n];
QMF_IM(X[l + offset][2*n]) = 2. * out_imag[n];
QMF_RE(X[l + offset][2*n+1]) = -2. * out_imag[31-n];
QMF_IM(X[l + offset][2*n+1]) = -2. * out_real[31-n];
#endif
} else {
if (2*n < kx) {
#ifdef FIXED_POINT
QMF_RE(X[l + offset][2*n]) = out_real[n];
QMF_IM(X[l + offset][2*n]) = out_imag[n];
#else
QMF_RE(X[l + offset][2*n]) = 2. * out_real[n];
QMF_IM(X[l + offset][2*n]) = 2. * out_imag[n];
#endif
/* Step 1: Calculate all non-zero pairs */
pX = X[l + offset];
for (n = 0; n < kx/2; n++) {
idx0 = 2*n; idx1 = idx0 + 1;
QMF_RE(pX[idx0]) = FAAD_ANALYSIS_SCALE2( real[n ]);
QMF_IM(pX[idx0]) = FAAD_ANALYSIS_SCALE2( imag[n ]);
QMF_RE(pX[idx1]) = FAAD_ANALYSIS_SCALE2(-imag[31-n]);
QMF_IM(pX[idx1]) = FAAD_ANALYSIS_SCALE2(-real[31-n]);
}
else {
QMF_RE(X[l + offset][2*n]) = 0;
QMF_IM(X[l + offset][2*n]) = 0;
/* Step 2: Calculate a single pair with half zero'ed */
if (kx&1) {
idx0 = 2*n; idx1 = idx0 + 1;
QMF_RE(pX[idx0]) = FAAD_ANALYSIS_SCALE2( real[n]);
QMF_IM(pX[idx0]) = FAAD_ANALYSIS_SCALE2( imag[n]);
QMF_RE(pX[idx1]) = QMF_IM(pX[idx1]) = 0;
n++;
}
QMF_RE(X[l + offset][2*n+1]) = 0;
QMF_IM(X[l + offset][2*n+1]) = 0;
/* Step 3: All other are zero'ed */
for (; n < 16; n++) {
idx0 = 2*n; idx1 = idx0 + 1;
QMF_RE(pX[idx0]) = QMF_IM(pX[idx0]) = 0;
QMF_RE(pX[idx1]) = QMF_IM(pX[idx1]) = 0;
}
}
#endif
#endif /* #ifdef SBR_LOW_POWER */
}
}
@ -384,17 +386,26 @@ void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
qmfs->v_index = (1280-128);
}
}
#else
#else /* #ifdef SBR_LOW_POWER */
#define FAAD_CMPLX_PRETWIDDLE_SUB(k) \
(MUL_F(QMF_RE(X[l][k]), RE(qmf32_pre_twiddle[k])) - \
MUL_F(QMF_IM(X[l][k]), IM(qmf32_pre_twiddle[k]))) \
#define FAAD_CMPLX_PRETWIDDLE_ADD(k) \
(MUL_F(QMF_IM(X[l][k]), RE(qmf32_pre_twiddle[k])) + \
MUL_F(QMF_RE(X[l][k]), IM(qmf32_pre_twiddle[k]))) \
void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
real_t *output)
{
ALIGN real_t x1[32], x2[32];
ALIGN real_t x1[32];
ALIGN real_t x2[32];
#ifndef FIXED_POINT
real_t scale = 1.f/64.f;
#endif
int16_t n, k, out = 0;
uint8_t l;
int32_t n, k, idx0, idx1, out = 0;
uint32_t l;
/* qmf subsample l */
for (l = 0; l < sbr->numTimeSlotsRate; l++)
@ -405,43 +416,43 @@ void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
/* calculate 64 samples */
/* complex pre-twiddle */
for (k = 0; k < 32; k++)
for (k = 0; k < 32;)
{
x1[k] = MUL_F(QMF_RE(X[l][k]), RE(qmf32_pre_twiddle[k])) - MUL_F(QMF_IM(X[l][k]), IM(qmf32_pre_twiddle[k]));
x2[k] = MUL_F(QMF_IM(X[l][k]), RE(qmf32_pre_twiddle[k])) + MUL_F(QMF_RE(X[l][k]), IM(qmf32_pre_twiddle[k]));
#ifndef FIXED_POINT
x1[k] *= scale;
x2[k] *= scale;
#else
x1[k] >>= 1;
x2[k] >>= 1;
#endif
x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++;
x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++;
x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++;
x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++;
}
/* transform */
DCT4_32(x1, x1);
DST4_32(x2, x2);
for (n = 0; n < 32; n++)
idx0 = qmfs->v_index;
idx1 = qmfs->v_index + 63;
for (n = 0; n < 32; n+=2)
{
qmfs->v[qmfs->v_index + n] = qmfs->v[qmfs->v_index + 640 + n] = -x1[n] + x2[n];
qmfs->v[qmfs->v_index + 63 - n] = qmfs->v[qmfs->v_index + 640 + 63 - n] = x1[n] + x2[n];
qmfs->v[idx0] = qmfs->v[idx0 + 640] = -x1[n ] + x2[n ]; idx0++;
qmfs->v[idx1] = qmfs->v[idx1 + 640] = x1[n ] + x2[n ]; idx1--;
qmfs->v[idx0] = qmfs->v[idx0 + 640] = -x1[n+1] + x2[n+1]; idx0++;
qmfs->v[idx1] = qmfs->v[idx1 + 640] = x1[n+1] + x2[n+1]; idx1--;
}
/* calculate 32 output samples and window */
for (k = 0; k < 32; k++)
{
output[out++] = MUL_F(qmfs->v[qmfs->v_index + k], qmf_c[2*k]) +
MUL_F(qmfs->v[qmfs->v_index + 96 + k], qmf_c[64 + 2*k]) +
MUL_F(qmfs->v[qmfs->v_index + 128 + k], qmf_c[128 + 2*k]) +
MUL_F(qmfs->v[qmfs->v_index + 224 + k], qmf_c[192 + 2*k]) +
MUL_F(qmfs->v[qmfs->v_index + 256 + k], qmf_c[256 + 2*k]) +
MUL_F(qmfs->v[qmfs->v_index + 352 + k], qmf_c[320 + 2*k]) +
MUL_F(qmfs->v[qmfs->v_index + 384 + k], qmf_c[384 + 2*k]) +
MUL_F(qmfs->v[qmfs->v_index + 480 + k], qmf_c[448 + 2*k]) +
MUL_F(qmfs->v[qmfs->v_index + 512 + k], qmf_c[512 + 2*k]) +
MUL_F(qmfs->v[qmfs->v_index + 608 + k], qmf_c[576 + 2*k]);
idx0 = qmfs->v_index + k; idx1 = 2*k;
output[out++] = FAAD_SYNTHESIS_SCALE(
MUL_F(qmfs->v[idx0 ], qmf_c[idx1 ]) +
MUL_F(qmfs->v[idx0 + 96], qmf_c[idx1 + 64]) +
MUL_F(qmfs->v[idx0 + 128], qmf_c[idx1 + 128]) +
MUL_F(qmfs->v[idx0 + 224], qmf_c[idx1 + 192]) +
MUL_F(qmfs->v[idx0 + 256], qmf_c[idx1 + 256]) +
MUL_F(qmfs->v[idx0 + 352], qmf_c[idx1 + 320]) +
MUL_F(qmfs->v[idx0 + 384], qmf_c[idx1 + 384]) +
MUL_F(qmfs->v[idx0 + 480], qmf_c[idx1 + 448]) +
MUL_F(qmfs->v[idx0 + 512], qmf_c[idx1 + 512]) +
MUL_F(qmfs->v[idx0 + 608], qmf_c[idx1 + 576]));
}
/* update ringbuffer index */
@ -454,30 +465,17 @@ void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
real_t *output)
{
// ALIGN real_t x1[64], x2[64];
#ifndef SBR_LOW_POWER
static ALIGN real_t in_real1[32], in_imag1[32], out_real1[32], out_imag1[32];
static ALIGN real_t in_real2[32], in_imag2[32], out_real2[32], out_imag2[32];
#endif
ALIGN real_t real1[32];
ALIGN real_t imag1[32];
ALIGN real_t real2[32];
ALIGN real_t imag2[32];
qmf_t * pX;
real_t * pring_buffer_1, * pring_buffer_3;
// real_t * ptemp_1, * ptemp_2;
#ifdef PREFER_POINTERS
// These pointers are used if target platform has autoinc address generators
real_t * pring_buffer_2, * pring_buffer_4;
real_t * pring_buffer_5, * pring_buffer_6;
real_t * pring_buffer_7, * pring_buffer_8;
real_t * pring_buffer_9, * pring_buffer_10;
const real_t * pqmf_c_1, * pqmf_c_2, * pqmf_c_3, * pqmf_c_4;
const real_t * pqmf_c_5, * pqmf_c_6, * pqmf_c_7, * pqmf_c_8;
const real_t * pqmf_c_9, * pqmf_c_10;
#endif // #ifdef PREFER_POINTERS
real_t * p_buf_1, * p_buf_3;
#ifndef FIXED_POINT
real_t scale = 1.f/64.f;
#endif
int16_t n, k, out = 0;
uint8_t l;
int32_t n, k, idx0, idx1, out = 0;
uint32_t l;
/* qmf subsample l */
for (l = 0; l < sbr->numTimeSlotsRate; l++)
@ -487,139 +485,46 @@ void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
//memmove(qmfs->v + 128, qmfs->v, (1280-128)*sizeof(real_t));
/* calculate 128 samples */
#ifndef FIXED_POINT
pX = X[l];
in_imag1[31] = scale*QMF_RE(pX[1]);
in_real1[0] = scale*QMF_RE(pX[0]);
in_imag2[31] = scale*QMF_IM(pX[63-1]);
in_real2[0] = scale*QMF_IM(pX[63-0]);
for (k = 1; k < 31; k++)
for (k = 0; k < 32; k++)
{
in_imag1[31 - k] = scale*QMF_RE(pX[2*k + 1]);
in_real1[ k] = scale*QMF_RE(pX[2*k ]);
in_imag2[31 - k] = scale*QMF_IM(pX[63 - (2*k + 1)]);
in_real2[ k] = scale*QMF_IM(pX[63 - (2*k )]);
idx0 = 2*k; idx1 = idx0+1;
real1[ k] = QMF_RE(pX[idx0]); imag2[ k] = QMF_IM(pX[idx0]);
imag1[31-k] = QMF_RE(pX[idx1]); real2[31-k] = QMF_IM(pX[idx1]);
}
in_imag1[0] = scale*QMF_RE(pX[63]);
in_real1[31] = scale*QMF_RE(pX[62]);
in_imag2[0] = scale*QMF_IM(pX[63-63]);
in_real2[31] = scale*QMF_IM(pX[63-62]);
#else
pX = X[l];
in_imag1[31] = QMF_RE(pX[1]) >> 1;
in_real1[0] = QMF_RE(pX[0]) >> 1;
in_imag2[31] = QMF_IM(pX[62]) >> 1;
in_real2[0] = QMF_IM(pX[63]) >> 1;
for (k = 1; k < 31; k++)
{
in_imag1[31 - k] = QMF_RE(pX[2*k + 1]) >> 1;
in_real1[ k] = QMF_RE(pX[2*k ]) >> 1;
in_imag2[31 - k] = QMF_IM(pX[63 - (2*k + 1)]) >> 1;
in_real2[ k] = QMF_IM(pX[63 - (2*k )]) >> 1;
}
in_imag1[0] = QMF_RE(pX[63]) >> 1;
in_real1[31] = QMF_RE(pX[62]) >> 1;
in_imag2[0] = QMF_IM(pX[0]) >> 1;
in_real2[31] = QMF_IM(pX[1]) >> 1;
#endif
// dct4_kernel is DCT_IV without reordering which is done before and after FFT
dct4_kernel(in_real1, in_imag1, out_real1, out_imag1);
dct4_kernel(in_real2, in_imag2, out_real2, out_imag2);
dct4_kernel(real1, imag1);
dct4_kernel(real2, imag2);
p_buf_1 = qmfs->v + qmfs->v_index;
p_buf_3 = p_buf_1 + 1280;
pring_buffer_1 = qmfs->v + qmfs->v_index;
pring_buffer_3 = pring_buffer_1 + 1280;
#ifdef PREFER_POINTERS
pring_buffer_2 = pring_buffer_1 + 127;
pring_buffer_4 = pring_buffer_1 + (1280 + 127);
#endif // #ifdef PREFER_POINTERS
// ptemp_1 = x1;
// ptemp_2 = x2;
#ifdef PREFER_POINTERS
idx0 = 0; idx1 = 127;
for (n = 0; n < 32; n++)
{
//real_t x1 = *ptemp_1++;
//real_t x2 = *ptemp_2++;
// pring_buffer_3 and pring_buffer_4 are needed only for double ring buffer
*pring_buffer_1++ = *pring_buffer_3++ = out_real2[n] - out_real1[n];
*pring_buffer_2-- = *pring_buffer_4-- = out_real2[n] + out_real1[n];
//x1 = *ptemp_1++;
//x2 = *ptemp_2++;
*pring_buffer_1++ = *pring_buffer_3++ = out_imag2[31-n] + out_imag1[31-n];
*pring_buffer_2-- = *pring_buffer_4-- = out_imag2[31-n] - out_imag1[31-n];
}
#else // #ifdef PREFER_POINTERS
for (n = 0; n < 32; n++)
{
// pring_buffer_3 and pring_buffer_4 are needed only for double ring buffer
pring_buffer_1[2*n] = pring_buffer_3[2*n] = out_real2[n] - out_real1[n];
pring_buffer_1[127-2*n] = pring_buffer_3[127-2*n] = out_real2[n] + out_real1[n];
pring_buffer_1[2*n+1] = pring_buffer_3[2*n+1] = out_imag2[31-n] + out_imag1[31-n];
pring_buffer_1[127-(2*n+1)] = pring_buffer_3[127-(2*n+1)] = out_imag2[31-n] - out_imag1[31-n];
p_buf_1[idx0] = p_buf_3[idx0] = real2[ n] - real1[ n]; idx0++;
p_buf_1[idx1] = p_buf_3[idx1] = real2[ n] + real1[ n]; idx1--;
p_buf_1[idx0] = p_buf_3[idx0] = imag2[31-n] + imag1[31-n]; idx0++;
p_buf_1[idx1] = p_buf_3[idx1] = imag2[31-n] - imag1[31-n]; idx1--;
}
#endif // #ifdef PREFER_POINTERS
pring_buffer_1 = qmfs->v + qmfs->v_index;
#ifdef PREFER_POINTERS
pring_buffer_2 = pring_buffer_1 + 192;
pring_buffer_3 = pring_buffer_1 + 256;
pring_buffer_4 = pring_buffer_1 + (256 + 192);
pring_buffer_5 = pring_buffer_1 + 512;
pring_buffer_6 = pring_buffer_1 + (512 + 192);
pring_buffer_7 = pring_buffer_1 + 768;
pring_buffer_8 = pring_buffer_1 + (768 + 192);
pring_buffer_9 = pring_buffer_1 + 1024;
pring_buffer_10 = pring_buffer_1 + (1024 + 192);
pqmf_c_1 = qmf_c;
pqmf_c_2 = qmf_c + 64;
pqmf_c_3 = qmf_c + 128;
pqmf_c_4 = qmf_c + 192;
pqmf_c_5 = qmf_c + 256;
pqmf_c_6 = qmf_c + 320;
pqmf_c_7 = qmf_c + 384;
pqmf_c_8 = qmf_c + 448;
pqmf_c_9 = qmf_c + 512;
pqmf_c_10 = qmf_c + 576;
#endif // #ifdef PREFER_POINTERS
p_buf_1 = qmfs->v + qmfs->v_index;
/* calculate 64 output samples and window */
for (k = 0; k < 64; k++)
{
#ifdef PREFER_POINTERS
output[out++] =
MUL_F(*pring_buffer_1++, *pqmf_c_1++) +
MUL_F(*pring_buffer_2++, *pqmf_c_2++) +
MUL_F(*pring_buffer_3++, *pqmf_c_3++) +
MUL_F(*pring_buffer_4++, *pqmf_c_4++) +
MUL_F(*pring_buffer_5++, *pqmf_c_5++) +
MUL_F(*pring_buffer_6++, *pqmf_c_6++) +
MUL_F(*pring_buffer_7++, *pqmf_c_7++) +
MUL_F(*pring_buffer_8++, *pqmf_c_8++) +
MUL_F(*pring_buffer_9++, *pqmf_c_9++) +
MUL_F(*pring_buffer_10++, *pqmf_c_10++);
#else // #ifdef PREFER_POINTERS
output[out++] =
MUL_F(pring_buffer_1[k+0], qmf_c[k+0]) +
MUL_F(pring_buffer_1[k+192], qmf_c[k+64]) +
MUL_F(pring_buffer_1[k+256], qmf_c[k+128]) +
MUL_F(pring_buffer_1[k+(256+192)], qmf_c[k+192]) +
MUL_F(pring_buffer_1[k+512], qmf_c[k+256]) +
MUL_F(pring_buffer_1[k+(512+192)], qmf_c[k+320]) +
MUL_F(pring_buffer_1[k+768], qmf_c[k+384]) +
MUL_F(pring_buffer_1[k+(768+192)], qmf_c[k+448]) +
MUL_F(pring_buffer_1[k+1024], qmf_c[k+512]) +
MUL_F(pring_buffer_1[k+(1024+192)], qmf_c[k+576]);
#endif // #ifdef PREFER_POINTERS
output[out++] = FAAD_SYNTHESIS_SCALE(
MUL_F(p_buf_1[k ], qmf_c[k ]) +
MUL_F(p_buf_1[k+ 192 ], qmf_c[k+ 64]) +
MUL_F(p_buf_1[k+ 256 ], qmf_c[k+128]) +
MUL_F(p_buf_1[k+ 256+192], qmf_c[k+192]) +
MUL_F(p_buf_1[k+ 512 ], qmf_c[k+256]) +
MUL_F(p_buf_1[k+ 512+192], qmf_c[k+320]) +
MUL_F(p_buf_1[k+ 768 ], qmf_c[k+384]) +
MUL_F(p_buf_1[k+ 768+192], qmf_c[k+448]) +
MUL_F(p_buf_1[k+1024 ], qmf_c[k+512]) +
MUL_F(p_buf_1[k+1024+192], qmf_c[k+576]));
}
/* update ringbuffer index */
@ -628,6 +533,6 @@ void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
qmfs->v_index = (1280 - 128);
}
}
#endif
#endif /* #ifdef SBR_LOW_POWER */
#endif
#endif /* #ifdef SBR_DEC */

View file

@ -38,7 +38,7 @@ extern "C" {
#pragma warning(disable:4244)
#endif
ALIGN static const real_t qmf_c[640] = {
ALIGN static const real_t qmf_c[640] ICONST_ATTR_FAAD_LARGE_IRAM = {
FRAC_CONST(0), FRAC_CONST(-0.00055252865047),
FRAC_CONST(-0.00056176925738), FRAC_CONST(-0.00049475180896),
FRAC_CONST(-0.00048752279712), FRAC_CONST(-0.00048937912498),

View file

@ -458,14 +458,14 @@ static INLINE real_t iquant(int16_t q, const real_t *tab, uint8_t *error)
if (q < 0)
{
/* tab contains a value for all possible q [0,8192] */
if (-q < IQ_TABLE_SIZE)
if (LIKELY(-q < IQ_TABLE_SIZE))
return -tab[-q];
*error = 17;
return 0;
} else {
/* tab contains a value for all possible q [0,8192] */
if (q < IQ_TABLE_SIZE)
if (LIKELY(q < IQ_TABLE_SIZE))
return tab[q];
*error = 17;
@ -523,17 +523,17 @@ ALIGN static const real_t pow2sf_tab[] = {
- Within a scalefactor window band, the coefficients are in ascending
spectral order.
*/
static uint8_t quant_to_spec(NeAACDecHandle hDecoder,
ic_stream *ics, int16_t *quant_data,
real_t *spec_data, uint16_t frame_len)
{
ALIGN static const real_t pow2_table[] ICONST_ATTR =
{
COEF_CONST(1.0),
COEF_CONST(1.1892071150027210667174999705605), /* 2^0.25 */
COEF_CONST(1.4142135623730950488016887242097), /* 2^0.5 */
COEF_CONST(1.4142135623730950488016887242097), /* 2^0.50 */
COEF_CONST(1.6817928305074290860622509524664) /* 2^0.75 */
};
static uint8_t quant_to_spec(NeAACDecHandle hDecoder,
ic_stream *ics, int16_t *quant_data,
real_t *spec_data, uint16_t frame_len)
{
const real_t *tab = iq_table;
(void)frame_len;