1
0
Fork 0
forked from len0rd/rockbox

Speed up AAC-HE SBR by 2% on S5L8701. Use MEM_ALIGN on critical arrays and avoid stalls in asm code.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@29209 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Andree Buschmann 2011-02-05 14:29:47 +00:00
parent 62fb090ac4
commit 8b540fa9d9
3 changed files with 33 additions and 33 deletions

View file

@ -1453,7 +1453,7 @@ void DCT2_32_unscaled(real_t *y, real_t *x)
#else /* #ifdef SBR_LOW_POWER */ #else /* #ifdef SBR_LOW_POWER */
/* table for pre-processing within dct4_kernel() */ /* table for pre-processing within dct4_kernel() */
static const real_t dct4_pre_tab[] ICONST_ATTR = { static const real_t dct4_pre_tab[] ICONST_ATTR MEM_ALIGN_ATTR = {
COEF_CONST(0.999924719333649), COEF_CONST(-1.01219630241394), COEF_CONST(-0.987653195858002), COEF_CONST(0.999924719333649), COEF_CONST(-1.01219630241394), COEF_CONST(-0.987653195858002),
COEF_CONST(0.998118102550507), COEF_CONST(-1.05943882465363), COEF_CONST(-0.936797380447388), COEF_CONST(0.998118102550507), COEF_CONST(-1.05943882465363), COEF_CONST(-0.936797380447388),
COEF_CONST(0.993906974792480), COEF_CONST(-1.10412919521332), COEF_CONST(-0.883684754371643), COEF_CONST(0.993906974792480), COEF_CONST(-1.10412919521332), COEF_CONST(-0.883684754371643),
@ -1489,7 +1489,7 @@ static const real_t dct4_pre_tab[] ICONST_ATTR = {
}; };
/* table for post-processing within dct4_kernel() */ /* table for post-processing within dct4_kernel() */
static const real_t dct4_post_tab[] ICONST_ATTR = { static const real_t dct4_post_tab[] ICONST_ATTR MEM_ALIGN_ATTR = {
COEF_CONST(1 ), COEF_CONST(-1 ), COEF_CONST(-1 ), COEF_CONST(1 ), COEF_CONST(-1 ), COEF_CONST(-1 ),
COEF_CONST(0.998795449733734), COEF_CONST(-1.04786312580109), COEF_CONST(-0.949727773666382), COEF_CONST(0.998795449733734), COEF_CONST(-1.04786312580109), COEF_CONST(-0.949727773666382),
COEF_CONST(0.995184719562531), COEF_CONST(-1.09320187568665), COEF_CONST(-0.897167563438416), COEF_CONST(0.995184719562531), COEF_CONST(-1.09320187568665), COEF_CONST(-0.897167563438416),
@ -1525,7 +1525,7 @@ static const real_t dct4_post_tab[] ICONST_ATTR = {
}; };
// Table adapted from codeclib to fit into IRAM // Table adapted from codeclib to fit into IRAM
const uint32_t dct4_revtab[32] ICONST_ATTR = { const uint32_t dct4_revtab[32] ICONST_ATTR MEM_ALIGN_ATTR = {
0, 24, 12, 22, 6, 30, 11, 19, 3, 27, 15, 21, 5, 29, 9, 17, 0, 24, 12, 22, 6, 30, 11, 19, 3, 27, 15, 21, 5, 29, 9, 17,
1, 25, 13, 23, 7, 31, 10, 18, 2, 26, 14, 20, 4, 28, 8, 16}; 1, 25, 13, 23, 7, 31, 10, 18, 2, 26, 14, 20, 4, 28, 8, 16};

View file

@ -521,8 +521,8 @@ uint8_t sbrDecodeSingleFrame(sbr_info *sbr, real_t *channel,
return 0; return 0;
} }
ALIGN qmf_t X_left[MAX_NTSRHFG][64];// = {{0}}; qmf_t X_left [MAX_NTSRHFG][64] MEM_ALIGN_ATTR;// = {{0}};
ALIGN qmf_t X_right[MAX_NTSRHFG][64];// = {{0}}; /* must set this to 0 */ qmf_t X_right[MAX_NTSRHFG][64] MEM_ALIGN_ATTR;// = {{0}}; /* must set this to 0 */
#if (defined(PS_DEC) || defined(DRM_PS)) #if (defined(PS_DEC) || defined(DRM_PS))
uint8_t sbrDecodeSingleFramePS(sbr_info *sbr, real_t *left_channel, real_t *right_channel, uint8_t sbrDecodeSingleFramePS(sbr_info *sbr, real_t *left_channel, real_t *right_channel,

View file

@ -78,12 +78,12 @@ void qmfa_end(qmfa_info *qmfa)
void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input, void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input,
qmf_t X[MAX_NTSRHFG][64], uint8_t offset, uint8_t kx) qmf_t X[MAX_NTSRHFG][64], uint8_t offset, uint8_t kx)
{ {
ALIGN real_t u[64]; real_t u[64] MEM_ALIGN_ATTR;
#ifndef SBR_LOW_POWER #ifndef SBR_LOW_POWER
ALIGN real_t real[32]; real_t real[32] MEM_ALIGN_ATTR;
ALIGN real_t imag[32]; real_t imag[32] MEM_ALIGN_ATTR;
#else #else
ALIGN real_t y[32]; real_t y[32] MEM_ALIGN_ATTR;
#endif #endif
qmf_t *pX; qmf_t *pX;
uint32_t in = 0; uint32_t in = 0;
@ -227,8 +227,8 @@ void qmfs_end(qmfs_info *qmfs)
void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64], void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
real_t *output) real_t *output)
{ {
ALIGN real_t x[16]; real_t x[16] MEM_ALIGN_ATTR;
ALIGN real_t y[16]; real_t y[16] MEM_ALIGN_ATTR;
int16_t n, k, out = 0; int16_t n, k, out = 0;
uint8_t l; uint8_t l;
@ -291,8 +291,8 @@ void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64], void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
real_t *output) real_t *output)
{ {
ALIGN real_t x[64]; real_t x[64] MEM_ALIGN_ATTR;
ALIGN real_t y[64]; real_t y[64] MEM_ALIGN_ATTR;
int16_t n, k, out = 0; int16_t n, k, out = 0;
uint8_t l; uint8_t l;
@ -401,8 +401,8 @@ static const complex_t qmf32_pre_twiddle[] =
void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64], void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
real_t *output) real_t *output)
{ {
ALIGN real_t x1[32]; real_t x1[32] MEM_ALIGN_ATTR;
ALIGN real_t x2[32]; real_t x2[32] MEM_ALIGN_ATTR;
int32_t n, k, idx0, idx1, out = 0; int32_t n, k, idx0, idx1, out = 0;
uint32_t l; uint32_t l;
@ -464,10 +464,10 @@ void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64], void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
real_t *output) real_t *output)
{ {
ALIGN real_t real1[32]; real_t real1[32] MEM_ALIGN_ATTR;
ALIGN real_t imag1[32]; real_t imag1[32] MEM_ALIGN_ATTR;
ALIGN real_t real2[32]; real_t real2[32] MEM_ALIGN_ATTR;
ALIGN real_t imag2[32]; real_t imag2[32] MEM_ALIGN_ATTR;
qmf_t *pX; qmf_t *pX;
real_t *p_buf_1, *p_buf_3; real_t *p_buf_1, *p_buf_3;
int32_t n, k, idx0, idx1, out = 0; int32_t n, k, idx0, idx1, out = 0;
@ -517,36 +517,36 @@ void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
asm volatile ( asm volatile (
"ldmia %[qtab]!, { r0-r3 } \n\t" "ldmia %[qtab]!, { r0-r3 } \n\t"
"ldr r4, [%[pbuf]] \n\t" "ldr r4, [%[pbuf]] \n\t"
"ldr r7, [%[pbuf], #192*4] \n\t"
"smull r5, r6, r4, r0 \n\t" "smull r5, r6, r4, r0 \n\t"
"ldr r4, [%[pbuf], #192*4] \n\t"
"smlal r5, r6, r4, r1 \n\t"
"ldr r4, [%[pbuf], #256*4] \n\t" "ldr r4, [%[pbuf], #256*4] \n\t"
"smlal r5, r6, r7, r1 \n\t"
"ldr r7, [%[pbuf], #448*4] \n\t"
"smlal r5, r6, r4, r2 \n\t" "smlal r5, r6, r4, r2 \n\t"
"ldr r4, [%[pbuf], #448*4] \n\t" "ldr r4, [%[pbuf], #512*4] \n\t"
"smlal r5, r6, r4, r3 \n\t" "smlal r5, r6, r7, r3 \n\t"
"ldmia %[qtab]!, { r0-r3 } \n\t" "ldmia %[qtab]!, { r0-r3 } \n\t"
"ldr r4, [%[pbuf], #512*4] \n\t" "ldr r7, [%[pbuf], #704*4] \n\t"
"smlal r5, r6, r4, r0 \n\t" "smlal r5, r6, r4, r0 \n\t"
"ldr r4, [%[pbuf], #704*4] \n\t"
"smlal r5, r6, r4, r1 \n\t"
"ldr r4, [%[pbuf], #768*4] \n\t" "ldr r4, [%[pbuf], #768*4] \n\t"
"smlal r5, r6, r7, r1 \n\t"
"ldr r7, [%[pbuf], #960*4] \n\t"
"smlal r5, r6, r4, r2 \n\t" "smlal r5, r6, r4, r2 \n\t"
"ldr r4, [%[pbuf], #960*4] \n\t" "mov r2, #1024*4 \n\t"
"smlal r5, r6, r4, r3 \n\t"
"ldmia %[qtab]!, { r0-r1 } \n\t" "ldmia %[qtab]!, { r0-r1 } \n\t"
"mov r2, #1024*4 \n\t"
"ldr r4, [%[pbuf], r2] \n\t" "ldr r4, [%[pbuf], r2] \n\t"
"smlal r5, r6, r4, r0 \n\t" "smlal r5, r6, r7, r3 \n\t"
"mov r2, #1216*4 \n\t" "mov r2, #1216*4 \n\t"
"ldr r4, [%[pbuf], r2] \n\t" "ldr r7, [%[pbuf], r2] \n\t"
"smlal r5, r6, r4, r1 \n\t" "smlal r5, r6, r4, r0 \n\t"
"smlal r5, r6, r7, r1 \n\t"
"str r6, [%[pout]] \n" "str r6, [%[pout]] \n"
: [qtab] "+r" (qtab) : [qtab] "+r" (qtab)
: [pbuf] "r" (pbuf), [pout] "r" (pout) : [pbuf] "r" (pbuf), [pout] "r" (pout)
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "memory"); : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "memory");
} }
#elif defined CPU_COLDFIRE #elif defined CPU_COLDFIRE
const real_t *qtab = qmf_c; const real_t *qtab = qmf_c;