1
0
Fork 0
forked from len0rd/rockbox

Commit FS#11709 by me. Introduces ARMv5E optimized iQMF for atrac3 based on packed multiply instructions. Additionally, improves scheduling on arm9 and arm11 and forces cache alignment of buffers on all targets. Accuracy is slightly reduced, but still greater then 16 bit. Clip+ CPU clock required for LP2 files decreases by 13MHz and ARM11 by 18MHz. No performance or accuracy changes on armv4 or non-arm.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28549 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Michael Giacomelli 2010-11-10 18:29:29 +00:00
parent 33af0dec28
commit f51189fa4d
5 changed files with 205 additions and 9 deletions

View file

@ -30,7 +30,7 @@ CODEC_HEADER
static RMContext rmctx; static RMContext rmctx;
static RMPacket pkt; static RMPacket pkt;
static ATRAC3Context q IBSS_ATTR; static ATRAC3Context q IBSS_ATTR __attribute__ ((aligned (32)));
static void init_rm(RMContext *rmctx) static void init_rm(RMContext *rmctx)
{ {

View file

@ -1,5 +1,8 @@
atrac3.c atrac3.c
#if defined(CPU_ARM) #if defined(CPU_ARM)
atrac3_arm.S atrac3_arm.S
#if (ARM_ARCH >= 5)
atrac3_armv5e.S
#endif
#endif #endif

View file

@ -55,7 +55,11 @@
#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0) #define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0)
static VLC spectral_coeff_tab[7]; static VLC spectral_coeff_tab[7];
#if defined(CPU_ARM) && (ARM_ARCH >= 5) /*ARMv5e+ uses 32x16 multiplication*/
static int16_t qmf_window[48] IBSS_ATTR __attribute__ ((aligned (32)));
#else
static int32_t qmf_window[48] IBSS_ATTR; static int32_t qmf_window[48] IBSS_ATTR;
#endif
static int32_t atrac3_spectrum [2][1024] IBSS_ATTR __attribute__((aligned(16))); static int32_t atrac3_spectrum [2][1024] IBSS_ATTR __attribute__((aligned(16)));
static int32_t atrac3_IMDCT_buf[2][ 512] IBSS_ATTR __attribute__((aligned(16))); static int32_t atrac3_IMDCT_buf[2][ 512] IBSS_ATTR __attribute__((aligned(16)));
static int32_t atrac3_prevFrame[2][1024] IBSS_ATTR; static int32_t atrac3_prevFrame[2][1024] IBSS_ATTR;
@ -118,12 +122,30 @@ static channel_unit channel_units[2] IBSS_ATTR_LARGE_IRAM;
* } * }
*/ */
#if defined(CPU_ARM) #if defined(CPU_ARM) && (ARM_ARCH >= 5)
extern void
atrac3_iqmf_dewindowing_armv5e(int32_t *out,
int32_t *in,
int16_t *win,
unsigned int nIn);
static inline void
atrac3_iqmf_dewindowing(int32_t *out,
int32_t *in,
int16_t *win,
unsigned int nIn)
{
//atrac3_iqmf_dewindowing_armv5e(out, in, win, nIn);
}
#elif defined(CPU_ARM)
extern void extern void
atrac3_iqmf_dewindowing(int32_t *out, atrac3_iqmf_dewindowing(int32_t *out,
int32_t *in, int32_t *in,
int32_t *win, int16_t *win,
unsigned int nIn); unsigned int nIn);
#elif defined (CPU_COLDFIRE) #elif defined (CPU_COLDFIRE)
#define MULTIPLY_ADD_BLOCK \ #define MULTIPLY_ADD_BLOCK \
"movem.l (%[win]), %%d0-%%d7 \n\t" \ "movem.l (%[win]), %%d0-%%d7 \n\t" \
@ -206,7 +228,9 @@ static channel_unit channel_units[2] IBSS_ATTR_LARGE_IRAM;
out[0] = s2; out[0] = s2;
out[1] = s1; out[1] = s1;
} }
} }
#endif #endif
@ -244,6 +268,7 @@ atrac3_imdct_windowing(int32_t *buffer,
static void iqmf (int32_t *inlo, int32_t *inhi, unsigned int nIn, int32_t *pOut, int32_t *delayBuf, int32_t *temp) static void iqmf (int32_t *inlo, int32_t *inhi, unsigned int nIn, int32_t *pOut, int32_t *delayBuf, int32_t *temp)
{ {
/* Restore the delay buffer */ /* Restore the delay buffer */
memcpy(temp, delayBuf, 46*sizeof(int32_t)); memcpy(temp, delayBuf, 46*sizeof(int32_t));
@ -274,6 +299,7 @@ static void IMLT(int32_t *pInput, int32_t *pOutput)
/* Windowing. */ /* Windowing. */
atrac3_imdct_windowing(pOutput, window_lookup); atrac3_imdct_windowing(pOutput, window_lookup);
} }
@ -320,9 +346,13 @@ static void init_atrac3_transforms(void)
/* Generate the QMF window. */ /* Generate the QMF window. */
for (i=0 ; i<24; i++) { for (i=0 ; i<24; i++) {
s = qmf_48tap_half_fix[i] << 1; s = qmf_48tap_half_fix[i] << 1;
qmf_window[i] = s; #if defined(CPU_ARM) && (ARM_ARCH >= 5)
qmf_window[47 - i] = s; qmf_window[i] = qmf_window[47-i] = (int16_t)((s+(1<<15))>>16);
#else
qmf_window[i] = qmf_window[47-i] = s;
#endif
} }
} }
@ -1229,7 +1259,7 @@ int atrac3_decode_init(ATRAC3Context *q, struct mp3entry *id3)
vlcs_initialized = 1; vlcs_initialized = 1;
} }
init_atrac3_transforms(); init_atrac3_transforms();
/* init the joint-stereo decoding data */ /* init the joint-stereo decoding data */

View file

@ -67,6 +67,7 @@ typedef struct {
} channel_unit; } channel_unit;
typedef struct { typedef struct {
int32_t outSamples[2048];
GetBitContext gb; GetBitContext gb;
//@{ //@{
/** stream data */ /** stream data */
@ -90,8 +91,7 @@ typedef struct {
int weighting_delay[6]; int weighting_delay[6];
//@} //@}
//@{ //@{
/** data buffers */ /** data buffers */
int32_t outSamples[2048];
uint8_t decoded_bytes_buffer[1024]; uint8_t decoded_bytes_buffer[1024];
int32_t tempBuf[1070]; int32_t tempBuf[1070];
//@} //@}

View file

@ -0,0 +1,163 @@
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id:
*
* Copyright (C) 2010 by Michael Giacomelli
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
.section .text, "ax", %progbits
/****************************************************************************
* atrac3_iqmf_dewindowing_armv5e(int32_t *out,
* int32_t *in,
* int32_t *win,
* unsigned int nIn);
*
* Dewindowing step within iqmf of atrac3 synthesis using 16 bit filter
* coefficients and armv5e packed multiply instructions. Uses 2.5 cycles
* per filter coefficient (ideal). Benchmarked 3.54 per coefficient (Clip+).
*
* Reference implementation:
*
* for (j = nIn; j != 0; j--) {
* s1 = fixmul32(in[0], win[0]);
* s2 = fixmul32(in[1], win[1]);
* for (i = 2; i < 48; i += 2) {
* s1 += fixmul32(in[i ], win[i ]);
* s2 += fixmul32(in[i+1], win[i+1]);
* }
* out[0] = s2 << 1;
* out[1] = s1 << 1;
* in += 2;
* out += 2;
* }
* Note: r12 is a scratch register and can be used without restorage.
****************************************************************************/
.align 2
.global atrac3_iqmf_dewindowing_armv5e
.type atrac3_iqmf_dewindowing_armv5e, %function
atrac3_iqmf_dewindowing_armv5e:
/* r0 = dest */
/* r1 = input samples */
/* r2 = window coefficients */
/* r3 = counter */
stmfd sp!, {r4-r11, lr} /* save non-scratch registers */
.iqmf_dewindow_outer_loop: /* outer loop 0...counter-1 */
/* 0.. 7 */
ldmia r2!, {r4, r5, r8, r9} /* load win[0..7] */
ldmia r1!, {r6, r7, r10, r11} /* load in[0..3] to avoid stall on arm11 */
smulwb lr, r6, r4 /* s1 = in[0] * win[0] */
smulwt r12, r7, r4 /* s2 = in[1] * win[1] */
smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r11, r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
/* 8..15 */
ldmia r2!, {r4, r5, r8, r9} /* load win[8..15] */
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
/* 16..23 */
ldmia r2!, {r4, r5, r8, r9} /* load win[16..23] */
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
/* 24..31 */
ldmia r2!, {r4, r5, r8, r9} /* load win[24..31] */
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
/* 32..39 */
ldmia r2!, {r4, r5, r8, r9} /* load win[32..39] */
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
/* 40..47 */
ldmia r2!, {r4, r5, r8, r9} /* load win[40..47] */
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
mov lr , lr , lsl #1
mov r12, r12, lsl #1
stmia r0!, {r12, lr} /* store result out[0]=s2, out[1]=s1 */
sub r1, r1, #184 /* roll back 64 entries = 184 bytes */
sub r2, r2, #96 /* roll back 48 entries * 2 bytes = 96 bytes = win[0] */
subs r3, r3, #1 /* outer loop -= 1 */
bgt .iqmf_dewindow_outer_loop
ldmpc regs=r4-r11 /* restore registers */
.atrac3_iqmf_dewindowing_armv5e_end:
.size atrac3_iqmf_dewindowing_armv5e,.atrac3_iqmf_dewindowing_armv5e_end-atrac3_iqmf_dewindowing_armv5e