1
0
Fork 0
forked from len0rd/rockbox

Resort operands in mul/smull/mla/smlal to use D[] as second multiplication operand. Additionally do not pre-scale D[] for the 64 Bit precision synthesizer. Through this the 64 Bit multiplication is speed up by 1.5MHz without loss of precision.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@17719 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Andree Buschmann 2008-06-14 11:22:31 +00:00
parent 95cd24d5a2
commit c0951d95b3
2 changed files with 73 additions and 63 deletions

View file

@ -53,10 +53,17 @@
#define MPC_V_PRESHIFT(X) MPC_SHR_RND(X, 12) #define MPC_V_PRESHIFT(X) MPC_SHR_RND(X, 12)
// in this configuration a post-shift by >>1 is needed after synthesis // in this configuration a post-shift by >>1 is needed after synthesis
#else
#if defined(CPU_ARM)
// do not up-scale D-values to achieve higher speed in smull/mlal
// operations. saves ~14/8 = 1.75 cycles per multiplication
#define D(value) (value)
// in this configuration a post-shift by >>16 is needed after synthesis
#else #else
// saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17 // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17
#define D(value) (value << (14)) #define D(value) (value << (14))
#endif
// do not perform pre-shift // do not perform pre-shift
#define MPC_V_PRESHIFT(X) (X) #define MPC_V_PRESHIFT(X) (X)
#endif #endif

View file

@ -26,7 +26,8 @@
* *
* 2nd step within synthesis filter. Does the dewindowing. * 2nd step within synthesis filter. Does the dewindowing.
* 32=32x32 multiplies (OPTIMIZE_FOR_SPEED) * 32=32x32 multiplies (OPTIMIZE_FOR_SPEED)
* Uses pre-shifted V[] and D[] values. * Uses pre-shifted V[] and D[] values. D[] will always be the second operand
* of mul/mla to achieve higher speed as D[] has lower amplitude than V[].
****************************************************************************/ ****************************************************************************/
#if defined(OPTIMIZE_FOR_SPEED) #if defined(OPTIMIZE_FOR_SPEED)
.align 2 .align 2
@ -42,40 +43,40 @@ mpc_decoder_windowing_D:
mov lr, #32 mov lr, #32
.loop32: .loop32:
ldmia r2!, { r3-r10 } /* load first 8 window coefficients */ ldmia r2!, { r3-r10 } /* load D[00..07] */
ldr r11, [r1] /* 0 */ ldr r11, [r1] /* 0 */
mul r12, r3, r11 mul r12, r11, r3
ldr r11, [r1, #96*4] /* 1 */ ldr r11, [r1, #96*4] /* 1 */
mla r12, r4, r11, r12 mla r12, r11, r4, r12
ldr r11, [r1, #128*4] /* 2 */ ldr r11, [r1, #128*4] /* 2 */
mla r12, r5, r11, r12 mla r12, r11, r5, r12
ldr r11, [r1, #224*4] /* 3 */ ldr r11, [r1, #224*4] /* 3 */
mla r12, r6, r11, r12 mla r12, r11, r6, r12
ldr r11, [r1, #256*4] /* 4 */ ldr r11, [r1, #256*4] /* 4 */
mla r12, r7, r11, r12 mla r12, r11, r7, r12
ldr r11, [r1, #352*4] /* 5 */ ldr r11, [r1, #352*4] /* 5 */
mla r12, r8, r11, r12 mla r12, r11, r8, r12
ldr r11, [r1, #384*4] /* 6 */ ldr r11, [r1, #384*4] /* 6 */
mla r12, r9, r11, r12 mla r12, r11, r9, r12
ldr r11, [r1, #480*4] /* 7 */ ldr r11, [r1, #480*4] /* 7 */
mla r12, r10, r11, r12 mla r12, r11, r10, r12
ldmia r2!, { r3-r10 } /* load last 8 window coefficients */ ldmia r2!, { r3-r10 } /* load D[08..15] */
ldr r11, [r1, #512*4] /* 8 */ ldr r11, [r1, #512*4] /* 8 */
mla r12, r3, r11, r12 mla r12, r11, r3, r12
ldr r11, [r1, #608*4] /* 9 */ ldr r11, [r1, #608*4] /* 9 */
mla r12, r4, r11, r12 mla r12, r11, r4, r12
ldr r11, [r1, #640*4] /* 10 */ ldr r11, [r1, #640*4] /* 10 */
mla r12, r5, r11, r12 mla r12, r11, r5, r12
ldr r11, [r1, #736*4] /* 11 */ ldr r11, [r1, #736*4] /* 11 */
mla r12, r6, r11, r12 mla r12, r11, r6, r12
ldr r11, [r1, #768*4] /* 12 */ ldr r11, [r1, #768*4] /* 12 */
mla r12, r7, r11, r12 mla r12, r11, r7, r12
ldr r11, [r1, #864*4] /* 13 */ ldr r11, [r1, #864*4] /* 13 */
mla r12, r8, r11, r12 mla r12, r11, r8, r12
ldr r11, [r1, #896*4] /* 14 */ ldr r11, [r1, #896*4] /* 14 */
mla r12, r9, r11, r12 mla r12, r11, r9, r12
ldr r11, [r1, #992*4] /* 15 */ ldr r11, [r1, #992*4] /* 15 */
mla r12, r10, r11, r12 mla r12, r11, r10, r12
mov r12, r12, asr #1 /* post shift to compensate for pre-shifting */ mov r12, r12, asr #1 /* post shift to compensate for pre-shifting */
str r12, [r0], #4 /* store Data */ str r12, [r0], #4 /* store Data */
add r1, r1, #4 /* V++ */ add r1, r1, #4 /* V++ */
@ -92,9 +93,8 @@ mpc_decoder_windowing_D:
* *
* 2nd step within synthesis filter. Does the dewindowing. * 2nd step within synthesis filter. Does the dewindowing.
* 64=32x32 multiplies * 64=32x32 multiplies
* Drops lo-part of 64bit multiply results and will therefor loose 1 bit * Uses un-shifted D[]-values. D[] will always be the second operand of
* accuracy. The decoder output is binary identical as this imprecision is * smull/smlal to achieve higher speed as D[] has lower amplitude than V[].
* far below the output's 16bit resolution.
****************************************************************************/ ****************************************************************************/
.align 2 .align 2
.global mpc_decoder_windowing_D .global mpc_decoder_windowing_D
@ -105,52 +105,55 @@ mpc_decoder_windowing_D:
/* r2 = D[] */ /* r2 = D[] */
/* lr = counter */ /* lr = counter */
stmfd sp!, {r4-r12, lr} stmfd sp!, {r4-r9, lr}
mov lr, #32 mov lr, #32
.loop32: .loop32:
ldmia r2!, { r3-r10 } /* load first 8 window coefficients */ ldmia r2!, { r3-r6 } /* load D[00..03] */
ldr r11, [r1] /* 0 */ ldr r7, [r1] /* 0 */
smull r11, r12, r3, r11 smull r8, r9, r7, r3
ldr r11, [r1, #96*4] /* 1 */ ldr r7, [r1, #96*4] /* 1 */
smlal r11, r12, r4, r11 smlal r8, r9, r7, r4
ldr r11, [r1, #128*4] /* 2 */ ldr r7, [r1, #128*4] /* 2 */
smlal r11, r12, r5, r11 smlal r8, r9, r7, r5
ldr r11, [r1, #224*4] /* 3 */ ldr r7, [r1, #224*4] /* 3 */
smlal r11, r12, r6, r11 smlal r8, r9, r7, r6
ldr r11, [r1, #256*4] /* 4 */ ldmia r2!, { r3-r6 } /* load D[04..07] */
smlal r11, r12, r7, r11 ldr r7, [r1, #256*4] /* 4 */
ldr r11, [r1, #352*4] /* 5 */ smlal r8, r9, r7, r3
smlal r11, r12, r8, r11 ldr r7, [r1, #352*4] /* 5 */
ldr r11, [r1, #384*4] /* 6 */ smlal r8, r9, r7, r4
smlal r11, r12, r9, r11 ldr r7, [r1, #384*4] /* 6 */
ldr r11, [r1, #480*4] /* 7 */ smlal r8, r9, r7, r5
smlal r11, r12, r10, r11 ldr r7, [r1, #480*4] /* 7 */
ldmia r2!, { r3-r10 } /* load last 8 window coefficients */ smlal r8, r9, r7, r6
ldr r11, [r1, #512*4] /* 8 */ ldmia r2!, { r3-r6 } /* load D[08..11] */
smlal r11, r12, r3, r11 ldr r7, [r1, #512*4] /* 8 */
ldr r11, [r1, #608*4] /* 9 */ smlal r8, r9, r7, r3
smlal r11, r12, r4, r11 ldr r7, [r1, #608*4] /* 9 */
ldr r11, [r1, #640*4] /* 10 */ smlal r8, r9, r7, r4
smlal r11, r12, r5, r11 ldr r7, [r1, #640*4] /* 10 */
ldr r11, [r1, #736*4] /* 11 */ smlal r8, r9, r7, r5
smlal r11, r12, r6, r11 ldr r7, [r1, #736*4] /* 11 */
ldr r11, [r1, #768*4] /* 12 */ smlal r8, r9, r7, r6
smlal r11, r12, r7, r11 ldmia r2!, { r3-r6 } /* load D[12..15] */
ldr r11, [r1, #864*4] /* 13 */ ldr r7, [r1, #768*4] /* 12 */
smlal r11, r12, r8, r11 smlal r8, r9, r7, r3
ldr r11, [r1, #896*4] /* 14 */ ldr r7, [r1, #864*4] /* 13 */
smlal r11, r12, r9, r11 smlal r8, r9, r7, r4
ldr r11, [r1, #992*4] /* 15 */ ldr r7, [r1, #896*4] /* 14 */
smlal r11, r12, r10, r11 smlal r8, r9, r7, r5
mov r4, r12, lsl #2 /* get result from hi-part, loose 2 bits */ ldr r7, [r1, #992*4] /* 15 */
str r4, [r0], #4 /* store Data */ smlal r8, r9, r7, r6
mov r8, r8, lsr #16
orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
str r8, [r0], #4 /* store Data */
add r1, r1, #4 /* V++ */ add r1, r1, #4 /* V++ */
subs lr, lr, #1 subs lr, lr, #1
bgt .loop32 bgt .loop32
ldmfd sp!, {r4-r12, pc} ldmfd sp!, {r4-r9, pc}
.mpc_dewindowing_end: .mpc_dewindowing_end:
.size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D .size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D
#endif #endif