diff --git a/apps/eq_arm.S b/apps/eq_arm.S index 85617dc2fb..0c1961d2d3 100644 --- a/apps/eq_arm.S +++ b/apps/eq_arm.S @@ -7,7 +7,7 @@ * \/ \/ \/ \/ \/ * $Id$ * - * Copyright (C) 2006 Thom Johansen + * Copyright (C) 2006-2007 Thom Johansen * * All files in this archive are subject to the GNU General Public License. * See the file COPYING in the source tree root for full license agreement. @@ -17,6 +17,15 @@ * ****************************************************************************/ +/* uncomment this to make filtering calculate lower bits after shifting. + * without this, "shift" of the lower bits will be lost here. + */ +/* #define HIGH_PRECISION */ + +/* + * void eq_filter(int32_t **x, struct eqfilter *f, unsigned num, + * unsigned channels, unsigned shift) + */ .text .global eq_filter eq_filter: @@ -33,35 +42,40 @@ eq_filter: ldr r14, [sp, #8] @ r14 = numsamples ldmia r10, { r0-r3 } @ load history, r10 should be filter struct addr str r10, [sp, #4] @ save it for loop end -.loop: + /* r0-r3 = history, r4-r8 = coefs, r9 = x[], r10..r11 = accumulator, - r12 = shift amount, r14 = number of samples. - See eq_cf.S for explanation of what this loop does. Primary difference - is the reordering of the equation we do here, which is done for register - reuse reasons, we're pretty short on regs. + * r12 = shift amount, r14 = number of samples. */ - smull r10, r11, r6, r1 @ acc = b2*x[i - 2] - mov r1, r0 @ fix input history - smlal r10, r11, r5, r0 @ acc += b1*x[i - 1] - ldr r0, [r9] @ load input and fix history in same operation - smlal r10, r11, r4, r0 @ acc += b0*x[i] - smlal r10, r11, r7, r2 @ acc += a1*y[i - 1] - smlal r10, r11, r8, r3 @ acc += a2*y[i - 2] - mov r3, r2 @ fix output history - mov r2, r11, lsl r12 @ get result - @ TODO: arm makes it easy to mix in lower bits from r10 for extended - @ precision here, but we don't have enough regs to save the shift factor - @ we would need (32 - r12). - str r2, [r9], #4 @ save result - subs r14, r14, #1 @ are we done with this channel? +.loop: + /* Direct form 1 filtering code. + * y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2], + * where y[] is output and x[] is input. This is performed out of order to + * reuse registers, we're pretty short on regs. + */ + smull r10, r11, r6, r1 @ acc = b2*x[i - 2] + mov r1, r0 @ fix input history + smlal r10, r11, r5, r0 @ acc += b1*x[i - 1] + ldr r0, [r9] @ load input and fix history in same operation + smlal r10, r11, r4, r0 @ acc += b0*x[i] + smlal r10, r11, r7, r2 @ acc += a1*y[i - 1] + smlal r10, r11, r8, r3 @ acc += a2*y[i - 2] + mov r3, r2 @ fix output history + mov r2, r11, asl r12 @ get upper part of result and shift left +#ifdef HIGH_PRECISION + rsb r11, r12, #32 @ get shift amount for lower part + orr r2, r2, r10, lsr r11 @ then mix in correctly shifted lower part +#endif + str r2, [r9], #4 @ save result + subs r14, r14, #1 @ are we done with this channel? bne .loop - ldr r10, [sp, #4] @ load filter struct pointer - stmia r10!, { r0-r3 } @ save back history - ldr r11, [sp, #12] @ load number of channels - subs r11, r11, #1 @ all channels processed? + ldr r10, [sp, #4] @ load filter struct pointer + stmia r10!, { r0-r3 } @ save back history + ldr r11, [sp, #12] @ load number of channels + subs r11, r11, #1 @ all channels processed? strne r11, [sp, #12] bne .filterloop - add sp, sp, #16 @ compensate for temp storage + add sp, sp, #16 @ compensate for temp storage ldmia sp!, { r4-r11, pc } + diff --git a/apps/eq_cf.S b/apps/eq_cf.S index c9458cdc77..75bfcafb3a 100644 --- a/apps/eq_cf.S +++ b/apps/eq_cf.S @@ -7,7 +7,7 @@ * \/ \/ \/ \/ \/ * $Id$ * - * Copyright (C) 2006 Thom Johansen + * Copyright (C) 2006-2007 Thom Johansen * * All files in this archive are subject to the GNU General Public License. * See the file COPYING in the source tree root for full license agreement. @@ -17,14 +17,27 @@ * ****************************************************************************/ +/* uncomment this to make filtering calculate lower bits after shifting. + * without this, "shift" - 1 of the lower bits will be lost here. + */ +/* #define HIGH_PRECISION */ + +/* + * void eq_filter(int32_t **x, struct eqfilter *f, unsigned num, + * unsigned channels, unsigned shift) + */ .text .global eq_filter eq_filter: lea.l (-11*4, %sp), %sp movem.l %d2-%d7/%a2-%a6, (%sp) | save clobbered regs move.l (11*4+8, %sp), %a5 | fetch filter structure address - movem.l (11*4+16, %sp), %d6-%d7 | load num. channels and shift count + move.l (11*4+20, %sp), %d7 | load shift count subq.l #1, %d7 | EMAC gives us one free shift +#ifdef HIGH_PRECISION + moveq.l #8, %d6 + sub.l %d7, %d6 | shift for lower part of accumulator +#endif movem.l (%a5), %a0-%a4 | load coefs lea.l (5*4, %a5), %a5 | point to filter history @@ -34,11 +47,16 @@ eq_filter: move.l (%a6), %a6 move.l (11*4+12, %sp), %d5 | number of samples movem.l (%a5), %d0-%d3 | load filter history + + /* d0-r3 = history, d4 = number of channels, d5 = sample count, + * d6 = lower shift amount, d7 = upper shift amount, a0-a4 = coefs, + * a5 = history pointer, a6 = x[] + */ .loop: /* Direct form 1 filtering code. We assume DSP has put EMAC in frac mode. - y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2], - where y[] is output and x[] is input. This is performed out of order - to do parallel load of input value. + * y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2], + * where y[] is output and x[] is input. This is performed out of order + * to do parallel load of input value. */ mac.l %a2, %d1, %acc0 | acc = b2*x[i - 2] move.l %d0, %d1 | fix input history @@ -47,15 +65,23 @@ eq_filter: mac.l %a3, %d2, %acc0 | acc += a1*y[i - 1] mac.l %a4, %d3, %acc0 | acc += a2*y[i - 2] move.l %d2, %d3 | fix output history - movclr.l %acc0, %d2 | fetch and write result +#ifdef HIGH_PRECISION + move.l %accext01, %d2 | fetch lower part of accumulator + move.b %d2, %d4 | clear upper three bytes + lsr.l %d6, %d4 | shift lower bits +#endif + movclr.l %acc0, %d2 | fetch upper part of result asl.l %d7, %d2 | restore fixed point format +#ifdef HIGH_PRECISION + or.l %d2, %d4 | combine lower and upper parts +#endif move.l %d2, (%a6)+ | save result subq.l #1, %d5 | are we done with this channel? jne .loop movem.l %d0-%d3, (%a5) | save history back to struct lea.l (4*4, %a5), %a5 | point to next channel's history - subq.l #1, %d6 | have we processed both channels? + subq.l #1, (11*4+16, %sp) | have we processed both channels? jne .filterloop movem.l (%sp), %d2-%d7/%a2-%a6