1
0
Fork 0
forked from len0rd/rockbox

Coldfire assembler version of qmf_synth(). Wideband and ultra-wideband Speex files should see a great speedup. Also add faster and symmetric clipping in iir_mem16().

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15292 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Thom Johansen 2007-10-24 22:39:08 +00:00
parent fb70952228
commit cd9fc7a2b9
2 changed files with 168 additions and 17 deletions

View file

@ -47,6 +47,7 @@
#include "filters_arm4.h" #include "filters_arm4.h"
#elif defined (COLDFIRE_ASM) #elif defined (COLDFIRE_ASM)
#define OVERRIDE_IIR_MEM16 #define OVERRIDE_IIR_MEM16
#define OVERRIDE_QMF_SYNTH
#elif defined (BFIN_ASM) #elif defined (BFIN_ASM)
#include "filters_bfin.h" #include "filters_bfin.h"
#endif #endif
@ -475,6 +476,7 @@ void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_word16_t *y1
} }
} }
#ifndef OVERRIDE_QMF_SYNTH
/* Re-synthesised a signal from the QMF low-band and high-band signals */ /* Re-synthesised a signal from the QMF low-band and high-band signals */
void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack)
/* assumptions: /* assumptions:
@ -566,6 +568,7 @@ void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_
for (i = 0; i < M2; i++) for (i = 0; i < M2; i++)
mem2[2*i+1] = xx2[i]; mem2[2*i+1] = xx2[i];
} }
#endif
#ifdef FIXED_POINT #ifdef FIXED_POINT
#if 0 #if 0

View file

@ -31,7 +31,6 @@
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
.text .text
/* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */ /* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */
.global iir_mem16 .global iir_mem16
@ -59,14 +58,18 @@ iir_mem16:
move.w (%a3)+, %d0 move.w (%a3)+, %d0
ext.l %d0 ext.l %d0
add.l %d1, %d0 | Add with x[i] add.l %d1, %d0 | Add with x[i]
move.l #32768, %d1 move.l #32767, %d1
add.l %d1, %d0 | Bias result to [0..65535] move.l #65534, %a6
cmp.l #65535, %d0 | Clip to [0..65535] range add.l %d1, %d0 | Bias result to [-1..65534]
jle 1f cmp.l %a6, %d0 | Now do clip to [0..65534] range
spl.b %d0 jls 2f
ext.w %d0 jpl 1f
clr.l %d0 | Clip low
.word 0x51fa | trapf.w, shadow next insn
1: 1:
sub.l %d1, %d0 | Bias clipped result back to [-32768..32767] move.l %a6, %d0 | Clip high
2:
sub.l %d1, %d0 | Bias clipped result back to [-32767..32767]
neg.l %d0 | msac.w is bugged in gas, do this for now neg.l %d0 | msac.w is bugged in gas, do this for now
move.w %d0, (%a5)+ | Write result to y[i] move.w %d0, (%a5)+ | Write result to y[i]
move.l (%a4)+, %a6 | Fetch den[0] and den[1] move.l (%a4)+, %a6 | Fetch den[0] and den[1]
@ -111,14 +114,18 @@ iir_mem16:
move.w (%a3)+, %d0 move.w (%a3)+, %d0
ext.l %d0 ext.l %d0
add.l %d1, %d0 | Add with x[i] add.l %d1, %d0 | Add with x[i]
move.l #32768, %d1 move.l #32767, %d1
add.l %d1, %d0 | Bias result to [0..65535] move.l #65534, %a6
cmp.l #65535, %d0 | Clip to [0..65535] range add.l %d1, %d0 | Bias result to [-1..65534]
jle 1f cmp.l %a6, %d0 | Now do clip to [0..65534] range
spl.b %d0 jls 2f
ext.w %d0 jpl 1f
clr.l %d0 | Clip low
.word 0x51fa | trapf.w, shadow next insn
1: 1:
sub.l %d1, %d0 | Bias clipped result back to [-32768..32767] move.l %a6, %d0 | Clip high
2:
sub.l %d1, %d0 | Bias clipped result back to [-32767..32767]
neg.l %d0 | msac.w is bugged in gas, do this for now neg.l %d0 | msac.w is bugged in gas, do this for now
move.w %d0, (%a5)+ | Write result to y[i] move.w %d0, (%a5)+ | Write result to y[i]
move.l (%a4)+, %a6 | Fetch den[0] and den[1] move.l (%a4)+, %a6 | Fetch den[0] and den[1]
@ -159,7 +166,148 @@ iir_mem16:
movem.l %d1-%d7/%a0-%a2, (%a6) | Save back mem[] movem.l %d1-%d7/%a0-%a2, (%a6) | Save back mem[]
.exit: .exit:
movem.l (%sp), %d2-%d7/%a2-%a6 movem.l (%sp), %d2-%d7/%a2-%a6
lea.l (44, %sp), %sp lea.l (44, %sp), %sp
rts
/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
.global qmf_synth
qmf_synth:
lea.l (-44, %sp), %sp
movem.l %d2-%d7/%a2-%a6, (%sp)
movem.l (44+4, %sp), %a0-%a3 | a0 = x1, a1 = x2, a2 = a, a3 = y
movem.l (44+20, %sp), %d0-%d1/%a4-%a5 | d0 = N, d1 = M, a4 = mem1,a5 = mem2
move.l #0x80, %macsr | Enable saturation
| Comments make more sense when compared to the reference C version
move.l %a2, %d6 | Backup a
lsr.l #1, %d0 | N2 = N >> 1
lsr.l #1, %d1 | M2 = M >> 1
move.l %d1, %d7 | Backup M2
clr.l %d2
sub.l %d0, %d2
sub.l %d1, %d2 | d2 = -(N2 + M2)
lea.l (%sp, %d2.l*2), %a2 | Alloc two buffers of N2 + M2 shorts
lea.l (%a2, %d2.l*2), %a6 | a2 = xx1, a6 = xx2
move.l %sp, %d3
move.l %a6, %sp | Update sp
move.l %d3, -(%sp) | Stack old %sp
| Backwards copy x1 and x2 arrays to xx1 and xx2
| TODO: these copying loops probably have more potential for optimization
lea.l (%a0, %d0.l*2), %a0 | x1 += N2
lea.l (%a1, %d0.l*2), %a1 | x2 += N2
move.l %d0, %d2 | Loop counter is N2
0:
move.w -(%a0), (%a2)+
move.w -(%a1), (%a6)+
subq.l #1, %d2
jne 0b
| Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
move.l %d1, %d2 | Loop counter is M2
addq.l #4, %a4 | a4 = &mem1[1]
addq.l #4, %a5 | a5 = &mem2[1]
move.l %a4, %d3 | Backup mem1 and mem2
move.l %a5, %d4
0:
move.l (%a4), %d5
move.w %d5, (%a2)+
move.l (%a5), %d5
move.w %d5, (%a6)+
addq.l #8, %a4
addq.l #8, %a5
subq.l #1, %d2
jne 0b
move.l %d3, %a4 | a4 = &mem1[1]
move.l %d4, %a5 | a5 = &mem2[1]
clr.l %d2
sub.l %d1, %d2 | d2 = -M2
lea.l (-4, %a2, %d2.l*2), %a0 | a0 = &xx1[N2 - 2]
lea.l (-4, %a6, %d2.l*2), %a1 | a1 = &xx2[N2 - 2]
move.l %d6, %a2 | a2 = a
| Main loop, register usage:
| d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
| d2 = x10, d3 = x11, d4 = x20, d5 = x21, d6 = [a0, a1]
| a0 = xx1, a1 = xx2, a2 = a, a3 = y, a4 = mem1, a5 = mem2
0: | Outerloop
move.l #32768, %d2 | Rounding constant
move.l %d2, %acc0
move.l %d2, %acc1
move.l %d2, %acc2
move.l %d2, %acc3
move.w (%a0)+, %d2 | d2 = x10
move.w (%a1)+, %d4 | d4 = x20
move.l (%a2)+, %d6 | d6 = [a0, a1]
1: | Innerloop
move.w (%a0)+, %d3 | d3 = x11
move.w (%a1)+, %d5 | d5 = x21
mac.w %d6u, %d3l, #1, %acc0 | acc0 += a0*x11
msac.w %d6u, %d5l, #1, %acc0 | acc0 -= a0*x21
mac.w %d6l, %d3l, #1, %acc1 | acc1 += a1*x11
mac.w %d6l, %d5l, #1, %acc1 | acc1 += a1*x21
mac.w %d6u, %d2l, #1, %acc2 | acc2 += a0*x10
msac.w %d6u, %d4l, #1, %acc2 | acc2 -= a0*x20
mac.w %d6l, %d2l, #1, %acc3 | acc3 += a1*x10
mac.w %d6l, %d4l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x20
move.w (%a0)+, %d2 | d2 = x10
move.w (%a1)+, %d4 | d4 = x20
mac.w %d6u, %d2l, #1, %acc0 | acc0 += a0*x10
msac.w %d6u, %d4l, #1, %acc0 | acc0 -= a0*x20
mac.w %d6l, %d2l, #1, %acc1 | acc1 += a1*x10
mac.w %d6l, %d4l, #1, %acc1 | acc1 += a1*x20
mac.w %d6u, %d3l, #1, %acc2 | acc2 += a0*x11
msac.w %d6u, %d5l, #1, %acc2 | acc2 -= a0*x21
mac.w %d6l, %d3l, #1, %acc3 | acc3 += a1*x11
mac.w %d6l, %d5l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x21
subq.l #2, %d1
jne 1b
sub.l %d7, %d1 | d1 = -M2
lea.l (-4, %a2, %d1.l*4), %a2 | a2 = &a[0]
lea.l (-6, %a0, %d1.l*2), %a0 | a0 = &xx1[N2 - 2 - i]
lea.l (-6, %a1, %d1.l*2), %a1 | a1 = &xx2[N2 - 2 - i]
neg.l %d1 | d1 = M2
movclr.l %acc0, %d2
movclr.l %acc1, %d3
movclr.l %acc2, %d4
movclr.l %acc3, %d5
swap.w %d2 | Shift 16 right
swap.w %d3
swap.w %d4
swap.w %d5
| Thanks to the extra shift in the mac chain, we get clipping for free.
| The clipping will be [-32768..32767], not Speex standard [-32767..32767],
| but since qmf_synth() is called so late in the signal chain, it should
| work fine.
move.w %d2, (%a3)+ | Write results to y[]
move.w %d3, (%a3)+
move.w %d4, (%a3)+
move.w %d5, (%a3)+
subq.l #2, %d0
jne 0b
| Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
addq.l #4, %a0 | a0 = &xx1[0]
addq.l #4, %a1 | a1 = &xx2[0]
0:
move.w (%a0)+, %d2
move.w (%a1)+, %d3
ext.l %d2
ext.l %d3
move.l %d2, (%a4)
move.l %d3, (%a5)
addq.l #8, %a4
addq.l #8, %a5
subq.l #1, %d1
jne 0b
move.l #0, %macsr
move.l (%sp), %sp
movem.l (%sp), %d2-%d7/%a2-%a6
lea.l (44, %sp), %sp
rts rts