1
0
Fork 0
forked from len0rd/rockbox

Coldfire assembler version of qmf_synth(). Wideband and ultra-wideband Speex files should see a great speedup. Also add faster and symmetric clipping in iir_mem16().

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15292 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Thom Johansen 2007-10-24 22:39:08 +00:00
parent fb70952228
commit cd9fc7a2b9
2 changed files with 168 additions and 17 deletions

View file

@ -47,6 +47,7 @@
#include "filters_arm4.h"
#elif defined (COLDFIRE_ASM)
#define OVERRIDE_IIR_MEM16
#define OVERRIDE_QMF_SYNTH
#elif defined (BFIN_ASM)
#include "filters_bfin.h"
#endif
@ -475,6 +476,7 @@ void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_word16_t *y1
}
}
#ifndef OVERRIDE_QMF_SYNTH
/* Re-synthesised a signal from the QMF low-band and high-band signals */
void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack)
/* assumptions:
@ -566,6 +568,7 @@ void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_
for (i = 0; i < M2; i++)
mem2[2*i+1] = xx2[i];
}
#endif
#ifdef FIXED_POINT
#if 0

View file

@ -31,7 +31,6 @@
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
.text
/* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */
.global iir_mem16
@ -59,14 +58,18 @@ iir_mem16:
move.w (%a3)+, %d0
ext.l %d0
add.l %d1, %d0 | Add with x[i]
move.l #32768, %d1
add.l %d1, %d0 | Bias result to [0..65535]
cmp.l #65535, %d0 | Clip to [0..65535] range
jle 1f
spl.b %d0
ext.w %d0
move.l #32767, %d1
move.l #65534, %a6
add.l %d1, %d0 | Bias result to [-1..65534]
cmp.l %a6, %d0 | Now do clip to [0..65534] range
jls 2f
jpl 1f
clr.l %d0 | Clip low
.word 0x51fa | trapf.w, shadow next insn
1:
sub.l %d1, %d0 | Bias clipped result back to [-32768..32767]
move.l %a6, %d0 | Clip high
2:
sub.l %d1, %d0 | Bias clipped result back to [-32767..32767]
neg.l %d0 | msac.w is bugged in gas, do this for now
move.w %d0, (%a5)+ | Write result to y[i]
move.l (%a4)+, %a6 | Fetch den[0] and den[1]
@ -111,14 +114,18 @@ iir_mem16:
move.w (%a3)+, %d0
ext.l %d0
add.l %d1, %d0 | Add with x[i]
move.l #32768, %d1
add.l %d1, %d0 | Bias result to [0..65535]
cmp.l #65535, %d0 | Clip to [0..65535] range
jle 1f
spl.b %d0
ext.w %d0
move.l #32767, %d1
move.l #65534, %a6
add.l %d1, %d0 | Bias result to [-1..65534]
cmp.l %a6, %d0 | Now do clip to [0..65534] range
jls 2f
jpl 1f
clr.l %d0 | Clip low
.word 0x51fa | trapf.w, shadow next insn
1:
sub.l %d1, %d0 | Bias clipped result back to [-32768..32767]
move.l %a6, %d0 | Clip high
2:
sub.l %d1, %d0 | Bias clipped result back to [-32767..32767]
neg.l %d0 | msac.w is bugged in gas, do this for now
move.w %d0, (%a5)+ | Write result to y[i]
move.l (%a4)+, %a6 | Fetch den[0] and den[1]
@ -159,7 +166,148 @@ iir_mem16:
movem.l %d1-%d7/%a0-%a2, (%a6) | Save back mem[]
.exit:
movem.l (%sp), %d2-%d7/%a2-%a6
lea.l (44, %sp), %sp
movem.l (%sp), %d2-%d7/%a2-%a6
lea.l (44, %sp), %sp
rts
/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
.global qmf_synth
qmf_synth:
lea.l (-44, %sp), %sp
movem.l %d2-%d7/%a2-%a6, (%sp)
movem.l (44+4, %sp), %a0-%a3 | a0 = x1, a1 = x2, a2 = a, a3 = y
movem.l (44+20, %sp), %d0-%d1/%a4-%a5 | d0 = N, d1 = M, a4 = mem1,a5 = mem2
move.l #0x80, %macsr | Enable saturation
| Comments make more sense when compared to the reference C version
move.l %a2, %d6 | Backup a
lsr.l #1, %d0 | N2 = N >> 1
lsr.l #1, %d1 | M2 = M >> 1
move.l %d1, %d7 | Backup M2
clr.l %d2
sub.l %d0, %d2
sub.l %d1, %d2 | d2 = -(N2 + M2)
lea.l (%sp, %d2.l*2), %a2 | Alloc two buffers of N2 + M2 shorts
lea.l (%a2, %d2.l*2), %a6 | a2 = xx1, a6 = xx2
move.l %sp, %d3
move.l %a6, %sp | Update sp
move.l %d3, -(%sp) | Stack old %sp
| Backwards copy x1 and x2 arrays to xx1 and xx2
| TODO: these copying loops probably have more potential for optimization
lea.l (%a0, %d0.l*2), %a0 | x1 += N2
lea.l (%a1, %d0.l*2), %a1 | x2 += N2
move.l %d0, %d2 | Loop counter is N2
0:
move.w -(%a0), (%a2)+
move.w -(%a1), (%a6)+
subq.l #1, %d2
jne 0b
| Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
move.l %d1, %d2 | Loop counter is M2
addq.l #4, %a4 | a4 = &mem1[1]
addq.l #4, %a5 | a5 = &mem2[1]
move.l %a4, %d3 | Backup mem1 and mem2
move.l %a5, %d4
0:
move.l (%a4), %d5
move.w %d5, (%a2)+
move.l (%a5), %d5
move.w %d5, (%a6)+
addq.l #8, %a4
addq.l #8, %a5
subq.l #1, %d2
jne 0b
move.l %d3, %a4 | a4 = &mem1[1]
move.l %d4, %a5 | a5 = &mem2[1]
clr.l %d2
sub.l %d1, %d2 | d2 = -M2
lea.l (-4, %a2, %d2.l*2), %a0 | a0 = &xx1[N2 - 2]
lea.l (-4, %a6, %d2.l*2), %a1 | a1 = &xx2[N2 - 2]
move.l %d6, %a2 | a2 = a
| Main loop, register usage:
| d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
| d2 = x10, d3 = x11, d4 = x20, d5 = x21, d6 = [a0, a1]
| a0 = xx1, a1 = xx2, a2 = a, a3 = y, a4 = mem1, a5 = mem2
0: | Outerloop
move.l #32768, %d2 | Rounding constant
move.l %d2, %acc0
move.l %d2, %acc1
move.l %d2, %acc2
move.l %d2, %acc3
move.w (%a0)+, %d2 | d2 = x10
move.w (%a1)+, %d4 | d4 = x20
move.l (%a2)+, %d6 | d6 = [a0, a1]
1: | Innerloop
move.w (%a0)+, %d3 | d3 = x11
move.w (%a1)+, %d5 | d5 = x21
mac.w %d6u, %d3l, #1, %acc0 | acc0 += a0*x11
msac.w %d6u, %d5l, #1, %acc0 | acc0 -= a0*x21
mac.w %d6l, %d3l, #1, %acc1 | acc1 += a1*x11
mac.w %d6l, %d5l, #1, %acc1 | acc1 += a1*x21
mac.w %d6u, %d2l, #1, %acc2 | acc2 += a0*x10
msac.w %d6u, %d4l, #1, %acc2 | acc2 -= a0*x20
mac.w %d6l, %d2l, #1, %acc3 | acc3 += a1*x10
mac.w %d6l, %d4l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x20
move.w (%a0)+, %d2 | d2 = x10
move.w (%a1)+, %d4 | d4 = x20
mac.w %d6u, %d2l, #1, %acc0 | acc0 += a0*x10
msac.w %d6u, %d4l, #1, %acc0 | acc0 -= a0*x20
mac.w %d6l, %d2l, #1, %acc1 | acc1 += a1*x10
mac.w %d6l, %d4l, #1, %acc1 | acc1 += a1*x20
mac.w %d6u, %d3l, #1, %acc2 | acc2 += a0*x11
msac.w %d6u, %d5l, #1, %acc2 | acc2 -= a0*x21
mac.w %d6l, %d3l, #1, %acc3 | acc3 += a1*x11
mac.w %d6l, %d5l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x21
subq.l #2, %d1
jne 1b
sub.l %d7, %d1 | d1 = -M2
lea.l (-4, %a2, %d1.l*4), %a2 | a2 = &a[0]
lea.l (-6, %a0, %d1.l*2), %a0 | a0 = &xx1[N2 - 2 - i]
lea.l (-6, %a1, %d1.l*2), %a1 | a1 = &xx2[N2 - 2 - i]
neg.l %d1 | d1 = M2
movclr.l %acc0, %d2
movclr.l %acc1, %d3
movclr.l %acc2, %d4
movclr.l %acc3, %d5
swap.w %d2 | Shift 16 right
swap.w %d3
swap.w %d4
swap.w %d5
| Thanks to the extra shift in the mac chain, we get clipping for free.
| The clipping will be [-32768..32767], not Speex standard [-32767..32767],
| but since qmf_synth() is called so late in the signal chain, it should
| work fine.
move.w %d2, (%a3)+ | Write results to y[]
move.w %d3, (%a3)+
move.w %d4, (%a3)+
move.w %d5, (%a3)+
subq.l #2, %d0
jne 0b
| Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
addq.l #4, %a0 | a0 = &xx1[0]
addq.l #4, %a1 | a1 = &xx2[0]
0:
move.w (%a0)+, %d2
move.w (%a1)+, %d3
ext.l %d2
ext.l %d3
move.l %d2, (%a4)
move.l %d3, (%a5)
addq.l #8, %a4
addq.l #8, %a5
subq.l #1, %d1
jne 0b
move.l #0, %macsr
move.l (%sp), %sp
movem.l (%sp), %d2-%d7/%a2-%a6
lea.l (44, %sp), %sp
rts