forked from len0rd/rockbox
Coldfire assembler version of qmf_synth(). Wideband and ultra-wideband Speex files should see a great speedup. Also add faster and symmetric clipping in iir_mem16().
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15292 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
fb70952228
commit
cd9fc7a2b9
2 changed files with 168 additions and 17 deletions
|
@ -47,6 +47,7 @@
|
|||
#include "filters_arm4.h"
|
||||
#elif defined (COLDFIRE_ASM)
|
||||
#define OVERRIDE_IIR_MEM16
|
||||
#define OVERRIDE_QMF_SYNTH
|
||||
#elif defined (BFIN_ASM)
|
||||
#include "filters_bfin.h"
|
||||
#endif
|
||||
|
@ -475,6 +476,7 @@ void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_word16_t *y1
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef OVERRIDE_QMF_SYNTH
|
||||
/* Re-synthesised a signal from the QMF low-band and high-band signals */
|
||||
void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack)
|
||||
/* assumptions:
|
||||
|
@ -566,6 +568,7 @@ void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_
|
|||
for (i = 0; i < M2; i++)
|
||||
mem2[2*i+1] = xx2[i];
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef FIXED_POINT
|
||||
#if 0
|
||||
|
|
|
@ -31,7 +31,6 @@
|
|||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
.text
|
||||
/* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */
|
||||
.global iir_mem16
|
||||
|
@ -59,14 +58,18 @@ iir_mem16:
|
|||
move.w (%a3)+, %d0
|
||||
ext.l %d0
|
||||
add.l %d1, %d0 | Add with x[i]
|
||||
move.l #32768, %d1
|
||||
add.l %d1, %d0 | Bias result to [0..65535]
|
||||
cmp.l #65535, %d0 | Clip to [0..65535] range
|
||||
jle 1f
|
||||
spl.b %d0
|
||||
ext.w %d0
|
||||
move.l #32767, %d1
|
||||
move.l #65534, %a6
|
||||
add.l %d1, %d0 | Bias result to [-1..65534]
|
||||
cmp.l %a6, %d0 | Now do clip to [0..65534] range
|
||||
jls 2f
|
||||
jpl 1f
|
||||
clr.l %d0 | Clip low
|
||||
.word 0x51fa | trapf.w, shadow next insn
|
||||
1:
|
||||
sub.l %d1, %d0 | Bias clipped result back to [-32768..32767]
|
||||
move.l %a6, %d0 | Clip high
|
||||
2:
|
||||
sub.l %d1, %d0 | Bias clipped result back to [-32767..32767]
|
||||
neg.l %d0 | msac.w is bugged in gas, do this for now
|
||||
move.w %d0, (%a5)+ | Write result to y[i]
|
||||
move.l (%a4)+, %a6 | Fetch den[0] and den[1]
|
||||
|
@ -111,14 +114,18 @@ iir_mem16:
|
|||
move.w (%a3)+, %d0
|
||||
ext.l %d0
|
||||
add.l %d1, %d0 | Add with x[i]
|
||||
move.l #32768, %d1
|
||||
add.l %d1, %d0 | Bias result to [0..65535]
|
||||
cmp.l #65535, %d0 | Clip to [0..65535] range
|
||||
jle 1f
|
||||
spl.b %d0
|
||||
ext.w %d0
|
||||
move.l #32767, %d1
|
||||
move.l #65534, %a6
|
||||
add.l %d1, %d0 | Bias result to [-1..65534]
|
||||
cmp.l %a6, %d0 | Now do clip to [0..65534] range
|
||||
jls 2f
|
||||
jpl 1f
|
||||
clr.l %d0 | Clip low
|
||||
.word 0x51fa | trapf.w, shadow next insn
|
||||
1:
|
||||
sub.l %d1, %d0 | Bias clipped result back to [-32768..32767]
|
||||
move.l %a6, %d0 | Clip high
|
||||
2:
|
||||
sub.l %d1, %d0 | Bias clipped result back to [-32767..32767]
|
||||
neg.l %d0 | msac.w is bugged in gas, do this for now
|
||||
move.w %d0, (%a5)+ | Write result to y[i]
|
||||
move.l (%a4)+, %a6 | Fetch den[0] and den[1]
|
||||
|
@ -159,7 +166,148 @@ iir_mem16:
|
|||
movem.l %d1-%d7/%a0-%a2, (%a6) | Save back mem[]
|
||||
|
||||
.exit:
|
||||
movem.l (%sp), %d2-%d7/%a2-%a6
|
||||
lea.l (44, %sp), %sp
|
||||
movem.l (%sp), %d2-%d7/%a2-%a6
|
||||
lea.l (44, %sp), %sp
|
||||
rts
|
||||
|
||||
/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
|
||||
.global qmf_synth
|
||||
qmf_synth:
|
||||
lea.l (-44, %sp), %sp
|
||||
movem.l %d2-%d7/%a2-%a6, (%sp)
|
||||
movem.l (44+4, %sp), %a0-%a3 | a0 = x1, a1 = x2, a2 = a, a3 = y
|
||||
movem.l (44+20, %sp), %d0-%d1/%a4-%a5 | d0 = N, d1 = M, a4 = mem1,a5 = mem2
|
||||
move.l #0x80, %macsr | Enable saturation
|
||||
|
||||
| Comments make more sense when compared to the reference C version
|
||||
move.l %a2, %d6 | Backup a
|
||||
lsr.l #1, %d0 | N2 = N >> 1
|
||||
lsr.l #1, %d1 | M2 = M >> 1
|
||||
move.l %d1, %d7 | Backup M2
|
||||
clr.l %d2
|
||||
sub.l %d0, %d2
|
||||
sub.l %d1, %d2 | d2 = -(N2 + M2)
|
||||
lea.l (%sp, %d2.l*2), %a2 | Alloc two buffers of N2 + M2 shorts
|
||||
lea.l (%a2, %d2.l*2), %a6 | a2 = xx1, a6 = xx2
|
||||
move.l %sp, %d3
|
||||
move.l %a6, %sp | Update sp
|
||||
move.l %d3, -(%sp) | Stack old %sp
|
||||
|
||||
| Backwards copy x1 and x2 arrays to xx1 and xx2
|
||||
| TODO: these copying loops probably have more potential for optimization
|
||||
lea.l (%a0, %d0.l*2), %a0 | x1 += N2
|
||||
lea.l (%a1, %d0.l*2), %a1 | x2 += N2
|
||||
move.l %d0, %d2 | Loop counter is N2
|
||||
0:
|
||||
move.w -(%a0), (%a2)+
|
||||
move.w -(%a1), (%a6)+
|
||||
subq.l #1, %d2
|
||||
jne 0b
|
||||
|
||||
| Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
|
||||
move.l %d1, %d2 | Loop counter is M2
|
||||
addq.l #4, %a4 | a4 = &mem1[1]
|
||||
addq.l #4, %a5 | a5 = &mem2[1]
|
||||
move.l %a4, %d3 | Backup mem1 and mem2
|
||||
move.l %a5, %d4
|
||||
0:
|
||||
move.l (%a4), %d5
|
||||
move.w %d5, (%a2)+
|
||||
move.l (%a5), %d5
|
||||
move.w %d5, (%a6)+
|
||||
addq.l #8, %a4
|
||||
addq.l #8, %a5
|
||||
subq.l #1, %d2
|
||||
jne 0b
|
||||
move.l %d3, %a4 | a4 = &mem1[1]
|
||||
move.l %d4, %a5 | a5 = &mem2[1]
|
||||
|
||||
clr.l %d2
|
||||
sub.l %d1, %d2 | d2 = -M2
|
||||
lea.l (-4, %a2, %d2.l*2), %a0 | a0 = &xx1[N2 - 2]
|
||||
lea.l (-4, %a6, %d2.l*2), %a1 | a1 = &xx2[N2 - 2]
|
||||
move.l %d6, %a2 | a2 = a
|
||||
|
||||
| Main loop, register usage:
|
||||
| d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
|
||||
| d2 = x10, d3 = x11, d4 = x20, d5 = x21, d6 = [a0, a1]
|
||||
| a0 = xx1, a1 = xx2, a2 = a, a3 = y, a4 = mem1, a5 = mem2
|
||||
0: | Outerloop
|
||||
move.l #32768, %d2 | Rounding constant
|
||||
move.l %d2, %acc0
|
||||
move.l %d2, %acc1
|
||||
move.l %d2, %acc2
|
||||
move.l %d2, %acc3
|
||||
move.w (%a0)+, %d2 | d2 = x10
|
||||
move.w (%a1)+, %d4 | d4 = x20
|
||||
move.l (%a2)+, %d6 | d6 = [a0, a1]
|
||||
1: | Innerloop
|
||||
move.w (%a0)+, %d3 | d3 = x11
|
||||
move.w (%a1)+, %d5 | d5 = x21
|
||||
mac.w %d6u, %d3l, #1, %acc0 | acc0 += a0*x11
|
||||
msac.w %d6u, %d5l, #1, %acc0 | acc0 -= a0*x21
|
||||
mac.w %d6l, %d3l, #1, %acc1 | acc1 += a1*x11
|
||||
mac.w %d6l, %d5l, #1, %acc1 | acc1 += a1*x21
|
||||
mac.w %d6u, %d2l, #1, %acc2 | acc2 += a0*x10
|
||||
msac.w %d6u, %d4l, #1, %acc2 | acc2 -= a0*x20
|
||||
mac.w %d6l, %d2l, #1, %acc3 | acc3 += a1*x10
|
||||
mac.w %d6l, %d4l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x20
|
||||
|
||||
move.w (%a0)+, %d2 | d2 = x10
|
||||
move.w (%a1)+, %d4 | d4 = x20
|
||||
mac.w %d6u, %d2l, #1, %acc0 | acc0 += a0*x10
|
||||
msac.w %d6u, %d4l, #1, %acc0 | acc0 -= a0*x20
|
||||
mac.w %d6l, %d2l, #1, %acc1 | acc1 += a1*x10
|
||||
mac.w %d6l, %d4l, #1, %acc1 | acc1 += a1*x20
|
||||
mac.w %d6u, %d3l, #1, %acc2 | acc2 += a0*x11
|
||||
msac.w %d6u, %d5l, #1, %acc2 | acc2 -= a0*x21
|
||||
mac.w %d6l, %d3l, #1, %acc3 | acc3 += a1*x11
|
||||
mac.w %d6l, %d5l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x21
|
||||
subq.l #2, %d1
|
||||
jne 1b
|
||||
|
||||
sub.l %d7, %d1 | d1 = -M2
|
||||
lea.l (-4, %a2, %d1.l*4), %a2 | a2 = &a[0]
|
||||
lea.l (-6, %a0, %d1.l*2), %a0 | a0 = &xx1[N2 - 2 - i]
|
||||
lea.l (-6, %a1, %d1.l*2), %a1 | a1 = &xx2[N2 - 2 - i]
|
||||
neg.l %d1 | d1 = M2
|
||||
movclr.l %acc0, %d2
|
||||
movclr.l %acc1, %d3
|
||||
movclr.l %acc2, %d4
|
||||
movclr.l %acc3, %d5
|
||||
swap.w %d2 | Shift 16 right
|
||||
swap.w %d3
|
||||
swap.w %d4
|
||||
swap.w %d5
|
||||
| Thanks to the extra shift in the mac chain, we get clipping for free.
|
||||
| The clipping will be [-32768..32767], not Speex standard [-32767..32767],
|
||||
| but since qmf_synth() is called so late in the signal chain, it should
|
||||
| work fine.
|
||||
move.w %d2, (%a3)+ | Write results to y[]
|
||||
move.w %d3, (%a3)+
|
||||
move.w %d4, (%a3)+
|
||||
move.w %d5, (%a3)+
|
||||
subq.l #2, %d0
|
||||
jne 0b
|
||||
|
||||
| Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
|
||||
addq.l #4, %a0 | a0 = &xx1[0]
|
||||
addq.l #4, %a1 | a1 = &xx2[0]
|
||||
0:
|
||||
move.w (%a0)+, %d2
|
||||
move.w (%a1)+, %d3
|
||||
ext.l %d2
|
||||
ext.l %d3
|
||||
move.l %d2, (%a4)
|
||||
move.l %d3, (%a5)
|
||||
addq.l #8, %a4
|
||||
addq.l #8, %a5
|
||||
subq.l #1, %d1
|
||||
jne 0b
|
||||
|
||||
move.l #0, %macsr
|
||||
move.l (%sp), %sp
|
||||
movem.l (%sp), %d2-%d7/%a2-%a6
|
||||
lea.l (44, %sp), %sp
|
||||
rts
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue