1
0
Fork 0
forked from len0rd/rockbox

ARM assembler versions of iir_mem16() and qmf_synth(), yielding a very nice speedup. Touch some comments in filters_cf.S

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15393 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Thom Johansen 2007-11-01 21:11:26 +00:00
parent 9e23e9d43e
commit 6d88717f69
4 changed files with 321 additions and 13 deletions

View file

@ -34,4 +34,6 @@ window.c
#ifdef CPU_COLDFIRE
filters_cf.S
ltp_cf.S
#elif defined(CPU_ARM)
filters_arm4.S
#endif

View file

@ -45,6 +45,8 @@
#include "filters_sse.h"
#elif defined (ARM4_ASM) || defined(ARM5E_ASM)
#include "filters_arm4.h"
#define OVERRIDE_IIR_MEM16
#define OVERRIDE_QMF_SYNTH
#elif defined (COLDFIRE_ASM)
#define OVERRIDE_IIR_MEM16
#define OVERRIDE_QMF_SYNTH

View file

@ -0,0 +1,302 @@
/* Copyright (C) 2007 Thom Johansen */
/**
@file filters_arm4.S
@brief Various analysis/synthesis filters (ARMv4 version)
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of the Xiph.org Foundation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
.text
/* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */
.global iir_mem16
iir_mem16:
stmdb sp!, { r4-r11, lr }
ldr r5, [sp, #36] @ r0 = x, r1 = den, r2 = y, r3 = N
ldr r4, [sp, #40] @ r4 = mem, r5 = ord
cmp r5, #10
beq .order_10
cmp r5, #8
beq .order_8
ldmia sp!, { r4-r11, pc } @ Mon-supported order, return
@ TODO: try using direct form 1 filtering
.order_8:
ldmia r4, { r5-r12 } @ r5-r12 = mem[0..7]
0:
add r5, r5, #4096 @ Rounding constant
ldrsh r14, [r0], #2
add r14, r14, r5, asr #13 @ (mem[0] + 4096) >> 13 + x[i]
mov r5, #0x7f00
orr r5, r5, #0xff @ r5 = 32767
cmp r14, r5
movgt r14, r5 @ Clip positive
cmn r14, r5
rsblt r14, r5, #0 @ Clip negative
strh r14, [r2], #2 @ Write result to y[i]
ldrsh r4, [r1]
mul r5, r4, r14
sub r5, r6, r5 @ mem[0] = mem[1] - den[0]*y[i]
ldrsh r4, [r1, #2]
mul r6, r4, r14
sub r6, r7, r6 @ mem[1] = mem[2] - den[1]*y[i]
ldrsh r4, [r1, #4]
mul r7, r4, r14
sub r7, r8, r7 @ mem[2] = mem[3] - den[2]*y[i]
ldrsh r4, [r1, #6]
mul r8, r4, r14
sub r8, r9, r8 @ mem[3] = mem[4] - den[3]*y[i]
ldrsh r4, [r1, #8]
mul r9, r4, r14
sub r9, r10, r9 @ mem[4] = mem[5] - den[4]*y[i]
ldrsh r4, [r1, #10]
mul r10, r4, r14
sub r10, r11, r10 @ mem[5] = mem[6] - den[5]*y[i]
ldrsh r4, [r1, #12]
mul r11, r4, r14
sub r11, r12, r11 @ mem[6] = mem[7] - den[6]*y[i]
ldrsh r4, [r1, #14]
mul r12, r4, r14
rsb r12, r12, #0 @ mem[7] = -den[7]*y[i]
subs r3, r3, #1
bne 0b
ldr r4, [sp, #40] @ r4 = mem
stmia r4, { r5-r12 } @ Save back mem[]
ldmia sp!, { r4-r11, pc } @ Exit
.order_10:
ldmia r4, { r5-r9 } @ r5-r9 = mem[0..4]
add r5, r5, #4096 @ Rounding constant
ldrsh r14, [r0], #2
add r14, r14, r5, asr #13 @ (mem[0] + 4096) >> 13 + x[i]
mov r5, #0x7f00
orr r5, r5, #0xff @ r5 = 32767
cmp r14, r5
movgt r14, r5 @ Clip positive
cmn r14, r5
rsblt r14, r5, #0 @ Clip negative
strh r14, [r2], #2 @ Write result to y[i]
ldmia r1!, { r10-r12 } @ r10-r12 = den[0..5]
mov r5, r10, lsl #16
mov r5, r5, asr #16
mul r5, r14, r5
sub r5, r6, r5 @ mem[0] = mem[1] - den[0]*y[i]
mov r10, r10, asr #16
mul r6, r14, r10
sub r6, r7, r6 @ mem[1] = mem[2] - den[1]*y[i]
mov r10, r11, lsl #16
mov r10, r10, asr #16
mul r7, r14, r10
sub r7, r8, r7 @ mem[2] = mem[3] - den[2]*y[i]
mov r10, r11, asr #16
mul r8, r14, r10
sub r8, r9, r8 @ mem[3] = mem[4] - den[3]*y[i]
stmia r4!, { r5-r8 } @ Write back mem[0..3], r4 = &mem[4]
mov r10, r12, lsl #16
mov r10, r10, asr #16
mul r5, r14, r10
ldmib r4, { r6-r10 } @ r6-r10 = mem[5..9]
sub r5, r6, r5 @ mem[4] = mem[5] - den[4]*y[i]
mov r12, r12, asr #16
mul r6, r14, r12
sub r6, r7, r6 @ mem[5] = mem[6] - den[5]*y[i]
ldmia r1!, { r11-r12 } @ r11-r12 = den[6..9]
mov r7, r11, lsl #16
mov r7, r7, asr #16
mul r7, r14, r7
sub r7, r8, r7 @ mem[6] = mem[7] - den[6]*y[i]
mov r11, r11, asr #16
mul r8, r14, r11
sub r8, r9, r8 @ mem[7] = mem[8] - den[7]*y[i]
mov r11, r12, lsl #16
mov r11, r11, asr #16
mul r9, r14, r11
sub r9, r10, r9 @ mem[8] = mem[9] - den[8]*y[i]
mov r12, r12, asr #16
mul r10, r14, r12
rsb r10, r10, #0 @ mem[9] = -den[9]*y[i]
stmia r4!, { r5-r10 } @ Write back mem[4..9]
sub r4, r4, #10*4
sub r1, r1, #10*2
subs r3, r3, #1
bne .order_10
ldmia sp!, { r4-r11, pc } @ Exit
/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
.global qmf_synth
qmf_synth:
stmdb sp!, { r4-r11, lr }
add r7, sp, #36 @ r0 = x1, r1 = x2, r2 = a, r3 = y
ldmia r7, { r4-r7 } @ r4 = N, r5 = M, r6 = mem1, r7 = mem2
add r8, r4, r5
sub r9, sp, r8 @ r9 = sp - (N + M >> 1) = xx2
sub r8, r9, r8 @ r8 = r9 - (N + M >> 1) = xx1
str sp, [r8, #-4] @ Stack old sp
sub sp, r8, #4 @ Update sp
add r0, r0, r4 @ x1 += N >> 1
add r1, r1, r4 @ x2 += N >> 1
mov r14, r4 @ Loop counter is N
0:
@ Backwards copy x1 and x2 arrays to xx1 and xx2, assume N2 is power of two
@ N should always be a multiple of four, so this should be OK
ldmdb r0!, { r10-r11 }
mov r12, r10, ror #16
mov r11, r11, ror #16
stmia r8!, { r11-r12 }
ldmdb r1!, { r10-r11 }
mov r12, r10, ror #16
mov r11, r11, ror #16
stmia r9!, { r11-r12 }
subs r14, r14, #8
bne 0b
@ Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
mov r14, r5 @ Loop counter is M
add r6, r6, #2
add r7, r7, #2
stmdb sp!, { r6-r7 } @ Stack &mem1[1], &mem2[1]
0:
ldrh r10, [r6], #4
ldrh r11, [r6], #4
ldrh r12, [r7], #4
orr r10, r10, r11, lsl #16
ldrh r11, [r7], #4
orr r11, r12, r11, lsl #16
str r10, [r8], #4
str r11, [r9], #4
subs r14, r14, #4
bne 0b
sub r0, r8, r5 @ r0 = &xx1[N2]
sub r1, r9, r5 @ r1 = %xx2[N2]
str r4, [sp, #-4] @ Stack N
mov r4, r5
str r4, [sp, #-8] @ Stack M
@ sp doesn't point to the end of the stack frame from here on, but we're not
@ calling anything so it shouldn't matter
@ Main loop, register usage:
@ r0 = xx1, r1 = xx2, r2 = a, r3 = y, r4 = M, r5 = x10, r6 = x11, r7 = x20
@ r8 = x21, r9 = [a1, a0], r10 = acc0, r11 = acc1, r12 = acc2, r14 = acc3
0: @ Outerloop
mov r10, #16384 @ Init acccumulators to rounding const
mov r11, #16384
mov r12, #16384
mov r14, #16384
ldrsh r5, [r0, #-4]! @ r5 = x10, r0 = &xx1[N2 - 2]
ldrsh r7, [r1, #-4]! @ r7 = x20, r1 = &xx2[N2 - 2]
1: @ Innerloop
ldrsh r9, [r2], #2 @ r9 = a0
ldrsh r6, [r0, #2]! @ r6 = x11
ldrsh r8, [r1, #2]! @ r8 = x21
sub r5, r5, r7 @ r5 = x10 - x20
add r7, r5, r7, asl #1 @ r7 = x10 + x20
mla r12, r9, r5, r12 @ acc2 += a0*(x10 - x20)
sub r5, r6, r8 @ r5 = x11 - x21
mla r10, r9, r5, r10 @ acc0 += a0*(x11 - x21)
ldrsh r9, [r2], #2 @ r9 = a1
add r5, r6, r8 @ r5 = x11 + x21
mla r14, r9, r7, r14 @ acc3 += a1*(x10 + x20)
mla r11, r9, r5, r11 @ acc1 += a1*(x11 + x21)
ldrsh r9, [r2], #2 @ r9 = a1
ldrsh r5, [r0, #2]! @ r5 = x10
ldrsh r7, [r1, #2]! @ r7 = x20
sub r6, r6, r8 @ r6 = x11 - x21
add r8, r6, r8, asl #1 @ r8 = x11 + x21
mla r12, r9, r6, r12 @ acc2 += a0*(x11 - x21)
sub r6, r5, r7 @ r6 = x10 - x20
mla r10, r9, r6, r10 @ acc0 += a0*(x10 - x20)
ldrsh r9, [r2], #2 @ r9 = a1
add r6, r5, r7 @ r5 = x10 + x20
mla r14, r9, r8, r14 @ acc3 += a1*(x11 + x21)
mla r11, r9, r6, r11 @ acc1 += a1*(x10 + x10)
subs r4, r4, #4
bne 1b
ldr r4, [sp, #-8] @ r4 = M
sub r2, r2, r4, lsl #1 @ r2 = &a[0]
sub r0, r0, r4 @ r0 = &xx1[N2 - 2 - i]
sub r1, r1, r4 @ r1 = &xx2[N2 - 2 - i]
mov r10, r10, asr #15 @ Shift outputs down
mov r11, r11, asr #15
mov r12, r12, asr #15
mov r14, r14, asr #15
@ TODO: this can be optimized further
mov r9, #0x7f00 @ Clip all four outputs
orr r9, r9, #0xff @ r9 = 32767
cmp r10, r9
movgt r10, r9
cmn r10, r9
rsblt r10, r9, #0
cmp r11, r9
movgt r11, r9
cmn r11, r9
rsblt r11, r9, #0
cmp r12, r9
movgt r12, r9
cmn r12, r9
rsblt r12, r9, #0
cmp r14, r9
movgt r14, r9
cmn r14, r9
rsblt r14, r9, #0
strh r10, [r3], #2 @ Write outputs
strh r11, [r3], #2
strh r12, [r3], #2
strh r14, [r3], #2
ldr r10, [sp, #-4] @ Load N
subs r10, r10, #4 @ Are we done?
strne r10, [sp, #-4]
bne 0b
@ Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
@ r0 and r1 are &xx1[0] and &xx2[0] at this point
ldmia sp, { r5-r6, sp } @ Fetch &mem1[1], &mem2[1], restore sp
0:
ldr r7, [r0], #4
ldr r8, [r1], #4
strh r7, [r5], #4
strh r8, [r6], #4
mov r7, r7, lsr #16
mov r8, r8, lsr #16
strh r7, [r5], #4
strh r8, [r6], #4
subs r4, r4, #4
bne 0b
ldmia sp!, { r4-r11, pc } @ Exit

View file

@ -48,6 +48,7 @@ iir_mem16:
jeq .order_10
jra .exit
| TODO: try using direct form 1 filtering
| d0 = y[i], d1-d7, a0 = mem[0] .. mem[7]
| a3 = x, a4 = den, a5 = y, a6 = temp
.order_8:
@ -171,6 +172,7 @@ iir_mem16:
lea.l (44, %sp), %sp
rts
/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
.global qmf_synth
qmf_synth:
@ -210,10 +212,10 @@ qmf_synth:
jne 0b
| Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
move.l %d1, %d2 | Loop counter is M2
addq.l #2, %a4 | a4 = &mem1[1]
addq.l #2, %a5 | a5 = &mem2[1]
move.l %a4, %d3 | Backup mem1 and mem2
move.l %d1, %d2 | Loop counter is M2
addq.l #2, %a4 | a4 = &mem1[1]
addq.l #2, %a5 | a5 = &mem2[1]
move.l %a4, %d3 | Backup mem1 and mem2
move.l %a5, %d4
0:
move.w (%a4), (%a2)+
@ -222,14 +224,14 @@ qmf_synth:
addq.l #4, %a5
subq.l #1, %d2
jne 0b
move.l %d3, %a4 | a4 = &mem1[1]
move.l %d4, %a5 | a5 = &mem2[1]
move.l %d3, %a4 | a4 = &mem1[1]
move.l %d4, %a5 | a5 = &mem2[1]
clr.l %d2
sub.l %d1, %d2 | d2 = -M2
lea.l (-4, %a2, %d2.l*2), %a0 | a0 = &xx1[N2 - 2]
lea.l (-4, %a6, %d2.l*2), %a1 | a1 = &xx2[N2 - 2]
move.l %d6, %a2 | a2 = a
sub.l %d1, %d2 | d2 = -M2
lea.l (-4, %a2, %d2.l*2), %a0 | a0 = &xx1[N2 - 2]
lea.l (-4, %a6, %d2.l*2), %a1 | a1 = &xx2[N2 - 2]
move.l %d6, %a2 | a2 = a
| Main loop, register usage:
| d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
@ -286,7 +288,7 @@ qmf_synth:
| The clipping will be [-32768..32767], not Speex standard [-32767..32767],
| but since qmf_synth() is called so late in the signal chain, it should
| work fine.
move.w %d2, (%a3)+ | Write results to y[]
move.w %d2, (%a3)+ | Write results to y[]
move.w %d3, (%a3)+
move.w %d4, (%a3)+
move.w %d5, (%a3)+
@ -294,8 +296,8 @@ qmf_synth:
jne 0b
| Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
addq.l #4, %a0 | a0 = &xx1[0]
addq.l #4, %a1 | a1 = &xx2[0]
addq.l #4, %a0 | a0 = &xx1[0]
addq.l #4, %a1 | a1 = &xx2[0]
0:
move.w (%a0)+, (%a4)
move.w (%a1)+, (%a5)