Coldfire assembler version of qmf_synth(). Wideband and ultra-wideband Speex files should see a great speedup. Also add faster and symmetric clipping in iir_mem16().

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15292 a1c6a512-1295-4272-9138-f99709370657
2007-10-24 22:39:08 +00:00 · 2007-10-24 22:39:08 +00:00 · cd9fc7a2b9
commit cd9fc7a2b9
parent fb70952228
2 changed files with 168 additions and 17 deletions
--- a/apps/codecs/libspeex/filters.c
+++ b/apps/codecs/libspeex/filters.c
@ -47,6 +47,7 @@
 #include "filters_arm4.h"
 #elif defined (COLDFIRE_ASM)
 #define OVERRIDE_IIR_MEM16
+#define OVERRIDE_QMF_SYNTH
 #elif defined (BFIN_ASM)
 #include "filters_bfin.h"
 #endif
@ -475,6 +476,7 @@ void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_word16_t *y1
   }
 }

+#ifndef OVERRIDE_QMF_SYNTH
 /* Re-synthesised a signal from the QMF low-band and high-band signals */
 void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack)
   /* assumptions:
@ -566,6 +568,7 @@ void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_
   for (i = 0; i < M2; i++)
      mem2[2*i+1] = xx2[i];
 }
+#endif

 #ifdef FIXED_POINT
 #if 0
--- a/apps/codecs/libspeex/filters_cf.S
+++ b/apps/codecs/libspeex/filters_cf.S
@ -31,7 +31,6 @@
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-
    .text
 /* void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack) */
    .global iir_mem16
@ -59,14 +58,18 @@ iir_mem16:
    move.w   (%a3)+, %d0
    ext.l    %d0
    add.l    %d1, %d0           | Add with x[i]
-    move.l   #32768, %d1
-    add.l    %d1, %d0           | Bias result to [0..65535]
-    cmp.l    #65535, %d0        | Clip to [0..65535] range
-    jle      1f
-    spl.b    %d0                
-    ext.w    %d0
+    move.l   #32767, %d1
+    move.l   #65534, %a6
+    add.l    %d1, %d0           | Bias result to [-1..65534]
+    cmp.l    %a6, %d0           | Now do clip to [0..65534] range
+    jls      2f
+    jpl      1f
+    clr.l    %d0                | Clip low
+    .word    0x51fa             | trapf.w, shadow next insn
 1:
-    sub.l    %d1, %d0           | Bias clipped result back to [-32768..32767]
+    move.l   %a6, %d0           | Clip high
+2:
+    sub.l    %d1, %d0           | Bias clipped result back to [-32767..32767]
    neg.l    %d0                | msac.w is bugged in gas, do this for now
    move.w   %d0, (%a5)+        | Write result to y[i]
    move.l   (%a4)+, %a6        | Fetch den[0] and den[1]
@ -111,14 +114,18 @@ iir_mem16:
    move.w   (%a3)+, %d0
    ext.l    %d0
    add.l    %d1, %d0           | Add with x[i]
-    move.l   #32768, %d1
-    add.l    %d1, %d0           | Bias result to [0..65535]
-    cmp.l    #65535, %d0        | Clip to [0..65535] range
-    jle      1f
-    spl.b    %d0                
-    ext.w    %d0
+    move.l   #32767, %d1
+    move.l   #65534, %a6
+    add.l    %d1, %d0           | Bias result to [-1..65534]
+    cmp.l    %a6, %d0           | Now do clip to [0..65534] range
+    jls      2f
+    jpl      1f
+    clr.l    %d0                | Clip low
+    .word    0x51fa             | trapf.w, shadow next insn
 1:
-    sub.l    %d1, %d0           | Bias clipped result back to [-32768..32767]
+    move.l   %a6, %d0           | Clip high
+2:
+    sub.l    %d1, %d0           | Bias clipped result back to [-32767..32767]
    neg.l    %d0                | msac.w is bugged in gas, do this for now
    move.w   %d0, (%a5)+        | Write result to y[i]
    move.l   (%a4)+, %a6        | Fetch den[0] and den[1]
@ -159,7 +166,148 @@ iir_mem16:
    movem.l  %d1-%d7/%a0-%a2, (%a6) | Save back mem[]

 .exit:
-    movem.l (%sp), %d2-%d7/%a2-%a6
-    lea.l (44, %sp), %sp
+    movem.l  (%sp), %d2-%d7/%a2-%a6
+    lea.l    (44, %sp), %sp
+    rts
+
+/* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
+    .global qmf_synth
+qmf_synth:
+    lea.l    (-44, %sp), %sp
+    movem.l  %d2-%d7/%a2-%a6, (%sp)
+    movem.l  (44+4, %sp), %a0-%a3          | a0 = x1, a1 = x2, a2 = a, a3 = y
+    movem.l  (44+20, %sp), %d0-%d1/%a4-%a5 | d0 = N, d1 = M, a4 = mem1,a5 = mem2
+    move.l   #0x80, %macsr                 | Enable saturation
+
+    | Comments make more sense when compared to the reference C version
+    move.l   %a2, %d6                   | Backup a
+    lsr.l    #1, %d0                    | N2 = N >> 1
+    lsr.l    #1, %d1                    | M2 = M >> 1
+    move.l   %d1, %d7                   | Backup M2
+    clr.l    %d2
+    sub.l    %d0, %d2
+    sub.l    %d1, %d2                   | d2 = -(N2 + M2)
+    lea.l    (%sp, %d2.l*2), %a2        | Alloc two buffers of N2 + M2 shorts
+    lea.l    (%a2, %d2.l*2), %a6        | a2 = xx1, a6 = xx2
+    move.l   %sp, %d3
+    move.l   %a6, %sp                   | Update sp
+    move.l   %d3, -(%sp)                | Stack old %sp
+
+    | Backwards copy x1 and x2 arrays to xx1 and xx2
+    | TODO: these copying loops probably have more potential for optimization
+    lea.l    (%a0, %d0.l*2), %a0        | x1 += N2
+    lea.l    (%a1, %d0.l*2), %a1        | x2 += N2
+    move.l   %d0, %d2                   | Loop counter is N2
+0:
+    move.w   -(%a0), (%a2)+
+    move.w   -(%a1), (%a6)+
+    subq.l   #1, %d2
+    jne      0b
+
+    | Copy alternate members of mem1 and mem2 to last part of xx1 and xx2
+    move.l   %d1, %d2                   | Loop counter is M2
+    addq.l   #4, %a4                    | a4 = &mem1[1]
+    addq.l   #4, %a5                    | a5 = &mem2[1]
+    move.l   %a4, %d3                   | Backup mem1 and mem2
+    move.l   %a5, %d4
+0:
+    move.l   (%a4), %d5
+    move.w   %d5, (%a2)+
+    move.l   (%a5), %d5
+    move.w   %d5, (%a6)+
+    addq.l   #8, %a4
+    addq.l   #8, %a5
+    subq.l   #1, %d2
+    jne      0b
+    move.l   %d3, %a4                   | a4 = &mem1[1]
+    move.l   %d4, %a5                   | a5 = &mem2[1]
+
+    clr.l    %d2
+    sub.l    %d1, %d2                   | d2 = -M2
+    lea.l    (-4, %a2, %d2.l*2), %a0    | a0 = &xx1[N2 - 2]
+    lea.l    (-4, %a6, %d2.l*2), %a1    | a1 = &xx2[N2 - 2]
+    move.l   %d6, %a2                   | a2 = a
+
+    | Main loop, register usage:
+    | d0 = N2 counter, d1 = M2 counter, d7 = M2 backup
+    | d2 = x10, d3 = x11, d4 = x20, d5 = x21, d6 = [a0, a1]
+    | a0 = xx1, a1 = xx2, a2 = a, a3 = y, a4 = mem1, a5 = mem2
+0:  | Outerloop
+    move.l   #32768, %d2                        | Rounding constant
+    move.l   %d2, %acc0
+    move.l   %d2, %acc1
+    move.l   %d2, %acc2
+    move.l   %d2, %acc3
+    move.w   (%a0)+, %d2                        | d2 = x10
+    move.w   (%a1)+, %d4                        | d4 = x20
+    move.l   (%a2)+, %d6                        | d6 = [a0, a1]
+1:  | Innerloop
+    move.w   (%a0)+, %d3                        | d3 = x11
+    move.w   (%a1)+, %d5                        | d5 = x21
+    mac.w    %d6u, %d3l, #1, %acc0              | acc0 += a0*x11
+    msac.w   %d6u, %d5l, #1, %acc0              | acc0 -= a0*x21
+    mac.w    %d6l, %d3l, #1, %acc1              | acc1 += a1*x11
+    mac.w    %d6l, %d5l, #1, %acc1              | acc1 += a1*x21
+    mac.w    %d6u, %d2l, #1, %acc2              | acc2 += a0*x10
+    msac.w   %d6u, %d4l, #1, %acc2              | acc2 -= a0*x20
+    mac.w    %d6l, %d2l, #1, %acc3              | acc3 += a1*x10
+    mac.w    %d6l, %d4l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x20
+
+    move.w   (%a0)+, %d2                        | d2 = x10
+    move.w   (%a1)+, %d4                        | d4 = x20
+    mac.w    %d6u, %d2l, #1, %acc0              | acc0 += a0*x10
+    msac.w   %d6u, %d4l, #1, %acc0              | acc0 -= a0*x20
+    mac.w    %d6l, %d2l, #1, %acc1              | acc1 += a1*x10
+    mac.w    %d6l, %d4l, #1, %acc1              | acc1 += a1*x20
+    mac.w    %d6u, %d3l, #1, %acc2              | acc2 += a0*x11
+    msac.w   %d6u, %d5l, #1, %acc2              | acc2 -= a0*x21
+    mac.w    %d6l, %d3l, #1, %acc3              | acc3 += a1*x11
+    mac.w    %d6l, %d5l, #1, (%a2)+, %d6, %acc3 | acc3 += a1*x21
+    subq.l   #2, %d1
+    jne      1b
+ 
+    sub.l    %d7, %d1                           | d1 = -M2
+    lea.l    (-4, %a2, %d1.l*4), %a2            | a2 = &a[0]
+    lea.l    (-6, %a0, %d1.l*2), %a0            | a0 = &xx1[N2 - 2 - i] 
+    lea.l    (-6, %a1, %d1.l*2), %a1            | a1 = &xx2[N2 - 2 - i]
+    neg.l    %d1                                | d1 = M2
+    movclr.l %acc0, %d2
+    movclr.l %acc1, %d3
+    movclr.l %acc2, %d4
+    movclr.l %acc3, %d5
+    swap.w   %d2                                | Shift 16 right
+    swap.w   %d3
+    swap.w   %d4
+    swap.w   %d5
+    | Thanks to the extra shift in the mac chain, we get clipping for free.
+    | The clipping will be [-32768..32767], not Speex standard [-32767..32767],
+    | but since qmf_synth() is called so late in the signal chain, it should
+    | work fine.
+    move.w   %d2, (%a3)+        | Write results to y[]
+    move.w   %d3, (%a3)+
+    move.w   %d4, (%a3)+
+    move.w   %d5, (%a3)+
+    subq.l   #2, %d0
+    jne      0b
+
+    | Copy start of xx1 and xx2 back to alternate mem1 and mem2 entries
+    addq.l   #4, %a0            | a0 = &xx1[0]
+    addq.l   #4, %a1            | a1 = &xx2[0]
+0:
+    move.w   (%a0)+, %d2
+    move.w   (%a1)+, %d3
+    ext.l    %d2
+    ext.l    %d3
+    move.l   %d2, (%a4)
+    move.l   %d3, (%a5)
+    addq.l   #8, %a4
+    addq.l   #8, %a5
+    subq.l   #1, %d1
+    jne      0b
+
+    move.l   #0, %macsr
+    move.l   (%sp), %sp
+    movem.l  (%sp), %d2-%d7/%a2-%a6
+    lea.l    (44, %sp), %sp
    rts