Fundamentally rewrite much of the audio DSP.

Creates a standard buffer passing, local data passing and messaging system for processing stages. Stages can be moved to their own source files to reduce clutter and ease assimilation of new ones. dsp.c becomes dsp_core.c which supports an engine and framework for effects. Formats and change notifications are passed along with the buffer so that they arrive at the correct time at each stage in the chain regardless of the internal delays of a particular one. Removes restrictions on the number of samples that can be processed at a time and it pays attention to destination buffer size restrictions without having to limit input count, which also allows pcmbuf to remain fuller and safely set its own buffer limits as it sees fit. There is no longer a need to query input/output counts given a certain number of input samples; just give it the sizes of the source and destination buffers. Works in harmony with stages that are not deterministic in terms of sample input/output ratio (like both resamplers but most notably the timestretch). As a result it fixes quirks with timestretch hanging up with certain settings and it now operates properly throughout its full settings range. Change-Id: Ib206ec78f6f6c79259c5af9009fe021d68be9734 Reviewed-on: http://gerrit.rockbox.org/200 Reviewed-by: Michael Sevakis <jethead71@rockbox.org> Tested-by: Michael Sevakis <jethead71@rockbox.org>
2012-03-27 19:52:15 -04:00 · 2012-03-27 19:52:15 -04:00 · c9bcbe202d
commit c9bcbe202d
parent c9c1349773
56 changed files with 4823 additions and 2998 deletions
--- a/lib/rbcodec/dsp/dsp_arm.S
+++ b/lib/rbcodec/dsp/dsp_arm.S
@ -21,20 +21,19 @@
 #include "config.h"

 /****************************************************************************
- *  void channels_process_sound_chan_mono(int count, int32_t *buf[])
+ *  void channel_mode_proc_mono(struct dsp_proc_entry *this,
+ *                              struct dsp_buffer **buf_p)
 */
-
-#include "config.h"
-
-    .section .icode, "ax", %progbits
-    .align  2
-    .global channels_process_sound_chan_mono
-    .type   channels_process_sound_chan_mono, %function
-channels_process_sound_chan_mono:
-    @ input: r0 = count, r1 = buf
+    .section .icode
+    .global channel_mode_proc_mono
+    .type   channel_mode_proc_mono, %function
+channel_mode_proc_mono:
+    @ input: r0 = this, r1 = buf_p
+    ldr     r1, [r1]                   @ r1 = buf = *buf_p;
    stmfd   sp!, { r4, lr }            @
                                       @
-    ldmia   r1, { r1, r2 }             @ r1 = buf[0], r2 = buf[1]
+    ldmia   r1, { r0-r2 }              @ r0 = buf->remcount, r1 = buf->p32[0],
+                                       @ r2 = buf->p32[1]
    subs    r0, r0, #1                 @ odd: end at 0; even: end at -1
    beq     .mono_singlesample         @ Zero? Only one sample!
                                       @
@ -61,25 +60,26 @@ channels_process_sound_chan_mono:
    str     r12, [r2]                  @ store Mo
                                       @
    ldmpc   regs=r4                    @
-    .size   channels_process_sound_chan_mono, \
-                .-channels_process_sound_chan_mono
+    .size   channel_mode_proc_mono, .-channel_mode_proc_mono

 /****************************************************************************
- * void channels_process_sound_chan_custom(int count, int32_t *buf[])
+ * void channel_mode_proc_custom(struct dsp_proc_entry *this,
+ *                               struct dsp_buffer **buf_p)
 */
-    .section .icode, "ax", %progbits
-    .align  2
-    .global channels_process_sound_chan_custom
-    .type   channels_process_sound_chan_custom, %function
-channels_process_sound_chan_custom:
+    .section .icode
+    .global channel_mode_proc_custom
+    .type   channel_mode_proc_custom, %function
+channel_mode_proc_custom:
+    @ input: r0 = this, r1 = buf_p
+    ldr     r2, [r0]                   @ r2 = &channel_mode_data = this->data
+    ldr     r1, [r1]                   @ r1 = buf = *buf_p;
+
    stmfd   sp!, { r4-r10, lr }

-    ldr     r3, =dsp_sw_gain
-    ldr     r4, =dsp_sw_cross
+    ldmia   r2, { r3, r4 }             @ r3 = sw_gain, r4 = sw_cross

-    ldmia   r1, { r1, r2 }             @ r1 = buf[0], r2 = buf[1]
-    ldr     r3, [r3]                   @ r3 = dsp_sw_gain
-    ldr     r4, [r4]                   @ r4 = dsp_sw_cross
+    ldmia   r1, { r0-r2 }              @ r0 = buf->remcount, r1 = buf->p32[0],
+                                       @ r2 = buf->p32[1]

    subs    r0, r0, #1
    beq     .custom_single_sample      @ Zero? Only one sample!
@ -135,21 +135,22 @@ channels_process_sound_chan_custom:
    str     r7, [r2]                   @ Store Rc0

    ldmpc   regs=r4-r10
-    .size   channels_process_sound_chan_custom, \
-                .-channels_process_sound_chan_custom
+    .size   channel_mode_proc_custom, .-channel_mode_proc_custom

 /****************************************************************************
- *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
+ *  void channel_mode_proc_karaoke(struct dsp_proc_entry *this,
+ *                                 struct dsp_buffer **buf_p)
 */
-    .section .icode, "ax", %progbits
-    .align  2
-    .global channels_process_sound_chan_karaoke
-    .type   channels_process_sound_chan_karaoke, %function
-channels_process_sound_chan_karaoke:
-    @ input: r0 = count, r1 = buf
+    .section .icode
+    .global channel_mode_proc_karaoke
+    .type   channel_mode_proc_karaoke, %function
+channel_mode_proc_karaoke:
+    @ input: r0 = this, r1 = buf_p
+    ldr     r1, [r1]                   @ r1 = buf = *buf_p;
    stmfd   sp!, { r4, lr }            @
                                       @
-    ldmia   r1, { r1, r2 }             @ r1 = buf[0], r2 = buf[1]
+    ldmia   r1, { r0-r2 }              @ r0 = buf->remcount, r1 = buf->p32[0],
+                                       @ r2 = buf->p32[1]
    subs    r0, r0, #1                 @ odd: end at 0; even: end at -1
    beq     .karaoke_singlesample      @ Zero? Only one sample!
                                       @
@ -179,24 +180,313 @@ channels_process_sound_chan_karaoke:
    str     r12, [r2]                  @ store Ro
                                       @
    ldmpc   regs=r4                    @
-    .size   channels_process_sound_chan_karaoke, \
-                .-channels_process_sound_chan_karaoke
+    .size   channel_mode_proc_karaoke, .-channel_mode_proc_karaoke
+
+/****************************************************************************
+ * void crossfeed_process(struct dsp_proc_entry *this,
+ *                        struct dsp_buffer **buf_p)
+ */
+    .section .text
+    .global crossfeed_process
+crossfeed_process:
+    @ input: r0 = this, r1 = buf_p
+    @ unfortunately, we ended up in a bit of a register squeeze here, and need
+    @ to keep the count on the stack :/
+    ldr     r1, [r1]                   @ r1 = buf = *buf_p;
+    stmfd   sp!, { r4-r11, lr }        @ stack modified regs
+    ldr     r12, [r1]                  @ r12 = buf->remcount
+    ldr     r14, [r0]                  @ r14 = this->data = &crossfeed_state
+    ldmib   r1, { r2-r3 }              @ r2 = buf->p32[0], r3 = buf->p32[1]
+    ldmia   r14!, { r4-r11 }           @ load direct gain and filter data
+    add     r0, r14, #13*2*4           @ calculate end of delay
+    stmfd   sp!, { r0, r12 }           @ stack end of delay adr, count and state
+    ldr     r0, [r0]                   @ fetch current delay line address
+
+    /* Register usage in loop:
+     * r0 = &delay[index][0], r1 = accumulator high, r2 = buf->p32[0],
+     * r3 = buf->p32[1], r4 = direct gain, r5-r7 = b0, b1, a1 (filter coefs),
+     * r8-r11 = filter history, r12 = temp, r14 = accumulator low
+     */
+.cfloop:
+    smull   r14, r1, r6, r8            @ acc = b1*dr[n - 1]
+    smlal   r14, r1, r7, r9            @ acc += a1*y_l[n - 1]
+    ldr     r8, [r0, #4]               @ r8 = dr[n]
+    smlal   r14, r1, r5, r8            @ acc += b0*dr[n]
+    mov     r9, r1, lsl #1             @ fix format for filter history
+    ldr     r12, [r2]                  @ load left input
+    smlal   r14, r1, r4, r12           @ acc += gain*x_l[n]
+    mov     r1, r1, lsl #1             @ fix format
+    str     r1, [r2], #4               @ save result
+
+    smull   r14, r1, r6, r10           @ acc = b1*dl[n - 1]
+    smlal   r14, r1, r7, r11           @ acc += a1*y_r[n - 1]
+    ldr     r10, [r0]                  @ r10 = dl[n]
+    str     r12, [r0], #4              @ save left input to delay line
+    smlal   r14, r1, r5, r10           @ acc += b0*dl[n]
+    mov     r11, r1, lsl #1            @ fix format for filter history
+    ldr     r12, [r3]                  @ load right input
+    smlal   r14, r1, r4, r12           @ acc += gain*x_r[n]
+    str     r12, [r0], #4              @ save right input to delay line
+    mov     r1, r1, lsl #1             @ fix format
+    ldmia   sp, { r12, r14 }           @ fetch delay line end addr and count from stack
+    str     r1, [r3], #4               @ save result
+
+    cmp     r0, r12                    @ need to wrap to start of delay?
+    subhs   r0, r12, #13*2*4           @ wrap back delay line ptr to start
+
+    subs    r14, r14, #1               @ are we finished?
+    strgt   r14, [sp, #4]              @ nope, save count back to stack
+    bgt     .cfloop
+
+    @ save data back to struct
+    str     r0, [r12]                  @ save delay line index
+    sub     r12, r12, #13*2*4 + 4*4    @ r12 = data->history
+    stmia   r12, { r8-r11 }            @ save filter history
+    add     sp, sp, #8                 @ remove temp variables from stack
+    ldmpc   regs=r4-r11
+    .size   crossfeed_process, .-crossfeed_process
+
+/****************************************************************************
+ * int lin_resample_resample(struct resample_data *data,
+ *                           struct dsp_buffer *src,
+ *                           struct dsp_buffer *dst)
+ */
+    .section    .text
+    .global     lin_resample_resample
+lin_resample_resample:
+    @input: r0 = data, r1 = src, r2 = dst
+    stmfd   sp!, { r4-r11, lr }     @ stack modified regs
+    ldr     r4, [r0]                @ r4 = data->delta
+    add     r10, r0, #4             @ r10 = &data->phase
+    ldrb    r3, [r1, #17]           @ r3 = num_channels,
+    stmfd   sp!, { r1, r10 }        @ stack src, &data->phase
+.lrs_channel_loop:
+    ldr     r5, [r10]               @ r5 = data->phase
+    ldr     r6, [r1]                @ r6 = srcrem = src->remcount
+    ldr     r7, [r1, r3, lsl #2]    @ r7 = src->p32[ch]
+    ldr     r8, [r2, r3, lsl #2]    @ r8 = dst->p32[ch]
+    ldr     r9, [r2, #12]           @ r9 = dstrem = dst->bufcount
+
+    cmp     r6, #0x8000             @ srcrem = MIN(srcrem, 0x8000)
+    movgt   r6, #0x8000             @
+    mov     r0, r5, lsr #16         @ pos = MIN(pos, srcrem)
+    cmp     r0, r6                  @
+    movgt   r0, r6                  @ r0 = pos = phase >> 16
+    cmp     r0, #0                  @
+    ldrle   r11, [r10, r3, lsl #2]  @ pos <= 0? r11 = last = last_sample[ch]
+    addgt   r12, r7, r0, lsl #2     @ pos > 0? r1 = last = s[pos - 1]
+    ldrgt   r11, [r12, #-4]         @
+    cmp     r0, r6                  @
+    bge     .lrs_channel_done       @ pos >= count? channel complete
+
+    cmp     r4, #0x10000            @ delta >= 1.0?
+    ldrhs   r12, [r7, r0, lsl #2]   @ yes? r12 = s[pos]
+    bhs     .lrs_dsstart            @ yes? is downsampling
+
+    /** Upsampling **/
+    mov     r5, r5, lsl #16         @ Move phase into high halfword
+    add     r7, r7, r0, lsl #2      @ r7 = &s[pos]
+    sub     r0, r6, r0              @ r0 = dte = srcrem - pos
+.lrs_usloop_1:
+    ldr     r12, [r7], #4           @ r12 = s[pos]
+    sub     r14, r12, r11           @ r14 = diff = s[pos] - s[pos - 1]
+.lrs_usloop_0:
+    mov     r1, r5, lsr #16         @ r1 = frac = phase >> 16
+    @ keep frac in Rs to take advantage of multiplier early termination
+    smull   r1, r10, r14, r1        @ r1, r10 = diff * frac (lo, hi)
+    add     r1, r11, r1, lsr #16    @ r1 = out = last + frac*diff
+    add     r1, r1, r10, lsl #16    @
+    str     r1, [r8], #4            @ *d++ = out
+    subs    r9, r9, #1              @ destination full?
+    bls     .lrs_usfull             @ yes? channel is done
+    adds    r5, r5, r4, lsl #16     @ phase += delta << 16
+    bcc     .lrs_usloop_0           @ if carry is set, pos is incremented
+    subs    r0, r0, #1              @ if srcrem > 0, do another sample
+    mov     r11, r12                @ r11 = last = s[pos-1] (pos changed)
+    bgt     .lrs_usloop_1
+    b       .lrs_usdone
+
+.lrs_usfull:
+    adds    r5, r5, r4, lsl #16     @ do missed phase increment
+    subcs   r0, r0, #1              @ do missed srcrem decrement
+    movcs   r11, r12                @ r11 = s[pos-1] (pos changed)
+
+.lrs_usdone:
+    sub     r0, r6, r0              @ r0 = pos = srcrem - dte
+    orr     r5, r5, r0              @ reconstruct swapped phase
+    mov     r5, r5, ror #16         @ swap pos and frac for phase
+    b       .lrs_channel_done       @
+
+    /** Downsampling **/
+.lrs_dsloop:
+    add     r10, r7, r0, lsl #2     @ r10 = &s[pos]
+    ldmda   r10, { r11, r12 }       @ r11 = last, r12 = s[pos]
+.lrs_dsstart:
+    sub     r14, r12, r11           @ r14 = diff = s[pos] - s[pos - 1]
+    @ keep frac in Rs to take advantage of multiplier early termination
+    bic     r1, r5, r0, lsl #16     @ frac = phase & 0xffff
+    smull   r1, r10, r14, r1        @ r1, r10 = diff * frac (lo, hi)
+    add     r5, r5, r4              @ phase += delta
+    subs    r9, r9, #1              @ destination full? ...
+    mov     r0, r5, lsr #16         @ pos = phase >> 16
+    add     r1, r11, r1, lsr #16    @ r1 = out = last + frac*diff
+    add     r1, r1, r10, lsl #16    @
+    str     r1, [r8], #4            @ *d++ = out
+    cmpgt   r6, r0                  @ ... || pos >= srcrem? ...
+    bgt     .lrs_dsloop             @ ... no, do more samples
+
+    cmp     r0, r6                  @ pos = MIN(pos, srcrem)
+    movgt   r0, r6                  @
+    sub     r1, r0, #1              @ pos must always be > 0 since step >= 1.0
+    ldr     r11, [r7, r1, lsl #2]   @ r11 = s[pos - 1]
+
+.lrs_channel_done:
+    ldmia   sp, { r1, r10 }         @ recover src, &data->phase
+    str     r11, [r10, r3, lsl #2]  @ last_sample[ch] = last
+    subs    r3, r3, #1              @
+    bgt     .lrs_channel_loop       @
+
+    ldr     r6, [r2, #12]           @ r6 = dst->bufcount
+    sub     r5, r5, r0, lsl #16     @ r5 = phase - (pos << 16)
+    str     r5, [r10]               @ data->phase = r5
+    sub     r6, r6, r9              @ r6 = dst->bufcount - dstrem = dstcount
+    str     r6, [r2]                @ dst->remcount = dstcount
+    add     sp, sp, #8              @ adjust stack for temp variables
+    ldmpc   regs=r4-r11             @ ... and we're out
+    .size   lin_resample_resample, .-lin_resample_resample
+
+/****************************************************************************
+ *  void pga_process(struct dsp_proc_entry *this, struct dsp_buffer **buf_p)
+ */
+    .section .icode
+    .global pga_process
+    .type   pga_process, %function
+pga_process:
+    @ input: r0 = this, r1 = buf_p
+    ldr     r0, [r0]                @ r0 = data = this->data (&pga_data)
+    ldr     r1, [r1]                @ r1 = buf = *buf_p;
+    stmfd   sp!, { r4-r8, lr }
+
+    ldr     r4, [r0]                @ r4 = data->gain
+    ldr     r0, [r1], #4            @ r0 = buf->remcount, r1 = buf->p32
+    ldrb    r3, [r1, #13]           @ r3 = buf->format.num_channels
+
+.pga_channelloop:
+    ldr     r2, [r1], #4            @ r2 = buf->p32[ch] and inc index of p32
+    subs    r12, r0, #1             @ r12 = count - 1
+    beq     .pga_singlesample       @ Zero? Only one sample!
+
+.pga_loop:
+    ldmia   r2, { r5, r6 }          @ load r5, r6 from r2 (*p32[ch])
+    smull   r7, r8, r5, r4          @ r7 = FRACMUL_SHL(r5, r4, 8)
+    smull   r14, r5, r6, r4         @ r14 = FRACMUL_SHL(r6, r4, 8)
+    subs    r12, r12, #2
+    mov     r7, r7, lsr #23
+    mov     r14, r14, lsr #23
+    orr     r7, r7, r8, asl #9
+    orr     r14, r14, r5, asl #9
+    stmia   r2!, { r7, r14 }        @ save r7, r14 to *p32[ch] and increment
+    bgt     .pga_loop               @ end of pga loop
+
+    blt     .pga_evencount          @ < 0? even count
+
+.pga_singlesample:
+    ldr     r5, [r2]                @ handle odd sample
+    smull   r7, r8, r5, r4          @ r7 = FRACMUL_SHL(r5, r4, 8)
+    mov     r7, r7, lsr #23
+    orr     r7, r7, r8, asl #9
+    str     r7, [r2]
+
+.pga_evencount:
+    subs    r3, r3, #1
+    bgt     .pga_channelloop        @ end of channel loop
+
+    ldmpc   regs=r4-r8
+    .size   pga_process, .-pga_process
+
+/****************************************************************************
+ * void filter_process(struct dsp_filter *f, int32_t *buf[], int count,
+ *                     unsigned int channels)
+ *
+ * define HIGH_PRECISION as '1' to make filtering calculate lower bits after
+ * shifting. without this, "shift" - 1 of the lower bits will be lost here.
+ */
+#define HIGH_PRECISION 0
+
+#if CONFIG_CPU == PP5002
+    .section    .icode,"ax",%progbits
+#else
+    .text
+#endif
+    .global filter_process
+filter_process:
+    @input: r0 = f, r1 = buf, r2 = count, r3 = channels
+    stmfd   sp!, { r4-r11, lr }     @ save all clobbered regs
+    ldmia   r0!, { r4-r8 }          @ load coefs, r0 = f->history
+    sub     r3, r3, #1              @ r3 = ch = channels - 1
+    stmfd   sp!, { r0-r3 }          @ save adjusted params
+    ldrb    r14, [r0, #32]          @ r14 = shift
+
+    @ Channels are processed high to low while history is saved low to high
+    @ It's really noone's business how we do this
+.fp_channelloop:
+    ldmia   r0, { r9-r12 }          @ load history, r0 = history[channels-ch-1]
+    ldr     r3, [r1, r3, lsl #2]    @ r3 = buf[ch]
+
+    @ r9-r12 = history, r4-r8 = coefs, r0..r1 = accumulator,
+    @ r2 = number of samples, r3 = buf[ch], r14 = shift amount
+.fp_loop:
+    @ Direct form 1 filtering code.
+    @ y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
+    @ where y[] is output and x[] is input. This is performed out of order to
+    @ reuse registers, we're pretty short on regs.
+    smull   r0, r1, r5, r9          @ acc = b1*x[i - 1]
+    smlal   r0, r1, r6, r10         @ acc += b2*x[i - 2]
+    mov     r10, r9                 @ fix input history
+    ldr     r9, [r3]                @ load input and fix history
+    smlal   r0, r1, r7, r11         @ acc += a1*y[i - 1]
+    smlal   r0, r1, r8, r12         @ acc += a2*y[i - 2]
+    smlal   r0, r1, r4, r9          @ acc += b0*x[i] /* avoid stall on arm9 */
+    mov     r12, r11                @ fix output history
+    mov     r11, r1, asl r14        @ get upper part of result and shift left
+#if HIGH_PRECISION
+    rsb     r1, r14, #32            @ get shift amount for lower part
+    orr     r11, r11, r0, lsr r1    @ then mix in correctly shifted lower part
+#endif
+    str     r11, [r3], #4           @ save result
+    subs    r2, r2, #1              @ are we done with this channel?
+    bgt     .fp_loop                @
+
+    ldr     r3, [sp, #12]           @ r3 = ch
+    ldr     r0, [sp]                @ r0 = history[channels-ch-1]
+    subs    r3, r3, #1              @ all channels processed?
+    stmia   r0!, { r9-r12 }         @ save back history, history++
+    ldmhsib sp, { r1-r2 }           @ r1 = buf, r2 = count
+    strhs   r3, [sp, #12]           @ store ch
+    strhs   r0, [sp]                @ store history[channels-ch-1]
+    bhs     .fp_channelloop
+
+    add     sp, sp, #16             @ compensate for temp storage
+    ldmpc   regs=r4-r11
+    .size   filter_process, .-filter_process

 #if ARM_ARCH < 6
 /****************************************************************************
- *  void sample_output_mono(int count, struct dsp_data *data,
- *                          const int32_t *src[], int16_t *dst)
+ *  void sample_output_mono(struct sample_io_data *this,
+ *                          struct dsp_buffer *src,
+ *                          struct dsp_buffer *dst)
 */
-    .section .icode, "ax", %progbits
-    .align  2
+    .section .icode
    .global sample_output_mono
    .type   sample_output_mono, %function
 sample_output_mono:
-    @ input: r0 = count, r1 = data, r2 = src, r3 = dst
+    @ input: r0 = this, r1 = src, r2 = dst
    stmfd   sp!, { r4-r6, lr }

-    ldr     r1, [r1]                   @ lr = data->output_scale
-    ldr     r2, [r2]                   @ r2 = src[0]
+    ldr     r0, [r0]                   @ r0 = this->outcount
+    ldr     r3, [r2, #4]               @ r2 = dst->p16out
+    ldr     r2, [r1, #4]               @ r1 = src->p32[0]
+    ldrb    r1, [r1, #19]              @ r2 = src->format.output_scale

    mov     r4, #1
    mov     r4, r4, lsl r1             @ r4 = 1 << (scale-1)
@ -246,19 +536,21 @@ sample_output_mono:
    .size   sample_output_mono, .-sample_output_mono

 /****************************************************************************
- * void sample_output_stereo(int count, struct dsp_data *data,
- *                           const int32_t *src[], int16_t *dst)
+ *  void sample_output_stereo(struct sample_io_data *this,
+ *                          struct dsp_buffer *src,
+ *                          struct dsp_buffer *dst)
 */
-    .section .icode, "ax", %progbits
-    .align  2
+    .section .icode
    .global sample_output_stereo
    .type   sample_output_stereo, %function
 sample_output_stereo:
-    @ input: r0 = count, r1 = data, r2 = src, r3 = dst
+    @ input: r0 = this, r1 = src, r2 = dst
    stmfd   sp!, { r4-r9, lr }

-    ldr     r1, [r1]                   @ r1 = data->output_scale
-    ldmia   r2, { r2, r5 }             @ r2 = src[0], r5 = src[1]
+    ldr     r0, [r0]                   @ r0 = this->outcount
+    ldr     r3, [r2, #4]               @ r3 = dsp->p16out
+    ldmib   r1, { r2, r5 }             @ r2 = src->p32[0], r5 = src->p32[1]
+    ldrb    r1, [r1, #19]              @ r1 = src->format.output_scale

    mov     r4, #1
    mov     r4, r4, lsl r1             @ r4 = 1 << (scale-1)
@ -330,232 +622,3 @@ sample_output_stereo:
    ldmpc   regs=r4-r9
    .size   sample_output_stereo, .-sample_output_stereo
 #endif /* ARM_ARCH < 6 */
-
-/****************************************************************************
- * void apply_crossfeed(int count, int32_t* src[])
- */
-    .section .text
-    .global apply_crossfeed
-apply_crossfeed:
-    @ unfortunately, we ended up in a bit of a register squeeze here, and need
-    @ to keep the count on the stack :/
-    stmdb   sp!, { r4-r11, lr }        @ stack modified regs
-    ldmia   r1, { r2-r3 }              @ r2 = src[0], r3 = src[1]
-
-    ldr     r1, =crossfeed_data
-    ldmia   r1!, { r4-r11 }            @ load direct gain and filter data
-    mov     r12, r0                    @ better to ldm delay + count later
-    add     r0, r1, #13*4*2            @ calculate end of delay
-    stmdb   sp!, { r0, r12 }           @ stack end of delay adr and count
-    ldr     r0, [r1, #13*4*2]          @ fetch current delay line address
-
-    /* Register usage in loop:
-     * r0 = &delay[index][0], r1 = accumulator high, r2 = src[0], r3 = src[1],
-     * r4 = direct gain, r5-r7 = b0, b1, a1 (filter coefs),
-     * r8-r11 = filter history, r12 = temp, r14 = accumulator low
-     */
-.cfloop:
-    smull   r14, r1, r6, r8            @ acc = b1*dr[n - 1]
-    smlal   r14, r1, r7, r9            @ acc += a1*y_l[n - 1]
-    ldr     r8, [r0, #4]               @ r8 = dr[n]
-    smlal   r14, r1, r5, r8            @ acc += b0*dr[n]
-    mov     r9, r1, lsl #1             @ fix format for filter history
-    ldr     r12, [r2]                  @ load left input
-    smlal   r14, r1, r4, r12           @ acc += gain*x_l[n]
-    mov     r1, r1, lsl #1             @ fix format
-    str     r1, [r2], #4               @ save result
-
-    smull   r14, r1, r6, r10           @ acc = b1*dl[n - 1]
-    smlal   r14, r1, r7, r11           @ acc += a1*y_r[n - 1]
-    ldr     r10, [r0]                  @ r10 = dl[n]
-    str     r12, [r0], #4              @ save left input to delay line
-    smlal   r14, r1, r5, r10           @ acc += b0*dl[n]
-    mov     r11, r1, lsl #1            @ fix format for filter history
-    ldr     r12, [r3]                  @ load right input
-    smlal   r14, r1, r4, r12           @ acc += gain*x_r[n]
-    str     r12, [r0], #4              @ save right input to delay line
-    mov     r1, r1, lsl #1             @ fix format
-    ldmia   sp, { r12, r14 }           @ fetch delay line end addr and count from stack
-    str     r1, [r3], #4               @ save result
-
-    cmp     r0, r12                    @ need to wrap to start of delay?
-    subeq   r0, r0, #13*4*2            @ wrap back delay line ptr to start
-
-    subs    r14, r14, #1               @ are we finished?
-    strne   r14, [sp, #4]              @ nope, save count back to stack
-    bne     .cfloop
-
-    @ save data back to struct
-    ldr     r12, =crossfeed_data + 4*4
-    stmia   r12, { r8-r11 }            @ save filter history
-    str     r0, [r12, #30*4]           @ save delay line index
-    add     sp, sp, #8                 @ remove temp variables from stack
-    ldmpc   regs=r4-r11
-    .size   apply_crossfeed, .-apply_crossfeed
-
-/****************************************************************************
- * int dsp_downsample(int count, struct dsp_data *data,
- *                    in32_t *src[], int32_t *dst[])
- */
-    .section    .text
-    .global     dsp_downsample
-dsp_downsample:
-    stmdb   sp!, { r4-r11, lr }     @ stack modified regs
-    ldmib   r1, { r5-r6 }           @ r5 = num_channels,r6 = resample_data.delta
-    sub     r5, r5, #1              @ pre-decrement num_channels for use
-    add     r4, r1, #12             @ r4 = &resample_data.phase
-    mov     r12, #0xff
-    orr     r12, r12, #0xff00       @ r12 = 0xffff
-.dschannel_loop:
-    ldr     r1, [r4]                @ r1 = resample_data.phase
-    ldr     r7, [r2, r5, lsl #2]    @ r7 = s = src[ch - 1]
-    ldr     r8, [r3, r5, lsl #2]    @ r8 = d = dst[ch - 1]
-    add     r9, r4, #4              @ r9 = &last_sample[0]
-    ldr     r10, [r9, r5, lsl #2]   @ r10 = last_sample[ch - 1]
-    sub     r11, r0, #1
-    ldr     r14, [r7, r11, lsl #2]  @ load last sample in s[] ...
-    str     r14, [r9, r5, lsl #2]   @ and write as next frame's last_sample
-    movs    r9, r1, lsr #16         @ r9 = pos = phase >> 16
-    ldreq   r11, [r7]               @ if pos = 0, load src[0] and jump into loop
-    beq     .dsuse_last_start
-    cmp     r9, r0                  @ if pos >= count, we're already done
-    bge     .dsloop_skip
-
-    @ Register usage in loop:
-    @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel,
-    @ r6 = delta, r7 = s, r8 = d, r9 = pos, r10 = s[pos - 1], r11 = s[pos]
-.dsloop:
-    add     r9, r7, r9, lsl #2      @ r9 = &s[pos]
-    ldmda   r9, { r10, r11 }        @ r10 = s[pos - 1], r11 = s[pos]
-.dsuse_last_start:
-    sub     r11, r11, r10           @ r11 = diff = s[pos] - s[pos - 1]
-    @ keep frac in lower bits to take advantage of multiplier early termination
-    and     r9, r1, r12             @ frac = phase & 0xffff
-    smull   r9, r14, r11, r9
-    add     r1, r1, r6              @ phase += delta
-    add     r10, r10, r9, lsr #16   @ r10 = out = s[pos - 1] + frac*diff
-    add     r10, r10, r14, lsl #16
-    str     r10, [r8], #4           @ *d++ = out
-    mov     r9, r1, lsr #16         @ pos = phase >> 16
-    cmp     r9, r0                  @ pos < count?
-    blt     .dsloop                 @ yup, do more samples
-.dsloop_skip:
-    subs    r5, r5, #1
-    bpl     .dschannel_loop         @ if (--ch) >= 0, do another channel
-    sub     r1, r1, r0, lsl #16     @ wrap phase back to start
-    str     r1, [r4]                @ store back
-    ldr     r1, [r3]                @ r1 = &dst[0]
-    sub     r8, r8, r1              @ dst - &dst[0]
-    mov     r0, r8, lsr #2          @ convert bytes->samples
-    ldmpc   regs=r4-r11             @ ... and we're out
-    .size   dsp_downsample, .-dsp_downsample
-
-/****************************************************************************
- * int dsp_upsample(int count, struct dsp_data *dsp,
- *                  in32_t *src[], int32_t *dst[])
- */
-    .section    .text
-    .global     dsp_upsample
-dsp_upsample:
-    stmfd   sp!, { r4-r11, lr }     @ stack modified regs
-    ldmib   r1, { r5-r6 }           @ r5 = num_channels,r6 = resample_data.delta
-    sub     r5, r5, #1              @ pre-decrement num_channels for use
-    add     r4, r1, #12             @ r4 = &resample_data.phase
-    mov     r6, r6, lsl #16         @ we'll use carry to detect pos increments
-    stmfd   sp!, { r0, r4 }         @ stack count and &resample_data.phase
-.uschannel_loop:
-    ldr     r12, [r4]               @ r12 = resample_data.phase
-    ldr     r7, [r2, r5, lsl #2]    @ r7 = s = src[ch - 1]
-    ldr     r8, [r3, r5, lsl #2]    @ r8 = d = dst[ch - 1]
-    add     r9, r4, #4              @ r9 = &last_sample[0]
-    mov     r1, r12, lsl #16        @ we'll use carry to detect pos increments
-    sub     r11, r0, #1
-    ldr     r14, [r7, r11, lsl #2]  @ load last sample in s[] ...
-    ldr     r10, [r9, r5, lsl #2]   @ r10 = last_sample[ch - 1]
-    str     r14, [r9, r5, lsl #2]   @ and write as next frame's last_sample
-    movs    r14, r12, lsr #16       @ pos = resample_data.phase >> 16
-    beq     .usstart_0              @ pos = 0
-    cmp     r14, r0                 @ if pos >= count, we're already done
-    bge     .usloop_skip
-    add     r7, r7, r14, lsl #2     @ r7 = &s[pos]
-    ldr     r10, [r7, #-4]          @ r11 = s[pos - 1]
-    b       .usstart_0
-
-    @ Register usage in loop:
-    @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel,
-    @ r6 = delta, r7 = s, r8 = d, r9 = diff, r10 = s[pos - 1], r11 = s[pos]
-.usloop_1:
-    mov     r10, r11                @ r10 = previous sample
-.usstart_0:
-    ldr     r11, [r7], #4           @ r11 = next sample
-    mov     r4, r1, lsr #16         @ r4 = frac = phase >> 16
-    sub     r9, r11, r10            @ r9 = diff = s[pos] - s[pos - 1]
-.usloop_0:
-    smull   r12, r14, r4, r9
-    adds    r1, r1, r6              @ phase += delta << 16
-    mov     r4, r1, lsr #16         @ r4 = frac = phase >> 16
-    add     r14, r10, r14, lsl #16
-    add     r14, r14, r12, lsr #16  @ r14 = out = s[pos - 1] + frac*diff
-    str     r14, [r8], #4           @ *d++ = out
-    bcc     .usloop_0               @ if carry is set, pos is incremented
-    subs    r0, r0, #1              @ if count > 0, do another sample
-    bgt     .usloop_1
-.usloop_skip:
-    subs    r5, r5, #1
-    ldmfd   sp, { r0, r4 }          @ reload count and &resample_data.phase
-    bpl     .uschannel_loop         @ if (--ch) >= 0, do another channel
-    mov     r1, r1, lsr #16         @ wrap phase back to start of next frame
-    ldr     r2, [r3]                @ r1 = &dst[0]
-    str     r1, [r4]                @ store phase
-    sub     r8, r8, r2              @ dst - &dst[0]
-    mov     r0, r8, lsr #2          @ convert bytes->samples
-    add     sp, sp, #8              @ adjust stack for temp variables
-    ldmpc   regs=r4-r11             @ ... and we're out
-    .size       dsp_upsample, .-dsp_upsample
-
-/****************************************************************************
- *  void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
- */
-    .section .icode, "ax", %progbits
-    .align  2
-    .global dsp_apply_gain
-    .type   dsp_apply_gain, %function
-dsp_apply_gain:
-    @ input: r0 = count, r1 = data, r2 = buf[]
-    stmfd   sp!, { r4-r8, lr }
-
-    ldr     r3, [r1,  #4]           @ r3 = data->num_channels
-    ldr     r4, [r1, #32]           @ r5 = data->gain
-
-.dag_outerloop:
-    ldr     r1, [r2], #4            @ r1 = buf[0] and increment index of buf[]
-    subs    r12, r0, #1             @ r12 = r0 = count - 1
-    beq     .dag_singlesample       @ Zero? Only one sample!
-
-.dag_innerloop:
-    ldmia   r1, { r5, r6 }          @ load r5, r6 from r1
-    smull   r7, r8, r5, r4          @ r7 = FRACMUL_SHL(r5, r4, 8)
-    smull   r14, r5, r6, r4         @ r14 = FRACMUL_SHL(r6, r4, 8)
-    subs    r12, r12, #2
-    mov     r7, r7, lsr #23
-    mov     r14, r14, lsr #23
-    orr     r7, r7, r8, asl #9
-    orr     r14, r14, r5, asl #9
-    stmia   r1!, { r7, r14 }        @ save r7, r14 to [r1] and increment r1
-    bgt     .dag_innerloop          @ end of inner loop
-
-    blt     .dag_evencount          @ < 0? even count
-
-.dag_singlesample:
-    ldr     r5, [r1]                @ handle odd sample
-    smull   r7, r8, r5, r4          @ r7 = FRACMUL_SHL(r5, r4, 8)
-    mov     r7, r7, lsr #23
-    orr     r7, r7, r8, asl #9
-    str     r7, [r1]
-
-.dag_evencount:
-    subs    r3, r3, #1
-    bgt     .dag_outerloop          @ end of outer loop
-
-    ldmpc   regs=r4-r8
-    .size   dsp_apply_gain, .-dsp_apply_gain