1
0
Fork 0
forked from len0rd/rockbox

Fundamentally rewrite much of the audio DSP.

Creates a standard buffer passing, local data passing and messaging
system for processing stages. Stages can be moved to their own source
files to reduce clutter and ease assimilation of new ones. dsp.c
becomes dsp_core.c which supports an engine and framework for effects.

Formats and change notifications are passed along with the buffer so
that they arrive at the correct time at each stage in the chain
regardless of the internal delays of a particular one.

Removes restrictions on the number of samples that can be processed at
a time and it pays attention to destination buffer size restrictions
without having to limit input count, which also allows pcmbuf to
remain fuller and safely set its own buffer limits as it sees fit.
There is no longer a need to query input/output counts given a certain
number of input samples; just give it the sizes of the source and
destination buffers.

Works in harmony with stages that are not deterministic in terms of
sample input/output ratio (like both resamplers but most notably
the timestretch). As a result it fixes quirks with timestretch hanging
up with certain settings and it now operates properly throughout its
full settings range.
Change-Id: Ib206ec78f6f6c79259c5af9009fe021d68be9734
Reviewed-on: http://gerrit.rockbox.org/200
Reviewed-by: Michael Sevakis <jethead71@rockbox.org>
Tested-by: Michael Sevakis <jethead71@rockbox.org>
This commit is contained in:
Michael Sevakis 2012-03-27 19:52:15 -04:00
parent c9c1349773
commit c9bcbe202d
56 changed files with 4823 additions and 2998 deletions

View file

@ -21,20 +21,19 @@
#include "config.h"
/****************************************************************************
* void channels_process_sound_chan_mono(int count, int32_t *buf[])
* void channel_mode_proc_mono(struct dsp_proc_entry *this,
* struct dsp_buffer **buf_p)
*/
#include "config.h"
.section .icode, "ax", %progbits
.align 2
.global channels_process_sound_chan_mono
.type channels_process_sound_chan_mono, %function
channels_process_sound_chan_mono:
@ input: r0 = count, r1 = buf
.section .icode
.global channel_mode_proc_mono
.type channel_mode_proc_mono, %function
channel_mode_proc_mono:
@ input: r0 = this, r1 = buf_p
ldr r1, [r1] @ r1 = buf = *buf_p;
stmfd sp!, { r4, lr } @
@
ldmia r1, { r1, r2 } @ r1 = buf[0], r2 = buf[1]
ldmia r1, { r0-r2 } @ r0 = buf->remcount, r1 = buf->p32[0],
@ r2 = buf->p32[1]
subs r0, r0, #1 @ odd: end at 0; even: end at -1
beq .mono_singlesample @ Zero? Only one sample!
@
@ -61,25 +60,26 @@ channels_process_sound_chan_mono:
str r12, [r2] @ store Mo
@
ldmpc regs=r4 @
.size channels_process_sound_chan_mono, \
.-channels_process_sound_chan_mono
.size channel_mode_proc_mono, .-channel_mode_proc_mono
/****************************************************************************
* void channels_process_sound_chan_custom(int count, int32_t *buf[])
* void channel_mode_proc_custom(struct dsp_proc_entry *this,
* struct dsp_buffer **buf_p)
*/
.section .icode, "ax", %progbits
.align 2
.global channels_process_sound_chan_custom
.type channels_process_sound_chan_custom, %function
channels_process_sound_chan_custom:
.section .icode
.global channel_mode_proc_custom
.type channel_mode_proc_custom, %function
channel_mode_proc_custom:
@ input: r0 = this, r1 = buf_p
ldr r2, [r0] @ r2 = &channel_mode_data = this->data
ldr r1, [r1] @ r1 = buf = *buf_p;
stmfd sp!, { r4-r10, lr }
ldr r3, =dsp_sw_gain
ldr r4, =dsp_sw_cross
ldmia r2, { r3, r4 } @ r3 = sw_gain, r4 = sw_cross
ldmia r1, { r1, r2 } @ r1 = buf[0], r2 = buf[1]
ldr r3, [r3] @ r3 = dsp_sw_gain
ldr r4, [r4] @ r4 = dsp_sw_cross
ldmia r1, { r0-r2 } @ r0 = buf->remcount, r1 = buf->p32[0],
@ r2 = buf->p32[1]
subs r0, r0, #1
beq .custom_single_sample @ Zero? Only one sample!
@ -135,21 +135,22 @@ channels_process_sound_chan_custom:
str r7, [r2] @ Store Rc0
ldmpc regs=r4-r10
.size channels_process_sound_chan_custom, \
.-channels_process_sound_chan_custom
.size channel_mode_proc_custom, .-channel_mode_proc_custom
/****************************************************************************
* void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
* void channel_mode_proc_karaoke(struct dsp_proc_entry *this,
* struct dsp_buffer **buf_p)
*/
.section .icode, "ax", %progbits
.align 2
.global channels_process_sound_chan_karaoke
.type channels_process_sound_chan_karaoke, %function
channels_process_sound_chan_karaoke:
@ input: r0 = count, r1 = buf
.section .icode
.global channel_mode_proc_karaoke
.type channel_mode_proc_karaoke, %function
channel_mode_proc_karaoke:
@ input: r0 = this, r1 = buf_p
ldr r1, [r1] @ r1 = buf = *buf_p;
stmfd sp!, { r4, lr } @
@
ldmia r1, { r1, r2 } @ r1 = buf[0], r2 = buf[1]
ldmia r1, { r0-r2 } @ r0 = buf->remcount, r1 = buf->p32[0],
@ r2 = buf->p32[1]
subs r0, r0, #1 @ odd: end at 0; even: end at -1
beq .karaoke_singlesample @ Zero? Only one sample!
@
@ -179,24 +180,313 @@ channels_process_sound_chan_karaoke:
str r12, [r2] @ store Ro
@
ldmpc regs=r4 @
.size channels_process_sound_chan_karaoke, \
.-channels_process_sound_chan_karaoke
.size channel_mode_proc_karaoke, .-channel_mode_proc_karaoke
/****************************************************************************
* void crossfeed_process(struct dsp_proc_entry *this,
* struct dsp_buffer **buf_p)
*/
.section .text
.global crossfeed_process
crossfeed_process:
@ input: r0 = this, r1 = buf_p
@ unfortunately, we ended up in a bit of a register squeeze here, and need
@ to keep the count on the stack :/
ldr r1, [r1] @ r1 = buf = *buf_p;
stmfd sp!, { r4-r11, lr } @ stack modified regs
ldr r12, [r1] @ r12 = buf->remcount
ldr r14, [r0] @ r14 = this->data = &crossfeed_state
ldmib r1, { r2-r3 } @ r2 = buf->p32[0], r3 = buf->p32[1]
ldmia r14!, { r4-r11 } @ load direct gain and filter data
add r0, r14, #13*2*4 @ calculate end of delay
stmfd sp!, { r0, r12 } @ stack end of delay adr, count and state
ldr r0, [r0] @ fetch current delay line address
/* Register usage in loop:
* r0 = &delay[index][0], r1 = accumulator high, r2 = buf->p32[0],
* r3 = buf->p32[1], r4 = direct gain, r5-r7 = b0, b1, a1 (filter coefs),
* r8-r11 = filter history, r12 = temp, r14 = accumulator low
*/
.cfloop:
smull r14, r1, r6, r8 @ acc = b1*dr[n - 1]
smlal r14, r1, r7, r9 @ acc += a1*y_l[n - 1]
ldr r8, [r0, #4] @ r8 = dr[n]
smlal r14, r1, r5, r8 @ acc += b0*dr[n]
mov r9, r1, lsl #1 @ fix format for filter history
ldr r12, [r2] @ load left input
smlal r14, r1, r4, r12 @ acc += gain*x_l[n]
mov r1, r1, lsl #1 @ fix format
str r1, [r2], #4 @ save result
smull r14, r1, r6, r10 @ acc = b1*dl[n - 1]
smlal r14, r1, r7, r11 @ acc += a1*y_r[n - 1]
ldr r10, [r0] @ r10 = dl[n]
str r12, [r0], #4 @ save left input to delay line
smlal r14, r1, r5, r10 @ acc += b0*dl[n]
mov r11, r1, lsl #1 @ fix format for filter history
ldr r12, [r3] @ load right input
smlal r14, r1, r4, r12 @ acc += gain*x_r[n]
str r12, [r0], #4 @ save right input to delay line
mov r1, r1, lsl #1 @ fix format
ldmia sp, { r12, r14 } @ fetch delay line end addr and count from stack
str r1, [r3], #4 @ save result
cmp r0, r12 @ need to wrap to start of delay?
subhs r0, r12, #13*2*4 @ wrap back delay line ptr to start
subs r14, r14, #1 @ are we finished?
strgt r14, [sp, #4] @ nope, save count back to stack
bgt .cfloop
@ save data back to struct
str r0, [r12] @ save delay line index
sub r12, r12, #13*2*4 + 4*4 @ r12 = data->history
stmia r12, { r8-r11 } @ save filter history
add sp, sp, #8 @ remove temp variables from stack
ldmpc regs=r4-r11
.size crossfeed_process, .-crossfeed_process
/****************************************************************************
* int lin_resample_resample(struct resample_data *data,
* struct dsp_buffer *src,
* struct dsp_buffer *dst)
*/
.section .text
.global lin_resample_resample
lin_resample_resample:
@input: r0 = data, r1 = src, r2 = dst
stmfd sp!, { r4-r11, lr } @ stack modified regs
ldr r4, [r0] @ r4 = data->delta
add r10, r0, #4 @ r10 = &data->phase
ldrb r3, [r1, #17] @ r3 = num_channels,
stmfd sp!, { r1, r10 } @ stack src, &data->phase
.lrs_channel_loop:
ldr r5, [r10] @ r5 = data->phase
ldr r6, [r1] @ r6 = srcrem = src->remcount
ldr r7, [r1, r3, lsl #2] @ r7 = src->p32[ch]
ldr r8, [r2, r3, lsl #2] @ r8 = dst->p32[ch]
ldr r9, [r2, #12] @ r9 = dstrem = dst->bufcount
cmp r6, #0x8000 @ srcrem = MIN(srcrem, 0x8000)
movgt r6, #0x8000 @
mov r0, r5, lsr #16 @ pos = MIN(pos, srcrem)
cmp r0, r6 @
movgt r0, r6 @ r0 = pos = phase >> 16
cmp r0, #0 @
ldrle r11, [r10, r3, lsl #2] @ pos <= 0? r11 = last = last_sample[ch]
addgt r12, r7, r0, lsl #2 @ pos > 0? r1 = last = s[pos - 1]
ldrgt r11, [r12, #-4] @
cmp r0, r6 @
bge .lrs_channel_done @ pos >= count? channel complete
cmp r4, #0x10000 @ delta >= 1.0?
ldrhs r12, [r7, r0, lsl #2] @ yes? r12 = s[pos]
bhs .lrs_dsstart @ yes? is downsampling
/** Upsampling **/
mov r5, r5, lsl #16 @ Move phase into high halfword
add r7, r7, r0, lsl #2 @ r7 = &s[pos]
sub r0, r6, r0 @ r0 = dte = srcrem - pos
.lrs_usloop_1:
ldr r12, [r7], #4 @ r12 = s[pos]
sub r14, r12, r11 @ r14 = diff = s[pos] - s[pos - 1]
.lrs_usloop_0:
mov r1, r5, lsr #16 @ r1 = frac = phase >> 16
@ keep frac in Rs to take advantage of multiplier early termination
smull r1, r10, r14, r1 @ r1, r10 = diff * frac (lo, hi)
add r1, r11, r1, lsr #16 @ r1 = out = last + frac*diff
add r1, r1, r10, lsl #16 @
str r1, [r8], #4 @ *d++ = out
subs r9, r9, #1 @ destination full?
bls .lrs_usfull @ yes? channel is done
adds r5, r5, r4, lsl #16 @ phase += delta << 16
bcc .lrs_usloop_0 @ if carry is set, pos is incremented
subs r0, r0, #1 @ if srcrem > 0, do another sample
mov r11, r12 @ r11 = last = s[pos-1] (pos changed)
bgt .lrs_usloop_1
b .lrs_usdone
.lrs_usfull:
adds r5, r5, r4, lsl #16 @ do missed phase increment
subcs r0, r0, #1 @ do missed srcrem decrement
movcs r11, r12 @ r11 = s[pos-1] (pos changed)
.lrs_usdone:
sub r0, r6, r0 @ r0 = pos = srcrem - dte
orr r5, r5, r0 @ reconstruct swapped phase
mov r5, r5, ror #16 @ swap pos and frac for phase
b .lrs_channel_done @
/** Downsampling **/
.lrs_dsloop:
add r10, r7, r0, lsl #2 @ r10 = &s[pos]
ldmda r10, { r11, r12 } @ r11 = last, r12 = s[pos]
.lrs_dsstart:
sub r14, r12, r11 @ r14 = diff = s[pos] - s[pos - 1]
@ keep frac in Rs to take advantage of multiplier early termination
bic r1, r5, r0, lsl #16 @ frac = phase & 0xffff
smull r1, r10, r14, r1 @ r1, r10 = diff * frac (lo, hi)
add r5, r5, r4 @ phase += delta
subs r9, r9, #1 @ destination full? ...
mov r0, r5, lsr #16 @ pos = phase >> 16
add r1, r11, r1, lsr #16 @ r1 = out = last + frac*diff
add r1, r1, r10, lsl #16 @
str r1, [r8], #4 @ *d++ = out
cmpgt r6, r0 @ ... || pos >= srcrem? ...
bgt .lrs_dsloop @ ... no, do more samples
cmp r0, r6 @ pos = MIN(pos, srcrem)
movgt r0, r6 @
sub r1, r0, #1 @ pos must always be > 0 since step >= 1.0
ldr r11, [r7, r1, lsl #2] @ r11 = s[pos - 1]
.lrs_channel_done:
ldmia sp, { r1, r10 } @ recover src, &data->phase
str r11, [r10, r3, lsl #2] @ last_sample[ch] = last
subs r3, r3, #1 @
bgt .lrs_channel_loop @
ldr r6, [r2, #12] @ r6 = dst->bufcount
sub r5, r5, r0, lsl #16 @ r5 = phase - (pos << 16)
str r5, [r10] @ data->phase = r5
sub r6, r6, r9 @ r6 = dst->bufcount - dstrem = dstcount
str r6, [r2] @ dst->remcount = dstcount
add sp, sp, #8 @ adjust stack for temp variables
ldmpc regs=r4-r11 @ ... and we're out
.size lin_resample_resample, .-lin_resample_resample
/****************************************************************************
* void pga_process(struct dsp_proc_entry *this, struct dsp_buffer **buf_p)
*/
.section .icode
.global pga_process
.type pga_process, %function
pga_process:
@ input: r0 = this, r1 = buf_p
ldr r0, [r0] @ r0 = data = this->data (&pga_data)
ldr r1, [r1] @ r1 = buf = *buf_p;
stmfd sp!, { r4-r8, lr }
ldr r4, [r0] @ r4 = data->gain
ldr r0, [r1], #4 @ r0 = buf->remcount, r1 = buf->p32
ldrb r3, [r1, #13] @ r3 = buf->format.num_channels
.pga_channelloop:
ldr r2, [r1], #4 @ r2 = buf->p32[ch] and inc index of p32
subs r12, r0, #1 @ r12 = count - 1
beq .pga_singlesample @ Zero? Only one sample!
.pga_loop:
ldmia r2, { r5, r6 } @ load r5, r6 from r2 (*p32[ch])
smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8)
smull r14, r5, r6, r4 @ r14 = FRACMUL_SHL(r6, r4, 8)
subs r12, r12, #2
mov r7, r7, lsr #23
mov r14, r14, lsr #23
orr r7, r7, r8, asl #9
orr r14, r14, r5, asl #9
stmia r2!, { r7, r14 } @ save r7, r14 to *p32[ch] and increment
bgt .pga_loop @ end of pga loop
blt .pga_evencount @ < 0? even count
.pga_singlesample:
ldr r5, [r2] @ handle odd sample
smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8)
mov r7, r7, lsr #23
orr r7, r7, r8, asl #9
str r7, [r2]
.pga_evencount:
subs r3, r3, #1
bgt .pga_channelloop @ end of channel loop
ldmpc regs=r4-r8
.size pga_process, .-pga_process
/****************************************************************************
* void filter_process(struct dsp_filter *f, int32_t *buf[], int count,
* unsigned int channels)
*
* define HIGH_PRECISION as '1' to make filtering calculate lower bits after
* shifting. without this, "shift" - 1 of the lower bits will be lost here.
*/
#define HIGH_PRECISION 0
#if CONFIG_CPU == PP5002
.section .icode,"ax",%progbits
#else
.text
#endif
.global filter_process
filter_process:
@input: r0 = f, r1 = buf, r2 = count, r3 = channels
stmfd sp!, { r4-r11, lr } @ save all clobbered regs
ldmia r0!, { r4-r8 } @ load coefs, r0 = f->history
sub r3, r3, #1 @ r3 = ch = channels - 1
stmfd sp!, { r0-r3 } @ save adjusted params
ldrb r14, [r0, #32] @ r14 = shift
@ Channels are processed high to low while history is saved low to high
@ It's really noone's business how we do this
.fp_channelloop:
ldmia r0, { r9-r12 } @ load history, r0 = history[channels-ch-1]
ldr r3, [r1, r3, lsl #2] @ r3 = buf[ch]
@ r9-r12 = history, r4-r8 = coefs, r0..r1 = accumulator,
@ r2 = number of samples, r3 = buf[ch], r14 = shift amount
.fp_loop:
@ Direct form 1 filtering code.
@ y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
@ where y[] is output and x[] is input. This is performed out of order to
@ reuse registers, we're pretty short on regs.
smull r0, r1, r5, r9 @ acc = b1*x[i - 1]
smlal r0, r1, r6, r10 @ acc += b2*x[i - 2]
mov r10, r9 @ fix input history
ldr r9, [r3] @ load input and fix history
smlal r0, r1, r7, r11 @ acc += a1*y[i - 1]
smlal r0, r1, r8, r12 @ acc += a2*y[i - 2]
smlal r0, r1, r4, r9 @ acc += b0*x[i] /* avoid stall on arm9 */
mov r12, r11 @ fix output history
mov r11, r1, asl r14 @ get upper part of result and shift left
#if HIGH_PRECISION
rsb r1, r14, #32 @ get shift amount for lower part
orr r11, r11, r0, lsr r1 @ then mix in correctly shifted lower part
#endif
str r11, [r3], #4 @ save result
subs r2, r2, #1 @ are we done with this channel?
bgt .fp_loop @
ldr r3, [sp, #12] @ r3 = ch
ldr r0, [sp] @ r0 = history[channels-ch-1]
subs r3, r3, #1 @ all channels processed?
stmia r0!, { r9-r12 } @ save back history, history++
ldmhsib sp, { r1-r2 } @ r1 = buf, r2 = count
strhs r3, [sp, #12] @ store ch
strhs r0, [sp] @ store history[channels-ch-1]
bhs .fp_channelloop
add sp, sp, #16 @ compensate for temp storage
ldmpc regs=r4-r11
.size filter_process, .-filter_process
#if ARM_ARCH < 6
/****************************************************************************
* void sample_output_mono(int count, struct dsp_data *data,
* const int32_t *src[], int16_t *dst)
* void sample_output_mono(struct sample_io_data *this,
* struct dsp_buffer *src,
* struct dsp_buffer *dst)
*/
.section .icode, "ax", %progbits
.align 2
.section .icode
.global sample_output_mono
.type sample_output_mono, %function
sample_output_mono:
@ input: r0 = count, r1 = data, r2 = src, r3 = dst
@ input: r0 = this, r1 = src, r2 = dst
stmfd sp!, { r4-r6, lr }
ldr r1, [r1] @ lr = data->output_scale
ldr r2, [r2] @ r2 = src[0]
ldr r0, [r0] @ r0 = this->outcount
ldr r3, [r2, #4] @ r2 = dst->p16out
ldr r2, [r1, #4] @ r1 = src->p32[0]
ldrb r1, [r1, #19] @ r2 = src->format.output_scale
mov r4, #1
mov r4, r4, lsl r1 @ r4 = 1 << (scale-1)
@ -246,19 +536,21 @@ sample_output_mono:
.size sample_output_mono, .-sample_output_mono
/****************************************************************************
* void sample_output_stereo(int count, struct dsp_data *data,
* const int32_t *src[], int16_t *dst)
* void sample_output_stereo(struct sample_io_data *this,
* struct dsp_buffer *src,
* struct dsp_buffer *dst)
*/
.section .icode, "ax", %progbits
.align 2
.section .icode
.global sample_output_stereo
.type sample_output_stereo, %function
sample_output_stereo:
@ input: r0 = count, r1 = data, r2 = src, r3 = dst
@ input: r0 = this, r1 = src, r2 = dst
stmfd sp!, { r4-r9, lr }
ldr r1, [r1] @ r1 = data->output_scale
ldmia r2, { r2, r5 } @ r2 = src[0], r5 = src[1]
ldr r0, [r0] @ r0 = this->outcount
ldr r3, [r2, #4] @ r3 = dsp->p16out
ldmib r1, { r2, r5 } @ r2 = src->p32[0], r5 = src->p32[1]
ldrb r1, [r1, #19] @ r1 = src->format.output_scale
mov r4, #1
mov r4, r4, lsl r1 @ r4 = 1 << (scale-1)
@ -330,232 +622,3 @@ sample_output_stereo:
ldmpc regs=r4-r9
.size sample_output_stereo, .-sample_output_stereo
#endif /* ARM_ARCH < 6 */
/****************************************************************************
* void apply_crossfeed(int count, int32_t* src[])
*/
.section .text
.global apply_crossfeed
apply_crossfeed:
@ unfortunately, we ended up in a bit of a register squeeze here, and need
@ to keep the count on the stack :/
stmdb sp!, { r4-r11, lr } @ stack modified regs
ldmia r1, { r2-r3 } @ r2 = src[0], r3 = src[1]
ldr r1, =crossfeed_data
ldmia r1!, { r4-r11 } @ load direct gain and filter data
mov r12, r0 @ better to ldm delay + count later
add r0, r1, #13*4*2 @ calculate end of delay
stmdb sp!, { r0, r12 } @ stack end of delay adr and count
ldr r0, [r1, #13*4*2] @ fetch current delay line address
/* Register usage in loop:
* r0 = &delay[index][0], r1 = accumulator high, r2 = src[0], r3 = src[1],
* r4 = direct gain, r5-r7 = b0, b1, a1 (filter coefs),
* r8-r11 = filter history, r12 = temp, r14 = accumulator low
*/
.cfloop:
smull r14, r1, r6, r8 @ acc = b1*dr[n - 1]
smlal r14, r1, r7, r9 @ acc += a1*y_l[n - 1]
ldr r8, [r0, #4] @ r8 = dr[n]
smlal r14, r1, r5, r8 @ acc += b0*dr[n]
mov r9, r1, lsl #1 @ fix format for filter history
ldr r12, [r2] @ load left input
smlal r14, r1, r4, r12 @ acc += gain*x_l[n]
mov r1, r1, lsl #1 @ fix format
str r1, [r2], #4 @ save result
smull r14, r1, r6, r10 @ acc = b1*dl[n - 1]
smlal r14, r1, r7, r11 @ acc += a1*y_r[n - 1]
ldr r10, [r0] @ r10 = dl[n]
str r12, [r0], #4 @ save left input to delay line
smlal r14, r1, r5, r10 @ acc += b0*dl[n]
mov r11, r1, lsl #1 @ fix format for filter history
ldr r12, [r3] @ load right input
smlal r14, r1, r4, r12 @ acc += gain*x_r[n]
str r12, [r0], #4 @ save right input to delay line
mov r1, r1, lsl #1 @ fix format
ldmia sp, { r12, r14 } @ fetch delay line end addr and count from stack
str r1, [r3], #4 @ save result
cmp r0, r12 @ need to wrap to start of delay?
subeq r0, r0, #13*4*2 @ wrap back delay line ptr to start
subs r14, r14, #1 @ are we finished?
strne r14, [sp, #4] @ nope, save count back to stack
bne .cfloop
@ save data back to struct
ldr r12, =crossfeed_data + 4*4
stmia r12, { r8-r11 } @ save filter history
str r0, [r12, #30*4] @ save delay line index
add sp, sp, #8 @ remove temp variables from stack
ldmpc regs=r4-r11
.size apply_crossfeed, .-apply_crossfeed
/****************************************************************************
* int dsp_downsample(int count, struct dsp_data *data,
* in32_t *src[], int32_t *dst[])
*/
.section .text
.global dsp_downsample
dsp_downsample:
stmdb sp!, { r4-r11, lr } @ stack modified regs
ldmib r1, { r5-r6 } @ r5 = num_channels,r6 = resample_data.delta
sub r5, r5, #1 @ pre-decrement num_channels for use
add r4, r1, #12 @ r4 = &resample_data.phase
mov r12, #0xff
orr r12, r12, #0xff00 @ r12 = 0xffff
.dschannel_loop:
ldr r1, [r4] @ r1 = resample_data.phase
ldr r7, [r2, r5, lsl #2] @ r7 = s = src[ch - 1]
ldr r8, [r3, r5, lsl #2] @ r8 = d = dst[ch - 1]
add r9, r4, #4 @ r9 = &last_sample[0]
ldr r10, [r9, r5, lsl #2] @ r10 = last_sample[ch - 1]
sub r11, r0, #1
ldr r14, [r7, r11, lsl #2] @ load last sample in s[] ...
str r14, [r9, r5, lsl #2] @ and write as next frame's last_sample
movs r9, r1, lsr #16 @ r9 = pos = phase >> 16
ldreq r11, [r7] @ if pos = 0, load src[0] and jump into loop
beq .dsuse_last_start
cmp r9, r0 @ if pos >= count, we're already done
bge .dsloop_skip
@ Register usage in loop:
@ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel,
@ r6 = delta, r7 = s, r8 = d, r9 = pos, r10 = s[pos - 1], r11 = s[pos]
.dsloop:
add r9, r7, r9, lsl #2 @ r9 = &s[pos]
ldmda r9, { r10, r11 } @ r10 = s[pos - 1], r11 = s[pos]
.dsuse_last_start:
sub r11, r11, r10 @ r11 = diff = s[pos] - s[pos - 1]
@ keep frac in lower bits to take advantage of multiplier early termination
and r9, r1, r12 @ frac = phase & 0xffff
smull r9, r14, r11, r9
add r1, r1, r6 @ phase += delta
add r10, r10, r9, lsr #16 @ r10 = out = s[pos - 1] + frac*diff
add r10, r10, r14, lsl #16
str r10, [r8], #4 @ *d++ = out
mov r9, r1, lsr #16 @ pos = phase >> 16
cmp r9, r0 @ pos < count?
blt .dsloop @ yup, do more samples
.dsloop_skip:
subs r5, r5, #1
bpl .dschannel_loop @ if (--ch) >= 0, do another channel
sub r1, r1, r0, lsl #16 @ wrap phase back to start
str r1, [r4] @ store back
ldr r1, [r3] @ r1 = &dst[0]
sub r8, r8, r1 @ dst - &dst[0]
mov r0, r8, lsr #2 @ convert bytes->samples
ldmpc regs=r4-r11 @ ... and we're out
.size dsp_downsample, .-dsp_downsample
/****************************************************************************
* int dsp_upsample(int count, struct dsp_data *dsp,
* in32_t *src[], int32_t *dst[])
*/
.section .text
.global dsp_upsample
dsp_upsample:
stmfd sp!, { r4-r11, lr } @ stack modified regs
ldmib r1, { r5-r6 } @ r5 = num_channels,r6 = resample_data.delta
sub r5, r5, #1 @ pre-decrement num_channels for use
add r4, r1, #12 @ r4 = &resample_data.phase
mov r6, r6, lsl #16 @ we'll use carry to detect pos increments
stmfd sp!, { r0, r4 } @ stack count and &resample_data.phase
.uschannel_loop:
ldr r12, [r4] @ r12 = resample_data.phase
ldr r7, [r2, r5, lsl #2] @ r7 = s = src[ch - 1]
ldr r8, [r3, r5, lsl #2] @ r8 = d = dst[ch - 1]
add r9, r4, #4 @ r9 = &last_sample[0]
mov r1, r12, lsl #16 @ we'll use carry to detect pos increments
sub r11, r0, #1
ldr r14, [r7, r11, lsl #2] @ load last sample in s[] ...
ldr r10, [r9, r5, lsl #2] @ r10 = last_sample[ch - 1]
str r14, [r9, r5, lsl #2] @ and write as next frame's last_sample
movs r14, r12, lsr #16 @ pos = resample_data.phase >> 16
beq .usstart_0 @ pos = 0
cmp r14, r0 @ if pos >= count, we're already done
bge .usloop_skip
add r7, r7, r14, lsl #2 @ r7 = &s[pos]
ldr r10, [r7, #-4] @ r11 = s[pos - 1]
b .usstart_0
@ Register usage in loop:
@ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel,
@ r6 = delta, r7 = s, r8 = d, r9 = diff, r10 = s[pos - 1], r11 = s[pos]
.usloop_1:
mov r10, r11 @ r10 = previous sample
.usstart_0:
ldr r11, [r7], #4 @ r11 = next sample
mov r4, r1, lsr #16 @ r4 = frac = phase >> 16
sub r9, r11, r10 @ r9 = diff = s[pos] - s[pos - 1]
.usloop_0:
smull r12, r14, r4, r9
adds r1, r1, r6 @ phase += delta << 16
mov r4, r1, lsr #16 @ r4 = frac = phase >> 16
add r14, r10, r14, lsl #16
add r14, r14, r12, lsr #16 @ r14 = out = s[pos - 1] + frac*diff
str r14, [r8], #4 @ *d++ = out
bcc .usloop_0 @ if carry is set, pos is incremented
subs r0, r0, #1 @ if count > 0, do another sample
bgt .usloop_1
.usloop_skip:
subs r5, r5, #1
ldmfd sp, { r0, r4 } @ reload count and &resample_data.phase
bpl .uschannel_loop @ if (--ch) >= 0, do another channel
mov r1, r1, lsr #16 @ wrap phase back to start of next frame
ldr r2, [r3] @ r1 = &dst[0]
str r1, [r4] @ store phase
sub r8, r8, r2 @ dst - &dst[0]
mov r0, r8, lsr #2 @ convert bytes->samples
add sp, sp, #8 @ adjust stack for temp variables
ldmpc regs=r4-r11 @ ... and we're out
.size dsp_upsample, .-dsp_upsample
/****************************************************************************
* void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
*/
.section .icode, "ax", %progbits
.align 2
.global dsp_apply_gain
.type dsp_apply_gain, %function
dsp_apply_gain:
@ input: r0 = count, r1 = data, r2 = buf[]
stmfd sp!, { r4-r8, lr }
ldr r3, [r1, #4] @ r3 = data->num_channels
ldr r4, [r1, #32] @ r5 = data->gain
.dag_outerloop:
ldr r1, [r2], #4 @ r1 = buf[0] and increment index of buf[]
subs r12, r0, #1 @ r12 = r0 = count - 1
beq .dag_singlesample @ Zero? Only one sample!
.dag_innerloop:
ldmia r1, { r5, r6 } @ load r5, r6 from r1
smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8)
smull r14, r5, r6, r4 @ r14 = FRACMUL_SHL(r6, r4, 8)
subs r12, r12, #2
mov r7, r7, lsr #23
mov r14, r14, lsr #23
orr r7, r7, r8, asl #9
orr r14, r14, r5, asl #9
stmia r1!, { r7, r14 } @ save r7, r14 to [r1] and increment r1
bgt .dag_innerloop @ end of inner loop
blt .dag_evencount @ < 0? even count
.dag_singlesample:
ldr r5, [r1] @ handle odd sample
smull r7, r8, r5, r4 @ r7 = FRACMUL_SHL(r5, r4, 8)
mov r7, r7, lsr #23
orr r7, r7, r8, asl #9
str r7, [r1]
.dag_evencount:
subs r3, r3, #1
bgt .dag_outerloop @ end of outer loop
ldmpc regs=r4-r8
.size dsp_apply_gain, .-dsp_apply_gain