forked from len0rd/rockbox
Introduce new hermite polynomial resampler.
Uses the Catmull-Rom case of Hermite cubic splines. Vastly improves the quality and accuracy of audio resampling with a rather minor additional overhead compared to the previous linear implementation. ARM and Coldfire assembly implementations included. Change-Id: Ic45d84bc66c5b312ef373198297a952167a4be26 Reviewed-on: http://gerrit.rockbox.org/304 Reviewed-by: Michael Sevakis <jethead71@rockbox.org> Tested-by: Michael Sevakis <jethead71@rockbox.org>
This commit is contained in:
parent
91b850ec42
commit
a7dee7f447
3 changed files with 537 additions and 254 deletions
|
@ -289,114 +289,224 @@ crossfeed_meier_process:
|
|||
ldmpc regs=r4-r10 @ restore non-volatile context, return
|
||||
.size crossfeed_meier_process, .-crossfeed_meier_process
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
* int resample_linear(struct resample_data *data, struct dsp_buffer *src,
|
||||
* struct dsp_buffer *dst)
|
||||
* int resample_hermite(struct resample_data *data, struct dsp_buffer *src,
|
||||
* struct dsp_buffer *dst)
|
||||
*/
|
||||
.section .text, "ax",%progbits
|
||||
.global resample_linear
|
||||
resample_linear:
|
||||
.global resample_hermite
|
||||
resample_hermite:
|
||||
@input: r0 = data, r1 = src, r2 = dst
|
||||
stmfd sp!, { r4-r11, lr } @ stack modified regs
|
||||
ldr r4, [r0] @ r4 = data->delta
|
||||
add r10, r0, #4 @ r10 = &data->phase
|
||||
ldrb r3, [r1, #17] @ r3 = num_channels,
|
||||
stmfd sp!, { r1, r10 } @ stack src, &data->phase
|
||||
.lrs_channel_loop:
|
||||
ldr r5, [r10] @ r5 = data->phase
|
||||
ldr r6, [r1] @ r6 = srcrem = src->remcount
|
||||
ldr r7, [r1, r3, lsl #2] @ r7 = src->p32[ch]
|
||||
ldr r8, [r2, r3, lsl #2] @ r8 = dst->p32[ch]
|
||||
ldr r9, [r2, #12] @ r9 = dstrem = dst->bufcount
|
||||
stmfd sp!, { r0-r2, r4-r11, lr } @ stack parms, modified regs
|
||||
ldr r9, [r1] @ r9 = srcrem = src->remcount
|
||||
ldrb r10, [r1, #17] @ r10 = ch = num_channels
|
||||
ldr r14, [r0] @ r14 = data->delta, r0 = data
|
||||
|
||||
cmp r6, #0x8000 @ srcrem = MIN(srcrem, 0x8000)
|
||||
movgt r6, #0x8000 @
|
||||
mov r0, r5, lsr #16 @ pos = MIN(pos, srcrem)
|
||||
cmp r0, r6 @
|
||||
movgt r0, r6 @ r0 = pos = phase >> 16
|
||||
cmp r0, #0 @
|
||||
ldrle r11, [r10, r3, lsl #2] @ pos <= 0? r11 = last = last_sample[ch]
|
||||
addgt r12, r7, r0, lsl #2 @ pos > 0? r1 = last = s[pos - 1]
|
||||
ldrgt r11, [r12, #-4] @
|
||||
cmp r0, r6 @
|
||||
bge .lrs_channel_done @ pos >= count? channel complete
|
||||
cmp r9, #0x8000 @ srcrem = MIN(srcrem, 0x8000)
|
||||
movgt r9, #0x8000 @
|
||||
|
||||
cmp r4, #0x10000 @ delta >= 1.0?
|
||||
ldrhs r12, [r7, r0, lsl #2] @ yes? r12 = s[pos]
|
||||
bhs .lrs_dsstart @ yes? is downsampling
|
||||
@ Channels are processed high to low while history is saved low to high
|
||||
@ It's really noone's business how we do this
|
||||
add r12, r0, #8 @ r12 = h = data->history
|
||||
|
||||
.hrs_channel_loop:
|
||||
stmfd sp!, { r10, r12 } @ push ch, h
|
||||
ldr r5, [r0, #4] @ r5 = data->phase
|
||||
ldr r6, [r1, r10, lsl #2] @ r6 = src->p32[ch]
|
||||
ldr r7, [r2, r10, lsl #2] @ r7 = dst->p32[ch]
|
||||
ldr r8, [r2, #12] @ r8 = dstrem = dst->bufcount
|
||||
|
||||
mov r0, r5, lsr #16 @ r0 = pos = phase >> 16
|
||||
cmp r0, r9 @ r0 = pos = MIN(pos, srcrem)
|
||||
movgt r0, r9 @
|
||||
|
||||
add r6, r6, r0, lsl #2 @ r6 = &s[pos]
|
||||
|
||||
cmp r0, #3 @ pos >= 3? history not needed
|
||||
ldmgedb r6, { r1-r3 } @ x3..x1 = s[pos-3]..s[pos-1]
|
||||
bge .hrs_loadhist_done @
|
||||
add r10, r0, r0, lsl #1 @ branch pc + pos*12
|
||||
add pc, pc, r10, lsl #2 @
|
||||
nop @
|
||||
|
||||
ldmia r12, { r1-r3 } @ x3..x1 = h[0]..h[2]
|
||||
b .hrs_loadhist_done @
|
||||
nop @
|
||||
|
||||
ldmib r12, { r1-r2 } @ x3..x2 = h[1]..h[2]
|
||||
ldr r3, [r6, #-4] @ x1 = s[0]
|
||||
b .hrs_loadhist_done @
|
||||
|
||||
ldr r1, [r12, #8] @ x3 = h[2]
|
||||
ldmdb r6, { r2-r3 } @ x2..x1 = s[0]..s[1]
|
||||
.hrs_loadhist_done:
|
||||
|
||||
cmp r0, r9 @ pos past end?
|
||||
bge .hrs_channel_done
|
||||
|
||||
cmp r14, #0x10000 @ delta >= 1.0?
|
||||
bhs .hrs_dsstart @ yes? is downsampling
|
||||
|
||||
/** Upsampling **/
|
||||
mov r5, r5, lsl #16 @ Move phase into high halfword
|
||||
add r7, r7, r0, lsl #2 @ r7 = &s[pos]
|
||||
sub r0, r6, r0 @ r0 = dte = srcrem - pos
|
||||
.lrs_usloop_1:
|
||||
ldr r12, [r7], #4 @ r12 = s[pos]
|
||||
sub r14, r12, r11 @ r14 = diff = s[pos] - s[pos - 1]
|
||||
.lrs_usloop_0:
|
||||
mov r1, r5, lsr #16 @ r1 = frac = phase >> 16
|
||||
@ keep frac in Rs to take advantage of multiplier early termination
|
||||
smull r1, r10, r14, r1 @ r1, r10 = diff * frac (lo, hi)
|
||||
add r1, r11, r1, lsr #16 @ r1 = out = last + frac*diff
|
||||
add r1, r1, r10, lsl #16 @
|
||||
str r1, [r8], #4 @ *d++ = out
|
||||
subs r9, r9, #1 @ destination full?
|
||||
bls .lrs_usfull @ yes? channel is done
|
||||
adds r5, r5, r4, lsl #16 @ phase += delta << 16
|
||||
bcc .lrs_usloop_0 @ if carry is set, pos is incremented
|
||||
subs r0, r0, #1 @ if srcrem > 0, do another sample
|
||||
mov r11, r12 @ r11 = last = s[pos-1] (pos changed)
|
||||
bgt .lrs_usloop_1
|
||||
b .lrs_usdone
|
||||
str r9, [sp, #-4]! @ push srcrem
|
||||
mov r5, r5, lsl #16 @ r5 = phase << 16
|
||||
sub r0, r9, r0 @ r0 = dte = srcrem - pos
|
||||
mov r14, r14, lsl #16 @ r14 = delta << 16
|
||||
|
||||
.lrs_usfull:
|
||||
adds r5, r5, r4, lsl #16 @ do missed phase increment
|
||||
subcs r0, r0, #1 @ do missed srcrem decrement
|
||||
movcs r11, r12 @ r11 = s[pos-1] (pos changed)
|
||||
@ Register usage in loop:
|
||||
@ r0 = dte
|
||||
@ r1 = x3, r2 = x2, r3 = x1, r4 = x0
|
||||
@ r5 = phase << 16/frac, r6 = &s[pos], r7 = d, r8 = dstrem
|
||||
@ r9 = scratch/acclo, r10 = scratch/acchi
|
||||
@ r11 = c2, r12 = c3, c1 calculated in frac loop
|
||||
@ r14 = delta << 16
|
||||
@
|
||||
@ Try to avoid overflow as much as possible and at the same time preserve
|
||||
@ accuracy. Same formulas apply to downsampling but registers and
|
||||
@ instruction order differ due to specific constraints.
|
||||
@ c1 = -0.5*x3 + 0.5*x1
|
||||
@ = 0.5*(x1 - x3) <--
|
||||
@
|
||||
@ v = x1 - x2, -v = x2 - x1
|
||||
@ c2 = x3 - 2.5*x2 + 2*x1 - 0.5*x0
|
||||
@ = x3 + 2*(x1 - x2) - 0.5*(x0 + x2)
|
||||
@ = x3 + 2*v - 0.5*(x0 + x2) <--
|
||||
@
|
||||
@ c3 = -0.5*x3 + 1.5*x2 - 1.5*x1 + 0.5*x0
|
||||
@ = 0.5*(x0 - x3 + (x2 - x1)) + (x2 - x1)
|
||||
@ = 0.5*(x0 - x3 - v) - v <--
|
||||
.hrs_usloop_carry:
|
||||
ldr r4, [r6], #4 @ x0 = s[pos]
|
||||
sub r9, r3, r2 @ r9 = v, r11 = c2, r12 = c3
|
||||
add r11, r1, r9, asl #1 @
|
||||
add r10, r4, r2 @
|
||||
sub r12, r4, r1 @
|
||||
sub r12, r12, r9 @
|
||||
sub r11, r11, r10, asr #1 @
|
||||
rsb r12, r9, r12, asr #1 @
|
||||
.hrs_usloop_frac:
|
||||
mov r5, r5, lsr #16 @ r5 = phase -> frac
|
||||
smull r9, r10, r12, r5 @ acc = frac * c3 + c2
|
||||
add r9, r11, r9, lsr #16 @
|
||||
add r9, r9, r10, asl #16 @
|
||||
smull r9, r10, r5, r9 @ acc = frac * acc + c1
|
||||
mov r9, r9, lsr #16 @
|
||||
orr r9, r9, r10, asl #16 @
|
||||
sub r10, r3, r1 @
|
||||
add r9, r9, r10, asr #1 @
|
||||
smull r9, r10, r5, r9 @ acc = frac * acc + x2
|
||||
subs r8, r8, #1 @ destination full?
|
||||
add r9, r2, r9, lsr #16 @
|
||||
add r9, r9, r10, asl #16 @
|
||||
str r9, [r7], #4 @ *d++ = acc
|
||||
bls .hrs_usfull @ yes? channel is done
|
||||
adds r5, r14, r5, lsl #16 @ frac += delta
|
||||
bcc .hrs_usloop_frac @ if carry is set, pos is incremented
|
||||
|
||||
.lrs_usdone:
|
||||
sub r0, r6, r0 @ r0 = pos = srcrem - dte
|
||||
orr r5, r5, r0 @ reconstruct swapped phase
|
||||
mov r5, r5, ror #16 @ swap pos and frac for phase
|
||||
b .lrs_channel_done @
|
||||
subs r0, r0, #1 @ if dte > 0, do another sample
|
||||
mov r1, r2 @ x3 = x2
|
||||
mov r2, r3 @ x2 = x1
|
||||
mov r3, r4 @ x1 = x0
|
||||
bgt .hrs_usloop_carry
|
||||
b .hrs_usdone
|
||||
|
||||
.hrs_usfull:
|
||||
adds r5, r14, r5, lsl #16 @ do missed phase increment
|
||||
bcc .hrs_usdone @
|
||||
sub r0, r0, #1 @ do missed dte decrement
|
||||
mov r1, r2 @ do missed history update
|
||||
mov r2, r3 @
|
||||
mov r3, r4 @
|
||||
|
||||
.hrs_usdone:
|
||||
ldr r9, [sp], #4 @ r9 = pop srcrem
|
||||
mov r14, r14, lsr #16 @ restore delta for next round
|
||||
sub r0, r9, r0 @ r0 = pos = srcrem - dte
|
||||
orr r5, r5, r0 @ reconstruct swapped phase
|
||||
mov r5, r5, ror #16 @ swap pos and frac for phase
|
||||
b .hrs_channel_done
|
||||
|
||||
/** Downsampling **/
|
||||
.lrs_dsloop:
|
||||
add r10, r7, r0, lsl #2 @ r10 = &s[pos]
|
||||
ldmda r10, { r11, r12 } @ r11 = last, r12 = s[pos]
|
||||
.lrs_dsstart:
|
||||
sub r14, r12, r11 @ r14 = diff = s[pos] - s[pos - 1]
|
||||
@ keep frac in Rs to take advantage of multiplier early termination
|
||||
bic r1, r5, r0, lsl #16 @ frac = phase & 0xffff
|
||||
smull r1, r10, r14, r1 @ r1, r10 = diff * frac (lo, hi)
|
||||
add r5, r5, r4 @ phase += delta
|
||||
subs r9, r9, #1 @ destination full? ...
|
||||
mov r0, r5, lsr #16 @ pos = phase >> 16
|
||||
add r1, r11, r1, lsr #16 @ r1 = out = last + frac*diff
|
||||
add r1, r1, r10, lsl #16 @
|
||||
str r1, [r8], #4 @ *d++ = out
|
||||
cmpgt r6, r0 @ ... || pos >= srcrem? ...
|
||||
bgt .lrs_dsloop @ ... no, do more samples
|
||||
@ Register usage in loop:
|
||||
@ r0 = pos/frac
|
||||
@ r1 = x3, r2 = x2, r3 = x1, r4 = x0
|
||||
@ r5 = phase, r6 = &s[pos], r7 = d, r8 = dstrem
|
||||
@ r9 = srcrem, r10 = scratch/acclo
|
||||
@ r11 = c2/scratch, r12 = c3/acchi
|
||||
@ r14 = delta
|
||||
.hrs_dsloop_4:
|
||||
ldmdb r6, { r1-r3 } @ x3..x0 = s[pos-3]..s[pos-1]
|
||||
b .hrs_dsloop
|
||||
.hrs_dsloop_3:
|
||||
ldmdb r6, { r2-r3 } @ x2..x0 = s[pos-2]..s[pos-1]
|
||||
mov r1, r4 @ x3 = x0
|
||||
b .hrs_dsloop
|
||||
.hrs_dsloop_2:
|
||||
mov r1, r3 @ x3 = x1
|
||||
ldr r3, [r6, #-4] @ x1 = s[pos-1]
|
||||
mov r2, r4 @ x2 = x0
|
||||
b .hrs_dsloop
|
||||
.hrs_dsloop_1: @ expected loop destination
|
||||
mov r1, r2 @ x3 = x2
|
||||
mov r2, r3 @ x2 = x1
|
||||
mov r3, r4 @ x1 = x0
|
||||
.hrs_dsloop:
|
||||
subs r8, r8, #1 @ destination full?
|
||||
cmpgt r9, r0 @ ... || pos >= srcrem?
|
||||
ble .hrs_channel_done
|
||||
.hrs_dsstart:
|
||||
ldr r4, [r6] @ x0 = s[pos]
|
||||
sub r10, r3, r2 @ r10 = v, r11 = c2, r12 = c3
|
||||
add r11, r4, r2 @
|
||||
bic r0, r5, r0, lsl #16 @ r0 = frac = phase & 0xffff
|
||||
sub r11, r1, r11, asr #1 @
|
||||
add r11, r11, r10, asl #1 @
|
||||
sub r12, r4, r1 @
|
||||
sub r12, r12, r10 @
|
||||
rsb r12, r10, r12, asr #1 @
|
||||
smull r10, r12, r0, r12 @ acc = frac * c3 + c2
|
||||
add r10, r11, r10, lsr #16 @
|
||||
add r10, r10, r12, asl #16 @
|
||||
sub r11, r3, r1 @
|
||||
smull r10, r12, r0, r10 @ acc = frac * acc + c1
|
||||
mov r11, r11, asr #1 @
|
||||
add r10, r11, r10, lsr #16 @
|
||||
add r10, r10, r12, asl #16 @
|
||||
smull r10, r12, r0, r10 @ acc = frac * acc + x2
|
||||
mov r11, r5, lsr #16 @ r11 = last_pos
|
||||
add r5, r5, r14 @ phase += delta
|
||||
mov r0, r5, lsr #16 @ r0 = pos = phase >> 16
|
||||
add r10, r2, r10, lsr #16 @
|
||||
add r10, r10, r12, asl #16 @
|
||||
str r10, [r7], #4 @ *d++ = acc
|
||||
|
||||
cmp r0, r6 @ pos = MIN(pos, srcrem)
|
||||
movgt r0, r6 @
|
||||
sub r1, r0, #1 @ pos must always be > 0 since step >= 1.0
|
||||
ldr r11, [r7, r1, lsl #2] @ r11 = s[pos - 1]
|
||||
cmp r0, r9 @ r0 = pos = MIN(pos, srcrem)
|
||||
movgt r0, r9 @
|
||||
sub r11, r0, r11 @ shift = pos - last_pos
|
||||
cmp r11, #4 @
|
||||
add r6, r6, r11, lsl #2 @ r6 += shift * 4
|
||||
bge .hrs_dsloop_4 @
|
||||
ldr pc, [pc, r11, lsl #2] @ branch to corresponding loop address
|
||||
.word 0, 0
|
||||
.word .hrs_dsloop_1
|
||||
.word .hrs_dsloop_2
|
||||
.word .hrs_dsloop_3
|
||||
|
||||
.lrs_channel_done:
|
||||
ldmia sp, { r1, r10 } @ recover src, &data->phase
|
||||
str r11, [r10, r3, lsl #2] @ last_sample[ch] = last
|
||||
subs r3, r3, #1 @
|
||||
bgt .lrs_channel_loop @
|
||||
.hrs_channel_done:
|
||||
ldmfd sp!, { r10, r12 } @ recover ch, h
|
||||
subs r10, r10, #1 @ --ch
|
||||
stmia r12!, { r1-r3 } @ h[0..2] = x3..x1
|
||||
ldmgtia sp, { r0-r2 } @ load data, src, dst
|
||||
bgt .hrs_channel_loop
|
||||
|
||||
ldr r6, [r2, #12] @ r6 = dst->bufcount
|
||||
sub r5, r5, r0, lsl #16 @ r5 = phase - (pos << 16)
|
||||
str r5, [r10] @ data->phase = r5
|
||||
sub r6, r6, r9 @ r6 = dst->bufcount - dstrem = dstcount
|
||||
str r6, [r2] @ dst->remcount = dstcount
|
||||
add sp, sp, #8 @ adjust stack for temp variables
|
||||
ldmpc regs=r4-r11 @ ... and we're out
|
||||
.size resample_linear, .-resample_linear
|
||||
ldmfd sp!, { r1-r3 } @ pop data, src, dst
|
||||
sub r5, r5, r0, lsl #16 @ r5 = phase - (pos << 16)
|
||||
ldr r2, [r3, #12] @ r2 = dst->bufcount
|
||||
str r5, [r1, #4] @ data->phase = r5
|
||||
sub r2, r2, r8 @ r2 = dst->bufcount - dstrem
|
||||
str r2, [r3] @ dst->remcount = r2
|
||||
ldmpc regs=r4-r11 @ ... and we're out
|
||||
.size resample_hermite, .-resample_hermite
|
||||
|
||||
/****************************************************************************
|
||||
* void pga_process(struct dsp_proc_entry *this, struct dsp_buffer **buf_p)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue