Add arm assembler for dsp_apply_gain(). Speeds up this routine by 30-40% on PP502x.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25596 a1c6a512-1295-4272-9138-f99709370657
2025-12-09 05:05:20 -05:00 · 2010-04-11 19:02:43 +00:00 · 2010-04-11 19:02:43 +00:00 · efb702dc9b
commit efb702dc9b
parent 3b1c3881f0
2 changed files with 40 additions and 0 deletions
--- a/apps/dsp_arm.S
+++ b/apps/dsp_arm.S
@ -381,3 +381,42 @@ dsp_upsample:
 .usend:
    .size       dsp_upsample,.usend-dsp_upsample

+/****************************************************************************
+ *  void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
+ *  NOTE: The following code processes two samples at once. When count is odd,
+ *        there is an additional obsolete sample processed, which will not be
+ *        used by the calling functions.
+ */
+    .section .icode, "ax", %progbits
+    .align  2
+    .global dsp_apply_gain
+    .type   dsp_apply_gain, %function
+dsp_apply_gain:
+    @ input: r0 = count, r1 = data, r2 = buf[]
+    stmfd   sp!, {r4-r8, lr}
+
+    ldr     r3, [r1,  #4]           @ r3 = data->num_channels
+    ldr     r4, [r1, #32]           @ r5 = data->gain
+
+.dag_outerloop:
+    ldr     r1, [r2], #4            @ r1 = buf[0] and increment index of buf[]
+    mov     lr, r0                  @ lr = r0 = count
+
+.dag_innerloop:
+    ldmia   r1, {r5, r6}            @ load r5, r6 from r1
+    smull   r7, r8, r5, r4          @ r5 = FRACMUL_SHL(r5, r4, 8)
+    mov     r8, r8, asl #9
+    orr     r5, r8, r7, lsr #23
+    smull   r7, r8, r6, r4          @ r6 = FRACMUL_SHL(r6, r4, 8)
+    mov     r8, r8, asl #9
+    orr     r6, r8, r7, lsr #23
+    stmia   r1!, {r5, r6}           @ save r5, r6 to r1 and increment r1
+    subs    lr, lr, #2
+    bgt     .dag_innerloop          @ end of inner loop
+
+    subs    r3, r3, #1
+    bgt     .dag_outerloop          @ end of outer loop
+
+    ldmfd   sp!, {r4-r8, pc}
+.dagend:
+    .size   dsp_apply_gain,.dagend-dsp_apply_gain