More comments for udiv32_armv4.S, reduce zero divisor test to one cycle for the skipped branch by setting flags when inverting divisor, 32-bit numerators are handled by calling the 31-bit divider and fixing the results.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24151 a1c6a512-1295-4272-9138-f99709370657
2025-11-16 16:42:33 -05:00 · 2010-01-03 04:30:13 +00:00 · 2010-01-03 04:30:13 +00:00 · c1f4d4037a
commit c1f4d4037a
parent 1d46959012
1 changed files with 56 additions and 55 deletions
--- a/apps/codecs/lib/udiv32_armv4.S
+++ b/apps/codecs/lib/udiv32_armv4.S
@ -36,11 +36,14 @@
   iteration by storing quotient and remainder together and adding the previous
   quotient bit during trial subtraction. Modified to work with any dividend
   and divisor both less than 1 << 30, and skipping trials by calculating bits
-   in output.
+   in output. */
-*/
+.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
 .macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient
    mov     \bits, #1
    /* Shift the divisor left until it aligns with the numerator. If it already
       has the high bit set, this is fine, everything inside .rept will be
       skipped, and the add before and adcs after will set the one-bit result
       to zero. */
    cmp     \divisor, \dividend, lsr #16
    movls   \divisor, \divisor, lsl #16
    addls   \bits, \bits, #16
@ -56,7 +59,8 @@
    cmp     \divisor, \dividend, lsr #1
    movls   \divisor, \divisor, lsl #1
    addls   \bits, \bits, #1
-    rsb     \divisor, \divisor, #0
+    rsbs    \divisor, \divisor, #0
    bcs     .L_div0
    adds    \result, \dividend, \divisor
    subcc   \result, \result, \divisor
    rsb     \curbit, \bits, #31
@ -64,44 +68,14 @@
    nop
    .rept   30
    adcs    \result, \divisor, \result, lsl #1
    /* Fix the remainder portion of the result. This must be done because the
       handler for 32-bit numerators needs the remainder. */
    subcc   \result, \result, \divisor
    .endr
-    /* shift remainder/quotient left one, add final quotient bit */
+    /* Shift remainder/quotient left one, add final quotient bit */
    adc     \result, \result, \result
-    mov     \dividend, \result, lsr \bits
+    mov     \remainder, \result, lsr \bits
-    eor     \quotient, \result, \dividend, lsl \bits
+    eor     \quotient, \result, \remainder, lsl \bits
 .endm
 .macro ARM_DIV_32_BODY dividend, divisor, result, curbit
    mov     \result, \dividend
    mov     \curbit, #90          @ 3 * 30, (calculating branch dest)
    cmp     \divisor, \result, lsr #16
    movls   \result,\result, lsr #16
    subls   \curbit, \curbit, #48
    cmp     \divisor, \result, lsr #8
    movls   \result,\result, lsr #8
    subls   \curbit, \curbit, #24
    cmp     \divisor, \result, lsr #4
    movls   \result,\result, lsr #4
    subls   \curbit, \curbit, #12
    cmp     \divisor, \result, lsr #2
    subls   \curbit, \curbit, #6
    @ Calculation is only done down to shift=2, because the shift=1 step
    @ would need 3 more cycles, but would only gain 1.5 cycles on average.
    mov     \result, #0
    add     pc, pc, \curbit, lsl #2
    nop
    .set    shift, 32
    .rept   31
    .set    shift, shift - 1
    cmp     \divisor, \dividend, lsr #shift
    orrls   \result, \result, #(1 << shift)
    subls   \dividend, \dividend, \divisor, lsl #shift
    .endr   @ shift==0 in the .rept would cause a warning  for lsr #0
    cmp     \divisor, \dividend
    orrls   \result, \result, #1
    @subls  \dividend, \dividend, \divisor  @ correct remainder not needed
 .endm
 #ifdef USE_IRAM
@ -114,21 +88,48 @@
    .type   udiv32_arm,%function
 udiv32_arm:
    cmp     r1, #0
    beq     20f
    tst     r0, r0
-    /* High bit must be unset, otherwise use ARM_DIV_32_BODY. High bit of
+    /* High bit must be unset, otherwise shift numerator right, calculate,
-       divisor is also unset dividend has been tested to be >= divisor.
+       and correct results. As this case is very uncommon we want to avoid
       any other delays on the main path in handling it, so the long divide
       calls the short divide as a function. */
    bmi     .L_udiv32
 .L_udiv31:
    ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
    bx      lr
 .L_udiv32:
    /* store original numerator and divisor, we'll need them to correct the
       result, */
    stmdb   sp, { r0, r1, lr }
    /* Call __div0 here if divisor is zero, otherwise it would report the wrong
       address. */
    mov     r0, r0, lsr #1
    bl      .L_udiv31
    /* This address is never a branch target, but is used to test lr before
       calling __div0. */
 .L_udiv32_div0_trap:
    ldmdb   sp, { r2, r3, lr }
    /* Move the low bit of the original numerator to the carry bit */
    movs    r2, r2, lsr #1
    /* Shift the remainder left one and add in the carry bit */
    adc     r1, r1, r1
    /* Subtract the original divisor from the remainder, setting carry if the
       result is non-negative */
    subs    r1, r1, r3
    /* Shift quotient left one and add carry bit */
    adc     r0, r0, r0
    bx      lr
 .L_div0:
    /* Check the return address, since .L_udiv32 uses bl to wrap the 31-bit
       divider. If the return address is at .L_udiv32_div0_trap, then the 
       the return address of the original caller is at sp - 4
    */
-    bmi     10f
+    adr     r2, .L_udiv32_div0_trap
-    ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0
+    cmp     r2, lr
-    bx      lr
+    subeq     sp, sp, #4
-
+    bleq    __div0
-10:
+    /* Otherwise, push lr to the stack before calling __div0 */
-    ARM_DIV_32_BODY r0, r1, r2, r3
+    stmdb sp!, { lr }
-    mov     r0, r2
+    bl      __div0
-    bx      lr
+    .size udiv32_arm, . - udiv32_arm
 20:
    movne   r0, #0
    bx      lr