forked from len0rd/rockbox
ARMv4 unsigned integer division: Using an overflow-safe comparison method in the main calculation allows to put back the 1.5 cyle (average) optimisation. Shaved off another instruction, as we don't need the remainder. * Use the very efficient ffs algorithm from ffs-arm.S for dividing by a power of 2.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19032 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
35823422c2
commit
545b51e2e4
1 changed files with 31 additions and 24 deletions
|
@ -33,7 +33,7 @@
|
||||||
.macro ARM_DIV_BODY dividend, divisor, result, curbit
|
.macro ARM_DIV_BODY dividend, divisor, result, curbit
|
||||||
|
|
||||||
mov \result, \dividend
|
mov \result, \dividend
|
||||||
mov \curbit, #93 @ 3 * 31, (calculating branch dest)
|
mov \curbit, #90 @ 3 * 30, (calculating branch dest)
|
||||||
cmp \divisor, \result, lsr #16
|
cmp \divisor, \result, lsr #16
|
||||||
movls \result,\result, lsr #16
|
movls \result,\result, lsr #16
|
||||||
subls \curbit, \curbit, #48
|
subls \curbit, \curbit, #48
|
||||||
|
@ -44,40 +44,35 @@
|
||||||
movls \result,\result, lsr #4
|
movls \result,\result, lsr #4
|
||||||
subls \curbit, \curbit, #12
|
subls \curbit, \curbit, #12
|
||||||
cmp \divisor, \result, lsr #2
|
cmp \divisor, \result, lsr #2
|
||||||
movls \result,\result, lsr #2
|
|
||||||
subls \curbit, \curbit, #6
|
subls \curbit, \curbit, #6
|
||||||
cmp \divisor, \result, lsr #1
|
@ Calculation is only done down to shift=2, because the shift=1 step
|
||||||
subls \curbit, \curbit, #3
|
@ would need 3 more cycles, but would only gain 1.5 cycles on average.
|
||||||
mov \result, #0
|
mov \result, #0
|
||||||
add pc, pc, \curbit, lsl #2
|
add pc, pc, \curbit, lsl #2
|
||||||
nop
|
nop
|
||||||
.set shift, 32
|
.set shift, 32
|
||||||
.rept 32
|
.rept 31
|
||||||
.set shift, shift - 1
|
.set shift, shift - 1
|
||||||
cmp \dividend, \divisor, lsl #shift
|
cmp \divisor, \dividend, lsr #shift
|
||||||
adc \result, \result, \result
|
orrls \result, \result, #(1 << shift)
|
||||||
subcs \dividend, \dividend, \divisor, lsl #shift
|
subls \dividend, \dividend, \divisor, lsl #shift
|
||||||
.endr
|
.endr @ shift==0 in the .rept would cause a warning for lsr #0
|
||||||
|
cmp \divisor, \dividend
|
||||||
|
orrls \result, \result, #1
|
||||||
|
@subls \dividend, \dividend, \divisor @ correct remainder not needed
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro ARM_DIV2_ORDER divisor, order
|
.macro ARM_DIV2_ORDER divisor, order
|
||||||
|
|
||||||
cmp \divisor, #(1 << 16)
|
@ There's exactly one bit set in the divisor, so ffs() can be used
|
||||||
movhs \divisor, \divisor, lsr #16
|
@ This is the ffs algorithm devised by D.Seal and posted to
|
||||||
movhs \order, #16
|
@ comp.sys.arm on 16 Feb 1994.
|
||||||
movlo \order, #0
|
adr \order, L_ffs_table
|
||||||
|
orr \divisor, \divisor, \divisor, lsl #4 @ = X * 0x11
|
||||||
|
orr \divisor, \divisor, \divisor, lsl #6 @ = X * 0x451
|
||||||
|
rsb \divisor, \divisor, \divisor, lsl #16 @ = X * 0x0450fbaf
|
||||||
|
|
||||||
cmp \divisor, #(1 << 8)
|
ldrb \order, [\order, \divisor, lsr #26]
|
||||||
movhs \divisor, \divisor, lsr #8
|
|
||||||
addhs \order, \order, #8
|
|
||||||
|
|
||||||
cmp \divisor, #(1 << 4)
|
|
||||||
movhs \divisor, \divisor, lsr #4
|
|
||||||
addhs \order, \order, #4
|
|
||||||
|
|
||||||
cmp \divisor, #(1 << 2)
|
|
||||||
addhi \order, \order, #3
|
|
||||||
addls \order, \order, \divisor, lsr #1
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
@ -113,3 +108,15 @@ udiv32_arm:
|
||||||
ARM_DIV2_ORDER r1, r2
|
ARM_DIV2_ORDER r1, r2
|
||||||
mov r0, r0, lsr r2
|
mov r0, r0, lsr r2
|
||||||
bx lr
|
bx lr
|
||||||
|
|
||||||
|
L_ffs_table:
|
||||||
|
@ 0 1 2 3 4 5 6 7
|
||||||
|
@----------------------------------------------
|
||||||
|
.byte 32, 0, 1, 12, 2, 6, 0, 13 @ 0- 7
|
||||||
|
.byte 3, 0, 7, 0, 0, 0, 0, 14 @ 8-15
|
||||||
|
.byte 10, 4, 0, 0, 8, 0, 0, 25 @ 16-23
|
||||||
|
.byte 0, 0, 0, 0, 0, 21, 27, 15 @ 24-31
|
||||||
|
.byte 31, 11, 5, 0, 0, 0, 0, 0 @ 32-39
|
||||||
|
.byte 9, 0, 0, 24, 0, 0, 20, 26 @ 40-47
|
||||||
|
.byte 30, 0, 0, 0, 0, 23, 0, 19 @ 48-55
|
||||||
|
.byte 29, 0, 22, 18, 28, 17, 16, 0 @ 56-63
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue