1
0
Fork 0
forked from len0rd/rockbox

patch #919088: 17% faster bitswap(), by Jens "SH" Arnold

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4407 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Jörg Hohensohn 2004-03-18 22:44:05 +00:00
parent b61cf76aba
commit e64256d499

View file

@ -7,7 +7,7 @@
* \/ \/ \/ \/ \/ * \/ \/ \/ \/ \/
* $Id$ * $Id$
* *
* Copyright (C) 2002 by Magnus Holmgren * Copyright (C) 2004 by Jens Arnold
* *
* All files in this archive are subject to the GNU General Public License. * All files in this archive are subject to the GNU General Public License.
* See the file COPYING in the source tree root for full license agreement. * See the file COPYING in the source tree root for full license agreement.
@ -22,67 +22,87 @@
.global _bitswap .global _bitswap
.type _bitswap,@function .type _bitswap,@function
/* Registers used: /* Flips the bits of all bytes in a memory area (required for mp3 data on
* the Archos). This version is optimized for speed and size.
* *
* r0 Temporary (required by some instructions) * arguments:
* r1 Low byte * r4 - start address
* r2 High byte / final result * r5 - length
* r4 &Data *
* r5 Length * return value: void
* r7 Flip table *
*/ * register usage:
* r0 - temporary
/* The instruction order below is a bit strange, because: * r1 - bit mask for rounding to long bound / low byte (after swap)
* 1) Keeping load/stores on longword boundaries means the instruction fetch * r2 - high byte (after swap) / combined result
* won't compete with the memory access (because instructions are fetched * r4 - data address - 4
* in pairs). * r5 - end address - 4
* 2) Using the result of a fetch in the next instruction causes a stall * r7 - flip table (addressing with signed offset)
* (except in certain circumstances). *
* See the SH-1 programming manual for details. * The instruction order below is devised in a way to utilize the pipelining
* of the SH1 to the max.
*/ */
_bitswap: _bitswap:
mov.l .fliptable,r7 mova _fliptable,r0
add #-2,r4 /* ptr is used shifted by 2 */ mov r0,r7
add r4,r5 /* r5 = end_address - 2 */ add #-4,r4 /* address is shifted by 4 */
add #-1,r5 /* r5 = &last_byte - 2 */ add r4,r5 /* r5 = end_address - 4 */
cmp/hi r4,r5 /* at least something to do? */
bf .exit /* no, get out of here! */
add #-3,r5 /* end offset for flipping 4 bytes per pass */
mov r4,r0 mov r4,r0
tst #1,r0 /* even address? */ tst #1,r0 /* even address? */
bt .init /* yes */ bt .start2_w /* yes, jump into main loop */
add #1,r4 /* r4 now even */ /* no, flip first byte */
mov.b @(1,r4),r0 /* no, swap first byte */ mov.b @(4,r4),r0 /* load byte, sign extension! */
extu.b r0,r0 add #1,r4 /* early increment */
mov.b @(r0,r7),r0 mov.b @(r0,r7),r0 /* fliptable offset is signed */
mov.b r0,@(1,r4) bra .start2_w /* jump into main loop */
mov.b r0,@(3,r4) /* store byte */
.init: /* main loop: flips 2 words per pass */
cmp/hi r4,r5 /* at least 2 bytes to swap? */ .align 2
bf .last /* no, skip main loop */ .loop2_w:
mov.w @(6,r4),r0 /* load second word */
add #4,r4 /* early increment */
swap.b r0,r2 /* get high byte (2nd word) */
exts.b r0,r0 /* prepare low byte (2nd word) */
mov.b @(r0,r7),r1 /* swap low byte (2nd word) */
exts.b r2,r0 /* prepare high byte (2nd word) */
mov.b @(r0,r7),r2 /* swap high byte (2nd word) */
extu.b r1,r0 /* zero extend low byte (2nd word) */
mov.w @r4,r1 /* load first word */
shll8 r2 /* shift high byte (2nd word), low byte zeroed */
or r2,r0 /* put low byte (2nd word) in result */
swap.b r1,r2 /* get high byte (1st word) */
mov.w r0,@(2,r4) /* store result (2nd word) */
exts.b r1,r0 /* prepare low byte (1st word) */
mov.b @(r0,r7),r1 /* swap low byte (1st word) */
exts.b r2,r0 /* prepare high byte (1st word) */
mov.b @(r0,r7),r2 /* swap high byte (1st word) */
extu.b r1,r0 /* zero extend low byte (1st word) */
shll8 r2 /* shift high byte (1st word), low byte zeroed */
or r2,r0 /* put low byte (1st word) in result */
mov.w r0,@r4 /* store result (1st word) */
.start2_w:
cmp/hi r4,r5 /* runs r4 up to last long bound */
bt .loop2_w
.loop: bra .start_b2 /* jump into trailing byte loop */
mov.w @(2,r4),r0 /* data to flip */ add #3,r5 /* reset end offset */
add #2,r4 /* early increment */
swap.b r0,r2 /* get high byte */
extu.b r0,r0 /* prepare low byte */
mov.b @(r0,r7),r1 /* swap low byte */
extu.b r2,r0 /* prepare high byte */
mov.b @(r0,r7),r2 /* swap high byte */
extu.b r1,r1 /* zero extend low byte */
shll8 r2 /* shift high byte, low byte zeroed */
or r1,r2 /* put low byte in result */
mov.w r2,@r4 /* store result, ptr already incr'd */
cmp/hi r4,r5 /* while &last_byte > data */
bt .loop
.last: /* trailing byte loop: flips 0..3 bytes */
cmp/eq r4,r5 /* if behind (&last_byte - 2), exit */ .loop_b2:
bf .exit mov.b @(4,r4),r0 /* loand byte, sign extension! */
add #1,r4 /* early increment */
mov.b @(2,r4),r0 /* swap last byte */ mov.b @(r0,r7),r0 /* fliptable offset is signed */
extu.b r0,r0 mov.b r0,@(3,r4) /* store byte */
mov.b @(r0,r7),r0 .start_b2:
mov.b r0,@(2,r4) cmp/hi r4,r5 /* runs r4 up to end address */
bt .loop_b2
.exit: .exit:
rts rts
@ -90,9 +110,22 @@ _bitswap:
.align 2 .align 2
.fliptable: .byte 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1
.long _fliptable .byte 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1
.byte 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9
.byte 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9
.byte 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5
.byte 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5
.byte 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed
.byte 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd
.byte 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3
.byte 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3
.byte 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb
.byte 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb
.byte 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7
.byte 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7
.byte 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef
.byte 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
_fliptable: _fliptable:
.byte 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0 .byte 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0
.byte 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0 .byte 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0
@ -110,22 +143,7 @@ _fliptable:
.byte 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6 .byte 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6
.byte 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee .byte 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee
.byte 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe .byte 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe
.byte 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1
.byte 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1
.byte 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9
.byte 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9
.byte 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5
.byte 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5
.byte 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed
.byte 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd
.byte 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3
.byte 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3
.byte 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb
.byte 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb
.byte 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7
.byte 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7
.byte 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef
.byte 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
.end: .end:
.size _bitswap,.end-_bitswap .size _bitswap,.end-_bitswap