1
0
Fork 0
forked from len0rd/rockbox

patch #919088: 17% faster bitswap(), by Jens "SH" Arnold

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4407 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Jörg Hohensohn 2004-03-18 22:44:05 +00:00
parent b61cf76aba
commit e64256d499

View file

@ -7,7 +7,7 @@
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2002 by Magnus Holmgren
* Copyright (C) 2004 by Jens Arnold
*
* All files in this archive are subject to the GNU General Public License.
* See the file COPYING in the source tree root for full license agreement.
@ -22,67 +22,87 @@
.global _bitswap
.type _bitswap,@function
/* Registers used:
/* Flips the bits of all bytes in a memory area (required for mp3 data on
* the Archos). This version is optimized for speed and size.
*
* r0 Temporary (required by some instructions)
* r1 Low byte
* r2 High byte / final result
* r4 &Data
* r5 Length
* r7 Flip table
*/
/* The instruction order below is a bit strange, because:
* 1) Keeping load/stores on longword boundaries means the instruction fetch
* won't compete with the memory access (because instructions are fetched
* in pairs).
* 2) Using the result of a fetch in the next instruction causes a stall
* (except in certain circumstances).
* See the SH-1 programming manual for details.
* arguments:
* r4 - start address
* r5 - length
*
* return value: void
*
* register usage:
* r0 - temporary
* r1 - bit mask for rounding to long bound / low byte (after swap)
* r2 - high byte (after swap) / combined result
* r4 - data address - 4
* r5 - end address - 4
* r7 - flip table (addressing with signed offset)
*
* The instruction order below is devised in a way to utilize the pipelining
* of the SH1 to the max.
*/
_bitswap:
mov.l .fliptable,r7
add #-2,r4 /* ptr is used shifted by 2 */
add r4,r5 /* r5 = end_address - 2 */
add #-1,r5 /* r5 = &last_byte - 2 */
mova _fliptable,r0
mov r0,r7
add #-4,r4 /* address is shifted by 4 */
add r4,r5 /* r5 = end_address - 4 */
cmp/hi r4,r5 /* at least something to do? */
bf .exit /* no, get out of here! */
add #-3,r5 /* end offset for flipping 4 bytes per pass */
mov r4,r0
tst #1,r0 /* even address? */
bt .init /* yes */
bt .start2_w /* yes, jump into main loop */
add #1,r4 /* r4 now even */
mov.b @(1,r4),r0 /* no, swap first byte */
extu.b r0,r0
mov.b @(r0,r7),r0
mov.b r0,@(1,r4)
/* no, flip first byte */
mov.b @(4,r4),r0 /* load byte, sign extension! */
add #1,r4 /* early increment */
mov.b @(r0,r7),r0 /* fliptable offset is signed */
bra .start2_w /* jump into main loop */
mov.b r0,@(3,r4) /* store byte */
.init:
cmp/hi r4,r5 /* at least 2 bytes to swap? */
bf .last /* no, skip main loop */
/* main loop: flips 2 words per pass */
.align 2
.loop2_w:
mov.w @(6,r4),r0 /* load second word */
add #4,r4 /* early increment */
swap.b r0,r2 /* get high byte (2nd word) */
exts.b r0,r0 /* prepare low byte (2nd word) */
mov.b @(r0,r7),r1 /* swap low byte (2nd word) */
exts.b r2,r0 /* prepare high byte (2nd word) */
mov.b @(r0,r7),r2 /* swap high byte (2nd word) */
extu.b r1,r0 /* zero extend low byte (2nd word) */
mov.w @r4,r1 /* load first word */
shll8 r2 /* shift high byte (2nd word), low byte zeroed */
or r2,r0 /* put low byte (2nd word) in result */
swap.b r1,r2 /* get high byte (1st word) */
mov.w r0,@(2,r4) /* store result (2nd word) */
exts.b r1,r0 /* prepare low byte (1st word) */
mov.b @(r0,r7),r1 /* swap low byte (1st word) */
exts.b r2,r0 /* prepare high byte (1st word) */
mov.b @(r0,r7),r2 /* swap high byte (1st word) */
extu.b r1,r0 /* zero extend low byte (1st word) */
shll8 r2 /* shift high byte (1st word), low byte zeroed */
or r2,r0 /* put low byte (1st word) in result */
mov.w r0,@r4 /* store result (1st word) */
.start2_w:
cmp/hi r4,r5 /* runs r4 up to last long bound */
bt .loop2_w
.loop:
mov.w @(2,r4),r0 /* data to flip */
add #2,r4 /* early increment */
swap.b r0,r2 /* get high byte */
extu.b r0,r0 /* prepare low byte */
mov.b @(r0,r7),r1 /* swap low byte */
extu.b r2,r0 /* prepare high byte */
mov.b @(r0,r7),r2 /* swap high byte */
extu.b r1,r1 /* zero extend low byte */
shll8 r2 /* shift high byte, low byte zeroed */
or r1,r2 /* put low byte in result */
mov.w r2,@r4 /* store result, ptr already incr'd */
cmp/hi r4,r5 /* while &last_byte > data */
bt .loop
bra .start_b2 /* jump into trailing byte loop */
add #3,r5 /* reset end offset */
.last:
cmp/eq r4,r5 /* if behind (&last_byte - 2), exit */
bf .exit
mov.b @(2,r4),r0 /* swap last byte */
extu.b r0,r0
mov.b @(r0,r7),r0
mov.b r0,@(2,r4)
/* trailing byte loop: flips 0..3 bytes */
.loop_b2:
mov.b @(4,r4),r0 /* loand byte, sign extension! */
add #1,r4 /* early increment */
mov.b @(r0,r7),r0 /* fliptable offset is signed */
mov.b r0,@(3,r4) /* store byte */
.start_b2:
cmp/hi r4,r5 /* runs r4 up to end address */
bt .loop_b2
.exit:
rts
@ -90,9 +110,22 @@ _bitswap:
.align 2
.fliptable:
.long _fliptable
.byte 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1
.byte 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1
.byte 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9
.byte 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9
.byte 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5
.byte 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5
.byte 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed
.byte 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd
.byte 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3
.byte 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3
.byte 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb
.byte 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb
.byte 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7
.byte 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7
.byte 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef
.byte 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
_fliptable:
.byte 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0
.byte 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0
@ -110,22 +143,7 @@ _fliptable:
.byte 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6
.byte 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee
.byte 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe
.byte 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1
.byte 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1
.byte 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9
.byte 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9
.byte 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5
.byte 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5
.byte 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed
.byte 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd
.byte 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3
.byte 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3
.byte 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb
.byte 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb
.byte 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7
.byte 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7
.byte 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef
.byte 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
.end:
.size _bitswap,.end-_bitswap