1
0
Fork 0
forked from len0rd/rockbox

Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Jens Arnold 2008-11-05 00:10:05 +00:00
parent 7a835ee0c6
commit fe04e40be7
4 changed files with 137 additions and 3 deletions

View file

@ -49,6 +49,14 @@ removing the rc parameter from each function (and the RNGC macro)).
*/
#ifdef ROCKBOX
#include "../lib/codeclib.h"
/* for UDIV32() */
#endif
#ifndef UDIV32
#define UDIV32(a, b) (a / b)
#endif
/* BITSTREAM READING FUNCTIONS */
@ -121,15 +129,15 @@ static inline void range_dec_normalize(void)
static inline int range_decode_culfreq(int tot_f)
{
range_dec_normalize();
rc.help = rc.range / tot_f;
return rc.low / rc.help;
rc.help = UDIV32(rc.range, tot_f);
return UDIV32(rc.low, rc.help);
}
static inline int range_decode_culshift(int shift)
{
range_dec_normalize();
rc.help = rc.range >> shift;
return rc.low / rc.help;
return UDIV32(rc.low, rc.help);
}

View file

@ -5,6 +5,9 @@ codeclib.c
mdct2.c
#ifdef CPU_ARM
mdct_arm.S
#if ARM_ARCH == 4
udiv32_armv4.S
#endif
#endif
#elif defined(SIMULATOR) && defined(__APPLE__)

View file

@ -57,6 +57,15 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con
extern void mdct_backward(int n, int32_t *in, int32_t *out);
#if defined(CPU_ARM) && (ARM_ARCH == 4)
/* optimised unsigned integer division for ARMv4, in IRAM */
unsigned udiv32_arm(unsigned a, unsigned b);
#define UDIV32(a, b) udiv32_arm(a, b)
#else
/* default */
#define UDIV32(a, b) (a / b)
#endif
/* Various codec helper functions */
int codec_init(void);

View file

@ -0,0 +1,114 @@
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2008 by Jens Arnold
*
* Optimised unsigned integer division for ARMv4
*
* Based on: libgcc routines for ARM cpu.
* Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
* Free Software Foundation, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
/* Codecs should not normally do this, but we need to check a macro, and
* codecs.h would confuse the assembler. */
.macro ARM_DIV_BODY dividend, divisor, result, curbit
mov \result, \dividend
mov \curbit, #90 @ 3 * 30, (calculating branch dest)
cmp \divisor, \result, lsr #16
movls \result,\result, lsr #16
subls \curbit, \curbit, #48
cmp \divisor, \result, lsr #8
movls \result,\result, lsr #8
subls \curbit, \curbit, #24
cmp \divisor, \result, lsr #4
movls \result,\result, lsr #4
subls \curbit, \curbit, #12
cmp \divisor, \result, lsr #2
subls \curbit, \curbit, #6
@ calculation is only done down to shift=2, because the shift=1 step
@ would need 3 more cycles, but would only gain 1.5 cycles on average
mov \result, #0
add pc, pc, \curbit, lsl #2
nop
.set shift, 32
.rept 32
.set shift, shift - 1
cmp \dividend, \divisor, lsl #shift
adc \result, \result, \result
subcs \dividend, \dividend, \divisor, lsl #shift
.endr
.endm
.macro ARM_DIV2_ORDER divisor, order
cmp \divisor, #(1 << 16)
movhs \divisor, \divisor, lsr #16
movhs \order, #16
movlo \order, #0
cmp \divisor, #(1 << 8)
movhs \divisor, \divisor, lsr #8
addhs \order, \order, #8
cmp \divisor, #(1 << 4)
movhs \divisor, \divisor, lsr #4
addhs \order, \order, #4
cmp \divisor, #(1 << 2)
addhi \order, \order, #3
addls \order, \order, \divisor, lsr #1
.endm
#ifdef USE_IRAM
.section .icode,"ax",%progbits
#else
.text
#endif
.align
.global udiv32_arm
.type udiv32_arm,%function
udiv32_arm:
subs r2, r1, #1
bxeq lr
bcc 20f
cmp r0, r1
bls 10f
tst r1, r2
beq 30f
ARM_DIV_BODY r0, r1, r2, r3
mov r0, r2
bx lr
10:
moveq r0, #1
20:
movne r0, #0
bx lr
30:
ARM_DIV2_ORDER r1, r2
mov r0, r0, lsr r2
bx lr