forked from len0rd/rockbox
Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
7a835ee0c6
commit
fe04e40be7
4 changed files with 137 additions and 3 deletions
|
@ -49,6 +49,14 @@ removing the rc parameter from each function (and the RNGC macro)).
|
|||
|
||||
*/
|
||||
|
||||
#ifdef ROCKBOX
|
||||
#include "../lib/codeclib.h"
|
||||
/* for UDIV32() */
|
||||
#endif
|
||||
|
||||
#ifndef UDIV32
|
||||
#define UDIV32(a, b) (a / b)
|
||||
#endif
|
||||
|
||||
/* BITSTREAM READING FUNCTIONS */
|
||||
|
||||
|
@ -121,15 +129,15 @@ static inline void range_dec_normalize(void)
|
|||
static inline int range_decode_culfreq(int tot_f)
|
||||
{
|
||||
range_dec_normalize();
|
||||
rc.help = rc.range / tot_f;
|
||||
return rc.low / rc.help;
|
||||
rc.help = UDIV32(rc.range, tot_f);
|
||||
return UDIV32(rc.low, rc.help);
|
||||
}
|
||||
|
||||
static inline int range_decode_culshift(int shift)
|
||||
{
|
||||
range_dec_normalize();
|
||||
rc.help = rc.range >> shift;
|
||||
return rc.low / rc.help;
|
||||
return UDIV32(rc.low, rc.help);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -5,6 +5,9 @@ codeclib.c
|
|||
mdct2.c
|
||||
#ifdef CPU_ARM
|
||||
mdct_arm.S
|
||||
#if ARM_ARCH == 4
|
||||
udiv32_armv4.S
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#elif defined(SIMULATOR) && defined(__APPLE__)
|
||||
|
|
|
@ -57,6 +57,15 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con
|
|||
|
||||
extern void mdct_backward(int n, int32_t *in, int32_t *out);
|
||||
|
||||
#if defined(CPU_ARM) && (ARM_ARCH == 4)
|
||||
/* optimised unsigned integer division for ARMv4, in IRAM */
|
||||
unsigned udiv32_arm(unsigned a, unsigned b);
|
||||
#define UDIV32(a, b) udiv32_arm(a, b)
|
||||
#else
|
||||
/* default */
|
||||
#define UDIV32(a, b) (a / b)
|
||||
#endif
|
||||
|
||||
/* Various codec helper functions */
|
||||
|
||||
int codec_init(void);
|
||||
|
|
114
apps/codecs/lib/udiv32_armv4.S
Normal file
114
apps/codecs/lib/udiv32_armv4.S
Normal file
|
@ -0,0 +1,114 @@
|
|||
/***************************************************************************
|
||||
* __________ __ ___.
|
||||
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
|
||||
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
|
||||
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
|
||||
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
|
||||
* \/ \/ \/ \/ \/
|
||||
* $Id$
|
||||
*
|
||||
* Copyright (C) 2008 by Jens Arnold
|
||||
*
|
||||
* Optimised unsigned integer division for ARMv4
|
||||
*
|
||||
* Based on: libgcc routines for ARM cpu.
|
||||
* Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
|
||||
* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
|
||||
* Free Software Foundation, Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
*
|
||||
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
||||
* KIND, either express or implied.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
#include "config.h"
|
||||
/* Codecs should not normally do this, but we need to check a macro, and
|
||||
* codecs.h would confuse the assembler. */
|
||||
|
||||
.macro ARM_DIV_BODY dividend, divisor, result, curbit
|
||||
|
||||
mov \result, \dividend
|
||||
mov \curbit, #90 @ 3 * 30, (calculating branch dest)
|
||||
cmp \divisor, \result, lsr #16
|
||||
movls \result,\result, lsr #16
|
||||
subls \curbit, \curbit, #48
|
||||
cmp \divisor, \result, lsr #8
|
||||
movls \result,\result, lsr #8
|
||||
subls \curbit, \curbit, #24
|
||||
cmp \divisor, \result, lsr #4
|
||||
movls \result,\result, lsr #4
|
||||
subls \curbit, \curbit, #12
|
||||
cmp \divisor, \result, lsr #2
|
||||
subls \curbit, \curbit, #6
|
||||
@ calculation is only done down to shift=2, because the shift=1 step
|
||||
@ would need 3 more cycles, but would only gain 1.5 cycles on average
|
||||
mov \result, #0
|
||||
add pc, pc, \curbit, lsl #2
|
||||
nop
|
||||
.set shift, 32
|
||||
.rept 32
|
||||
.set shift, shift - 1
|
||||
cmp \dividend, \divisor, lsl #shift
|
||||
adc \result, \result, \result
|
||||
subcs \dividend, \dividend, \divisor, lsl #shift
|
||||
.endr
|
||||
.endm
|
||||
|
||||
.macro ARM_DIV2_ORDER divisor, order
|
||||
|
||||
cmp \divisor, #(1 << 16)
|
||||
movhs \divisor, \divisor, lsr #16
|
||||
movhs \order, #16
|
||||
movlo \order, #0
|
||||
|
||||
cmp \divisor, #(1 << 8)
|
||||
movhs \divisor, \divisor, lsr #8
|
||||
addhs \order, \order, #8
|
||||
|
||||
cmp \divisor, #(1 << 4)
|
||||
movhs \divisor, \divisor, lsr #4
|
||||
addhs \order, \order, #4
|
||||
|
||||
cmp \divisor, #(1 << 2)
|
||||
addhi \order, \order, #3
|
||||
addls \order, \order, \divisor, lsr #1
|
||||
.endm
|
||||
|
||||
|
||||
#ifdef USE_IRAM
|
||||
.section .icode,"ax",%progbits
|
||||
#else
|
||||
.text
|
||||
#endif
|
||||
.align
|
||||
.global udiv32_arm
|
||||
.type udiv32_arm,%function
|
||||
|
||||
udiv32_arm:
|
||||
subs r2, r1, #1
|
||||
bxeq lr
|
||||
bcc 20f
|
||||
cmp r0, r1
|
||||
bls 10f
|
||||
tst r1, r2
|
||||
beq 30f
|
||||
|
||||
ARM_DIV_BODY r0, r1, r2, r3
|
||||
mov r0, r2
|
||||
bx lr
|
||||
|
||||
10:
|
||||
moveq r0, #1
|
||||
20:
|
||||
movne r0, #0
|
||||
bx lr
|
||||
|
||||
30:
|
||||
ARM_DIV2_ORDER r1, r2
|
||||
mov r0, r0, lsr r2
|
||||
bx lr
|
Loading…
Add table
Add a link
Reference in a new issue