forked from len0rd/rockbox
Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
7a835ee0c6
commit
fe04e40be7
4 changed files with 137 additions and 3 deletions
|
@ -49,6 +49,14 @@ removing the rc parameter from each function (and the RNGC macro)).
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#ifdef ROCKBOX
|
||||||
|
#include "../lib/codeclib.h"
|
||||||
|
/* for UDIV32() */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef UDIV32
|
||||||
|
#define UDIV32(a, b) (a / b)
|
||||||
|
#endif
|
||||||
|
|
||||||
/* BITSTREAM READING FUNCTIONS */
|
/* BITSTREAM READING FUNCTIONS */
|
||||||
|
|
||||||
|
@ -121,15 +129,15 @@ static inline void range_dec_normalize(void)
|
||||||
static inline int range_decode_culfreq(int tot_f)
|
static inline int range_decode_culfreq(int tot_f)
|
||||||
{
|
{
|
||||||
range_dec_normalize();
|
range_dec_normalize();
|
||||||
rc.help = rc.range / tot_f;
|
rc.help = UDIV32(rc.range, tot_f);
|
||||||
return rc.low / rc.help;
|
return UDIV32(rc.low, rc.help);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int range_decode_culshift(int shift)
|
static inline int range_decode_culshift(int shift)
|
||||||
{
|
{
|
||||||
range_dec_normalize();
|
range_dec_normalize();
|
||||||
rc.help = rc.range >> shift;
|
rc.help = rc.range >> shift;
|
||||||
return rc.low / rc.help;
|
return UDIV32(rc.low, rc.help);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,9 @@ codeclib.c
|
||||||
mdct2.c
|
mdct2.c
|
||||||
#ifdef CPU_ARM
|
#ifdef CPU_ARM
|
||||||
mdct_arm.S
|
mdct_arm.S
|
||||||
|
#if ARM_ARCH == 4
|
||||||
|
udiv32_armv4.S
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#elif defined(SIMULATOR) && defined(__APPLE__)
|
#elif defined(SIMULATOR) && defined(__APPLE__)
|
||||||
|
|
|
@ -57,6 +57,15 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con
|
||||||
|
|
||||||
extern void mdct_backward(int n, int32_t *in, int32_t *out);
|
extern void mdct_backward(int n, int32_t *in, int32_t *out);
|
||||||
|
|
||||||
|
#if defined(CPU_ARM) && (ARM_ARCH == 4)
|
||||||
|
/* optimised unsigned integer division for ARMv4, in IRAM */
|
||||||
|
unsigned udiv32_arm(unsigned a, unsigned b);
|
||||||
|
#define UDIV32(a, b) udiv32_arm(a, b)
|
||||||
|
#else
|
||||||
|
/* default */
|
||||||
|
#define UDIV32(a, b) (a / b)
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Various codec helper functions */
|
/* Various codec helper functions */
|
||||||
|
|
||||||
int codec_init(void);
|
int codec_init(void);
|
||||||
|
|
114
apps/codecs/lib/udiv32_armv4.S
Normal file
114
apps/codecs/lib/udiv32_armv4.S
Normal file
|
@ -0,0 +1,114 @@
|
||||||
|
/***************************************************************************
|
||||||
|
* __________ __ ___.
|
||||||
|
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
|
||||||
|
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
|
||||||
|
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
|
||||||
|
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
|
||||||
|
* \/ \/ \/ \/ \/
|
||||||
|
* $Id$
|
||||||
|
*
|
||||||
|
* Copyright (C) 2008 by Jens Arnold
|
||||||
|
*
|
||||||
|
* Optimised unsigned integer division for ARMv4
|
||||||
|
*
|
||||||
|
* Based on: libgcc routines for ARM cpu.
|
||||||
|
* Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
|
||||||
|
* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
|
||||||
|
* Free Software Foundation, Inc.
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU General Public License
|
||||||
|
* as published by the Free Software Foundation; either version 2
|
||||||
|
* of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
||||||
|
* KIND, either express or implied.
|
||||||
|
*
|
||||||
|
****************************************************************************/
|
||||||
|
|
||||||
|
#include "config.h"
|
||||||
|
/* Codecs should not normally do this, but we need to check a macro, and
|
||||||
|
* codecs.h would confuse the assembler. */
|
||||||
|
|
||||||
|
.macro ARM_DIV_BODY dividend, divisor, result, curbit
|
||||||
|
|
||||||
|
mov \result, \dividend
|
||||||
|
mov \curbit, #90 @ 3 * 30, (calculating branch dest)
|
||||||
|
cmp \divisor, \result, lsr #16
|
||||||
|
movls \result,\result, lsr #16
|
||||||
|
subls \curbit, \curbit, #48
|
||||||
|
cmp \divisor, \result, lsr #8
|
||||||
|
movls \result,\result, lsr #8
|
||||||
|
subls \curbit, \curbit, #24
|
||||||
|
cmp \divisor, \result, lsr #4
|
||||||
|
movls \result,\result, lsr #4
|
||||||
|
subls \curbit, \curbit, #12
|
||||||
|
cmp \divisor, \result, lsr #2
|
||||||
|
subls \curbit, \curbit, #6
|
||||||
|
@ calculation is only done down to shift=2, because the shift=1 step
|
||||||
|
@ would need 3 more cycles, but would only gain 1.5 cycles on average
|
||||||
|
mov \result, #0
|
||||||
|
add pc, pc, \curbit, lsl #2
|
||||||
|
nop
|
||||||
|
.set shift, 32
|
||||||
|
.rept 32
|
||||||
|
.set shift, shift - 1
|
||||||
|
cmp \dividend, \divisor, lsl #shift
|
||||||
|
adc \result, \result, \result
|
||||||
|
subcs \dividend, \dividend, \divisor, lsl #shift
|
||||||
|
.endr
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ARM_DIV2_ORDER divisor, order
|
||||||
|
|
||||||
|
cmp \divisor, #(1 << 16)
|
||||||
|
movhs \divisor, \divisor, lsr #16
|
||||||
|
movhs \order, #16
|
||||||
|
movlo \order, #0
|
||||||
|
|
||||||
|
cmp \divisor, #(1 << 8)
|
||||||
|
movhs \divisor, \divisor, lsr #8
|
||||||
|
addhs \order, \order, #8
|
||||||
|
|
||||||
|
cmp \divisor, #(1 << 4)
|
||||||
|
movhs \divisor, \divisor, lsr #4
|
||||||
|
addhs \order, \order, #4
|
||||||
|
|
||||||
|
cmp \divisor, #(1 << 2)
|
||||||
|
addhi \order, \order, #3
|
||||||
|
addls \order, \order, \divisor, lsr #1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef USE_IRAM
|
||||||
|
.section .icode,"ax",%progbits
|
||||||
|
#else
|
||||||
|
.text
|
||||||
|
#endif
|
||||||
|
.align
|
||||||
|
.global udiv32_arm
|
||||||
|
.type udiv32_arm,%function
|
||||||
|
|
||||||
|
udiv32_arm:
|
||||||
|
subs r2, r1, #1
|
||||||
|
bxeq lr
|
||||||
|
bcc 20f
|
||||||
|
cmp r0, r1
|
||||||
|
bls 10f
|
||||||
|
tst r1, r2
|
||||||
|
beq 30f
|
||||||
|
|
||||||
|
ARM_DIV_BODY r0, r1, r2, r3
|
||||||
|
mov r0, r2
|
||||||
|
bx lr
|
||||||
|
|
||||||
|
10:
|
||||||
|
moveq r0, #1
|
||||||
|
20:
|
||||||
|
movne r0, #0
|
||||||
|
bx lr
|
||||||
|
|
||||||
|
30:
|
||||||
|
ARM_DIV2_ORDER r1, r2
|
||||||
|
mov r0, r0, lsr r2
|
||||||
|
bx lr
|
Loading…
Add table
Add a link
Reference in a new issue