Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657
2008-11-05 00:10:05 +00:00 · 2008-11-05 00:10:05 +00:00 · fe04e40be7
commit fe04e40be7
parent 7a835ee0c6
4 changed files with 137 additions and 3 deletions
--- a/apps/codecs/demac/libdemac/rangecoding.h
+++ b/apps/codecs/demac/libdemac/rangecoding.h
@ -49,6 +49,14 @@ removing the rc parameter from each function (and the RNGC macro)).
 */
 #ifdef ROCKBOX
 #include "../lib/codeclib.h"
 /* for UDIV32() */
 #endif
 #ifndef UDIV32
 #define UDIV32(a, b)  (a / b)
 #endif
 /* BITSTREAM READING FUNCTIONS */
@ -121,15 +129,15 @@ static inline void range_dec_normalize(void)
 static inline int range_decode_culfreq(int tot_f)
 {
    range_dec_normalize();
-    rc.help = rc.range / tot_f;
+    rc.help = UDIV32(rc.range, tot_f);
-    return rc.low / rc.help;
+    return UDIV32(rc.low, rc.help);
 }
 static inline int range_decode_culshift(int shift)
 {
    range_dec_normalize();
    rc.help = rc.range >> shift;
-    return rc.low / rc.help;
+    return UDIV32(rc.low, rc.help);
 }
--- a/apps/codecs/lib/SOURCES
+++ b/apps/codecs/lib/SOURCES
@ -5,6 +5,9 @@ codeclib.c
 mdct2.c
 #ifdef CPU_ARM
 mdct_arm.S
 #if ARM_ARCH == 4
 udiv32_armv4.S
 #endif
 #endif
 #elif defined(SIMULATOR) && defined(__APPLE__)
--- a/apps/codecs/lib/codeclib.h
+++ b/apps/codecs/lib/codeclib.h
@ -57,6 +57,15 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con
 extern void mdct_backward(int n, int32_t *in, int32_t *out);
 #if defined(CPU_ARM) && (ARM_ARCH == 4)
 /* optimised unsigned integer division for ARMv4, in IRAM */
 unsigned udiv32_arm(unsigned a, unsigned b);
 #define UDIV32(a, b) udiv32_arm(a, b)
 #else
 /* default */
 #define UDIV32(a, b) (a / b)
 #endif
 /* Various codec helper functions */
 int codec_init(void);
--- a/apps/codecs/lib/udiv32_armv4.S
+++ b/apps/codecs/lib/udiv32_armv4.S
@ -0,0 +1,114 @@
 /***************************************************************************
 *             __________               __   ___.
 *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
 * Copyright (C) 2008 by Jens Arnold
 *
 * Optimised unsigned integer division for ARMv4
 *
 * Based on: libgcc routines for ARM cpu.
 * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
 * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
 * Free Software Foundation, Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 ****************************************************************************/
 #include "config.h"
 /* Codecs should not normally do this, but we need to check a macro, and
 * codecs.h would confuse the assembler. */
 .macro ARM_DIV_BODY dividend, divisor, result, curbit
    mov     \result, \dividend
    mov     \curbit, #90          @ 3 * 30, (calculating branch dest)
    cmp     \divisor, \result, lsr #16
    movls   \result,\result, lsr #16
    subls   \curbit, \curbit, #48
    cmp     \divisor, \result, lsr #8
    movls   \result,\result, lsr #8
    subls   \curbit, \curbit, #24
    cmp     \divisor, \result, lsr #4
    movls   \result,\result, lsr #4
    subls   \curbit, \curbit, #12
    cmp     \divisor, \result, lsr #2
    subls   \curbit, \curbit, #6
    @ calculation is only done down to shift=2, because the shift=1 step
    @ would need 3 more cycles, but would only gain 1.5 cycles on average
    mov     \result, #0
    add     pc, pc, \curbit, lsl #2
    nop
    .set    shift, 32
    .rept   32
    .set    shift, shift - 1
    cmp     \dividend, \divisor, lsl #shift
    adc     \result, \result, \result
    subcs   \dividend, \dividend, \divisor, lsl #shift
    .endr
 .endm
 .macro ARM_DIV2_ORDER divisor, order
    cmp     \divisor, #(1 << 16)
    movhs   \divisor, \divisor, lsr #16
    movhs   \order, #16
    movlo   \order, #0
    cmp     \divisor, #(1 << 8)
    movhs   \divisor, \divisor, lsr #8
    addhs   \order, \order, #8
    cmp     \divisor, #(1 << 4)
    movhs   \divisor, \divisor, lsr #4
    addhs   \order, \order, #4
    cmp     \divisor, #(1 << 2)
    addhi   \order, \order, #3
    addls   \order, \order, \divisor, lsr #1
 .endm
 #ifdef USE_IRAM
    .section    .icode,"ax",%progbits
 #else
    .text
 #endif
    .align
    .global udiv32_arm
    .type   udiv32_arm,%function
 udiv32_arm:
    subs    r2, r1, #1
    bxeq    lr
    bcc     20f
    cmp     r0, r1
    bls     10f
    tst     r1, r2
    beq     30f
    ARM_DIV_BODY r0, r1, r2, r3
    mov     r0, r2
    bx      lr
 10:
    moveq   r0, #1
 20:
    movne   r0, #0
    bx      lr
 30:
    ARM_DIV2_ORDER r1, r2
    mov     r0, r0, lsr r2
    bx      lr