Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657
2008-11-05 00:10:05 +00:00 · 2008-11-05 00:10:05 +00:00 · fe04e40be7
commit fe04e40be7
parent 7a835ee0c6
4 changed files with 137 additions and 3 deletions
--- a/apps/codecs/demac/libdemac/rangecoding.h
+++ b/apps/codecs/demac/libdemac/rangecoding.h
@ -49,6 +49,14 @@ removing the rc parameter from each function (and the RNGC macro)).

 */

+#ifdef ROCKBOX
+#include "../lib/codeclib.h"
+/* for UDIV32() */
+#endif
+
+#ifndef UDIV32
+#define UDIV32(a, b)  (a / b)
+#endif

 /* BITSTREAM READING FUNCTIONS */

@ -121,15 +129,15 @@ static inline void range_dec_normalize(void)
 static inline int range_decode_culfreq(int tot_f)
 {
    range_dec_normalize();
-    rc.help = rc.range / tot_f;
-    return rc.low / rc.help;
+    rc.help = UDIV32(rc.range, tot_f);
+    return UDIV32(rc.low, rc.help);
 }

 static inline int range_decode_culshift(int shift)
 {
    range_dec_normalize();
    rc.help = rc.range >> shift;
-    return rc.low / rc.help;
+    return UDIV32(rc.low, rc.help);
 }


--- a/apps/codecs/lib/SOURCES
+++ b/apps/codecs/lib/SOURCES
@ -5,6 +5,9 @@ codeclib.c
 mdct2.c
 #ifdef CPU_ARM
 mdct_arm.S
+#if ARM_ARCH == 4
+udiv32_armv4.S
+#endif
 #endif

 #elif defined(SIMULATOR) && defined(__APPLE__)
--- a/apps/codecs/lib/codeclib.h
+++ b/apps/codecs/lib/codeclib.h
@ -57,6 +57,15 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con

 extern void mdct_backward(int n, int32_t *in, int32_t *out);

+#if defined(CPU_ARM) && (ARM_ARCH == 4)
+/* optimised unsigned integer division for ARMv4, in IRAM */
+unsigned udiv32_arm(unsigned a, unsigned b);
+#define UDIV32(a, b) udiv32_arm(a, b)
+#else
+/* default */
+#define UDIV32(a, b) (a / b)
+#endif
+
 /* Various codec helper functions */

 int codec_init(void);
--- a/apps/codecs/lib/udiv32_armv4.S
+++ b/apps/codecs/lib/udiv32_armv4.S
@ -0,0 +1,114 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2008 by Jens Arnold
+ *
+ * Optimised unsigned integer division for ARMv4
+ *
+ * Based on: libgcc routines for ARM cpu.
+ * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
+ * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+ * Free Software Foundation, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+/* Codecs should not normally do this, but we need to check a macro, and
+ * codecs.h would confuse the assembler. */
+
+.macro ARM_DIV_BODY dividend, divisor, result, curbit
+
+    mov     \result, \dividend
+    mov     \curbit, #90          @ 3 * 30, (calculating branch dest)
+    cmp     \divisor, \result, lsr #16
+    movls   \result,\result, lsr #16
+    subls   \curbit, \curbit, #48
+    cmp     \divisor, \result, lsr #8
+    movls   \result,\result, lsr #8
+    subls   \curbit, \curbit, #24
+    cmp     \divisor, \result, lsr #4
+    movls   \result,\result, lsr #4
+    subls   \curbit, \curbit, #12
+    cmp     \divisor, \result, lsr #2
+    subls   \curbit, \curbit, #6
+    @ calculation is only done down to shift=2, because the shift=1 step
+    @ would need 3 more cycles, but would only gain 1.5 cycles on average
+    mov     \result, #0
+    add     pc, pc, \curbit, lsl #2
+    nop
+    .set    shift, 32
+    .rept   32
+    .set    shift, shift - 1
+    cmp     \dividend, \divisor, lsl #shift
+    adc     \result, \result, \result
+    subcs   \dividend, \dividend, \divisor, lsl #shift
+    .endr
+.endm
+
+.macro ARM_DIV2_ORDER divisor, order
+
+    cmp     \divisor, #(1 << 16)
+    movhs   \divisor, \divisor, lsr #16
+    movhs   \order, #16
+    movlo   \order, #0
+
+    cmp     \divisor, #(1 << 8)
+    movhs   \divisor, \divisor, lsr #8
+    addhs   \order, \order, #8
+
+    cmp     \divisor, #(1 << 4)
+    movhs   \divisor, \divisor, lsr #4
+    addhs   \order, \order, #4
+
+    cmp     \divisor, #(1 << 2)
+    addhi   \order, \order, #3
+    addls   \order, \order, \divisor, lsr #1
+.endm
+
+
+#ifdef USE_IRAM
+    .section    .icode,"ax",%progbits
+#else
+    .text
+#endif
+    .align
+    .global udiv32_arm
+    .type   udiv32_arm,%function
+
+udiv32_arm:
+    subs    r2, r1, #1
+    bxeq    lr
+    bcc     20f
+    cmp     r0, r1
+    bls     10f
+    tst     r1, r2
+    beq     30f
+
+    ARM_DIV_BODY r0, r1, r2, r3
+    mov     r0, r2
+    bx      lr
+
+10:
+    moveq   r0, #1
+20:
+    movne   r0, #0
+    bx      lr
+
+30:
+    ARM_DIV2_ORDER r1, r2
+    mov     r0, r0, lsr r2
+    bx      lr