libdemac: ARMv7 assembler optimisation for the filters, tested on Nokia N900. Speedup is 2.1x for -c5000 compared to the ARMv6 asm. Note that actually compiling it on device requires hand-assembling the 'vadd' and 'vsub' instructions due to a bug in binutils 2.18.50, and making the standalone decoder use it requires Makefile and demac_config.h hacks.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27944 a1c6a512-1295-4272-9138-f99709370657
2010-08-30 06:31:47 +00:00 · 2010-08-30 06:31:47 +00:00 · 811877e5b3
commit 811877e5b3
parent dd5e3eb542
2 changed files with 216 additions and 0 deletions
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@ -41,6 +41,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #ifdef CPU_COLDFIRE
 #include "vector_math16_cf.h"
 #elif defined(CPU_ARM) && (ARM_ARCH >= 7)
 #include "vector_math16_armv7.h"
 #elif defined(CPU_ARM) && (ARM_ARCH >= 6)
 #include "vector_math16_armv6.h"
 #elif defined(CPU_ARM) && (ARM_ARCH >= 5)
--- a/apps/codecs/demac/libdemac/vector_math16_armv7.h
+++ b/apps/codecs/demac/libdemac/vector_math16_armv7.h
@ -0,0 +1,214 @@
 /*
 libdemac - A Monkey's Audio decoder
 $Id$
 Copyright (C) Dave Chapman 2007
 ARMv7 neon vector math copyright (C) 2010 Jens Arnold
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 */
 #define FUSED_VECTOR_MATH
 #if ORDER > 32
 #define REPEAT_BLOCK(x) x x x
 #elif ORDER > 16
 #define REPEAT_BLOCK(x) x
 #else
 #define REPEAT_BLOCK(x)
 #endif
 /* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
 static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
 {
    int res;
 #if ORDER > 64
    int cnt = ORDER>>6;
 #endif
    asm volatile (
 #if ORDER > 64
        "vmov.i16    q0, #0              \n"
    "1:                                  \n"
        "subs        %[cnt], %[cnt], #1  \n"
 #endif
        "vld1.16     {d6-d9}, [%[f2]]!   \n"
        "vld1.16     {d2-d5}, [%[v1]]    \n"
        "vld1.16     {d10-d13}, [%[s2]]! \n"
 #if ORDER > 64
        "vmlal.s16   q0, d2, d6          \n"
 #else
        "vmull.s16   q0, d2, d6          \n"
 #endif
        "vmlal.s16   q0, d3, d7          \n"
        "vmlal.s16   q0, d4, d8          \n"
        "vmlal.s16   q0, d5, d9          \n"
        "vadd.i16    q1, q1, q5          \n"
        "vadd.i16    q2, q2, q6          \n"
        "vst1.16     {d2-d5}, [%[v1]]!   \n"
        REPEAT_BLOCK(
        "vld1.16     {d6-d9}, [%[f2]]!   \n"
        "vld1.16     {d2-d5}, [%[v1]]    \n"
        "vld1.16     {d10-d13}, [%[s2]]! \n"
        "vmlal.s16   q0, d2, d6          \n"
        "vmlal.s16   q0, d3, d7          \n"
        "vmlal.s16   q0, d4, d8          \n"
        "vmlal.s16   q0, d5, d9          \n"
        "vadd.i16    q1, q1, q5          \n"
        "vadd.i16    q2, q2, q6          \n"
        "vst1.16     {d2-d5}, [%[v1]]!   \n"
        )
 #if ORDER > 64
        "bne         1b                  \n"
 #endif
        "vpadd.i32   d0, d0, d1          \n"
        "vpaddl.s32  d0, d0              \n"
        "vmov.32     %[res], d0[0]       \n"
        : /* outputs */
 #if ORDER > 64
        [cnt]"+r"(cnt),
 #endif
        [v1] "+r"(v1),
        [f2] "+r"(f2),
        [s2] "+r"(s2),
        [res]"=r"(res)
        : /* inputs */
        : /* clobbers */
        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
        "d8", "d9", "d10", "d11", "d12", "d13", "memory"
    );
    return res;
 }
 /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
 static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
 {
    int res;
 #if ORDER > 64
    int cnt = ORDER>>6;
 #endif
    asm volatile (
 #if ORDER > 64
        "vmov.i16    q0, #0              \n"
    "1:                                  \n"
        "subs        %[cnt], %[cnt], #1  \n"
 #endif
        "vld1.16     {d6-d9}, [%[f2]]!   \n"
        "vld1.16     {d2-d5}, [%[v1]]    \n"
        "vld1.16     {d10-d13}, [%[s2]]! \n"
 #if ORDER > 64
        "vmlal.s16   q0, d2, d6          \n"
 #else
        "vmull.s16   q0, d2, d6          \n"
 #endif
        "vmlal.s16   q0, d3, d7          \n"
        "vmlal.s16   q0, d4, d8          \n"
        "vmlal.s16   q0, d5, d9          \n"
        "vsub.i16    q1, q1, q5          \n"
        "vsub.i16    q2, q2, q6          \n"
        "vst1.16     {d2-d5}, [%[v1]]!   \n"
        REPEAT_BLOCK(
        "vld1.16     {d6-d9}, [%[f2]]!   \n"
        "vld1.16     {d2-d5}, [%[v1]]    \n"
        "vld1.16     {d10-d13}, [%[s2]]! \n"
        "vmlal.s16   q0, d2, d6          \n"
        "vmlal.s16   q0, d3, d7          \n"
        "vmlal.s16   q0, d4, d8          \n"
        "vmlal.s16   q0, d5, d9          \n"
        "vsub.i16    q1, q1, q5          \n"
        "vsub.i16    q2, q2, q6          \n"
        "vst1.16     {d2-d5}, [%[v1]]!   \n"
        )
 #if ORDER > 64
        "bne         1b                  \n"
 #endif
        "vpadd.i32   d0, d0, d1          \n"
        "vpaddl.s32  d0, d0              \n"
        "vmov.32     %[res], d0[0]       \n"
        : /* outputs */
 #if ORDER > 64
        [cnt]"+r"(cnt),
 #endif
        [v1] "+r"(v1),
        [f2] "+r"(f2),
        [s2] "+r"(s2),
        [res]"=r"(res)
        : /* inputs */
        : /* clobbers */
        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
        "d8", "d9", "d10", "d11", "d12", "d13", "memory"
    );
    return res;
 }
 static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
 {
    int res;
 #if ORDER > 64
    int cnt = ORDER>>6;
 #endif
    asm volatile (
 #if ORDER > 64
        "vmov.i16    q0, #0              \n"
    "1:                                  \n"
        "subs        %[cnt], %[cnt], #1  \n"
 #endif
        "vld1.16     {d2-d5}, [%[v1]]!   \n"
        "vld1.16     {d6-d9}, [%[v2]]!   \n"
 #if ORDER > 64
        "vmlal.s16   q0, d2, d6          \n"
 #else
        "vmull.s16   q0, d2, d6          \n"
 #endif
        "vmlal.s16   q0, d3, d7          \n"
        "vmlal.s16   q0, d4, d8          \n"
        "vmlal.s16   q0, d5, d9          \n"
        REPEAT_BLOCK(
        "vld1.16     {d2-d5}, [%[v1]]!   \n"
        "vld1.16     {d6-d9}, [%[v2]]!   \n"
        "vmlal.s16   q0, d2, d6          \n"
        "vmlal.s16   q0, d3, d7          \n"
        "vmlal.s16   q0, d4, d8          \n"
        "vmlal.s16   q0, d5, d9          \n"
        )
 #if ORDER > 64
        "bne         1b                  \n"
 #endif
        "vpadd.i32   d0, d0, d1          \n"
        "vpaddl.s32  d0, d0              \n"
        "vmov.32     %[res], d0[0]       \n"
        : /* outputs */
 #if ORDER > 64
        [cnt]"+r"(cnt),
 #endif
        [v1] "+r"(v1),
        [v2] "+r"(v2),
        [res]"=r"(res)
        : /* inputs */
        : /* clobbers */
        "d0", "d1", "d2", "d3", "d4",
        "d5", "d6", "d7", "d8", "d9"
    );
    return res;
 }