1
0
Fork 0
forked from len0rd/rockbox

libdemac: ARMv7 assembler optimisation for the filters, tested on Nokia N900. Speedup is 2.1x for -c5000 compared to the ARMv6 asm. Note that actually compiling it on device requires hand-assembling the 'vadd' and 'vsub' instructions due to a bug in binutils 2.18.50, and making the standalone decoder use it requires Makefile and demac_config.h hacks.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27944 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Jens Arnold 2010-08-30 06:31:47 +00:00
parent dd5e3eb542
commit 811877e5b3
2 changed files with 216 additions and 0 deletions

View file

@ -41,6 +41,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
#ifdef CPU_COLDFIRE #ifdef CPU_COLDFIRE
#include "vector_math16_cf.h" #include "vector_math16_cf.h"
#elif defined(CPU_ARM) && (ARM_ARCH >= 7)
#include "vector_math16_armv7.h"
#elif defined(CPU_ARM) && (ARM_ARCH >= 6) #elif defined(CPU_ARM) && (ARM_ARCH >= 6)
#include "vector_math16_armv6.h" #include "vector_math16_armv6.h"
#elif defined(CPU_ARM) && (ARM_ARCH >= 5) #elif defined(CPU_ARM) && (ARM_ARCH >= 5)

View file

@ -0,0 +1,214 @@
/*
libdemac - A Monkey's Audio decoder
$Id$
Copyright (C) Dave Chapman 2007
ARMv7 neon vector math copyright (C) 2010 Jens Arnold
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
*/
#define FUSED_VECTOR_MATH
#if ORDER > 32
#define REPEAT_BLOCK(x) x x x
#elif ORDER > 16
#define REPEAT_BLOCK(x) x
#else
#define REPEAT_BLOCK(x)
#endif
/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
{
int res;
#if ORDER > 64
int cnt = ORDER>>6;
#endif
asm volatile (
#if ORDER > 64
"vmov.i16 q0, #0 \n"
"1: \n"
"subs %[cnt], %[cnt], #1 \n"
#endif
"vld1.16 {d6-d9}, [%[f2]]! \n"
"vld1.16 {d2-d5}, [%[v1]] \n"
"vld1.16 {d10-d13}, [%[s2]]! \n"
#if ORDER > 64
"vmlal.s16 q0, d2, d6 \n"
#else
"vmull.s16 q0, d2, d6 \n"
#endif
"vmlal.s16 q0, d3, d7 \n"
"vmlal.s16 q0, d4, d8 \n"
"vmlal.s16 q0, d5, d9 \n"
"vadd.i16 q1, q1, q5 \n"
"vadd.i16 q2, q2, q6 \n"
"vst1.16 {d2-d5}, [%[v1]]! \n"
REPEAT_BLOCK(
"vld1.16 {d6-d9}, [%[f2]]! \n"
"vld1.16 {d2-d5}, [%[v1]] \n"
"vld1.16 {d10-d13}, [%[s2]]! \n"
"vmlal.s16 q0, d2, d6 \n"
"vmlal.s16 q0, d3, d7 \n"
"vmlal.s16 q0, d4, d8 \n"
"vmlal.s16 q0, d5, d9 \n"
"vadd.i16 q1, q1, q5 \n"
"vadd.i16 q2, q2, q6 \n"
"vst1.16 {d2-d5}, [%[v1]]! \n"
)
#if ORDER > 64
"bne 1b \n"
#endif
"vpadd.i32 d0, d0, d1 \n"
"vpaddl.s32 d0, d0 \n"
"vmov.32 %[res], d0[0] \n"
: /* outputs */
#if ORDER > 64
[cnt]"+r"(cnt),
#endif
[v1] "+r"(v1),
[f2] "+r"(f2),
[s2] "+r"(s2),
[res]"=r"(res)
: /* inputs */
: /* clobbers */
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "memory"
);
return res;
}
/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
{
int res;
#if ORDER > 64
int cnt = ORDER>>6;
#endif
asm volatile (
#if ORDER > 64
"vmov.i16 q0, #0 \n"
"1: \n"
"subs %[cnt], %[cnt], #1 \n"
#endif
"vld1.16 {d6-d9}, [%[f2]]! \n"
"vld1.16 {d2-d5}, [%[v1]] \n"
"vld1.16 {d10-d13}, [%[s2]]! \n"
#if ORDER > 64
"vmlal.s16 q0, d2, d6 \n"
#else
"vmull.s16 q0, d2, d6 \n"
#endif
"vmlal.s16 q0, d3, d7 \n"
"vmlal.s16 q0, d4, d8 \n"
"vmlal.s16 q0, d5, d9 \n"
"vsub.i16 q1, q1, q5 \n"
"vsub.i16 q2, q2, q6 \n"
"vst1.16 {d2-d5}, [%[v1]]! \n"
REPEAT_BLOCK(
"vld1.16 {d6-d9}, [%[f2]]! \n"
"vld1.16 {d2-d5}, [%[v1]] \n"
"vld1.16 {d10-d13}, [%[s2]]! \n"
"vmlal.s16 q0, d2, d6 \n"
"vmlal.s16 q0, d3, d7 \n"
"vmlal.s16 q0, d4, d8 \n"
"vmlal.s16 q0, d5, d9 \n"
"vsub.i16 q1, q1, q5 \n"
"vsub.i16 q2, q2, q6 \n"
"vst1.16 {d2-d5}, [%[v1]]! \n"
)
#if ORDER > 64
"bne 1b \n"
#endif
"vpadd.i32 d0, d0, d1 \n"
"vpaddl.s32 d0, d0 \n"
"vmov.32 %[res], d0[0] \n"
: /* outputs */
#if ORDER > 64
[cnt]"+r"(cnt),
#endif
[v1] "+r"(v1),
[f2] "+r"(f2),
[s2] "+r"(s2),
[res]"=r"(res)
: /* inputs */
: /* clobbers */
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "memory"
);
return res;
}
static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
{
int res;
#if ORDER > 64
int cnt = ORDER>>6;
#endif
asm volatile (
#if ORDER > 64
"vmov.i16 q0, #0 \n"
"1: \n"
"subs %[cnt], %[cnt], #1 \n"
#endif
"vld1.16 {d2-d5}, [%[v1]]! \n"
"vld1.16 {d6-d9}, [%[v2]]! \n"
#if ORDER > 64
"vmlal.s16 q0, d2, d6 \n"
#else
"vmull.s16 q0, d2, d6 \n"
#endif
"vmlal.s16 q0, d3, d7 \n"
"vmlal.s16 q0, d4, d8 \n"
"vmlal.s16 q0, d5, d9 \n"
REPEAT_BLOCK(
"vld1.16 {d2-d5}, [%[v1]]! \n"
"vld1.16 {d6-d9}, [%[v2]]! \n"
"vmlal.s16 q0, d2, d6 \n"
"vmlal.s16 q0, d3, d7 \n"
"vmlal.s16 q0, d4, d8 \n"
"vmlal.s16 q0, d5, d9 \n"
)
#if ORDER > 64
"bne 1b \n"
#endif
"vpadd.i32 d0, d0, d1 \n"
"vpaddl.s32 d0, d0 \n"
"vmov.32 %[res], d0[0] \n"
: /* outputs */
#if ORDER > 64
[cnt]"+r"(cnt),
#endif
[v1] "+r"(v1),
[v2] "+r"(v2),
[res]"=r"(res)
: /* inputs */
: /* clobbers */
"d0", "d1", "d2", "d3", "d4",
"d5", "d6", "d7", "d8", "d9"
);
return res;
}