1
0
Fork 0
forked from len0rd/rockbox

libdemac: Add x86/x86_64 MMX asm for the filters. Not relevant for target but speeds up decoding on x86/x86_64 sims. Average speedup ranges from 25% for -c2000 to 3 times for -c5000; on Intel Atom it's even 45% for -c2000 to 6 times for -c5000.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24663 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Jens Arnold 2010-02-15 01:27:04 +00:00
parent 1bef4c6650
commit b8eb272e48
3 changed files with 238 additions and 6 deletions

View file

@ -46,6 +46,9 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
#elif defined(CPU_ARM) && (ARM_ARCH >= 5)
/* Assume all our ARMv5 targets are ARMv5te(j) */
#include "vector_math16_armv5te.h"
#elif (defined(__i386__) || defined(__i486__)) && defined(__MMX__) \
|| defined(__x86_64__)
#include "vector_math16_mmx.h"
#else
#include "vector_math_generic.h"
#endif

View file

@ -0,0 +1,219 @@
/*
libdemac - A Monkey's Audio decoder
$Id$
Copyright (C) Dave Chapman 2007
MMX vector math copyright (C) 2010 Jens Arnold
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
*/
#define FUSED_VECTOR_MATH
#define __E(__e) #__e
#define __S(__e) __E(__e)
static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t *s2)
{
int res, t;
#if ORDER > 256
int cnt = ORDER>>8;
#endif
asm volatile (
#if ORDER > 256
"pxor %%mm2, %%mm2 \n"
".set ofs, 0 \n"
"1: \n"
".rept 64 \n"
#else
"movq (%[v1]), %%mm2 \n"
"movq %%mm2, %%mm0 \n"
"pmaddwd (%[f2]), %%mm2 \n"
"paddw (%[s2]), %%mm0 \n"
"movq %%mm0, (%[v1]) \n"
".set ofs, 8 \n"
".rept " __S(ORDER>>2 - 1) "\n"
#endif
"movq ofs(%[v1]), %%mm1 \n"
"movq %%mm1, %%mm0 \n"
"pmaddwd ofs(%[f2]), %%mm1 \n"
"paddw ofs(%[s2]), %%mm0 \n"
"movq %%mm0, ofs(%[v1]) \n"
"paddd %%mm1, %%mm2 \n"
".set ofs, ofs + 8 \n"
".endr \n"
#if ORDER > 256
"add $512, %[v1] \n"
"add $512, %[s2] \n"
"add $512, %[f2] \n"
"dec %[cnt] \n"
"jne 1b \n"
#endif
"movd %%mm2, %[t] \n"
"psrlq $32, %%mm2 \n"
"movd %%mm2, %[res] \n"
"add %[t], %[res] \n"
: /* outputs */
#if ORDER > 256
[cnt]"+r"(cnt),
[s2] "+r"(s2),
[res]"=r"(res),
[t] "=r"(t)
: /* inputs */
[v1]"2"(v1),
[f2]"3"(f2)
#else
[res]"=r"(res),
[t] "=r"(t)
: /* inputs */
[v1]"r"(v1),
[f2]"r"(f2),
[s2]"r"(s2)
#endif
: /* clobbers */
"mm0", "mm1", "mm2"
);
return res;
}
static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t *s2)
{
int res, t;
#if ORDER > 256
int cnt = ORDER>>8;
#endif
asm volatile (
#if ORDER > 256
"pxor %%mm2, %%mm2 \n"
".set ofs, 0 \n"
"1: \n"
".rept 64 \n"
#else
"movq (%[v1]), %%mm2 \n"
"movq %%mm2, %%mm0 \n"
"pmaddwd (%[f2]), %%mm2 \n"
"psubw (%[s2]), %%mm0 \n"
"movq %%mm0, (%[v1]) \n"
".set ofs, 8 \n"
".rept " __S(ORDER>>2 - 1) "\n"
#endif
"movq ofs(%[v1]), %%mm1 \n"
"movq %%mm1, %%mm0 \n"
"pmaddwd ofs(%[f2]), %%mm1 \n"
"psubw ofs(%[s2]), %%mm0 \n"
"movq %%mm0, ofs(%[v1]) \n"
"paddd %%mm1, %%mm2 \n"
".set ofs, ofs + 8 \n"
".endr \n"
#if ORDER > 256
"add $512, %[v1] \n"
"add $512, %[s2] \n"
"add $512, %[f2] \n"
"dec %[cnt] \n"
"jne 1b \n"
#endif
"movd %%mm2, %[t] \n"
"psrlq $32, %%mm2 \n"
"movd %%mm2, %[res] \n"
"add %[t], %[res] \n"
: /* outputs */
#if ORDER > 256
[cnt]"+r"(cnt),
[s2] "+r"(s2),
[res]"=r"(res),
[t] "=r"(t)
: /* inputs */
[v1]"2"(v1),
[f2]"3"(f2)
#else
[res]"=r"(res),
[t] "=r"(t)
: /* inputs */
[v1]"r"(v1),
[f2]"r"(f2),
[s2]"r"(s2)
#endif
: /* clobbers */
"mm0", "mm1", "mm2"
);
return res;
}
static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
{
int res, t;
#if ORDER > 256
int cnt = ORDER>>8;
#endif
asm volatile (
#if ORDER > 256
"pxor %%mm1, %%mm1 \n"
".set ofs, 0 \n"
"1: \n"
".rept 64 \n"
#else
"movq (%[v1]), %%mm1 \n"
"pmaddwd (%[v2]), %%mm1 \n"
".set ofs, 8 \n"
".rept " __S(ORDER>>2 - 1) "\n"
#endif
"movq ofs(%[v1]), %%mm0 \n"
"pmaddwd ofs(%[v2]), %%mm0 \n"
"paddd %%mm0, %%mm1 \n"
".set ofs, ofs + 8 \n"
".endr \n"
#if ORDER > 256
"add $512, %[v1] \n"
"add $512, %[v2] \n"
"dec %[cnt] \n"
"jne 1b \n"
#endif
"movd %%mm1, %[t] \n"
"psrlq $32, %%mm1 \n"
"movd %%mm1, %[res] \n"
"add %[t], %[res] \n"
: /* outputs */
#if ORDER > 256
[cnt]"+r"(cnt),
[res]"=r"(res),
[t] "=r"(t)
: /* inputs */
[v1]"1"(v1),
[v2]"2"(v2)
#else
[res]"=r"(res),
[t] "=r"(t)
: /* inputs */
[v1]"r"(v1),
[v2]"r"(v2)
#endif
: /* clobbers */
"mm0", "mm1"
);
return res;
}

22
tools/configure vendored
View file

@ -171,12 +171,20 @@ simcc () {
GCCOPTS="$GCCOPTS -I\$(SIMDIR)"
if test "X$crosscompile" != "Xyes"; then
if [ "`uname -m`" = "x86_64" ] || [ "`uname -m`" = "amd64" ]; then
# fPIC is needed to make shared objects link
# setting visibility to hidden is necessary to avoid strange crashes
# due to symbol clashing
GCCOPTS="$GCCOPTS -fPIC -fvisibility=hidden"
fi
case `uname -m` in
x86_64|amd64)
# fPIC is needed to make shared objects link
# setting visibility to hidden is necessary to avoid strange crashes
# due to symbol clashing
GCCOPTS="$GCCOPTS -fPIC -fvisibility=hidden"
# x86_64 supports MMX by default
;;
i686)
echo "Enabling MMX support"
GCCOPTS="$GCCOPTS -mmmx"
;;
esac
id=$$
cat >$tmpdir/conftest-$id.c <<EOF
@ -218,6 +226,8 @@ EOF
LDOPTS="-mconsole $sdl_libs"
output="rockboxui.exe" # use this as output binary name
endian="little" # windows is little endian
echo "Enabling MMX support"
GCCOPTS="$GCCOPTS -mmmx"
fi
}