libdemac: Add x86/x86_64 MMX asm for the filters. Not relevant for target but speeds up decoding on x86/x86_64 sims. Average speedup ranges from 25% for -c2000 to 3 times for -c5000; on Intel Atom it's even 45% for -c2000 to 6 times for -c5000.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24663 a1c6a512-1295-4272-9138-f99709370657
2010-02-15 01:27:04 +00:00 · 2010-02-15 01:27:04 +00:00 · b8eb272e48
commit b8eb272e48
parent 1bef4c6650
3 changed files with 238 additions and 6 deletions
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@ -46,6 +46,9 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #elif defined(CPU_ARM) && (ARM_ARCH >= 5)
 /* Assume all our ARMv5 targets are ARMv5te(j) */
 #include "vector_math16_armv5te.h"
+#elif (defined(__i386__) || defined(__i486__))  && defined(__MMX__) \
+    || defined(__x86_64__)
+#include "vector_math16_mmx.h"
 #else
 #include "vector_math_generic.h"
 #endif
--- a/apps/codecs/demac/libdemac/vector_math16_mmx.h
+++ b/apps/codecs/demac/libdemac/vector_math16_mmx.h
@ -0,0 +1,219 @@
+/*
+
+libdemac - A Monkey's Audio decoder
+
+$Id$
+
+Copyright (C) Dave Chapman 2007
+
+MMX vector math copyright (C) 2010 Jens Arnold
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+
+*/
+
+#define FUSED_VECTOR_MATH
+
+#define __E(__e) #__e
+#define __S(__e) __E(__e)
+
+static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t *s2)
+{
+    int res, t;
+#if ORDER > 256
+    int cnt = ORDER>>8;
+#endif
+
+    asm volatile (
+#if ORDER > 256
+        "pxor    %%mm2, %%mm2        \n"
+        ".set    ofs, 0              \n"
+    "1:                              \n"
+        ".rept   64                  \n"
+#else
+        "movq    (%[v1]), %%mm2      \n"
+        "movq    %%mm2, %%mm0        \n"
+        "pmaddwd (%[f2]), %%mm2      \n"
+        "paddw   (%[s2]), %%mm0      \n"
+        "movq    %%mm0, (%[v1])      \n"
+        ".set    ofs, 8              \n"
+
+        ".rept  " __S(ORDER>>2 - 1) "\n"
+#endif
+        "movq    ofs(%[v1]), %%mm1   \n"
+        "movq    %%mm1, %%mm0        \n"
+        "pmaddwd ofs(%[f2]), %%mm1   \n"
+        "paddw   ofs(%[s2]), %%mm0   \n"
+        "movq    %%mm0, ofs(%[v1])   \n"
+        "paddd   %%mm1, %%mm2        \n"
+        ".set    ofs, ofs + 8        \n"
+        ".endr                       \n"
+#if ORDER > 256
+        "add     $512, %[v1]         \n"
+        "add     $512, %[s2]         \n"
+        "add     $512, %[f2]         \n"
+        "dec     %[cnt]              \n"
+        "jne     1b                  \n"
+#endif
+
+        "movd    %%mm2, %[t]         \n"
+        "psrlq   $32, %%mm2          \n"
+        "movd    %%mm2, %[res]       \n"
+        "add     %[t], %[res]        \n"
+        : /* outputs */
+#if ORDER > 256
+        [cnt]"+r"(cnt),
+        [s2] "+r"(s2),
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"2"(v1),
+        [f2]"3"(f2)
+#else
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"r"(v1),
+        [f2]"r"(f2),
+        [s2]"r"(s2)
+#endif
+        : /* clobbers */
+        "mm0", "mm1", "mm2"
+    );
+    return res;
+}
+
+static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t *s2)
+{
+    int res, t;
+#if ORDER > 256
+    int cnt = ORDER>>8;
+#endif
+
+    asm volatile (
+#if ORDER > 256
+        "pxor    %%mm2, %%mm2        \n"
+        ".set    ofs, 0              \n"
+    "1:                              \n"
+        ".rept   64                  \n"
+#else
+        "movq    (%[v1]), %%mm2      \n"
+        "movq    %%mm2, %%mm0        \n"
+        "pmaddwd (%[f2]), %%mm2      \n"
+        "psubw   (%[s2]), %%mm0      \n"
+        "movq    %%mm0, (%[v1])      \n"
+        ".set    ofs, 8              \n"
+
+        ".rept  " __S(ORDER>>2 - 1) "\n"
+#endif
+        "movq    ofs(%[v1]), %%mm1   \n"
+        "movq    %%mm1, %%mm0        \n"
+        "pmaddwd ofs(%[f2]), %%mm1   \n"
+        "psubw   ofs(%[s2]), %%mm0   \n"
+        "movq    %%mm0, ofs(%[v1])   \n"
+        "paddd   %%mm1, %%mm2        \n"
+        ".set    ofs, ofs + 8        \n"
+        ".endr                       \n"
+#if ORDER > 256
+        "add     $512, %[v1]         \n"
+        "add     $512, %[s2]         \n"
+        "add     $512, %[f2]         \n"
+        "dec     %[cnt]              \n"
+        "jne     1b                  \n"
+#endif
+
+        "movd    %%mm2, %[t]         \n"
+        "psrlq   $32, %%mm2          \n"
+        "movd    %%mm2, %[res]       \n"
+        "add     %[t], %[res]        \n"
+        : /* outputs */
+#if ORDER > 256
+        [cnt]"+r"(cnt),
+        [s2] "+r"(s2),
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"2"(v1),
+        [f2]"3"(f2)
+#else
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"r"(v1),
+        [f2]"r"(f2),
+        [s2]"r"(s2)
+#endif
+        : /* clobbers */
+        "mm0", "mm1", "mm2"
+    );
+    return res;
+}
+
+static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
+{
+    int res, t;
+#if ORDER > 256
+    int cnt = ORDER>>8;
+#endif
+               
+    asm volatile (
+#if ORDER > 256
+        "pxor    %%mm1, %%mm1        \n"
+        ".set    ofs, 0              \n"
+    "1:                              \n"
+        ".rept   64                  \n"
+#else
+        "movq    (%[v1]), %%mm1      \n"
+        "pmaddwd (%[v2]), %%mm1      \n"
+        ".set    ofs, 8              \n"
+
+        ".rept  " __S(ORDER>>2 - 1) "\n"
+#endif
+        "movq    ofs(%[v1]), %%mm0   \n"
+        "pmaddwd ofs(%[v2]), %%mm0   \n"
+        "paddd   %%mm0, %%mm1        \n"
+        ".set    ofs, ofs + 8        \n"
+        ".endr                       \n"
+#if ORDER > 256
+        "add     $512, %[v1]         \n"
+        "add     $512, %[v2]         \n"
+        "dec     %[cnt]              \n"
+        "jne     1b                  \n"
+#endif
+
+        "movd    %%mm1, %[t]         \n"
+        "psrlq   $32, %%mm1          \n"
+        "movd    %%mm1, %[res]       \n"
+        "add     %[t], %[res]        \n"
+        : /* outputs */
+#if ORDER > 256
+        [cnt]"+r"(cnt),
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"1"(v1),
+        [v2]"2"(v2)
+#else
+        [res]"=r"(res),
+        [t]  "=r"(t)
+        : /* inputs */
+        [v1]"r"(v1),
+        [v2]"r"(v2)
+#endif
+        : /* clobbers */
+        "mm0", "mm1"
+    );
+    return res;
+}
--- a/tools/configure
+++ b/tools/configure
@ -171,12 +171,20 @@ simcc () {
 GCCOPTS="$GCCOPTS -I\$(SIMDIR)"

 if test "X$crosscompile" != "Xyes"; then
-   if [ "`uname -m`" = "x86_64" ] || [ "`uname -m`" = "amd64" ]; then
-     # fPIC is needed to make shared objects link
-     # setting visibility to hidden is necessary to avoid strange crashes
-     # due to symbol clashing
-     GCCOPTS="$GCCOPTS -fPIC -fvisibility=hidden"
-   fi
+   case `uname -m` in
+    x86_64|amd64)
+      # fPIC is needed to make shared objects link
+      # setting visibility to hidden is necessary to avoid strange crashes
+      # due to symbol clashing
+      GCCOPTS="$GCCOPTS -fPIC -fvisibility=hidden"
+      # x86_64 supports MMX by default
+    ;;
+
+    i686)
+      echo "Enabling MMX support"
+      GCCOPTS="$GCCOPTS -mmmx"
+    ;;
+   esac

   id=$$
   cat >$tmpdir/conftest-$id.c <<EOF
@ -218,6 +226,8 @@ EOF
   LDOPTS="-mconsole $sdl_libs"
   output="rockboxui.exe" # use this as output binary name
   endian="little" # windows is little endian
+   echo "Enabling MMX support"
+   GCCOPTS="$GCCOPTS -mmmx"
 fi
 }