From 6e4aa260d019d94e23fbc8c26b95253ae752f697 Mon Sep 17 00:00:00 2001
From: Thom Johansen <thomj@rockbox.org>
Date: Thu, 18 Oct 2007 10:09:21 +0000
Subject: [PATCH] Add Coldfire and ARM assembler for "reverse multiply and
 copy" function too. Gives big speedup on Coldfire, small on ARM.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15183 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/libwma/wmadeci.c | 65 +++++++++++++++++++++++++++++++++---
 1 file changed, 61 insertions(+), 4 deletions(-)

diff --git a/apps/codecs/libwma/wmadeci.c b/apps/codecs/libwma/wmadeci.c
index 34a0f9f229..33894e1cf2 100644
--- a/apps/codecs/libwma/wmadeci.c
+++ b/apps/codecs/libwma/wmadeci.c
@@ -96,12 +96,33 @@ void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
         "smull r8, r9, r1, r5;"
         "add   r1, r4, r9, lsl #1;"
         "stmia %[dst]!, {r0, r1};"
-        "subs %[n], %[n], #2;"
-        "bne 0b;"
+        "subs  %[n], %[n], #2;"
+        "bne   0b;"
         : [d] "+r" (data), [w] "+r" (window), [dst] "+r" (dst), [n] "+r" (n)
         : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
 }
 
+static inline
+void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1,
+                         int len)
+{
+    /* Block sizes are always power of two */
+    asm volatile (
+        "add   %[s1], %[s1], %[n], lsl #2;"
+        "0:"
+        "ldmia %[s0]!, {r0, r1};"
+        "ldmdb %[s1]!, {r4, r5};"
+        "smull r8, r9, r0, r5;"
+        "mov   r0, r9, lsl #1;"
+        "smull r8, r9, r1, r4;"
+        "mov   r1, r9, lsl #1;"
+        "stmia %[dst]!, {r0, r1};"
+        "subs  %[n], %[n], #2;"
+        "bne   0b;"
+        : [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len)
+        : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
+}
+
 #elif defined(CPU_COLDFIRE)
 
 static inline
@@ -118,8 +139,8 @@ void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
         "mac.l %%d1, %%d5, %%acc1;"
         "mac.l %%d2, %%a0, %%acc2;"
         "mac.l %%d3, %%a1, %%acc3;"
-        "lea.l (%[d], 16), %[d];"
-        "lea.l (%[w], 16), %[w];"
+        "lea.l (16, %[d]), %[d];"
+        "lea.l (16, %[w]), %[w];"
         "movclr.l %%acc0, %%d0;"
         "movclr.l %%acc1, %%d1;"
         "movclr.l %%acc2, %%d2;"
@@ -134,6 +155,35 @@ void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
         : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
 }
 
+static inline
+void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1,
+                         int len)
+{
+    /* Block sizes are always power of two. Smallest block is always way bigger
+     * than four too.*/ 
+    asm volatile (
+        "lea.l (-16, %[s1], %[n]*4), %[s1];"
+        "0:"
+        "movem.l (%[s0]), %%d0-%%d3;"
+        "movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;"
+        "mac.l %%d0, %%a1, %%acc0;"
+        "mac.l %%d1, %%a0, %%acc1;"
+        "mac.l %%d2, %%d5, %%acc2;"
+        "mac.l %%d3, %%d4, %%acc3;"
+        "lea.l (16, %[s0]), %[s0];"
+        "lea.l (-16, %[s1]), %[s1];"
+        "movclr.l %%acc0, %%d0;"
+        "movclr.l %%acc1, %%d1;"
+        "movclr.l %%acc2, %%d2;"
+        "movclr.l %%acc3, %%d3;"
+        "movem.l %%d0-%%d3, (%[dst]);"
+        "lea.l (16, %[dst]), %[dst];"
+        "subq.l #4, %[n];"
+        "jne 0b;"
+        : [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len)
+        : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
+}
+
 #else
 
 static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){
@@ -142,6 +192,13 @@ static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const
         dst[i] = fixmul32b(src0[i], src1[i]) + dst[i];
 }
 
+static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){
+    int i;
+    src1 += len-1;
+    for(i=0; i<len; i++)
+        dst[i] = fixmul32b(src0[i], src1[-i]);
+}
+
 #endif
 
 /* TODO:  Adapt the above to work with this */