Add ARM assembler to libwmapro vector_fixmul_scalar(). Speeds up decoding by 1% on PP5022.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27603 a1c6a512-1295-4272-9138-f99709370657
2010-07-28 19:36:15 +00:00 · 2010-07-28 19:36:15 +00:00 · 3bb8020f78
commit 3bb8020f78
parent 2fefcdf31c
1 changed files with 48 additions and 18 deletions
--- a/apps/codecs/libwmapro/wmapro_math.h
+++ b/apps/codecs/libwmapro/wmapro_math.h
@ -180,7 +180,7 @@
    }
 #endif /* CPU_COLDFIRE, CPU_ARM */
-#ifdef CPU_COLDFIRE
+#if defined(CPU_COLDFIRE)
 static inline void vector_fixmul_window(int32_t *dst, const int32_t *src0, 
                                   const int32_t *src1, const int32_t *win, 
                                   int len)
@ -194,8 +194,8 @@ static inline void vector_fixmul_window(int32_t *dst, const int32_t *src0,
            int32_t s1 = src1[j];
            int32_t wi = -win[i];
            int32_t wj = -win[j];
-
+            asm volatile (
-        asm volatile ("mac.l    %[s0], %[wj], %%acc0\n\t"
+                "mac.l    %[s0], %[wj], %%acc0\n\t"
                "msac.l   %[s1], %[wi], %%acc0\n\t"
                "mac.l    %[s0], %[wi], %%acc1\n\t"
                "mac.l    %[s1], %[wj], %%acc1\n\t"
@ -229,6 +229,35 @@ static inline void vector_fixmul_window(int32_t *dst, const int32_t *src0,
 }
 #endif
 #if defined(CPU_ARM)
 static inline void vector_fixmul_scalar(int32_t *dst, const int32_t *src, 
                                        int32_t mul, int len)
 {
    /* len is _always_ a multiple of 4, because len is the difference of sfb's
     * which themselves are always a multiple of 4. */
    int i;
    for (i=0; i<len; i+=4) {
        asm volatile (
            "ldmia %[src]!, {r1-r4}    \n\t"
            "smull r0, r5, r1, %[mul] \n\t"
            "mov   r0, r0, lsr #24    \n\t"
            "orr   r0, r0, r5, lsl #8 \n\t"
            "smull r1, r5, r2, %[mul] \n\t"
            "mov   r1, r1, lsr #24    \n\t"
            "orr   r1, r1, r5, lsl #8 \n\t"
            "smull r2, r5, r3, %[mul] \n\t"
            "mov   r2, r2, lsr #24    \n\t"
            "orr   r2, r2, r5, lsl #8 \n\t"
            "smull r3, r5, r4, %[mul] \n\t"
            "mov   r3, r3, lsr #24    \n\t"
            "orr   r3, r3, r5, lsl #8 \n\t"
            "stmia %[dst]!, {r0-r3}    \n"
            : [dst]"+r"(dst), [src]"+r"(src)
            : [mul]"r"(mul)
            : "r0", "r1", "r2", "r3", "r4", "r5", "memory");
    }
 }
 #else
 static inline void vector_fixmul_scalar(int32_t *dst, const int32_t *src, 
                                        int32_t mul, int len)
 {
@ -242,6 +271,7 @@ static inline void vector_fixmul_scalar(int32_t *dst, const int32_t *src,
        dst[i+3] = fixmul24(src[i+3], mul);
    }
 }
 #endif /* CPU_ARM */
 static inline int av_clip(int a, int amin, int amax)
 {