Submit interim version of FS#10565. Performance optimization of atrac3 decoder for ARM. Introduce ASM routines for multiplications and two synthesis loops, refactured parts of synthesis and windowing. Speeds up decoding by a factor of 2.4 on PP502x.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@22548 a1c6a512-1295-4272-9138-f99709370657
2009-08-29 19:44:27 +00:00 · 2009-08-29 19:44:27 +00:00 · f4774bf5bf
commit f4774bf5bf
parent eb3cb724e8
5 changed files with 343 additions and 146 deletions
--- a/apps/codecs/libatrac/fixp_math.h
+++ b/apps/codecs/libatrac/fixp_math.h
@ -10,27 +10,77 @@

 /* Fixed point math routines for use in atrac3.c */

-static inline int32_t fixmul16(int32_t x, int32_t y)
-{
-    int64_t temp;
-    temp = x;
-    temp *= y;
-
-    temp >>= 16;
-
-    return (int32_t)temp;
-}
-
-static inline int32_t fixmul31(int32_t x, int32_t y)
-{
-    int64_t temp;
-    temp = x;
-    temp *= y;
-
-    temp >>= 31;        //16+31-16 = 31 bits
-
-    return (int32_t)temp;
-}
+#if defined(CPU_ARM)
+    #define fixmul16(X,Y) \
+     ({ \
+        int32_t low; \
+        int32_t high; \
+        asm volatile (                   /* calculates: result = (X*Y)>>16 */ \
+           "smull  %0,%1,%2,%3 \n\t"     /* 64 = 32x32 multiply */ \
+           "mov %0, %0, lsr #16 \n\t"    /* %0 = %0 >> 16 */ \
+           "orr %0, %0, %1, lsl #16 \n\t"/* result = %0 OR (%1 << 16) */ \
+           : "=&r"(low), "=&r" (high) \
+           : "r"(X),"r"(Y)); \
+        low; \
+     })
+     
+    #define fixmul31(X,Y) \
+     ({ \
+        int32_t low; \
+        int32_t high; \
+        asm volatile (                   /* calculates: result = (X*Y)>>31 */ \
+           "smull  %0,%1,%2,%3 \n\t"     /* 64 = 32x32 multiply */ \
+           "mov %0, %0, lsr #31 \n\t"    /* %0 = %0 >> 31 */ \
+           "orr %0, %0, %1, lsl #1 \n\t" /* result = %0 OR (%1 << 1) */ \
+           : "=&r"(low), "=&r" (high) \
+           : "r"(X),"r"(Y)); \
+        low; \
+     })
+     
+    #define fixmul32(X,Y) \
+     ({ \
+        int32_t low; \
+        int32_t high; \
+        asm volatile (                   /* calculates: result = (X*Y)>>32 */ \
+           "smull  %0,%1,%2,%3 \n\t"     /* 64 = 32x32 multiply */ \
+           : "=&r"(low), "=&r" (high) \
+           : "r"(X),"r"(Y)); \
+        high; \
+     })
+#else
+    static inline int32_t fixmul16(int32_t x, int32_t y)
+    {
+        int64_t temp;
+        temp = x;
+        temp *= y;
+    
+        temp >>= 16;
+    
+        return (int32_t)temp;
+    }
+    
+    static inline int32_t fixmul31(int32_t x, int32_t y)
+    {
+        int64_t temp;
+        temp = x;
+        temp *= y;
+    
+        temp >>= 31;        //16+31-16 = 31 bits
+    
+        return (int32_t)temp;
+    }
+    
+    static inline int32_t fixmul32(int32_t x, int32_t y)
+    {
+        int64_t temp;
+        temp = x;
+        temp *= y;
+    
+        temp >>= 32;        //16+31-16 = 31 bits
+    
+        return (int32_t)temp;
+    }
+#endif

 static inline int32_t fixdiv16(int32_t x, int32_t y)
 {