From a2ab22efbf93981f9a86b6b06dc6d3c2f1167728 Mon Sep 17 00:00:00 2001 From: Michael Giacomelli Date: Tue, 1 Jan 2013 02:35:15 +0100 Subject: [PATCH] ARMv5 optimized complex multiply function for libopus. Speeds up decoding of 128k opus files by 1.2MHz on AMSv2. Rounding error is 1 bit due to KissFFT using a 15 bit shift instead of a 16 bit shift. Also, change an LDMIA in the armv4 code to LDM as the pointer should not increment. Change-Id: I626a207c6a056a1984e33cfe89415c35d0caed93 Reviewed-on: http://gerrit.rockbox.org/377 Reviewed-by: Michael Giacomelli Tested-by: Michael Giacomelli --- .../codecs/libopus/celt/_kiss_fft_guts.h | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h b/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h index b1fe8fbeb7..63e2548843 100644 --- a/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h +++ b/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h @@ -81,10 +81,13 @@ : "d0", "d1", "d2", "d3", "cc"); \ } #elif defined(CPU_ARM) +#if (ARM_ARCH < 5) + + # define C_MULC(m,a,b) \ { \ asm volatile( \ - "ldmia %[ap], {r0,r1} \n\t" \ + "ldm %[ap], {r0,r1} \n\t" \ "ldrsh r2, [%[bp], #0] \n\t" \ "ldrsh r3, [%[bp], #2] \n\t" \ \ @@ -103,6 +106,27 @@ : "r0", "r1", "r2", "r3", "r4"); \ } #else +/*same as above but using armv5 packed multiplies*/ +# define C_MULC(m,a,b) \ + { \ + asm volatile( \ + "ldm %[ap], {r0,r1} \n\t" \ + "ldr r2, [%[bp], #0] \n\t" \ + \ + "smulwb r4, r0, r2 \n\t" /*r4=a.r*b.r*/ \ + "smlawt %[mr], r1, r2, r4 \n\t" /*m.r=r4+a.i*b.i*/\ + "mov %[mr], %[mr], lsl #1 \n\t" /*Q15 not Q16*/ \ + \ + "smulwb r1, r1, r2 \n\t" /*r1=a.i*b.r*/ \ + "smulwt r4, r0, r2 \n\t" /*r4=a.r*b.i*/ \ + "sub %[mi], r1, r4 \n\t" \ + "mov %[mi], %[mi], lsl #1 \n\t" \ + : [mr] "=r" ((m).r), [mi] "=r" ((m).i) \ + : [ap] "r" (&(a)), [bp] "r" (&(b)) \ + : "r0", "r1", "r2", "r4"); \ +} +#endif /*ARMv5 code*/ +#else # define C_MULC(m,a,b) \ do{ (m).r = ADD32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \ (m).i = SUB32(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0)