forked from len0rd/rockbox
Add Coldfire and ARM assembler for "reverse multiply and copy" function too. Gives big speedup on Coldfire, small on ARM.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15183 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
31245682b3
commit
6e4aa260d0
1 changed files with 61 additions and 4 deletions
|
@ -96,12 +96,33 @@ void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
|
||||||
"smull r8, r9, r1, r5;"
|
"smull r8, r9, r1, r5;"
|
||||||
"add r1, r4, r9, lsl #1;"
|
"add r1, r4, r9, lsl #1;"
|
||||||
"stmia %[dst]!, {r0, r1};"
|
"stmia %[dst]!, {r0, r1};"
|
||||||
"subs %[n], %[n], #2;"
|
"subs %[n], %[n], #2;"
|
||||||
"bne 0b;"
|
"bne 0b;"
|
||||||
: [d] "+r" (data), [w] "+r" (window), [dst] "+r" (dst), [n] "+r" (n)
|
: [d] "+r" (data), [w] "+r" (window), [dst] "+r" (dst), [n] "+r" (n)
|
||||||
: : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
|
: : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1,
|
||||||
|
int len)
|
||||||
|
{
|
||||||
|
/* Block sizes are always power of two */
|
||||||
|
asm volatile (
|
||||||
|
"add %[s1], %[s1], %[n], lsl #2;"
|
||||||
|
"0:"
|
||||||
|
"ldmia %[s0]!, {r0, r1};"
|
||||||
|
"ldmdb %[s1]!, {r4, r5};"
|
||||||
|
"smull r8, r9, r0, r5;"
|
||||||
|
"mov r0, r9, lsl #1;"
|
||||||
|
"smull r8, r9, r1, r4;"
|
||||||
|
"mov r1, r9, lsl #1;"
|
||||||
|
"stmia %[dst]!, {r0, r1};"
|
||||||
|
"subs %[n], %[n], #2;"
|
||||||
|
"bne 0b;"
|
||||||
|
: [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len)
|
||||||
|
: : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
|
||||||
|
}
|
||||||
|
|
||||||
#elif defined(CPU_COLDFIRE)
|
#elif defined(CPU_COLDFIRE)
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
|
@ -118,8 +139,8 @@ void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
|
||||||
"mac.l %%d1, %%d5, %%acc1;"
|
"mac.l %%d1, %%d5, %%acc1;"
|
||||||
"mac.l %%d2, %%a0, %%acc2;"
|
"mac.l %%d2, %%a0, %%acc2;"
|
||||||
"mac.l %%d3, %%a1, %%acc3;"
|
"mac.l %%d3, %%a1, %%acc3;"
|
||||||
"lea.l (%[d], 16), %[d];"
|
"lea.l (16, %[d]), %[d];"
|
||||||
"lea.l (%[w], 16), %[w];"
|
"lea.l (16, %[w]), %[w];"
|
||||||
"movclr.l %%acc0, %%d0;"
|
"movclr.l %%acc0, %%d0;"
|
||||||
"movclr.l %%acc1, %%d1;"
|
"movclr.l %%acc1, %%d1;"
|
||||||
"movclr.l %%acc2, %%d2;"
|
"movclr.l %%acc2, %%d2;"
|
||||||
|
@ -134,6 +155,35 @@ void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
|
||||||
: : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
|
: : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1,
|
||||||
|
int len)
|
||||||
|
{
|
||||||
|
/* Block sizes are always power of two. Smallest block is always way bigger
|
||||||
|
* than four too.*/
|
||||||
|
asm volatile (
|
||||||
|
"lea.l (-16, %[s1], %[n]*4), %[s1];"
|
||||||
|
"0:"
|
||||||
|
"movem.l (%[s0]), %%d0-%%d3;"
|
||||||
|
"movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;"
|
||||||
|
"mac.l %%d0, %%a1, %%acc0;"
|
||||||
|
"mac.l %%d1, %%a0, %%acc1;"
|
||||||
|
"mac.l %%d2, %%d5, %%acc2;"
|
||||||
|
"mac.l %%d3, %%d4, %%acc3;"
|
||||||
|
"lea.l (16, %[s0]), %[s0];"
|
||||||
|
"lea.l (-16, %[s1]), %[s1];"
|
||||||
|
"movclr.l %%acc0, %%d0;"
|
||||||
|
"movclr.l %%acc1, %%d1;"
|
||||||
|
"movclr.l %%acc2, %%d2;"
|
||||||
|
"movclr.l %%acc3, %%d3;"
|
||||||
|
"movem.l %%d0-%%d3, (%[dst]);"
|
||||||
|
"lea.l (16, %[dst]), %[dst];"
|
||||||
|
"subq.l #4, %[n];"
|
||||||
|
"jne 0b;"
|
||||||
|
: [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len)
|
||||||
|
: : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){
|
static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){
|
||||||
|
@ -142,6 +192,13 @@ static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const
|
||||||
dst[i] = fixmul32b(src0[i], src1[i]) + dst[i];
|
dst[i] = fixmul32b(src0[i], src1[i]) + dst[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){
|
||||||
|
int i;
|
||||||
|
src1 += len-1;
|
||||||
|
for(i=0; i<len; i++)
|
||||||
|
dst[i] = fixmul32b(src0[i], src1[-i]);
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* TODO: Adapt the above to work with this */
|
/* TODO: Adapt the above to work with this */
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue