forked from len0rd/rockbox
FS #6848 - fast vector operations for ARM in Tremor.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@12902 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
467651ae76
commit
25046aac17
5 changed files with 168 additions and 59 deletions
|
@ -95,6 +95,112 @@ static inline void XNPROD31(ogg_int32_t a, ogg_int32_t b,
|
||||||
*y = y1 << 1;
|
*y = y1 << 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef _V_VECT_OPS
|
||||||
|
#define _V_VECT_OPS
|
||||||
|
|
||||||
|
/* asm versions of vector operations for block.c, window.c */
|
||||||
|
static inline
|
||||||
|
void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n)
|
||||||
|
{
|
||||||
|
while (n>=4) {
|
||||||
|
asm volatile ("ldmia %[x], {r0, r1, r2, r3};"
|
||||||
|
"ldmia %[y]!, {r4, r5, r6, r7};"
|
||||||
|
"add r0, r0, r4;"
|
||||||
|
"add r1, r1, r5;"
|
||||||
|
"add r2, r2, r6;"
|
||||||
|
"add r3, r3, r7;"
|
||||||
|
"stmia %[x]!, {r0, r1, r2, r3};"
|
||||||
|
: [x] "+r" (x), [y] "+r" (y)
|
||||||
|
: : "r0", "r1", "r2", "r3",
|
||||||
|
"r4", "r5", "r6", "r7",
|
||||||
|
"memory");
|
||||||
|
n -= 4;
|
||||||
|
}
|
||||||
|
/* add final elements */
|
||||||
|
while (n>0) {
|
||||||
|
*x++ += *y++;
|
||||||
|
n--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
void vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n)
|
||||||
|
{
|
||||||
|
while (n>=4) {
|
||||||
|
asm volatile ("ldmia %[y]!, {r0, r1, r2, r3};"
|
||||||
|
"stmia %[x]!, {r0, r1, r2, r3};"
|
||||||
|
: [x] "+r" (x), [y] "+r" (y)
|
||||||
|
: : "r0", "r1", "r2", "r3",
|
||||||
|
"memory");
|
||||||
|
n -= 4;
|
||||||
|
}
|
||||||
|
/* copy final elements */
|
||||||
|
while (n>0) {
|
||||||
|
*x++ = *y++;
|
||||||
|
n--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
|
||||||
|
{
|
||||||
|
while (n>=4) {
|
||||||
|
asm volatile ("ldmia %[d], {r0, r1, r2, r3};"
|
||||||
|
"ldmia %[w]!, {r4, r5, r6, r7};"
|
||||||
|
"smull r8, r9, r0, r4;"
|
||||||
|
"mov r0, r9, lsl #1;"
|
||||||
|
"smull r8, r9, r1, r5;"
|
||||||
|
"mov r1, r9, lsl #1;"
|
||||||
|
"smull r8, r9, r2, r6;"
|
||||||
|
"mov r2, r9, lsl #1;"
|
||||||
|
"smull r8, r9, r3, r7;"
|
||||||
|
"mov r3, r9, lsl #1;"
|
||||||
|
"stmia %[d]!, {r0, r1, r2, r3};"
|
||||||
|
: [d] "+r" (data), [w] "+r" (window)
|
||||||
|
: : "r0", "r1", "r2", "r3",
|
||||||
|
"r4", "r5", "r6", "r7", "r8", "r9",
|
||||||
|
"memory", "cc");
|
||||||
|
n -= 4;
|
||||||
|
}
|
||||||
|
while(n>0) {
|
||||||
|
*data = MULT31(*data, *window);
|
||||||
|
data++;
|
||||||
|
window++;
|
||||||
|
n--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n)
|
||||||
|
{
|
||||||
|
while (n>=4) {
|
||||||
|
asm volatile ("ldmia %[d], {r0, r1, r2, r3};"
|
||||||
|
"ldmda %[w]!, {r4, r5, r6, r7};"
|
||||||
|
"smull r8, r9, r0, r7;"
|
||||||
|
"mov r0, r9, lsl #1;"
|
||||||
|
"smull r8, r9, r1, r6;"
|
||||||
|
"mov r1, r9, lsl #1;"
|
||||||
|
"smull r8, r9, r2, r5;"
|
||||||
|
"mov r2, r9, lsl #1;"
|
||||||
|
"smull r8, r9, r3, r4;"
|
||||||
|
"mov r3, r9, lsl #1;"
|
||||||
|
"stmia %[d]!, {r0, r1, r2, r3};"
|
||||||
|
: [d] "+r" (data), [w] "+r" (window)
|
||||||
|
: : "r0", "r1", "r2", "r3",
|
||||||
|
"r4", "r5", "r6", "r7", "r8", "r9",
|
||||||
|
"memory", "cc");
|
||||||
|
n -= 4;
|
||||||
|
}
|
||||||
|
while(n>0) {
|
||||||
|
*data = MULT31(*data, *window);
|
||||||
|
data++;
|
||||||
|
window--;
|
||||||
|
n--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef _V_CLIP_MATH
|
#ifndef _V_CLIP_MATH
|
||||||
|
|
|
@ -132,10 +132,13 @@ void XNPROD31(ogg_int32_t a, ogg_int32_t b,
|
||||||
[t] "r" (_t), [v] "r" (_v) \
|
[t] "r" (_t), [v] "r" (_v) \
|
||||||
: "cc");
|
: "cc");
|
||||||
|
|
||||||
|
#ifndef _V_VECT_OPS
|
||||||
|
#define _V_VECT_OPS
|
||||||
|
|
||||||
/* asm versions of vector operations for block.c, window.c */
|
/* asm versions of vector operations for block.c, window.c */
|
||||||
/* assumes MAC is initialized & accumulators cleared */
|
/* assumes MAC is initialized & accumulators cleared */
|
||||||
static inline
|
static inline
|
||||||
void mcf5249_vect_add(ogg_int32_t *x, ogg_int32_t *y, int n)
|
void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n)
|
||||||
{
|
{
|
||||||
/* align to 16 bytes */
|
/* align to 16 bytes */
|
||||||
while(n>0 && (int)x&16) {
|
while(n>0 && (int)x&16) {
|
||||||
|
@ -169,7 +172,7 @@ void mcf5249_vect_add(ogg_int32_t *x, ogg_int32_t *y, int n)
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
void mcf5249_vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n)
|
void vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n)
|
||||||
{
|
{
|
||||||
/* align to 16 bytes */
|
/* align to 16 bytes */
|
||||||
while(n>0 && (int)x&16) {
|
while(n>0 && (int)x&16) {
|
||||||
|
@ -196,7 +199,7 @@ void mcf5249_vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n)
|
||||||
|
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
void mcf5249_vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
|
void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
|
||||||
{
|
{
|
||||||
/* ensure data is aligned to 16-bytes */
|
/* ensure data is aligned to 16-bytes */
|
||||||
while(n>0 && (int)data%16) {
|
while(n>0 && (int)data%16) {
|
||||||
|
@ -250,7 +253,7 @@ void mcf5249_vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
void mcf5249_vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n)
|
void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n)
|
||||||
{
|
{
|
||||||
/* ensure at least data is aligned to 16-bytes */
|
/* ensure at least data is aligned to 16-bytes */
|
||||||
while(n>0 && (int)data%16) {
|
while(n>0 && (int)data%16) {
|
||||||
|
@ -338,6 +341,8 @@ void mcf5249_vect_zero(ogg_int32_t *ptr, int n)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef _V_CLIP_MATH
|
#ifndef _V_CLIP_MATH
|
||||||
#define _V_CLIP_MATH
|
#define _V_CLIP_MATH
|
||||||
|
|
||||||
|
|
|
@ -262,11 +262,7 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){
|
||||||
vorbis_info *vi=v->vi;
|
vorbis_info *vi=v->vi;
|
||||||
codec_setup_info *ci=(codec_setup_info *)vi->codec_setup;
|
codec_setup_info *ci=(codec_setup_info *)vi->codec_setup;
|
||||||
private_state *b=v->backend_state;
|
private_state *b=v->backend_state;
|
||||||
#ifdef CPU_COLDFIRE
|
|
||||||
int j;
|
int j;
|
||||||
#else
|
|
||||||
int i,j;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if(v->pcm_current>v->pcm_returned && v->pcm_returned!=-1)return(OV_EINVAL);
|
if(v->pcm_current>v->pcm_returned && v->pcm_returned!=-1)return(OV_EINVAL);
|
||||||
|
|
||||||
|
@ -312,47 +308,25 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){
|
||||||
/* large/large */
|
/* large/large */
|
||||||
ogg_int32_t *pcm=v->pcm[j]+prevCenter;
|
ogg_int32_t *pcm=v->pcm[j]+prevCenter;
|
||||||
ogg_int32_t *p=vb->pcm[j];
|
ogg_int32_t *p=vb->pcm[j];
|
||||||
#ifdef CPU_COLDFIRE
|
vect_add(pcm, p, n1);
|
||||||
mcf5249_vect_add(pcm, p, n1);
|
|
||||||
#else
|
|
||||||
for(i=0;i<n1;i++)
|
|
||||||
pcm[i]+=p[i];
|
|
||||||
#endif
|
|
||||||
}else{
|
}else{
|
||||||
/* large/small */
|
/* large/small */
|
||||||
ogg_int32_t *pcm=v->pcm[j]+prevCenter+n1/2-n0/2;
|
ogg_int32_t *pcm=v->pcm[j]+prevCenter+n1/2-n0/2;
|
||||||
ogg_int32_t *p=vb->pcm[j];
|
ogg_int32_t *p=vb->pcm[j];
|
||||||
#ifdef CPU_COLDFIRE
|
vect_add(pcm, p, n0);
|
||||||
mcf5249_vect_add(pcm, p, n0);
|
|
||||||
#else
|
|
||||||
for(i=0;i<n0;i++)
|
|
||||||
pcm[i]+=p[i];
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}else{
|
}else{
|
||||||
if(v->W){
|
if(v->W){
|
||||||
/* small/large */
|
/* small/large */
|
||||||
ogg_int32_t *pcm=v->pcm[j]+prevCenter;
|
ogg_int32_t *pcm=v->pcm[j]+prevCenter;
|
||||||
ogg_int32_t *p=vb->pcm[j]+n1/2-n0/2;
|
ogg_int32_t *p=vb->pcm[j]+n1/2-n0/2;
|
||||||
#ifdef CPU_COLDFIRE
|
vect_add(pcm, p, n0);
|
||||||
mcf5249_vect_add(pcm, p, n0);
|
vect_copy(&pcm[n0], &p[n0], n1/2-n0/2);
|
||||||
mcf5249_vect_copy(&pcm[n0], &p[n0], n1/2-n0/2);
|
|
||||||
#else
|
|
||||||
for(i=0;i<n0;i++)
|
|
||||||
pcm[i]+=p[i];
|
|
||||||
for(;i<n1/2+n0/2;i++)
|
|
||||||
pcm[i]=p[i];
|
|
||||||
#endif
|
|
||||||
}else{
|
}else{
|
||||||
/* small/small */
|
/* small/small */
|
||||||
ogg_int32_t *pcm=v->pcm[j]+prevCenter;
|
ogg_int32_t *pcm=v->pcm[j]+prevCenter;
|
||||||
ogg_int32_t *p=vb->pcm[j];
|
ogg_int32_t *p=vb->pcm[j];
|
||||||
#ifdef CPU_COLDFIRE
|
vect_add(pcm, p, n0);
|
||||||
mcf5249_vect_add(pcm, p, n0);
|
|
||||||
#else
|
|
||||||
for(i=0;i<n0;i++)
|
|
||||||
pcm[i]+=p[i];
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -360,12 +334,7 @@ int vorbis_synthesis_blockin(vorbis_dsp_state *v,vorbis_block *vb){
|
||||||
{
|
{
|
||||||
ogg_int32_t *pcm=v->pcm[j]+thisCenter;
|
ogg_int32_t *pcm=v->pcm[j]+thisCenter;
|
||||||
ogg_int32_t *p=vb->pcm[j]+n;
|
ogg_int32_t *p=vb->pcm[j]+n;
|
||||||
#ifdef CPU_COLDFIRE
|
vect_copy(pcm, p, n);
|
||||||
mcf5249_vect_copy(pcm, p, n);
|
|
||||||
#else
|
|
||||||
for(i=0;i<n;i++)
|
|
||||||
pcm[i]=p[i];
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -151,6 +151,51 @@ static inline void XNPROD31(ogg_int32_t a, ogg_int32_t b,
|
||||||
*y = MULT31(b, t) + MULT31(a, v);
|
*y = MULT31(b, t) + MULT31(a, v);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef _V_VECT_OPS
|
||||||
|
#define _V_VECT_OPS
|
||||||
|
|
||||||
|
static inline
|
||||||
|
void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n)
|
||||||
|
{
|
||||||
|
while (n>0) {
|
||||||
|
*x++ += *y++;
|
||||||
|
n--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
void vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n)
|
||||||
|
{
|
||||||
|
while (n>0) {
|
||||||
|
*x++ = *y++;
|
||||||
|
n--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
|
||||||
|
{
|
||||||
|
while(n>0) {
|
||||||
|
*data = MULT31(*data, *window);
|
||||||
|
data++;
|
||||||
|
window++;
|
||||||
|
n--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n)
|
||||||
|
{
|
||||||
|
while(n>0) {
|
||||||
|
*data = MULT31(*data, *window);
|
||||||
|
data++;
|
||||||
|
window--;
|
||||||
|
n--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef _V_CLIP_MATH
|
#ifndef _V_CLIP_MATH
|
||||||
|
|
|
@ -68,27 +68,11 @@ void _vorbis_apply_window(ogg_int32_t *d,const void *window_p[2],
|
||||||
long rightbegin=n/2+n/4-rn/4;
|
long rightbegin=n/2+n/4-rn/4;
|
||||||
long rightend=rightbegin+rn/2;
|
long rightend=rightbegin+rn/2;
|
||||||
|
|
||||||
#ifdef CPU_COLDFIRE
|
|
||||||
memset((void *)&d[0], 0, sizeof(ogg_int32_t)*leftbegin);
|
memset((void *)&d[0], 0, sizeof(ogg_int32_t)*leftbegin);
|
||||||
/* mcf5249_vect_zero(&d[0], leftbegin); */
|
/* mcf5249_vect_zero(&d[0], leftbegin); */
|
||||||
mcf5249_vect_mult_fw(&d[leftbegin], &window[lW][0], leftend-leftbegin);
|
vect_mult_fw(&d[leftbegin], &window[lW][0], leftend-leftbegin);
|
||||||
mcf5249_vect_mult_bw(&d[rightbegin], &window[nW][rn/2-1], rightend-rightbegin);
|
vect_mult_bw(&d[rightbegin], &window[nW][rn/2-1], rightend-rightbegin);
|
||||||
memset((void *)&d[rightend], 0, sizeof(ogg_int32_t)*(n-rightend));
|
memset((void *)&d[rightend], 0, sizeof(ogg_int32_t)*(n-rightend));
|
||||||
/* mcf5249_vect_zero(&d[rightend], n-rightend); */
|
/* mcf5249_vect_zero(&d[rightend], n-rightend); */
|
||||||
#else
|
|
||||||
int i,p;
|
|
||||||
|
|
||||||
for(i=0;i<leftbegin;i++)
|
|
||||||
d[i]=0;
|
|
||||||
|
|
||||||
for(p=0;i<leftend;i++,p++)
|
|
||||||
d[i]=MULT31(d[i],window[lW][p]);
|
|
||||||
|
|
||||||
for(i=rightbegin,p=rn/2-1;i<rightend;i++,p--)
|
|
||||||
d[i]=MULT31(d[i],window[nW][p]);
|
|
||||||
|
|
||||||
for(;i<n;i++)
|
|
||||||
d[i]=0;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue