1
0
Fork 0
forked from len0rd/rockbox

Patch #5219 by Antonius Hellmann. Several optimisations to libmad. Both Coldfire and ARM targets should benefit much from this.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@9821 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Thom Johansen 2006-04-27 19:52:09 +00:00
parent 103ebf77ce
commit f004315105
4 changed files with 1292 additions and 1098 deletions

View file

@ -87,9 +87,8 @@ unsigned short const crc_table[256] = {
*/ */
void mad_bit_init(struct mad_bitptr *bitptr, unsigned char const *byte) void mad_bit_init(struct mad_bitptr *bitptr, unsigned char const *byte)
{ {
bitptr->byte = byte; bitptr->ptr = (unsigned long*)((long)byte & ~3);
bitptr->cache = 0; bitptr->readbit = ((unsigned long)byte & 3) << 3;
bitptr->left = CHAR_BIT;
} }
/* /*
@ -99,17 +98,20 @@ void mad_bit_init(struct mad_bitptr *bitptr, unsigned char const *byte)
unsigned int mad_bit_length(struct mad_bitptr const *begin, unsigned int mad_bit_length(struct mad_bitptr const *begin,
struct mad_bitptr const *end) struct mad_bitptr const *end)
{ {
return begin->left + return end->readbit - begin->readbit;
CHAR_BIT * (end->byte - (begin->byte + 1)) + (CHAR_BIT - end->left);
} }
unsigned char mad_bit_bitsleft(struct mad_bitptr const *bitptr)
{
return 8 - (bitptr->readbit & 7);
}
/* /*
* NAME: bit->nextbyte() * NAME: bit->nextbyte()
* DESCRIPTION: return pointer to next unprocessed byte * DESCRIPTION: return pointer to next unprocessed byte
*/ */
unsigned char const *mad_bit_nextbyte(struct mad_bitptr const *bitptr) unsigned char const *mad_bit_nextbyte(struct mad_bitptr const *bitptr)
{ {
return bitptr->left == CHAR_BIT ? bitptr->byte : bitptr->byte + 1; return (unsigned char const*)bitptr->ptr + ((bitptr->readbit + 7) >> 3);
} }
/* /*
@ -118,60 +120,43 @@ unsigned char const *mad_bit_nextbyte(struct mad_bitptr const *bitptr)
*/ */
void mad_bit_skip(struct mad_bitptr *bitptr, unsigned int len) void mad_bit_skip(struct mad_bitptr *bitptr, unsigned int len)
{ {
bitptr->byte += len / CHAR_BIT; bitptr->readbit += len;
bitptr->left -= len % CHAR_BIT;
if (bitptr->left > CHAR_BIT) {
bitptr->byte++;
bitptr->left += CHAR_BIT;
}
if (bitptr->left < CHAR_BIT)
bitptr->cache = *bitptr->byte;
} }
/* /*
* NAME: bit->read() * NAME: bit->read()
* DESCRIPTION: read an arbitrary number of bits and return their UIMSBF value * DESCRIPTION: read an arbitrary number of bits and return their UIMSBF value
*/ */
unsigned long bmask[] ICONST_ATTR =
{ 0x00000000, 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f,
0x0000003f, 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff,
0x00000fff, 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff,
0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff,
0x00ffffff, 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff,
0x3fffffff, 0x7fffffff, 0xffffffff };
unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) ICODE_ATTR;
unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len)
{ {
register unsigned long value; unsigned long *curr = &bitptr->ptr[bitptr->readbit>>5];
if (bitptr->left == CHAR_BIT) if(len)
bitptr->cache = *bitptr->byte; {
if((bitptr->readbit ^ (bitptr->readbit + len - 1)) < 32)
{
bitptr->readbit += len;
if (len < bitptr->left) { return (betoh32(curr[0]) >> (-bitptr->readbit & 31)) & bmask[len];
value = (bitptr->cache & ((1 << bitptr->left) - 1)) >> }
(bitptr->left - len); else
bitptr->left -= len; {
bitptr->readbit += len;
return value; return ((betoh32(curr[0]) << ( bitptr->readbit & 31))
+ (betoh32(curr[1]) >> (-bitptr->readbit & 31))) & bmask[len];
}
} }
/* remaining bits in current byte */ return 0;
value = bitptr->cache & ((1 << bitptr->left) - 1);
len -= bitptr->left;
bitptr->byte++;
bitptr->left = CHAR_BIT;
/* more bytes */
while (len >= CHAR_BIT) {
value = (value << CHAR_BIT) | *bitptr->byte++;
len -= CHAR_BIT;
}
if (len > 0) {
bitptr->cache = *bitptr->byte;
value = (value << len) | (bitptr->cache >> (CHAR_BIT - len));
bitptr->left -= len;
}
return value;
} }
# if 0 # if 0

View file

@ -23,9 +23,8 @@
# define LIBMAD_BIT_H # define LIBMAD_BIT_H
struct mad_bitptr { struct mad_bitptr {
unsigned char const *byte; unsigned long *ptr;
unsigned short cache; unsigned long readbit;
unsigned short left;
}; };
void mad_bit_init(struct mad_bitptr *, unsigned char const *); void mad_bit_init(struct mad_bitptr *, unsigned char const *);
@ -35,7 +34,7 @@ void mad_bit_init(struct mad_bitptr *, unsigned char const *);
unsigned int mad_bit_length(struct mad_bitptr const *, unsigned int mad_bit_length(struct mad_bitptr const *,
struct mad_bitptr const *); struct mad_bitptr const *);
# define mad_bit_bitsleft(bitptr) ((bitptr)->left) unsigned char mad_bit_bitsleft(struct mad_bitptr const *bitptr);
unsigned char const *mad_bit_nextbyte(struct mad_bitptr const *); unsigned char const *mad_bit_nextbyte(struct mad_bitptr const *);
void mad_bit_skip(struct mad_bitptr *, unsigned int); void mad_bit_skip(struct mad_bitptr *, unsigned int);

File diff suppressed because it is too large Load diff

View file

@ -580,35 +580,138 @@ static
void synth_full(struct mad_synth *synth, struct mad_frame const *frame, void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
unsigned int nch, unsigned int ns) unsigned int nch, unsigned int ns)
{ {
unsigned int phase, ch, s, sb, pe, po; unsigned int phase, ch, s, sb, p;
mad_fixed_t *pcm1, *pcm2, (*filter)[2][2][16][8]; mad_fixed_t *pcm, (*filter)[2][2][16][8];
mad_fixed_t const (*sbsample)[36][32]; mad_fixed_t const (*sbsample)[36][32];
mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8]; mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8];
mad_fixed_t const (*Dptr)[32]; mad_fixed_t const (*D0ptr)[32];
mad_fixed64hi_t hi; mad_fixed_t const (*D1ptr)[32];
mad_fixed64hi_t hi0, hi1;
for (ch = 0; ch < nch; ++ch) { for (ch = 0; ch < nch; ++ch) {
sbsample = &frame->sbsample[ch]; sbsample = &frame->sbsample[ch];
filter = &synth->filter[ch]; filter = &synth->filter[ch];
phase = synth->phase; phase = synth->phase;
pcm1 = synth->pcm.samples[ch]; pcm = synth->pcm.samples[ch];
for (s = 0; s < ns; ++s) { for (s = 0; s < ns; ++s) {
dct32((*sbsample)[s], phase >> 1, dct32((*sbsample)[s], phase >> 1,
(*filter)[0][phase & 1], (*filter)[1][phase & 1]); (*filter)[0][phase & 1], (*filter)[1][phase & 1]);
pe = phase & ~1; p = (phase - 1) & 0xf;
po = ((phase - 1) & 0xf) | 1;
/* calculate 32 samples */ /* calculate 32 samples */
fe = &(*filter)[0][ phase & 1][0]; fe = &(*filter)[0][ phase & 1][0];
fx = &(*filter)[0][~phase & 1][0]; fx = &(*filter)[0][~phase & 1][0];
fo = &(*filter)[1][~phase & 1][0]; fo = &(*filter)[1][~phase & 1][0];
Dptr = &D[0]; D0ptr = (void*)&D[0][ p];
D1ptr = (void*)&D[0][-p];
asm volatile( if(s & 1)
{
asm volatile(
"movem.l (%1), %%d0-%%d7\n\t"
"move.l 4(%2), %%a5\n\t"
"msac.l %%d0, %%a5, 60(%2), %%a5, %%acc0\n\t"
"msac.l %%d1, %%a5, 52(%2), %%a5, %%acc0\n\t"
"msac.l %%d2, %%a5, 44(%2), %%a5, %%acc0\n\t"
"msac.l %%d3, %%a5, 36(%2), %%a5, %%acc0\n\t"
"msac.l %%d4, %%a5, 28(%2), %%a5, %%acc0\n\t"
"msac.l %%d5, %%a5, 20(%2), %%a5, %%acc0\n\t"
"msac.l %%d6, %%a5, 12(%2), %%a5, %%acc0\n\t"
"msac.l %%d7, %%a5, (%2), %%a5, %%acc0\n\t"
"movem.l (%3), %%d0-%%d7\n\t"
"mac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t"
"mac.l %%d1, %%a5, 48(%2), %%a5, %%acc0\n\t"
"mac.l %%d2, %%a5, 40(%2), %%a5, %%acc0\n\t"
"mac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t"
"mac.l %%d4, %%a5, 24(%2), %%a5, %%acc0\n\t"
"mac.l %%d5, %%a5, 16(%2), %%a5, %%acc0\n\t"
"mac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t"
"mac.l %%d7, %%a5, %%acc0\n\t"
"movclr.l %%acc0, %0\n\t"
: "=r" (hi0) : "a" (*fx), "a" (*D0ptr), "a" (*fe)
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5");
pcm[0] = hi0 << 3; /* shift result to libmad's fixed point format */
pcm += 16;
for (sb = 15; sb; sb--, fo++) {
++fe;
++D0ptr;
++D1ptr;
/* D[32 - sb][i] == -D[sb][31 - i] */
asm volatile (
"movem.l (%0), %%d0-%%d7\n\t"
"move.l 4(%2), %%a5\n\t"
"msac.l %%d0, %%a5, 60(%2), %%a5, %%acc0\n\t"
"msac.l %%d1, %%a5, 52(%2), %%a5, %%acc0\n\t"
"msac.l %%d2, %%a5, 44(%2), %%a5, %%acc0\n\t"
"msac.l %%d3, %%a5, 36(%2), %%a5, %%acc0\n\t"
"msac.l %%d4, %%a5, 28(%2), %%a5, %%acc0\n\t"
"msac.l %%d5, %%a5, 20(%2), %%a5, %%acc0\n\t"
"msac.l %%d6, %%a5, 12(%2), %%a5, %%acc0\n\t"
"msac.l %%d7, %%a5, 112(%3), %%a5, %%acc0\n\t"
"mac.l %%d7, %%a5, 104(%3), %%a5, %%acc1\n\t"
"mac.l %%d6, %%a5, 96(%3), %%a5, %%acc1\n\t"
"mac.l %%d5, %%a5, 88(%3), %%a5, %%acc1\n\t"
"mac.l %%d4, %%a5, 80(%3), %%a5, %%acc1\n\t"
"mac.l %%d3, %%a5, 72(%3), %%a5, %%acc1\n\t"
"mac.l %%d2, %%a5, 64(%3), %%a5, %%acc1\n\t"
"mac.l %%d1, %%a5, 120(%3), %%a5, %%acc1\n\t"
"mac.l %%d0, %%a5, 8(%2), %%a5, %%acc1\n\t"
"movem.l (%1), %%d0-%%d7\n\t"
"mac.l %%d7, %%a5, 16(%2), %%a5, %%acc0\n\t"
"mac.l %%d6, %%a5, 24(%2), %%a5, %%acc0\n\t"
"mac.l %%d5, %%a5, 32(%2), %%a5, %%acc0\n\t"
"mac.l %%d4, %%a5, 40(%2), %%a5, %%acc0\n\t"
"mac.l %%d3, %%a5, 48(%2), %%a5, %%acc0\n\t"
"mac.l %%d2, %%a5, 56(%2), %%a5, %%acc0\n\t"
"mac.l %%d1, %%a5, (%2), %%a5, %%acc0\n\t"
"mac.l %%d0, %%a5, 60(%3), %%a5, %%acc0\n\t"
"mac.l %%d0, %%a5, 68(%3), %%a5, %%acc1\n\t"
"mac.l %%d1, %%a5, 76(%3), %%a5, %%acc1\n\t"
"mac.l %%d2, %%a5, 84(%3), %%a5, %%acc1\n\t"
"mac.l %%d3, %%a5, 92(%3), %%a5, %%acc1\n\t"
"mac.l %%d4, %%a5, 100(%3), %%a5, %%acc1\n\t"
"mac.l %%d5, %%a5, 108(%3), %%a5, %%acc1\n\t"
"mac.l %%d6, %%a5, 116(%3), %%a5, %%acc1\n\t"
"mac.l %%d7, %%a5, %%acc1\n\t"
: : "a" (*fo), "a" (*fe), "a" (*D0ptr), "a" (*D1ptr)
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5");
asm volatile(
"movclr.l %%acc0, %0\n\t"
"movclr.l %%acc1, %1\n\t" : "=d" (hi0), "=d" (hi1) );
pcm[-sb] = hi0 << 3;
pcm[ sb] = hi1 << 3;
}
++D0ptr;
asm volatile(
"movem.l (%1), %%d0-%%d7\n\t"
"move.l 4(%2), %%a5\n\t"
"mac.l %%d0, %%a5, 60(%2), %%a5, %%acc0\n\t"
"mac.l %%d1, %%a5, 52(%2), %%a5, %%acc0\n\t"
"mac.l %%d2, %%a5, 44(%2), %%a5, %%acc0\n\t"
"mac.l %%d3, %%a5, 36(%2), %%a5, %%acc0\n\t"
"mac.l %%d4, %%a5, 28(%2), %%a5, %%acc0\n\t"
"mac.l %%d5, %%a5, 20(%2), %%a5, %%acc0\n\t"
"mac.l %%d6, %%a5, 12(%2), %%a5, %%acc0\n\t"
"mac.l %%d7, %%a5, %%acc0\n\t"
"movclr.l %%acc0, %0\n\t"
: "=r" (hi0) : "a" (*fo), "a" (*D0ptr)
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5");
pcm[0] = -(hi0 << 3);
}
else
{
asm volatile(
"movem.l (%1), %%d0-%%d7\n\t" "movem.l (%1), %%d0-%%d7\n\t"
"move.l (%2), %%a5\n\t" "move.l (%2), %%a5\n\t"
"msac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t" "msac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t"
@ -617,127 +720,80 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
"msac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t" "msac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t"
"msac.l %%d4, %%a5, 24(%2), %%a5, %%acc0\n\t" "msac.l %%d4, %%a5, 24(%2), %%a5, %%acc0\n\t"
"msac.l %%d5, %%a5, 16(%2), %%a5, %%acc0\n\t" "msac.l %%d5, %%a5, 16(%2), %%a5, %%acc0\n\t"
"msac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t" "msac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t"
"msac.l %%d7, %%a5, (%4), %%a5, %%acc0\n\t" "msac.l %%d7, %%a5, 4(%2), %%a5, %%acc0\n\t"
"movem.l (%3), %%d0-%%d7\n\t" "movem.l (%3), %%d0-%%d7\n\t"
"mac.l %%d0, %%a5, 56(%4), %%a5, %%acc0\n\t" "mac.l %%d0, %%a5, 60(%2), %%a5, %%acc0\n\t"
"mac.l %%d1, %%a5, 48(%4), %%a5, %%acc0\n\t" "mac.l %%d1, %%a5, 52(%2), %%a5, %%acc0\n\t"
"mac.l %%d2, %%a5, 40(%4), %%a5, %%acc0\n\t" "mac.l %%d2, %%a5, 44(%2), %%a5, %%acc0\n\t"
"mac.l %%d3, %%a5, 32(%4), %%a5, %%acc0\n\t" "mac.l %%d3, %%a5, 36(%2), %%a5, %%acc0\n\t"
"mac.l %%d4, %%a5, 24(%4), %%a5, %%acc0\n\t" "mac.l %%d4, %%a5, 28(%2), %%a5, %%acc0\n\t"
"mac.l %%d5, %%a5, 16(%4), %%a5, %%acc0\n\t" "mac.l %%d5, %%a5, 20(%2), %%a5, %%acc0\n\t"
"mac.l %%d6, %%a5, 8(%4), %%a5, %%acc0\n\t" "mac.l %%d6, %%a5, 12(%2), %%a5, %%acc0\n\t"
"mac.l %%d7, %%a5, %%acc0\n\t" "mac.l %%d7, %%a5, %%acc0\n\t"
"movclr.l %%acc0, %0\n\t" "movclr.l %%acc0, %0\n\t"
: "=r" (hi) : "=r" (hi0) : "a" (*fx), "a" (*D0ptr), "a" (*fe)
: "a" (*fx), "a" (*Dptr + po), "a" (*fe), "a" (*Dptr + pe)
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5"); : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5");
*pcm1++ = hi << 3; /* shift result to libmad's fixed point format */ pcm[0] = hi0 << 3; /* shift result to libmad's fixed point format */
pcm += 16;
pcm2 = pcm1 + 30; for (sb = 15; sb; sb--, fo++) {
++fe;
++D0ptr;
++D1ptr;
for (sb = 1; sb < 16; ++sb) { /* D[32 - sb][i] == -D[sb][31 - i] */
++fe; asm volatile (
++Dptr; "movem.l (%0), %%d0-%%d7\n\t"
/* D[32 - sb][i] == -D[sb][31 - i] */
#if __GNUC__ >= 4
/* GCC 4.0.1 can't find a suitable register here if all of d0-d7
* are clobbered, so use fewer registers. It does mean two extra
* movem instructions, but should have no additional performance
* impact (like not being able to use burst mode for the movem).
*/
asm volatile (
"movem.l (%1), %%d0-%%d3\n\t"
"move.l (%2), %%a5\n\t" "move.l (%2), %%a5\n\t"
"msac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t" "msac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t"
"msac.l %%d1, %%a5, 48(%2), %%a5, %%acc0\n\t" "msac.l %%d1, %%a5, 48(%2), %%a5, %%acc0\n\t"
"msac.l %%d2, %%a5, 40(%2), %%a5, %%acc0\n\t" "msac.l %%d2, %%a5, 40(%2), %%a5, %%acc0\n\t"
"msac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t" "msac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t"
"movem.l 16(%1), %%d0-%%d3\n\t" "msac.l %%d4, %%a5, 24(%2), %%a5, %%acc0\n\t"
"msac.l %%d0, %%a5, 24(%2), %%a5, %%acc0\n\t" "msac.l %%d5, %%a5, 16(%2), %%a5, %%acc0\n\t"
"msac.l %%d1, %%a5, 16(%2), %%a5, %%acc0\n\t" "msac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t"
"msac.l %%d2, %%a5, 8(%2), %%a5, %%acc0\n\t" "msac.l %%d7, %%a5, 116(%3), %%a5, %%acc0\n\t"
"msac.l %%d3, %%a5, 8(%4), %%a5, %%acc0\n\t" "mac.l %%d7, %%a5, 108(%3), %%a5, %%acc1\n\t"
"mac.l %%d6, %%a5, 100(%3), %%a5, %%acc1\n\t"
"movem.l 16(%3), %%d0-%%d3\n\t" "mac.l %%d5, %%a5, 92(%3), %%a5, %%acc1\n\t"
"mac.l %%d3, %%a5, 16(%4), %%a5, %%acc0\n\t" "mac.l %%d4, %%a5, 84(%3), %%a5, %%acc1\n\t"
"mac.l %%d2, %%a5, 24(%4), %%a5, %%acc0\n\t" "mac.l %%d3, %%a5, 76(%3), %%a5, %%acc1\n\t"
"mac.l %%d1, %%a5, 32(%4), %%a5, %%acc0\n\t" "mac.l %%d2, %%a5, 68(%3), %%a5, %%acc1\n\t"
"mac.l %%d0, %%a5, 40(%4), %%a5, %%acc0\n\t" "mac.l %%d1, %%a5, 60(%3), %%a5, %%acc1\n\t"
"movem.l (%3), %%d0-%%d3\n\t" "mac.l %%d0, %%a5, 12(%2), %%a5, %%acc1\n\t"
"mac.l %%d3, %%a5, 48(%4), %%a5, %%acc0\n\t"
"mac.l %%d2, %%a5, 56(%4), %%a5, %%acc0\n\t"
"mac.l %%d1, %%a5, (%4), %%a5, %%acc0\n\t"
"mac.l %%d0, %%a5, %%acc0\n\t"
"movclr.l %%acc0, %0\n\t"
: "=r" (hi)
: "a" (*fo), "a" (*Dptr + po), "a" (*fe), "a" (*Dptr + pe)
: "d0", "d1", "d2", "d3", "a5");
#else
asm volatile (
"movem.l (%1), %%d0-%%d7\n\t" "movem.l (%1), %%d0-%%d7\n\t"
"move.l (%2), %%a5\n\t" "mac.l %%d7, %%a5, 20(%2), %%a5, %%acc0\n\t"
"msac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t" "mac.l %%d6, %%a5, 28(%2), %%a5, %%acc0\n\t"
"msac.l %%d1, %%a5, 48(%2), %%a5, %%acc0\n\t" "mac.l %%d5, %%a5, 36(%2), %%a5, %%acc0\n\t"
"msac.l %%d2, %%a5, 40(%2), %%a5, %%acc0\n\t" "mac.l %%d4, %%a5, 44(%2), %%a5, %%acc0\n\t"
"msac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t" "mac.l %%d3, %%a5, 52(%2), %%a5, %%acc0\n\t"
"msac.l %%d4, %%a5, 24(%2), %%a5, %%acc0\n\t" "mac.l %%d2, %%a5, 60(%2), %%a5, %%acc0\n\t"
"msac.l %%d5, %%a5, 16(%2), %%a5, %%acc0\n\t" "mac.l %%d1, %%a5, 4(%2), %%a5, %%acc0\n\t"
"msac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t" "mac.l %%d0, %%a5, 120(%3), %%a5, %%acc0\n\t"
"msac.l %%d7, %%a5, 8(%4), %%a5, %%acc0\n\t" "mac.l %%d0, %%a5, 64(%3), %%a5, %%acc1\n\t"
"mac.l %%d1, %%a5, 72(%3), %%a5, %%acc1\n\t"
"movem.l (%3), %%d0-%%d7\n\t" "mac.l %%d2, %%a5, 80(%3), %%a5, %%acc1\n\t"
"mac.l %%d7, %%a5, 16(%4), %%a5, %%acc0\n\t" "mac.l %%d3, %%a5, 88(%3), %%a5, %%acc1\n\t"
"mac.l %%d6, %%a5, 24(%4), %%a5, %%acc0\n\t" "mac.l %%d4, %%a5, 96(%3), %%a5, %%acc1\n\t"
"mac.l %%d5, %%a5, 32(%4), %%a5, %%acc0\n\t" "mac.l %%d5, %%a5, 104(%3), %%a5, %%acc1\n\t"
"mac.l %%d4, %%a5, 40(%4), %%a5, %%acc0\n\t" "mac.l %%d6, %%a5, 112(%3), %%a5, %%acc1\n\t"
"mac.l %%d3, %%a5, 48(%4), %%a5, %%acc0\n\t" "mac.l %%d7, %%a5, %%acc1\n\t"
"mac.l %%d2, %%a5, 56(%4), %%a5, %%acc0\n\t" : : "a" (*fo), "a" (*fe), "a" (*D0ptr), "a" (*D1ptr)
"mac.l %%d1, %%a5, (%4), %%a5, %%acc0\n\t"
"mac.l %%d0, %%a5, %%acc0\n\t"
"movclr.l %%acc0, %0\n\t"
: "=r" (hi)
: "a" (*fo), "a" (*Dptr + po), "a" (*fe), "a" (*Dptr + pe)
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5"); : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5");
#endif
*pcm1++ = hi << 3; asm volatile(
"movclr.l %%acc0, %0\n\t"
"movclr.l %%acc1, %1\n\t" : "=d" (hi0), "=d" (hi1) );
pcm[-sb] = hi0 << 3;
pcm[ sb] = hi1 << 3;
}
++D0ptr;
asm volatile( asm volatile(
"movem.l (%1), %%d0-%%d7\n\t"
"move.l 60(%2), %%a5\n\t"
"mac.l %%d0, %%a5, 68(%2), %%a5, %%acc0\n\t"
"mac.l %%d1, %%a5, 76(%2), %%a5, %%acc0\n\t"
"mac.l %%d2, %%a5, 84(%2), %%a5, %%acc0\n\t"
"mac.l %%d3, %%a5, 92(%2), %%a5, %%acc0\n\t"
"mac.l %%d4, %%a5, 100(%2), %%a5, %%acc0\n\t"
"mac.l %%d5, %%a5, 108(%2), %%a5, %%acc0\n\t"
"mac.l %%d6, %%a5, 116(%2), %%a5, %%acc0\n\t"
"mac.l %%d7, %%a5, 116(%4), %%a5, %%acc0\n\t"
"movem.l (%3), %%d0-%%d7\n\t"
"mac.l %%d7, %%a5, 108(%4), %%a5, %%acc0\n\t"
"mac.l %%d6, %%a5, 100(%4), %%a5, %%acc0\n\t"
"mac.l %%d5, %%a5, 92(%4), %%a5, %%acc0\n\t"
"mac.l %%d4, %%a5, 84(%4), %%a5, %%acc0\n\t"
"mac.l %%d3, %%a5, 76(%4), %%a5, %%acc0\n\t"
"mac.l %%d2, %%a5, 68(%4), %%a5, %%acc0\n\t"
"mac.l %%d1, %%a5, 60(%4), %%a5, %%acc0\n\t"
"mac.l %%d0, %%a5, %%acc0\n\t"
"movclr.l %%acc0, %0\n\t"
: "=r" (hi)
: "a" (*fe), "a" (*Dptr - pe), "a" (*fo), "a" (*Dptr - po)
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5");
*pcm2-- = hi << 3;
++fo;
}
++Dptr;
asm volatile(
"movem.l (%1), %%d0-%%d7\n\t" "movem.l (%1), %%d0-%%d7\n\t"
"move.l (%2), %%a5\n\t" "move.l (%2), %%a5\n\t"
"mac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t" "mac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t"
@ -746,15 +802,15 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
"mac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t" "mac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t"
"mac.l %%d4, %%a5, 24(%2), %%a5, %%acc0\n\t" "mac.l %%d4, %%a5, 24(%2), %%a5, %%acc0\n\t"
"mac.l %%d5, %%a5, 16(%2), %%a5, %%acc0\n\t" "mac.l %%d5, %%a5, 16(%2), %%a5, %%acc0\n\t"
"mac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t" "mac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t"
"mac.l %%d7, %%a5, %%acc0\n\t" "mac.l %%d7, %%a5, %%acc0\n\t"
"movclr.l %%acc0, %0\n\t" "movclr.l %%acc0, %0\n\t"
: "=r" (hi) : "a" (*fo), "a" (*Dptr + po) : "=r" (hi0) : "a" (*fo), "a" (*D0ptr)
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5"); : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5");
*pcm1 = -(hi << 3); pcm[0] = -(hi0 << 3);
pcm1 += 16; }
pcm += 16;
phase = (phase + 1) % 16; phase = (phase + 1) % 16;
} }
} }
@ -766,129 +822,200 @@ static
void synth_full(struct mad_synth *synth, struct mad_frame const *frame, void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
unsigned int nch, unsigned int ns) unsigned int nch, unsigned int ns)
{ {
unsigned int phase, ch, s, sb, pe, po; int p;
mad_fixed_t *pcm1, *pcm2, (*filter)[2][2][16][8]; unsigned int phase, ch, s, sb;
mad_fixed_t *pcm, (*filter)[2][2][16][8];
mad_fixed_t const (*sbsample)[36][32]; mad_fixed_t const (*sbsample)[36][32];
register mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8]; mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8];
register mad_fixed_t const (*Dptr)[32], *ptr; mad_fixed_t const (*D0ptr)[32], *ptr;
register mad_fixed64hi_t hi; mad_fixed_t const (*D1ptr)[32];
register mad_fixed64lo_t lo; mad_fixed64hi_t hi;
mad_fixed64lo_t lo;
for (ch = 0; ch < nch; ++ch) { for (ch = 0; ch < nch; ++ch) {
sbsample = &frame->sbsample[ch]; sbsample = &frame->sbsample[ch];
filter = &synth->filter[ch]; filter = &synth->filter[ch];
phase = synth->phase; phase = synth->phase;
pcm1 = synth->pcm.samples[ch]; pcm = synth->pcm.samples[ch];
for (s = 0; s < ns; ++s) { for (s = 0; s < ns; ++s) {
dct32((*sbsample)[s], phase >> 1, dct32((*sbsample)[s], phase >> 1,
(*filter)[0][phase & 1], (*filter)[1][phase & 1]); (*filter)[0][phase & 1], (*filter)[1][phase & 1]);
pe = phase & ~1; p = (phase - 1) & 0xf;
po = ((phase - 1) & 0xf) | 1;
/* calculate 32 samples */ /* calculate 32 samples */
fe = &(*filter)[0][ phase & 1][0]; fe = &(*filter)[0][ phase & 1][0];
fx = &(*filter)[0][~phase & 1][0]; fx = &(*filter)[0][~phase & 1][0];
fo = &(*filter)[1][~phase & 1][0]; fo = &(*filter)[1][~phase & 1][0];
Dptr = &D[0]; D0ptr = (void*)&D[0][ p];
D1ptr = (void*)&D[0][-p];
ptr = *Dptr + po; if(s & 1)
ML0(hi, lo, (*fx)[0], ptr[ 0]); {
MLA(hi, lo, (*fx)[1], ptr[14]); ptr = *D0ptr;
MLA(hi, lo, (*fx)[2], ptr[12]); ML0(hi, lo, (*fx)[0], ptr[ 1]);
MLA(hi, lo, (*fx)[3], ptr[10]); MLA(hi, lo, (*fx)[1], ptr[15]);
MLA(hi, lo, (*fx)[4], ptr[ 8]); MLA(hi, lo, (*fx)[2], ptr[13]);
MLA(hi, lo, (*fx)[5], ptr[ 6]); MLA(hi, lo, (*fx)[3], ptr[11]);
MLA(hi, lo, (*fx)[6], ptr[ 4]); MLA(hi, lo, (*fx)[4], ptr[ 9]);
MLA(hi, lo, (*fx)[7], ptr[ 2]); MLA(hi, lo, (*fx)[5], ptr[ 7]);
MLN(hi, lo); MLA(hi, lo, (*fx)[6], ptr[ 5]);
MLA(hi, lo, (*fx)[7], ptr[ 3]);
MLN(hi, lo);
MLA(hi, lo, (*fe)[0], ptr[ 0]);
MLA(hi, lo, (*fe)[1], ptr[14]);
MLA(hi, lo, (*fe)[2], ptr[12]);
MLA(hi, lo, (*fe)[3], ptr[10]);
MLA(hi, lo, (*fe)[4], ptr[ 8]);
MLA(hi, lo, (*fe)[5], ptr[ 6]);
MLA(hi, lo, (*fe)[6], ptr[ 4]);
MLA(hi, lo, (*fe)[7], ptr[ 2]);
pcm[0] = SHIFT(MLZ(hi, lo));
pcm += 16;
ptr = *Dptr + pe; for (sb = 15; sb; sb--, fo++)
MLA(hi, lo, (*fe)[0], ptr[ 0]); {
MLA(hi, lo, (*fe)[1], ptr[14]); ++fe;
MLA(hi, lo, (*fe)[2], ptr[12]); ++D0ptr;
MLA(hi, lo, (*fe)[3], ptr[10]); ++D1ptr;
MLA(hi, lo, (*fe)[4], ptr[ 8]);
MLA(hi, lo, (*fe)[5], ptr[ 6]);
MLA(hi, lo, (*fe)[6], ptr[ 4]);
MLA(hi, lo, (*fe)[7], ptr[ 2]);
*pcm1++ = SHIFT(MLZ(hi, lo)); /* D[32 - sb][i] == -D[sb][31 - i] */
ptr = *D0ptr;
ML0(hi, lo, (*fo)[0], ptr[ 1]);
MLA(hi, lo, (*fo)[1], ptr[15]);
MLA(hi, lo, (*fo)[2], ptr[13]);
MLA(hi, lo, (*fo)[3], ptr[11]);
MLA(hi, lo, (*fo)[4], ptr[ 9]);
MLA(hi, lo, (*fo)[5], ptr[ 7]);
MLA(hi, lo, (*fo)[6], ptr[ 5]);
MLA(hi, lo, (*fo)[7], ptr[ 3]);
MLN(hi, lo);
MLA(hi, lo, (*fe)[7], ptr[ 2]);
MLA(hi, lo, (*fe)[6], ptr[ 4]);
MLA(hi, lo, (*fe)[5], ptr[ 6]);
MLA(hi, lo, (*fe)[4], ptr[ 8]);
MLA(hi, lo, (*fe)[3], ptr[10]);
MLA(hi, lo, (*fe)[2], ptr[12]);
MLA(hi, lo, (*fe)[1], ptr[14]);
MLA(hi, lo, (*fe)[0], ptr[ 0]);
pcm[-sb] = SHIFT(MLZ(hi, lo));
pcm2 = pcm1 + 30; ptr = *D1ptr;
ML0(hi, lo, (*fe)[0], ptr[31 - 16]);
MLA(hi, lo, (*fe)[1], ptr[31 - 14]);
MLA(hi, lo, (*fe)[2], ptr[31 - 12]);
MLA(hi, lo, (*fe)[3], ptr[31 - 10]);
MLA(hi, lo, (*fe)[4], ptr[31 - 8]);
MLA(hi, lo, (*fe)[5], ptr[31 - 6]);
MLA(hi, lo, (*fe)[6], ptr[31 - 4]);
MLA(hi, lo, (*fe)[7], ptr[31 - 2]);
MLA(hi, lo, (*fo)[7], ptr[31 - 3]);
MLA(hi, lo, (*fo)[6], ptr[31 - 5]);
MLA(hi, lo, (*fo)[5], ptr[31 - 7]);
MLA(hi, lo, (*fo)[4], ptr[31 - 9]);
MLA(hi, lo, (*fo)[3], ptr[31 - 11]);
MLA(hi, lo, (*fo)[2], ptr[31 - 13]);
MLA(hi, lo, (*fo)[1], ptr[31 - 15]);
MLA(hi, lo, (*fo)[0], ptr[31 - 1]);
pcm[sb] = SHIFT(MLZ(hi, lo));
}
for (sb = 1; sb < 16; ++sb) { ptr = *(D0ptr + 1);
++fe; ML0(hi, lo, (*fo)[0], ptr[ 1]);
++Dptr; MLA(hi, lo, (*fo)[1], ptr[15]);
MLA(hi, lo, (*fo)[2], ptr[13]);
MLA(hi, lo, (*fo)[3], ptr[11]);
MLA(hi, lo, (*fo)[4], ptr[ 9]);
MLA(hi, lo, (*fo)[5], ptr[ 7]);
MLA(hi, lo, (*fo)[6], ptr[ 5]);
MLA(hi, lo, (*fo)[7], ptr[ 3]);
pcm[0] = SHIFT(-MLZ(hi, lo));
}
else
{
ptr = *D0ptr;
ML0(hi, lo, (*fx)[0], ptr[ 0]);
MLA(hi, lo, (*fx)[1], ptr[14]);
MLA(hi, lo, (*fx)[2], ptr[12]);
MLA(hi, lo, (*fx)[3], ptr[10]);
MLA(hi, lo, (*fx)[4], ptr[ 8]);
MLA(hi, lo, (*fx)[5], ptr[ 6]);
MLA(hi, lo, (*fx)[6], ptr[ 4]);
MLA(hi, lo, (*fx)[7], ptr[ 2]);
MLN(hi, lo);
MLA(hi, lo, (*fe)[0], ptr[ 1]);
MLA(hi, lo, (*fe)[1], ptr[15]);
MLA(hi, lo, (*fe)[2], ptr[13]);
MLA(hi, lo, (*fe)[3], ptr[11]);
MLA(hi, lo, (*fe)[4], ptr[ 9]);
MLA(hi, lo, (*fe)[5], ptr[ 7]);
MLA(hi, lo, (*fe)[6], ptr[ 5]);
MLA(hi, lo, (*fe)[7], ptr[ 3]);
pcm[0] = SHIFT(MLZ(hi, lo));
pcm += 16;
/* D[32 - sb][i] == -D[sb][31 - i] */ for (sb = 15; sb; sb--, fo++)
{
++fe;
++D0ptr;
++D1ptr;
ptr = *Dptr + po; /* D[32 - sb][i] == -D[sb][31 - i] */
ML0(hi, lo, (*fo)[0], ptr[ 0]); ptr = *D0ptr;
MLA(hi, lo, (*fo)[1], ptr[14]); ML0(hi, lo, (*fo)[0], ptr[ 0]);
MLA(hi, lo, (*fo)[2], ptr[12]); MLA(hi, lo, (*fo)[1], ptr[14]);
MLA(hi, lo, (*fo)[3], ptr[10]); MLA(hi, lo, (*fo)[2], ptr[12]);
MLA(hi, lo, (*fo)[4], ptr[ 8]); MLA(hi, lo, (*fo)[3], ptr[10]);
MLA(hi, lo, (*fo)[5], ptr[ 6]); MLA(hi, lo, (*fo)[4], ptr[ 8]);
MLA(hi, lo, (*fo)[6], ptr[ 4]); MLA(hi, lo, (*fo)[5], ptr[ 6]);
MLA(hi, lo, (*fo)[7], ptr[ 2]); MLA(hi, lo, (*fo)[6], ptr[ 4]);
MLN(hi, lo); MLA(hi, lo, (*fo)[7], ptr[ 2]);
MLN(hi, lo);
MLA(hi, lo, (*fe)[7], ptr[ 3]);
MLA(hi, lo, (*fe)[6], ptr[ 5]);
MLA(hi, lo, (*fe)[5], ptr[ 7]);
MLA(hi, lo, (*fe)[4], ptr[ 9]);
MLA(hi, lo, (*fe)[3], ptr[11]);
MLA(hi, lo, (*fe)[2], ptr[13]);
MLA(hi, lo, (*fe)[1], ptr[15]);
MLA(hi, lo, (*fe)[0], ptr[ 1]);
pcm[-sb] = SHIFT(MLZ(hi, lo));
ptr = *Dptr + pe; ptr = *D1ptr;
MLA(hi, lo, (*fe)[7], ptr[ 2]); ML0(hi, lo, (*fe)[0], ptr[31 - 1]);
MLA(hi, lo, (*fe)[6], ptr[ 4]); MLA(hi, lo, (*fe)[1], ptr[31 - 15]);
MLA(hi, lo, (*fe)[5], ptr[ 6]); MLA(hi, lo, (*fe)[2], ptr[31 - 13]);
MLA(hi, lo, (*fe)[4], ptr[ 8]); MLA(hi, lo, (*fe)[3], ptr[31 - 11]);
MLA(hi, lo, (*fe)[3], ptr[10]); MLA(hi, lo, (*fe)[4], ptr[31 - 9]);
MLA(hi, lo, (*fe)[2], ptr[12]); MLA(hi, lo, (*fe)[5], ptr[31 - 7]);
MLA(hi, lo, (*fe)[1], ptr[14]); MLA(hi, lo, (*fe)[6], ptr[31 - 5]);
MLA(hi, lo, (*fe)[0], ptr[ 0]); MLA(hi, lo, (*fe)[7], ptr[31 - 3]);
MLA(hi, lo, (*fo)[7], ptr[31 - 2]);
MLA(hi, lo, (*fo)[6], ptr[31 - 4]);
MLA(hi, lo, (*fo)[5], ptr[31 - 6]);
MLA(hi, lo, (*fo)[4], ptr[31 - 8]);
MLA(hi, lo, (*fo)[3], ptr[31 - 10]);
MLA(hi, lo, (*fo)[2], ptr[31 - 12]);
MLA(hi, lo, (*fo)[1], ptr[31 - 14]);
MLA(hi, lo, (*fo)[0], ptr[31 - 16]);
pcm[sb] = SHIFT(MLZ(hi, lo));
}
*pcm1++ = SHIFT(MLZ(hi, lo)); ptr = *(D0ptr + 1);
ML0(hi, lo, (*fo)[0], ptr[ 0]);
ptr = *Dptr - pe; MLA(hi, lo, (*fo)[1], ptr[14]);
ML0(hi, lo, (*fe)[0], ptr[31 - 16]); MLA(hi, lo, (*fo)[2], ptr[12]);
MLA(hi, lo, (*fe)[1], ptr[31 - 14]); MLA(hi, lo, (*fo)[3], ptr[10]);
MLA(hi, lo, (*fe)[2], ptr[31 - 12]); MLA(hi, lo, (*fo)[4], ptr[ 8]);
MLA(hi, lo, (*fe)[3], ptr[31 - 10]); MLA(hi, lo, (*fo)[5], ptr[ 6]);
MLA(hi, lo, (*fe)[4], ptr[31 - 8]); MLA(hi, lo, (*fo)[6], ptr[ 4]);
MLA(hi, lo, (*fe)[5], ptr[31 - 6]); MLA(hi, lo, (*fo)[7], ptr[ 2]);
MLA(hi, lo, (*fe)[6], ptr[31 - 4]); pcm[0] = SHIFT(-MLZ(hi, lo));
MLA(hi, lo, (*fe)[7], ptr[31 - 2]);
ptr = *Dptr - po;
MLA(hi, lo, (*fo)[7], ptr[31 - 2]);
MLA(hi, lo, (*fo)[6], ptr[31 - 4]);
MLA(hi, lo, (*fo)[5], ptr[31 - 6]);
MLA(hi, lo, (*fo)[4], ptr[31 - 8]);
MLA(hi, lo, (*fo)[3], ptr[31 - 10]);
MLA(hi, lo, (*fo)[2], ptr[31 - 12]);
MLA(hi, lo, (*fo)[1], ptr[31 - 14]);
MLA(hi, lo, (*fo)[0], ptr[31 - 16]);
*pcm2-- = SHIFT(MLZ(hi, lo));
++fo;
} }
++Dptr; pcm += 16;
ptr = *Dptr + po;
ML0(hi, lo, (*fo)[0], ptr[ 0]);
MLA(hi, lo, (*fo)[1], ptr[14]);
MLA(hi, lo, (*fo)[2], ptr[12]);
MLA(hi, lo, (*fo)[3], ptr[10]);
MLA(hi, lo, (*fo)[4], ptr[ 8]);
MLA(hi, lo, (*fo)[5], ptr[ 6]);
MLA(hi, lo, (*fo)[6], ptr[ 4]);
MLA(hi, lo, (*fo)[7], ptr[ 2]);
*pcm1 = SHIFT(-MLZ(hi, lo));
pcm1 += 16;
phase = (phase + 1) % 16; phase = (phase + 1) % 16;
} }
} }