diff --git a/apps/codecs/libmad/bit.c b/apps/codecs/libmad/bit.c index ec1b64541e..626eb7a642 100644 --- a/apps/codecs/libmad/bit.c +++ b/apps/codecs/libmad/bit.c @@ -87,9 +87,8 @@ unsigned short const crc_table[256] = { */ void mad_bit_init(struct mad_bitptr *bitptr, unsigned char const *byte) { - bitptr->byte = byte; - bitptr->cache = 0; - bitptr->left = CHAR_BIT; + bitptr->ptr = (unsigned long*)((long)byte & ~3); + bitptr->readbit = ((unsigned long)byte & 3) << 3; } /* @@ -99,17 +98,20 @@ void mad_bit_init(struct mad_bitptr *bitptr, unsigned char const *byte) unsigned int mad_bit_length(struct mad_bitptr const *begin, struct mad_bitptr const *end) { - return begin->left + - CHAR_BIT * (end->byte - (begin->byte + 1)) + (CHAR_BIT - end->left); + return end->readbit - begin->readbit; } +unsigned char mad_bit_bitsleft(struct mad_bitptr const *bitptr) +{ + return 8 - (bitptr->readbit & 7); +} /* * NAME: bit->nextbyte() * DESCRIPTION: return pointer to next unprocessed byte */ unsigned char const *mad_bit_nextbyte(struct mad_bitptr const *bitptr) { - return bitptr->left == CHAR_BIT ? bitptr->byte : bitptr->byte + 1; + return (unsigned char const*)bitptr->ptr + ((bitptr->readbit + 7) >> 3); } /* @@ -118,60 +120,43 @@ unsigned char const *mad_bit_nextbyte(struct mad_bitptr const *bitptr) */ void mad_bit_skip(struct mad_bitptr *bitptr, unsigned int len) { - bitptr->byte += len / CHAR_BIT; - bitptr->left -= len % CHAR_BIT; - - if (bitptr->left > CHAR_BIT) { - bitptr->byte++; - bitptr->left += CHAR_BIT; - } - - if (bitptr->left < CHAR_BIT) - bitptr->cache = *bitptr->byte; + bitptr->readbit += len; } /* * NAME: bit->read() * DESCRIPTION: read an arbitrary number of bits and return their UIMSBF value */ +unsigned long bmask[] ICONST_ATTR = +{ 0x00000000, 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, + 0x0000003f, 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, + 0x00000fff, 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, + 0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, + 0x00ffffff, 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, + 0x3fffffff, 0x7fffffff, 0xffffffff }; +unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) ICODE_ATTR; unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) { - register unsigned long value; + unsigned long *curr = &bitptr->ptr[bitptr->readbit>>5]; - if (bitptr->left == CHAR_BIT) - bitptr->cache = *bitptr->byte; + if(len) + { + if((bitptr->readbit ^ (bitptr->readbit + len - 1)) < 32) + { + bitptr->readbit += len; - if (len < bitptr->left) { - value = (bitptr->cache & ((1 << bitptr->left) - 1)) >> - (bitptr->left - len); - bitptr->left -= len; + return (betoh32(curr[0]) >> (-bitptr->readbit & 31)) & bmask[len]; + } + else + { + bitptr->readbit += len; - return value; + return ((betoh32(curr[0]) << ( bitptr->readbit & 31)) + + (betoh32(curr[1]) >> (-bitptr->readbit & 31))) & bmask[len]; + } } - /* remaining bits in current byte */ - - value = bitptr->cache & ((1 << bitptr->left) - 1); - len -= bitptr->left; - - bitptr->byte++; - bitptr->left = CHAR_BIT; - - /* more bytes */ - - while (len >= CHAR_BIT) { - value = (value << CHAR_BIT) | *bitptr->byte++; - len -= CHAR_BIT; - } - - if (len > 0) { - bitptr->cache = *bitptr->byte; - - value = (value << len) | (bitptr->cache >> (CHAR_BIT - len)); - bitptr->left -= len; - } - - return value; + return 0; } # if 0 diff --git a/apps/codecs/libmad/bit.h b/apps/codecs/libmad/bit.h index 22ae66cb1d..026b501b57 100644 --- a/apps/codecs/libmad/bit.h +++ b/apps/codecs/libmad/bit.h @@ -23,9 +23,8 @@ # define LIBMAD_BIT_H struct mad_bitptr { - unsigned char const *byte; - unsigned short cache; - unsigned short left; + unsigned long *ptr; + unsigned long readbit; }; void mad_bit_init(struct mad_bitptr *, unsigned char const *); @@ -35,7 +34,7 @@ void mad_bit_init(struct mad_bitptr *, unsigned char const *); unsigned int mad_bit_length(struct mad_bitptr const *, struct mad_bitptr const *); -# define mad_bit_bitsleft(bitptr) ((bitptr)->left) +unsigned char mad_bit_bitsleft(struct mad_bitptr const *bitptr); unsigned char const *mad_bit_nextbyte(struct mad_bitptr const *); void mad_bit_skip(struct mad_bitptr *, unsigned int); diff --git a/apps/codecs/libmad/layer3.c b/apps/codecs/libmad/layer3.c index 2b66775502..5ff3a04824 100644 --- a/apps/codecs/libmad/layer3.c +++ b/apps/codecs/libmad/layer3.c @@ -44,6 +44,13 @@ # include "huffman.h" # include "layer3.h" +/* depending on the cpu "leftshift32" may be supported or not */ +# if defined(CPU_COLDFIRE) && !defined(SIMULATOR) +#define MAXLSHIFT 32 +#else +#define MAXLSHIFT 31 +#endif + /* --- Layer III ----------------------------------------------------------- */ enum { @@ -924,16 +931,17 @@ mad_fixed_t III_requantize(unsigned int value, signed int exp) * DESCRIPTION: decode Huffman code words of one channel of one granule */ static -enum mad_error III_huffdecode(struct mad_bitptr *ptr, mad_fixed_t xr[576], +enum mad_error III_huffdecode(struct mad_bitptr *ptr, mad_fixed_t xrarr[576], struct channel *channel, unsigned char const *sfbwidth, unsigned int part2_length) { + unsigned int bits; signed int exponents[39], exp; signed int const *expptr; struct mad_bitptr peek; signed int bits_left, cachesz; - register mad_fixed_t *xrptr; + register mad_fixed_t *xr; mad_fixed_t const *sfbound; register unsigned long bitcache; @@ -943,207 +951,240 @@ enum mad_error III_huffdecode(struct mad_bitptr *ptr, mad_fixed_t xr[576], III_exponents(channel, sfbwidth, exponents); - peek = *ptr; + peek = *ptr; + cachesz = 0; + sfbound = xr = xrarr; mad_bit_skip(ptr, bits_left); - /* align bit reads to byte boundaries */ - cachesz = mad_bit_bitsleft(&peek); - cachesz += ((32 - 1 - 24) + (24 - cachesz)) & ~7; - - bitcache = mad_bit_read(&peek, cachesz); - bits_left -= cachesz; - - xrptr = &xr[0]; - /* big_values */ { - unsigned int region, rcount; + int region; struct hufftable const *entry; - union huffpair const *table; - unsigned int linbits, startbits, big_values; - mad_fixed_t reqcache[16]; + union huffpair const *table; + unsigned int linbits, startbits, rcount; + mad_fixed_t reqcache[16]; + mad_fixed_t const *xr_end, *xr_big_val; - sfbound = xrptr + *sfbwidth++; - rcount = channel->region0_count + 1; + rcount = 1; + expptr = &exponents[0]; + region = -1; + exp = 0x3210; /* start value */ + bitcache = 0; + linbits = startbits = 0; + table = NULL; + xr_big_val = xr + 2 * channel->big_values; - entry = &mad_huff_pair_table[channel->table_select[region = 0]]; - table = entry->table; - linbits = entry->linbits; - startbits = entry->startbits; + while(xr < xr_big_val) + { + sfbound += *sfbwidth++; + xr_end = sfbound > xr_big_val ? xr_big_val : sfbound; - if (table == 0) - return MAD_ERROR_BADHUFFTABLE; - - expptr = &exponents[0]; - exp = *expptr++; - - /* clear cache */ - memset(reqcache, 0, sizeof(reqcache)); - - big_values = channel->big_values; - - while (big_values-- && cachesz + bits_left > 0) { - union huffpair const *pair; - unsigned int clumpsz, value; - register mad_fixed_t requantized; - - if (xrptr == sfbound) { - sfbound += *sfbwidth++; - - /* change table if region boundary */ - - if (--rcount == 0) { - if (region == 0) - rcount = channel->region1_count + 1; + /* change table if region boundary */ + if(--rcount == 0) + { + if(exp == 0x3210) + rcount = channel->region0_count + 1; + else + if(region == 0) + rcount = channel->region1_count + 1; else - rcount = 0; /* all remaining */ + rcount = 0; /* all remaining */ - entry = &mad_huff_pair_table[channel->table_select[++region]]; - table = entry->table; - linbits = entry->linbits; - startbits = entry->startbits; + entry = &mad_huff_pair_table[channel->table_select[++region]]; + table = entry->table; + linbits = entry->linbits; + startbits = entry->startbits; - if (table == 0) - return MAD_ERROR_BADHUFFTABLE; - } - - if (exp != *expptr) { - exp = *expptr; - memset(reqcache, 0, sizeof(reqcache)); - } - - ++expptr; + if(table == 0) + return MAD_ERROR_BADHUFFTABLE; } - if (cachesz < 21) { - unsigned int bits; - - bits = ((32 - 1 - 21) + (21 - cachesz)) & ~7; - bitcache = (bitcache << bits) | mad_bit_read(&peek, bits); - cachesz += bits; - bits_left -= bits; + if(exp != *expptr) + { + exp = *expptr; + /* clear cache */ + memset(reqcache, 0, sizeof(reqcache)); } - /* hcod (0..19) */ + ++expptr; - clumpsz = startbits; - pair = &table[MASK(bitcache, cachesz, clumpsz)]; + if(linbits) + { + for( ; xrfinal) { - cachesz -= clumpsz; + /* maxhuffcode(hufftab16,hufftab24)=17bit + sign(x,y)=2bit */ + if(cachesz < 19) + { + if(cachesz < 0) + return MAD_ERROR_BADHUFFDATA; /* cache underrun */ - clumpsz = pair->ptr.bits; - pair = &table[pair->ptr.offset + MASK(bitcache, cachesz, clumpsz)]; + bits = MAXLSHIFT - cachesz; + bitcache = (bitcache << bits) | mad_bit_read(&peek, bits); + cachesz += bits; + } + + /* hcod (0..19) */ + clumpsz = startbits; + pair = &table[MASK(bitcache, cachesz, clumpsz)]; + + while(!pair->final) + { + cachesz -= clumpsz; + clumpsz = pair->ptr.bits; + pair = &table[pair->ptr.offset + MASK(bitcache, cachesz, clumpsz)]; + } + + cachesz -= pair->value.hlen; + + /* x (0..14) */ + value = pair->value.x; + if(value == 0) + xr[0] = 0; + else + { + if(value == 15) + { + /* maxlinbits=13bit + sign(x,y)=2bit */ + if(cachesz < 15) + { + if(cachesz < 0) + return MAD_ERROR_BADHUFFDATA; /* cache underrun */ + + bits = MAXLSHIFT - cachesz; + bitcache = (bitcache << bits) | mad_bit_read(&peek, bits); + cachesz += bits; + } + + requantized = III_requantize(15+MASK(bitcache, cachesz, linbits), exp); + cachesz -= linbits; + } + else + { + if(reqcache[value]) + requantized = reqcache[value]; + else + requantized = reqcache[value] = III_requantize(value, exp); + } + + xr[0] = MASK1BIT(bitcache, cachesz--) ? -requantized : requantized; + } + + /* y (0..14) */ + value = pair->value.y; + if(value == 0) + xr[1] = 0; + else + { + if(value == 15) + { + /* maxlinbits=13bit + sign(y)=1bit */ + if(cachesz < 14) + { + if(cachesz < 0) + return MAD_ERROR_BADHUFFDATA; /* cache underrun */ + + bits = MAXLSHIFT - cachesz; + bitcache = (bitcache << bits) | mad_bit_read(&peek, bits); + cachesz += bits; + } + + requantized = III_requantize(15+MASK(bitcache, cachesz, linbits), exp); + cachesz -= linbits; + } + else + { + if(reqcache[value]) + requantized = reqcache[value]; + else + requantized = reqcache[value] = III_requantize(value, exp); + } + xr[1] = MASK1BIT(bitcache, cachesz--) ? -requantized : requantized; + } + } } + else + { + for( ; xrvalue.hlen; + /* maxlookup=4bit + sign(x,y)=2bit */ + if(cachesz < 6) + { + if(cachesz < 0) + return MAD_ERROR_BADHUFFDATA; /* cache underrun */ - if (linbits) { - /* x (0..14) */ + bits = MAXLSHIFT - cachesz; + bitcache = (bitcache << bits) | mad_bit_read(&peek, bits); + cachesz += bits; + } - value = pair->value.x; + /* hcod (0..19) */ + clumpsz = startbits; + pair = &table[MASK(bitcache, cachesz, clumpsz)]; - switch (value) { - case 0: - xrptr[0] = 0; - break; + while(!pair->final) + { + cachesz -= clumpsz; - case 15: - if ((unsigned int)cachesz < linbits + 2) { - bitcache = (bitcache << 16) | mad_bit_read(&peek, 16); - cachesz += 16; - bits_left -= 16; - } + /* maxlookup=4bit + sign(x,y)=2bit */ + if(cachesz < 6) + { + if(cachesz < 0) + return MAD_ERROR_BADHUFFDATA; /* cache underrun */ - value += MASK(bitcache, cachesz, linbits); - cachesz -= linbits; + bits = MAXLSHIFT - cachesz; + bitcache = (bitcache << bits) | mad_bit_read(&peek, bits); + cachesz += bits; + } - requantized = III_requantize(value, exp); - goto x_final; + clumpsz = pair->ptr.bits; + pair = &table[pair->ptr.offset + MASK(bitcache, cachesz, clumpsz)]; + } - default: - if (reqcache[value]) - requantized = reqcache[value]; + cachesz -= pair->value.hlen; + + /* x (0..1) */ + value = pair->value.x; + if(value == 0) + xr[0] = 0; else - requantized = reqcache[value] = III_requantize(value, exp); + { + if(reqcache[value]) + requantized = reqcache[value]; + else + requantized = reqcache[value] = III_requantize(value, exp); - x_final: - xrptr[0] = MASK1BIT(bitcache, cachesz--) ? - -requantized : requantized; - } + xr[0] = MASK1BIT(bitcache, cachesz--) ? -requantized : requantized; + } - /* y (0..14) */ - - value = pair->value.y; - - switch (value) { - case 0: - xrptr[1] = 0; - break; - - case 15: - if ((unsigned int)cachesz < linbits + 1) { - bitcache = (bitcache << 16) | mad_bit_read(&peek, 16); - cachesz += 16; - bits_left -= 16; - } - - value += MASK(bitcache, cachesz, linbits); - cachesz -= linbits; - - requantized = III_requantize(value, exp); - goto y_final; - - default: - if (reqcache[value]) - requantized = reqcache[value]; + /* y (0..1) */ + value = pair->value.y; + if(value == 0) + xr[1] = 0; else - requantized = reqcache[value] = III_requantize(value, exp); + { + if(reqcache[value]) + requantized = reqcache[value]; + else + requantized = reqcache[value] = III_requantize(value, exp); - y_final: - xrptr[1] = MASK1BIT(bitcache, cachesz--) ? - -requantized : requantized; - } + xr[1] = MASK1BIT(bitcache, cachesz--) ? -requantized : requantized; + } + } } - else { - /* x (0..1) */ - - value = pair->value.x; - - if (value == 0) - xrptr[0] = 0; - else { - if (reqcache[value]) - requantized = reqcache[value]; - else - requantized = reqcache[value] = III_requantize(value, exp); - - xrptr[0] = MASK1BIT(bitcache, cachesz--) ? - -requantized : requantized; - } - - /* y (0..1) */ - - value = pair->value.y; - - if (value == 0) - xrptr[1] = 0; - else { - if (reqcache[value]) - requantized = reqcache[value]; - else - requantized = reqcache[value] = III_requantize(value, exp); - - xrptr[1] = MASK1BIT(bitcache, cachesz--) ? - -requantized : requantized; - } - } - - xrptr += 2; } } - if (cachesz + bits_left < 0) + bits_left = ptr->readbit - peek.readbit; + + if(bits_left + cachesz < 0) return MAD_ERROR_BADHUFFDATA; /* big_values overrun */ /* count1 */ @@ -1155,15 +1196,20 @@ enum mad_error III_huffdecode(struct mad_bitptr *ptr, mad_fixed_t xr[576], requantized = III_requantize(1, exp); - while (cachesz + bits_left > 0 && xrptr <= &xr[572]) { + while(xr <= &xrarr[572] && bits_left + cachesz > 0) + { union huffquad const *quad; /* hcod (1..6) */ + if(cachesz < 10) + { + if(cachesz < 0) + return MAD_ERROR_BADHUFFDATA; /* cache underrun */ - if (cachesz < 10) { - bitcache = (bitcache << 16) | mad_bit_read(&peek, 16); - cachesz += 16; - bits_left -= 16; + bits = MAXLSHIFT - cachesz; + bitcache = (bitcache << bits) | mad_bit_read(&peek, bits); + cachesz += bits; + bits_left -= bits; } quad = &table[MASK(bitcache, cachesz, 4)]; @@ -1178,7 +1224,7 @@ enum mad_error III_huffdecode(struct mad_bitptr *ptr, mad_fixed_t xr[576], cachesz -= quad->value.hlen; - if (xrptr == sfbound) { + if (xr == sfbound) { sfbound += *sfbwidth++; if (exp != *expptr) { @@ -1190,18 +1236,16 @@ enum mad_error III_huffdecode(struct mad_bitptr *ptr, mad_fixed_t xr[576], } /* v (0..1) */ - - xrptr[0] = quad->value.v ? + xr[0] = quad->value.v ? (MASK1BIT(bitcache, cachesz--) ? -requantized : requantized) : 0; /* w (0..1) */ - - xrptr[1] = quad->value.w ? + xr[1] = quad->value.w ? (MASK1BIT(bitcache, cachesz--) ? -requantized : requantized) : 0; - xrptr += 2; + xr += 2; - if (xrptr == sfbound) { + if (xr == sfbound) { sfbound += *sfbwidth++; if (exp != *expptr) { @@ -1213,42 +1257,26 @@ enum mad_error III_huffdecode(struct mad_bitptr *ptr, mad_fixed_t xr[576], } /* x (0..1) */ - - xrptr[0] = quad->value.x ? + xr[0] = quad->value.x ? (MASK1BIT(bitcache, cachesz--) ? -requantized : requantized) : 0; /* y (0..1) */ - - xrptr[1] = quad->value.y ? + xr[1] = quad->value.y ? (MASK1BIT(bitcache, cachesz--) ? -requantized : requantized) : 0; - xrptr += 2; + xr += 2; } - if (cachesz + bits_left < 0) { -# if 0 && defined(DEBUG) - fprintf(stderr, "huffman count1 overrun (%d bits)\n", - -(cachesz + bits_left)); -# endif - + if(bits_left + cachesz < 0) + { /* technically the bitstream is misformatted, but apparently some encoders are just a bit sloppy with stuffing bits */ - - xrptr -= 4; + xr -= 4; } } - assert(-bits_left <= MAD_BUFFER_GUARD * CHAR_BIT); - -# if 0 && defined(DEBUG) - if (bits_left < 0) - fprintf(stderr, "read %d bits too many\n", -bits_left); - else if (cachesz + bits_left > 0) - fprintf(stderr, "%d stuffing bits\n", cachesz + bits_left); -# endif - /* rzero */ - memset(xrptr, 0, (char*)&xr[576] - (char*)xrptr); + memset(xr, 0, (char*)&xrarr[576] - (char*)xr); return MAD_ERROR_NONE; } @@ -1777,569 +1805,656 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) mad_fixed_t t[16]; /* assumes FRACBITS = 28 */ asm volatile ( - "move.l (4*4, %[X]), %%d0\n\t" - "move.l #0x0ec835e8, %%d1\n\t" - "mac.l %%d0, %%d1, (13*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x061f78aa, %%d1\n\t" - "mac.l %%d0, %%d1, (1*4, %[X]), %%d0, %%acc0\n\t" - "move.l %%acc0, %%d7\n\t" - "asl.l #3, %%d7\n\t" - "move.l %%d7, (6*4, %[t])\n\t" + /* MAD_F_ML0(hi, lo, X[4], MAD_F(0x0ec835e8)); */ + /* MAD_F_MLA(hi, lo, X[13], MAD_F(0x061f78aa)); */ + /* t6 = MAD_F_MLZ(hi, lo); */ + "move.l (4*4, %[X]), %%d0\n" + "move.l #0x0ec835e8, %%d1\n" + "move.l #0x061f78aa, %%d2\n" + "mac.l %%d1, %%d0, (13*4, %[X]), %%d0, %%acc0\n" + "mac.l %%d2, %%d0, ( 1*4, %[X]), %%d0, %%acc0\n" + "move.l %%acc0, %%d7\n" + "asl.l #3, %%d7\n" + "move.l %%d7, (6*4, %[t])\n" + + /* MAD_F_ML0(hi, lo, (t14 = X[1] - X[10]), -MAD_F(0x0ec835e8)); */ + /* MAD_F_MLA(hi, lo, (t15 = X[7] + X[16]), MAD_F(0x061f78aa)); */ + /* t4 = MAD_F_MLZ(hi, lo); */ + /* MAD_F_MLA(hi, lo, t14, -MAD_F(0x061f78aa)); */ + /* MAD_F_MLA(hi, lo, t15, -MAD_F(0x0ec835e8)); */ + /* t0 = MAD_F_MLZ(hi, lo); */ + "sub.l (10*4, %[X]), %%d0\n" /* t14 */ + "msac.l %%d0, %%d1, %%acc1\n" + "msac.l %%d0, %%d2, (7*4, %[X]), %%d5, %%acc0\n" + "add.l (16*4, %[X]), %%d5\n" /* t15 */ + "mac.l %%d5, %%d2, %%acc1\n" + "msac.l %%d5, %%d1, ( %[X]), %%d5, %%acc0\n" + "movclr.l %%acc1, %%d6\n" + "asl.l #3, %%d6\n" /* t4 */ + "move.l %%d6, (4*4, %[t])\n" + "move.l %%acc0, %%d0\n" + "asl.l #3, %%d0\n" /* t0 */ + "move.l %%d0, (0*4, %[t])\n" - "sub.l (10*4, %[X]), %%d0\n\t" - "move.l %%d0, (14*4, %[t])\n\t" - "move.l #0x061f78aa, %%d1\n\t" - "msac.l %%d0, %%d1, (7*4, %[X]), %%d0, %%acc0\n\t" - "add.l (16*4, %[X]), %%d0\n\t" - "move.l %%d0, (15*4, %[t])\n\t" - "move.l #0x0ec835e8, %%d1\n\t" - "msac.l %%d0, %%d1, (%[X]), %%d2, %%acc0\n\t" - "move.l %%acc0, %%d6\n\t" - "asl.l #3, %%d6\n\t" - "move.l %%d6, (%[t])\n\t" + /* MAD_F_MLA(hi, lo, (t8 =X[0]-X[11]-X[12]), MAD_F(0x0216a2a2)); */ + /* MAD_F_MLA(hi, lo, (t9 =X[2]-X[ 9]-X[14]), MAD_F(0x09bd7ca0)); */ + /* MAD_F_MLA(hi, lo, (t10=X[3]-X[ 8]-X[15]), -MAD_F(0x0cb19346)); */ + /* MAD_F_MLA(hi, lo, (t11=X[5]-X[ 6]-X[17]), -MAD_F(0x0fdcf549)); */ + /* x[10] = -(x[7] = MAD_F_MLZ(hi, lo)); */ + + /* MAD_F_ML0(hi, lo, t8, -MAD_F(0x0cb19346)); */ + /* MAD_F_MLA(hi, lo, t9, MAD_F(0x0fdcf549)); */ + /* MAD_F_MLA(hi, lo, t10, MAD_F(0x0216a2a2)); */ + /* MAD_F_MLA(hi, lo, t11, -MAD_F(0x09bd7ca0)); */ + /* x[19] = x[34] = MAD_F_MLZ(hi, lo) - t0; */ + + /* MAD_F_ML0(hi, lo, t8, MAD_F(0x09bd7ca0)); */ + /* MAD_F_MLA(hi, lo, t9, -MAD_F(0x0216a2a2)); */ + /* MAD_F_MLA(hi, lo, t10, MAD_F(0x0fdcf549)); */ + /* MAD_F_MLA(hi, lo, t11, -MAD_F(0x0cb19346)); */ + /* x[ 1] = MAD_F_MLZ(hi, lo); */ + + /* MAD_F_ML0(hi, lo, t8, -MAD_F(0x0fdcf549)); */ + /* MAD_F_MLA(hi, lo, t9, -MAD_F(0x0cb19346)); */ + /* MAD_F_MLA(hi, lo, t10, -MAD_F(0x09bd7ca0)); */ + /* MAD_F_MLA(hi, lo, t11, -MAD_F(0x0216a2a2)); */ + /* x[25] = MAD_F_MLZ(hi, lo); */ + + /* t12 = t8 - t10; */ + /* t13 = t9 + t11; */ + "move.l #0x0216a2a2, %%d1\n" + "move.l #0x0cb19346, %%d2\n" + "move.l #0x09bd7ca0, %%d3\n" + "move.l #0x0fdcf549, %%d4\n" + "sub.l (11*4, %[X]), %%d5\n" + "sub.l (12*4, %[X]), %%d5\n" + "mac.l %%d1, %%d5, %%acc0\n" + "msac.l %%d2, %%d5, %%acc1\n" + "mac.l %%d3, %%d5, %%acc2\n" + "msac.l %%d4, %%d5, (2*4, %[X]), %%d6, %%acc3\n" + "sub.l ( 9*4, %[X]), %%d6\n" + "sub.l (14*4, %[X]), %%d6\n" + "mac.l %%d3, %%d6, %%acc0\n" + "mac.l %%d4, %%d6, %%acc1\n" + "msac.l %%d1, %%d6, %%acc2\n" + "msac.l %%d2, %%d6, (3*4, %[X]), %%d7, %%acc3\n" + "sub.l ( 8*4, %[X]), %%d7\n" + "sub.l (15*4, %[X]), %%d7\n" + "sub.l %%d7, %%d5\n" /* d5: t12 */ + "move.l %%d5, (12*4, %[t])\n" + "msac.l %%d2, %%d7, %%acc0\n" + "mac.l %%d1, %%d7, %%acc1\n" + "mac.l %%d4, %%d7, %%acc2\n" + "msac.l %%d3, %%d7, (5*4, %[X]), %%d7, %%acc3\n" + "sub.l ( 6*4, %[X]), %%d7\n" + "sub.l (17*4, %[X]), %%d7\n" + "add.l %%d7, %%d6\n" /* d6: t13 */ + "move.l %%d6, (13*4, %[t])\n" + "msac.l %%d4, %%d7, %%acc0\n" + "msac.l %%d3, %%d7, %%acc1\n" + "msac.l %%d2, %%d7, %%acc2\n" + "msac.l %%d1, %%d7, (1*4, %[X]), %%d5, %%acc3\n" - "sub.l (11*4, %[X]), %%d2\n\t" /* store t8-t11 in d2-d5, will need them soon */ - "sub.l (12*4, %[X]), %%d2\n\t" - "move.l %%d2, (8*4, %[t])\n\t" - "move.l #0x0216a2a2, %%d1\n\t" - "mac.l %%d2, %%d1, (2*4, %[X]), %%d3, %%acc0\n\t" + "movclr.l %%acc0, %%d7\n" + "asl.l #3, %%d7\n" + "move.l %%d7, (7*4, %[x])\n" + "neg.l %%d7\n" + "move.l %%d7, (10*4, %[x])\n" + + "movclr.l %%acc1, %%d7\n" + "asl.l #3, %%d7\n" + "sub.l %%d0, %%d7\n" + "move.l %%d7, (19*4, %[x])\n" + "move.l %%d7, (34*4, %[x])\n" - "sub.l (9*4, %[X]), %%d3\n\t" - "sub.l (14*4, %[X]), %%d3\n\t" - "move.l %%d3, (9*4, %[t])\n\t" - "move.l #0x09bd7ca0, %%d1\n\t" - "mac.l %%d3, %%d1, (3*4, %[X]), %%d4, %%acc0\n\t" + "movclr.l %%acc2, %%d7\n" + "asl.l #3, %%d7\n" + "move.l %%d7, ( 1*4, %[x])\n" - "sub.l (8*4, %[X]), %%d4\n\t" - "sub.l (15*4, %[X]), %%d4\n\t" - "move.l %%d4, (10*4, %[t])\n\t" - "move.l #0x0cb19346, %%d1\n\t" - "msac.l %%d4, %%d1, (5*4, %[X]), %%d5, %%acc0\n\t" + "movclr.l %%acc3, %%d7\n" + "asl.l #3, %%d7\n" + "move.l %%d7, (25*4, %[x])\n" - "sub.l (6*4, %[X]), %%d5\n\t" - "sub.l (17*4, %[X]), %%d5\n\t" - "move.l %%d5, (11*4, %[t])\n\t" - "move.l #0x0fdcf549, %%d1\n\t" - "msac.l %%d5, %%d1, (%[X]), %%d0, %%acc0\n\t" + /* MAD_F_ML0(hi, lo, X[1], -MAD_F(0x09bd7ca0)); */ + /* MAD_F_MLA(hi, lo, X[7], MAD_F(0x0216a2a2)); */ + /* MAD_F_MLA(hi, lo, X[10], -MAD_F(0x0fdcf549)); */ + /* MAD_F_MLA(hi, lo, X[16], MAD_F(0x0cb19346)); */ + /* t1 = MAD_F_MLZ(hi, lo) + t6; */ + + /* MAD_F_ML0(hi, lo, X[1], -MAD_F(0x0216a2a2)); */ + /* MAD_F_MLA(hi, lo, X[7], -MAD_F(0x09bd7ca0)); */ + /* MAD_F_MLA(hi, lo, X[10], MAD_F(0x0cb19346)); */ + /* MAD_F_MLA(hi, lo, X[16], MAD_F(0x0fdcf549)); */ + /* t3 = MAD_F_MLZ(hi, lo); */ + + /* MAD_F_ML0(hi, lo, X[1], -MAD_F(0x0fdcf549)); */ + /* MAD_F_MLA(hi, lo, X[7], -MAD_F(0x0cb19346)); */ + /* MAD_F_MLA(hi, lo, X[10], -MAD_F(0x09bd7ca0)); */ + /* MAD_F_MLA(hi, lo, X[16], -MAD_F(0x0216a2a2)); */ + /* t5 = MAD_F_MLZ(hi, lo) - t6; */ + "msac.l %%d3, %%d5, %%acc0\n" + "msac.l %%d1, %%d5, %%acc1\n" + "msac.l %%d4, %%d5, ( 7*4, %[X]), %%d5, %%acc2\n" + "mac.l %%d1, %%d5, %%acc0\n" + "msac.l %%d3, %%d5, %%acc1\n" + "msac.l %%d2, %%d5, (10*4, %[X]), %%d5, %%acc2\n" + "msac.l %%d4, %%d5, %%acc0\n" + "mac.l %%d2, %%d5, %%acc1\n" + "msac.l %%d3, %%d5, (16*4, %[X]), %%d5, %%acc2\n" + "mac.l %%d2, %%d5, %%acc0\n" + "mac.l %%d4, %%d5, %%acc1\n" + "msac.l %%d1, %%d5, ( 0*4, %[X]), %%d0, %%acc2\n" + + "movclr.l %%acc0, %%d7\n" + "asl.l #3, %%d7\n" + "add.l (6*4, %[t]), %%d7\n" /* t1 */ - "movclr.l %%acc0, %%d7\n\t" - "asl.l #3, %%d7\n\t" - "move.l %%d7, (7*4, %[x])\n\t" - "neg.l %%d7\n\t" - "move.l %%d7, (10*4, %[x])\n\t" + "movclr.l %%acc1, %%d5\n" + "asl.l #3, %%d5\n" /* t3 */ - "move.l #0x0cb19346, %%d1\n\t" - "msac.l %%d2, %%d1, (3*4, %[X]), %%d2, %%acc0\n\t" /* preload for t12 statement */ - "move.l #0x0fdcf549, %%d1\n\t" - "mac.l %%d3, %%d1, (8*4, %[X]), %%d3, %%acc0\n\t" - "move.l #0x0216a2a2, %%d1\n\t" - "mac.l %%d4, %%d1, (11*4, %[X]), %%d4, %%acc0\n\t" - "move.l #0x09bd7ca0, %%d1\n\t" - "msac.l %%d5, %%d1, (12*4, %[X]), %%d5, %%acc0\n\t" - "movclr.l %%acc0, %%d7\n\t" - "asl.l #3, %%d7\n\t" - "sub.l %%d6, %%d7\n\t" /* t0 is still in d6 */ - "move.l %%d7, (19*4, %[x])\n\t" - "move.l %%d7, (34*4, %[x])\n\t" + "movclr.l %%acc2, %%d6\n" + "asl.l #3, %%d6\n" + "sub.l (6*4, %[t]), %%d6\n" /* t5 */ + "move.l %%d6, (5*4, %[t])\n" + + /* MAD_F_ML0(hi, lo, X[0], MAD_F(0x03768962)); */ + /* MAD_F_MLA(hi, lo, X[2], MAD_F(0x0e313245)); */ + /* MAD_F_MLA(hi, lo, X[3], -MAD_F(0x0ffc19fd)); */ + /* MAD_F_MLA(hi, lo, X[5], -MAD_F(0x0acf37ad)); */ + /* MAD_F_MLA(hi, lo, X[6], MAD_F(0x04cfb0e2)); */ + /* MAD_F_MLA(hi, lo, X[8], -MAD_F(0x0898c779)); */ + /* MAD_F_MLA(hi, lo, X[9], MAD_F(0x0d7e8807)); */ + /* MAD_F_MLA(hi, lo, X[11], MAD_F(0x0f426cb5)); */ + /* MAD_F_MLA(hi, lo, X[12], -MAD_F(0x0bcbe352)); */ + /* MAD_F_MLA(hi, lo, X[14], MAD_F(0x00b2aa3e)); */ + /* MAD_F_MLA(hi, lo, X[15], -MAD_F(0x07635284)); */ + /* MAD_F_MLA(hi, lo, X[17], -MAD_F(0x0f9ee890)); */ + /* x[11] = -(x[6] = MAD_F_MLZ(hi, lo) + t1); */ + /* MAD_F_ML0(hi, lo, X[0], -MAD_F(0x0f426cb5)); */ + /* MAD_F_MLA(hi, lo, X[2], -MAD_F(0x00b2aa3e)); */ + /* MAD_F_MLA(hi, lo, X[3], MAD_F(0x0898c779)); */ + /* MAD_F_MLA(hi, lo, X[5], MAD_F(0x0f9ee890)); */ + /* MAD_F_MLA(hi, lo, X[6], MAD_F(0x0acf37ad)); */ + /* MAD_F_MLA(hi, lo, X[8], -MAD_F(0x07635284)); */ + /* MAD_F_MLA(hi, lo, X[9], -MAD_F(0x0e313245)); */ + /* MAD_F_MLA(hi, lo, X[11], -MAD_F(0x0bcbe352)); */ + /* MAD_F_MLA(hi, lo, X[12], -MAD_F(0x03768962)); */ + /* MAD_F_MLA(hi, lo, X[14], MAD_F(0x0d7e8807)); */ + /* MAD_F_MLA(hi, lo, X[15], MAD_F(0x0ffc19fd)); */ + /* MAD_F_MLA(hi, lo, X[17], MAD_F(0x04cfb0e2)); */ + /* x[23] = x[30] = MAD_F_MLZ(hi, lo) + t1; */ + /* MAD_F_ML0(hi, lo, X[0], -MAD_F(0x0bcbe352)); */ + /* MAD_F_MLA(hi, lo, X[2], MAD_F(0x0d7e8807)); */ + /* MAD_F_MLA(hi, lo, X[3], -MAD_F(0x07635284)); */ + /* MAD_F_MLA(hi, lo, X[5], MAD_F(0x04cfb0e2)); */ + /* MAD_F_MLA(hi, lo, X[6], MAD_F(0x0f9ee890)); */ + /* MAD_F_MLA(hi, lo, X[8], -MAD_F(0x0ffc19fd)); */ + /* MAD_F_MLA(hi, lo, X[9], -MAD_F(0x00b2aa3e)); */ + /* MAD_F_MLA(hi, lo, X[11], MAD_F(0x03768962)); */ + /* MAD_F_MLA(hi, lo, X[12], -MAD_F(0x0f426cb5)); */ + /* MAD_F_MLA(hi, lo, X[14], MAD_F(0x0e313245)); */ + /* MAD_F_MLA(hi, lo, X[15], MAD_F(0x0898c779)); */ + /* MAD_F_MLA(hi, lo, X[17], -MAD_F(0x0acf37ad)); */ + /* x[18] = x[35] = MAD_F_MLZ(hi, lo) - t1; */ + "move.l #0x03768962, %%d1\n" + "move.l #0x0f426cb5, %%d2\n" + "move.l #0x0bcbe352, %%d3\n" + "mac.l %%d1, %%d0, %%acc0\n" + "msac.l %%d2, %%d0, %%acc1\n" + "msac.l %%d3, %%d0, (11*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d2, %%d0, %%acc0\n" + "msac.l %%d3, %%d0, %%acc1\n" + "mac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d3, %%d0, %%acc0\n" + "msac.l %%d1, %%d0, %%acc1\n" + "msac.l %%d2, %%d0, ( 2*4, %[X]), %%d0, %%acc2\n" + "move.l #0x0e313245, %%d1\n" + "move.l #0x00b2aa3e, %%d2\n" + "move.l #0x0d7e8807, %%d3\n" + "mac.l %%d1, %%d0, %%acc0\n" + "msac.l %%d2, %%d0, %%acc1\n" + "mac.l %%d3, %%d0, ( 9*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d3, %%d0, %%acc0\n" + "msac.l %%d1, %%d0, %%acc1\n" + "msac.l %%d2, %%d0, (14*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d2, %%d0, %%acc0\n" + "mac.l %%d3, %%d0, %%acc1\n" + "mac.l %%d1, %%d0, ( 3*4, %[X]), %%d0, %%acc2\n" + "move.l #0x0ffc19fd, %%d1\n" + "move.l #0x0898c779, %%d2\n" + "move.l #0x07635284, %%d3\n" + "msac.l %%d1, %%d0, %%acc0\n" + "mac.l %%d2, %%d0, %%acc1\n" + "msac.l %%d3, %%d0, ( 8*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d2, %%d0, %%acc0\n" + "msac.l %%d3, %%d0, %%acc1\n" + "msac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d3, %%d0, %%acc0\n" + "mac.l %%d1, %%d0, %%acc1\n" + "mac.l %%d2, %%d0, ( 5*4, %[X]), %%d0, %%acc2\n" + "move.l #0x0acf37ad, %%d1\n" + "move.l #0x0f9ee890, %%d2\n" + "move.l #0x04cfb0e2, %%d3\n" + "msac.l %%d1, %%d0, %%acc0\n" + "mac.l %%d2, %%d0, %%acc1\n" + "mac.l %%d3, %%d0, ( 6*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d3, %%d0, %%acc0\n" + "mac.l %%d1, %%d0, %%acc1\n" + "mac.l %%d2, %%d0, (17*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d2, %%d0, %%acc0\n" + "mac.l %%d3, %%d0, %%acc1\n" + "msac.l %%d1, %%d0, ( 4*4, %[X]), %%d0, %%acc2\n" + + "movclr.l %%acc0, %%d6\n" + "asl.l #3, %%d6\n" + "add.l %%d7, %%d6\n" + "move.l %%d6, (6*4, %[x])\n" + "neg.l %%d6\n" + "move.l %%d6, (11*4, %[x])\n" - "sub.l %%d2, %%d0\n\t" - "add.l %%d3, %%d0\n\t" - "sub.l %%d4, %%d0\n\t" - "sub.l %%d5, %%d0\n\t" - "add.l (15*4, %[X]), %%d0\n\t" + "movclr.l %%acc1, %%d6\n" + "asl.l #3, %%d6\n" + "add.l %%d7, %%d6\n" + "move.l %%d6, (23*4, %[x])\n" + "move.l %%d6, (30*4, %[x])\n" - "move.l (2*4, %[X]), %%d3\n\t" - "add.l (5*4, %[X]), %%d3\n\t" - "sub.l (6*4, %[X]), %%d3\n\t" - "sub.l (9*4, %[X]), %%d3\n\t" - "sub.l (14*4, %[X]), %%d3\n\t" - "sub.l (17*4, %[X]), %%d3\n\t" + "movclr.l %%acc2, %%d6\n" + "asl.l #3, %%d6\n" + "sub.l %%d7, %%d6\n" + "move.l %%d6, (18*4, %[x])\n" + "move.l %%d6, (35*4, %[x])\n" - "move.l %%d0, (12*4, %[t])\n\t" - "move.l %%d3, (13*4, %[t])\n\t" - - "move.l #0x0ec835e8, %%d1\n\t" - "msac.l %%d0, %%d1, (1*4, %[X]), %%d2, %%acc0\n\t" - "move.l #0x061f78aa, %%d1\n\t" - "mac.l %%d3, %%d1, (7*4, %[X]), %%d3, %%acc0\n\t" - "movclr.l %%acc0, %%d7\n\t" - "asl.l #3, %%d7\n\t" - "add.l %%d6, %%d7\n\t" - "move.l %%d7, (22*4, %[x])\n\t" - "move.l %%d7, (31*4, %[x])\n\t" + /* MAD_F_ML0(hi, lo, X[4], MAD_F(0x061f78aa)); */ + /* MAD_F_MLA(hi, lo, X[13], -MAD_F(0x0ec835e8)); */ + /* t3 += (t7 = MAD_F_MLZ(hi, lo)); */ + /* t4 -= t7; */ + "move.l #0x061f78aa, %%d1\n" + "mac.l %%d1, %%d0, (13*4, %[X]), %%d0, %%acc0\n" + "move.l #0x0ec835e8, %%d1\n" + "msac.l %%d1, %%d0, (1*4, %[X]), %%d0, %%acc0\n" + "move.l %%acc0, %%d6\n" + "asl.l #3, %%d6\n" /* t7 */ + "add.l %%d6, %%d5\n" /* t3 */ + "move.l (4*4, %[t]), %%d1\n" + "sub.l %%d6, %%d1\n" /* t4 */ + "move.l %%d1, (4*4, %[t])\n" - "move.l #0x09bd7ca0, %%d1\n\t" - "msac.l %%d1, %%d2, (10*4, %[X]), %%d2, %%acc0\n\t" - "move.l #0x0216a2a2, %%d1\n\t" - "mac.l %%d1, %%d3, (16*4, %[X]), %%d3, %%acc0\n\t" - "move.l #0x0fdcf549, %%d1\n\t" - "msac.l %%d1, %%d2, (6*4, %[t]), %%d2, %%acc0\n\t" - "move.l #0x0cb19346, %%d1\n\t" - "mac.l %%d1, %%d3, (%[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d7\n\t" - "asl.l #3, %%d7\n\t" - "add.l %%d2, %%d7\n\t" - "move.l %%d7, (1*4, %[t])\n\t" + /* MAD_F_MLA(hi, lo, X[1], -MAD_F(0x0cb19346)); */ + /* MAD_F_MLA(hi, lo, X[7], MAD_F(0x0fdcf549)); */ + /* MAD_F_MLA(hi, lo, X[10], MAD_F(0x0216a2a2)); */ + /* MAD_F_MLA(hi, lo, X[16], -MAD_F(0x09bd7ca0)); */ + /* t2 = MAD_F_MLZ(hi, lo); */ + "move.l #0x0cb19346, %%d1\n" + "msac.l %%d1, %%d0, ( 7*4, %[X]), %%d0, %%acc0\n" + "move.l #0x0fdcf549, %%d1\n" + "mac.l %%d1, %%d0, (10*4, %[X]), %%d0, %%acc0\n" + "move.l #0x0216a2a2, %%d1\n" + "mac.l %%d1, %%d0, (16*4, %[X]), %%d0, %%acc0\n" + "move.l #0x09bd7ca0, %%d1\n" + "msac.l %%d1, %%d0, ( %[X]), %%d0, %%acc0\n" + "move.l %%acc0, %%d7\n" + "asl.l #3, %%d7\n" /* t2 */ - "move.l #0x03768962, %%d1\n\t" - "mac.l %%d1, %%d0, (2*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0e313245, %%d1\n\t" - "mac.l %%d1, %%d0, (3*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0ffc19fd, %%d1\n\t" - "msac.l %%d1, %%d0, (5*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0acf37ad, %%d1\n\t" - "msac.l %%d1, %%d0, (6*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x04cfb0e2, %%d1\n\t" - "mac.l %%d1, %%d0, (8*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0898c779, %%d1\n\t" - "msac.l %%d1, %%d0, (9*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0d7e8807, %%d1\n\t" - "mac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f426cb5, %%d1\n\t" - "mac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0bcbe352, %%d1\n\t" - "msac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x00b2aa3e, %%d1\n\t" - "mac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x07635284, %%d1\n\t" - "msac.l %%d1, %%d0, (17*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f9ee890, %%d1\n\t" - "msac.l %%d1, %%d0, (%[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d6\n\t" - "asl.l #3, %%d6\n\t" - "add.l %%d7, %%d6\n\t" - "move.l %%d6, (6*4, %[x])\n\t" - "neg.l %%d6\n\t" - "move.l %%d6, (11*4, %[x])\n\t" + /* MAD_F_MLA(hi, lo, X[0], MAD_F(0x04cfb0e2)); */ + /* MAD_F_MLA(hi, lo, X[2], MAD_F(0x0ffc19fd)); */ + /* MAD_F_MLA(hi, lo, X[3], -MAD_F(0x0d7e8807)); */ + /* MAD_F_MLA(hi, lo, X[5], MAD_F(0x03768962)); */ + /* MAD_F_MLA(hi, lo, X[6], -MAD_F(0x0bcbe352)); */ + /* MAD_F_MLA(hi, lo, X[8], -MAD_F(0x0e313245)); */ + /* MAD_F_MLA(hi, lo, X[9], MAD_F(0x07635284)); */ + /* MAD_F_MLA(hi, lo, X[11], -MAD_F(0x0acf37ad)); */ + /* MAD_F_MLA(hi, lo, X[12], MAD_F(0x0f9ee890)); */ + /* MAD_F_MLA(hi, lo, X[14], MAD_F(0x0898c779)); */ + /* MAD_F_MLA(hi, lo, X[15], MAD_F(0x00b2aa3e)); */ + /* MAD_F_MLA(hi, lo, X[17], MAD_F(0x0f426cb5)); */ + /* x[12] = -(x[5] = MAD_F_MLZ(hi, lo)); */ + /* MAD_F_ML0(hi, lo, X[0], MAD_F(0x0acf37ad)); */ + /* MAD_F_MLA(hi, lo, X[2], -MAD_F(0x0898c779)); */ + /* MAD_F_MLA(hi, lo, X[3], MAD_F(0x0e313245)); */ + /* MAD_F_MLA(hi, lo, X[5], -MAD_F(0x0f426cb5)); */ + /* MAD_F_MLA(hi, lo, X[6], -MAD_F(0x03768962)); */ + /* MAD_F_MLA(hi, lo, X[8], MAD_F(0x00b2aa3e)); */ + /* MAD_F_MLA(hi, lo, X[9], -MAD_F(0x0ffc19fd)); */ + /* MAD_F_MLA(hi, lo, X[11], MAD_F(0x0f9ee890)); */ + /* MAD_F_MLA(hi, lo, X[12], -MAD_F(0x04cfb0e2)); */ + /* MAD_F_MLA(hi, lo, X[14], MAD_F(0x07635284)); */ + /* MAD_F_MLA(hi, lo, X[15], MAD_F(0x0d7e8807)); */ + /* MAD_F_MLA(hi, lo, X[17], -MAD_F(0x0bcbe352)); */ + /* x[17] = -(x[0] = MAD_F_MLZ(hi, lo) + t2); */ + /* MAD_F_ML0(hi, lo, X[0], -MAD_F(0x0f9ee890)); */ + /* MAD_F_MLA(hi, lo, X[2], -MAD_F(0x07635284)); */ + /* MAD_F_MLA(hi, lo, X[3], -MAD_F(0x00b2aa3e)); */ + /* MAD_F_MLA(hi, lo, X[5], MAD_F(0x0bcbe352)); */ + /* MAD_F_MLA(hi, lo, X[6], MAD_F(0x0f426cb5)); */ + /* MAD_F_MLA(hi, lo, X[8], MAD_F(0x0d7e8807)); */ + /* MAD_F_MLA(hi, lo, X[9], MAD_F(0x0898c779)); */ + /* MAD_F_MLA(hi, lo, X[11], -MAD_F(0x04cfb0e2)); */ + /* MAD_F_MLA(hi, lo, X[12], -MAD_F(0x0acf37ad)); */ + /* MAD_F_MLA(hi, lo, X[14], -MAD_F(0x0ffc19fd)); */ + /* MAD_F_MLA(hi, lo, X[15], -MAD_F(0x0e313245)); */ + /* MAD_F_MLA(hi, lo, X[17], -MAD_F(0x03768962)); */ + /* x[24] = x[29] = MAD_F_MLZ(hi, lo) + t2; */ + "move.l #0x0acf37ad, %%d1\n" + "move.l #0x0f9ee890, %%d2\n" + "move.l #0x04cfb0e2, %%d3\n" + "mac.l %%d3, %%d0, %%acc0\n" + "mac.l %%d1, %%d0, %%acc1\n" + "msac.l %%d2, %%d0, (11*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d1, %%d0, %%acc0\n" + "mac.l %%d2, %%d0, %%acc1\n" + "msac.l %%d3, %%d0, (12*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d2, %%d0, %%acc0\n" + "msac.l %%d3, %%d0, %%acc1\n" + "msac.l %%d1, %%d0, ( 2*4, %[X]), %%d0, %%acc2\n" + "move.l #0x0ffc19fd, %%d1\n" + "move.l #0x0898c779, %%d2\n" + "move.l #0x07635284, %%d3\n" + "mac.l %%d1, %%d0, %%acc0\n" + "msac.l %%d2, %%d0, %%acc1\n" + "msac.l %%d3, %%d0, ( 9*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d3, %%d0, %%acc0\n" + "msac.l %%d1, %%d0, %%acc1\n" + "mac.l %%d2, %%d0, (14*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d2, %%d0, %%acc0\n" + "mac.l %%d3, %%d0, %%acc1\n" + "msac.l %%d1, %%d0, ( 3*4, %[X]), %%d0, %%acc2\n" + "move.l #0x0e313245, %%d1\n" + "move.l #0x00b2aa3e, %%d2\n" + "move.l #0x0d7e8807, %%d3\n" + "msac.l %%d3, %%d0, %%acc0\n" + "mac.l %%d1, %%d0, %%acc1\n" + "msac.l %%d2, %%d0, ( 8*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d1, %%d0, %%acc0\n" + "mac.l %%d2, %%d0, %%acc1\n" + "mac.l %%d3, %%d0, (15*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d2, %%d0, %%acc0\n" + "mac.l %%d3, %%d0, %%acc1\n" + "msac.l %%d1, %%d0, ( 5*4, %[X]), %%d0, %%acc2\n" + "move.l #0x03768962, %%d1\n" + "move.l #0x0f426cb5, %%d2\n" + "move.l #0x0bcbe352, %%d3\n" + "mac.l %%d1, %%d0, %%acc0\n" + "msac.l %%d2, %%d0, %%acc1\n" + "mac.l %%d3, %%d0, ( 6*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d3, %%d0, %%acc0\n" + "msac.l %%d1, %%d0, %%acc1\n" + "mac.l %%d2, %%d0, (17*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d2, %%d0, %%acc0\n" + "msac.l %%d3, %%d0, %%acc1\n" + "msac.l %%d1, %%d0, ( %[X]), %%d0, %%acc2\n" + + "movclr.l %%acc0, %%d6\n" + "asl.l #3, %%d6\n" + "move.l %%d6, ( 5*4, %[x])\n" + "neg.l %%d6\n" + "move.l %%d6, (12*4, %[x])\n" - "move.l #0x0f426cb5, %%d1\n\t" - "msac.l %%d1, %%d0, (2*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x00b2aa3e, %%d1\n\t" - "msac.l %%d1, %%d0, (3*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0898c779, %%d1\n\t" - "mac.l %%d1, %%d0, (5*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f9ee890, %%d1\n\t" - "mac.l %%d1, %%d0, (6*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0acf37ad, %%d1\n\t" - "mac.l %%d1, %%d0, (8*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x07635284, %%d1\n\t" - "msac.l %%d1, %%d0, (9*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0e313245, %%d1\n\t" - "msac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0bcbe352, %%d1\n\t" - "msac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x03768962, %%d1\n\t" - "msac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0d7e8807, %%d1\n\t" - "mac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0ffc19fd, %%d1\n\t" - "mac.l %%d1, %%d0, (17*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x04cfb0e2, %%d1\n\t" - "mac.l %%d1, %%d0, (%[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d6\n\t" - "asl.l #3, %%d6\n\t" - "add.l %%d7, %%d6\n\t" - "move.l %%d6, (23*4, %[x])\n\t" - "move.l %%d6, (30*4, %[x])\n\t" + "movclr.l %%acc1, %%d6\n" + "asl.l #3, %%d6\n" + "add.l %%d7, %%d6\n" + "move.l %%d6, ( %[x])\n" + "neg.l %%d6\n" + "move.l %%d6, (17*4, %[x])\n" - "move.l #0x0bcbe352, %%d1\n\t" - "msac.l %%d1, %%d0, (2*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0d7e8807, %%d1\n\t" - "mac.l %%d1, %%d0, (3*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x07635284, %%d1\n\t" - "msac.l %%d1, %%d0, (5*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x04cfb0e2, %%d1\n\t" - "mac.l %%d1, %%d0, (6*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f9ee890, %%d1\n\t" - "mac.l %%d1, %%d0, (8*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0ffc19fd, %%d1\n\t" - "msac.l %%d1, %%d0, (9*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x00b2aa3e, %%d1\n\t" - "msac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x03768962, %%d1\n\t" - "mac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f426cb5, %%d1\n\t" - "msac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0e313245, %%d1\n\t" - "mac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0898c779, %%d1\n\t" - "mac.l %%d1, %%d0, (17*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0acf37ad, %%d1\n\t" - "msac.l %%d1, %%d0, (4*4, %[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d6\n\t" - "asl.l #3, %%d6\n\t" - "sub.l %%d7, %%d6\n\t" - "move.l %%d6, (18*4, %[x])\n\t" - "move.l %%d6, (35*4, %[x])\n\t" + "movclr.l %%acc2, %%d6\n" + "asl.l #3, %%d6\n" + "add.l %%d7, %%d6\n" + "move.l %%d6, (24*4, %[x])\n" + "move.l %%d6, (29*4, %[x])\n" + + /* MAD_F_ML0(hi, lo, X[0], MAD_F(0x00b2aa3e)); */ + /* MAD_F_MLA(hi, lo, X[2], MAD_F(0x03768962)); */ + /* MAD_F_MLA(hi, lo, X[3], -MAD_F(0x04cfb0e2)); */ + /* MAD_F_MLA(hi, lo, X[5], -MAD_F(0x07635284)); */ + /* MAD_F_MLA(hi, lo, X[6], MAD_F(0x0898c779)); */ + /* MAD_F_MLA(hi, lo, X[8], MAD_F(0x0acf37ad)); */ + /* MAD_F_MLA(hi, lo, X[9], -MAD_F(0x0bcbe352)); */ + /* MAD_F_MLA(hi, lo, X[11], -MAD_F(0x0d7e8807)); */ + /* MAD_F_MLA(hi, lo, X[12], MAD_F(0x0e313245)); */ + /* MAD_F_MLA(hi, lo, X[14], MAD_F(0x0f426cb5)); */ + /* MAD_F_MLA(hi, lo, X[15], -MAD_F(0x0f9ee890)); */ + /* MAD_F_MLA(hi, lo, X[17], -MAD_F(0x0ffc19fd)); */ + /* x[9] = -(x[8] = MAD_F_MLZ(hi, lo) + t3); */ + + /* MAD_F_ML0(hi, lo, X[0], -MAD_F(0x0e313245)); */ + /* MAD_F_MLA(hi, lo, X[2], MAD_F(0x0bcbe352)); */ + /* MAD_F_MLA(hi, lo, X[3], MAD_F(0x0f9ee890)); */ + /* MAD_F_MLA(hi, lo, X[5], -MAD_F(0x0898c779)); */ + /* MAD_F_MLA(hi, lo, X[6], -MAD_F(0x0ffc19fd)); */ + /* MAD_F_MLA(hi, lo, X[8], MAD_F(0x04cfb0e2)); */ + /* MAD_F_MLA(hi, lo, X[9], MAD_F(0x0f426cb5)); */ + /* MAD_F_MLA(hi, lo, X[11], -MAD_F(0x00b2aa3e)); */ + /* MAD_F_MLA(hi, lo, X[12], -MAD_F(0x0d7e8807)); */ + /* MAD_F_MLA(hi, lo, X[14], -MAD_F(0x03768962)); */ + /* MAD_F_MLA(hi, lo, X[15], MAD_F(0x0acf37ad)); */ + /* MAD_F_MLA(hi, lo, X[17], MAD_F(0x07635284)); */ + /* x[21] = x[32] = MAD_F_MLZ(hi, lo) + t3; */ + + /* MAD_F_ML0(hi, lo, X[0], -MAD_F(0x0d7e8807)); */ + /* MAD_F_MLA(hi, lo, X[2], MAD_F(0x0f426cb5)); */ + /* MAD_F_MLA(hi, lo, X[3], MAD_F(0x0acf37ad)); */ + /* MAD_F_MLA(hi, lo, X[5], -MAD_F(0x0ffc19fd)); */ + /* MAD_F_MLA(hi, lo, X[6], -MAD_F(0x07635284)); */ + /* MAD_F_MLA(hi, lo, X[8], MAD_F(0x0f9ee890)); */ + /* MAD_F_MLA(hi, lo, X[9], MAD_F(0x03768962)); */ + /* MAD_F_MLA(hi, lo, X[11], -MAD_F(0x0e313245)); */ + /* MAD_F_MLA(hi, lo, X[12], MAD_F(0x00b2aa3e)); */ + /* MAD_F_MLA(hi, lo, X[14], MAD_F(0x0bcbe352)); */ + /* MAD_F_MLA(hi, lo, X[15], -MAD_F(0x04cfb0e2)); */ + /* MAD_F_MLA(hi, lo, X[17], -MAD_F(0x0898c779)); */ + /* x[20] = x[33] = MAD_F_MLZ(hi, lo) - t3; */ + "move.l #0x0e313245, %%d1\n" + "move.l #0x00b2aa3e, %%d2\n" + "move.l #0x0d7e8807, %%d3\n" + "mac.l %%d2, %%d0, %%acc0\n" + "msac.l %%d1, %%d0, %%acc1\n" + "msac.l %%d3, %%d0, (11*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d3, %%d0, %%acc0\n" + "msac.l %%d2, %%d0, %%acc1\n" + "msac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d1, %%d0, %%acc0\n" + "msac.l %%d3, %%d0, %%acc1\n" + "mac.l %%d2, %%d0, ( 2*4, %[X]), %%d0, %%acc2\n" + "move.l #0x03768962, %%d1\n" + "move.l #0x0f426cb5, %%d2\n" + "move.l #0x0bcbe352, %%d3\n" + "mac.l %%d1, %%d0, %%acc0\n" + "mac.l %%d3, %%d0, %%acc1\n" + "mac.l %%d2, %%d0, ( 9*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d3, %%d0, %%acc0\n" + "mac.l %%d2, %%d0, %%acc1\n" + "mac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d2, %%d0, %%acc0\n" + "msac.l %%d1, %%d0, %%acc1\n" + "mac.l %%d3, %%d0, ( 3*4, %[X]), %%d0, %%acc2\n" + "move.l #0x0acf37ad, %%d1\n" + "move.l #0x0f9ee890, %%d2\n" + "move.l #0x04cfb0e2, %%d3\n" + "msac.l %%d3, %%d0, %%acc0\n" + "mac.l %%d2, %%d0, %%acc1\n" + "mac.l %%d1, %%d0, ( 8*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d1, %%d0, %%acc0\n" + "mac.l %%d3, %%d0, %%acc1\n" + "mac.l %%d2, %%d0, (15*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d2, %%d0, %%acc0\n" + "mac.l %%d1, %%d0, %%acc1\n" + "msac.l %%d3, %%d0, ( 5*4, %[X]), %%d0, %%acc2\n" + "move.l #0x0ffc19fd, %%d1\n" + "move.l #0x0898c779, %%d2\n" + "move.l #0x07635284, %%d3\n" + "msac.l %%d3, %%d0, %%acc0\n" + "msac.l %%d2, %%d0, %%acc1\n" + "msac.l %%d1, %%d0, ( 6*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d2, %%d0, %%acc0\n" + "msac.l %%d1, %%d0, %%acc1\n" + "msac.l %%d3, %%d0, (17*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d1, %%d0, %%acc0\n" + "mac.l %%d3, %%d0, %%acc1\n" + "msac.l %%d2, %%d0, (12*4, %[t]), %%d0, %%acc2\n" + + "movclr.l %%acc0, %%d6\n" + "asl.l #3, %%d6\n" + "add.l %%d5, %%d6\n" + "move.l %%d6, (8*4, %[x])\n" + "neg.l %%d6\n" + "move.l %%d6, (9*4, %[x])\n" - "move.l #0x061f78aa, %%d1\n\t" - "mac.l %%d1, %%d0, (13*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0ec835e8, %%d1\n\t" - "msac.l %%d1, %%d0, (1*4, %[X]), %%d0, %%acc0\n\t" - "move.l %%acc0, %%d5\n\t" - "asl.l #3, %%d5\n\t" - "move.l %%d5, (7*4, %[t])\n\t" + "movclr.l %%acc1, %%d6\n" + "asl.l #3, %%d6\n" + "add.l %%d5, %%d6\n" + "move.l %%d6, (21*4, %[x])\n" + "move.l %%d6, (32*4, %[x])\n" - "move.l #0x0cb19346, %%d1\n\t" - "msac.l %%d1, %%d0, (7*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0fdcf549, %%d1\n\t" - "mac.l %%d1, %%d0, (10*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0216a2a2, %%d1\n\t" - "mac.l %%d1, %%d0, (16*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x09bd7ca0, %%d1\n\t" - "msac.l %%d1, %%d0, (%[X]), %%d0, %%acc0\n\t" - "move.l %%acc0, %%d7\n\t" - "asl.l #3, %%d7\n\t" - "move.l %%d7, (2*4, %[t])\n\t" + "movclr.l %%acc2, %%d6\n" + "asl.l #3, %%d6\n" + "sub.l %%d5, %%d6\n" + "move.l %%d6, (20*4, %[x])\n" + "move.l %%d6, (33*4, %[x])\n" - "move.l #0x04cfb0e2, %%d1\n\t" - "mac.l %%d1, %%d0, (2*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0ffc19fd, %%d1\n\t" - "mac.l %%d1, %%d0, (3*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0d7e8807, %%d1\n\t" - "msac.l %%d1, %%d0, (5*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x03768962, %%d1\n\t" - "mac.l %%d1, %%d0, (6*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0bcbe352, %%d1\n\t" - "msac.l %%d1, %%d0, (8*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0e313245, %%d1\n\t" - "msac.l %%d1, %%d0, (9*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x07635284, %%d1\n\t" - "mac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0acf37ad, %%d1\n\t" - "msac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f9ee890, %%d1\n\t" - "mac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0898c779, %%d1\n\t" - "mac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x00b2aa3e, %%d1\n\t" - "mac.l %%d1, %%d0, (17*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f426cb5, %%d1\n\t" - "mac.l %%d1, %%d0, (%[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d6\n\t" - "asl.l #3, %%d6\n\t" - "move.l %%d6, (5*4, %[x])\n\t" - "neg.l %%d6\n\t" - "move.l %%d6, (12*4, %[x])\n\t" + /* MAD_F_ML0(hi, lo, t12, -MAD_F(0x0ec835e8)); */ + /* MAD_F_MLA(hi, lo, t13, MAD_F(0x061f78aa)); */ + /* x[22] = x[31] = MAD_F_MLZ(hi, lo) + t0; */ + "move.l #0x0ec835e8, %%d1\n" + "move.l #0x061f78aa, %%d2\n" + "msac.l %%d1, %%d0, (13*4, %[t]), %%d3, %%acc0\n" + "mac.l %%d2, %%d3, ( 1*4, %[x]), %%d4, %%acc0\n" + "movclr.l %%acc0, %%d6\n" + "asl.l #3, %%d6\n" + "add.l (0*4, %[t]), %%d6\n" + "move.l %%d6, (22*4, %[x])\n" + "move.l %%d6, (31*4, %[x])\n" - "move.l #0x0acf37ad, %%d1\n\t" - "mac.l %%d1, %%d0, (2*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0898c779, %%d1\n\t" - "msac.l %%d1, %%d0, (3*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0e313245, %%d1\n\t" - "mac.l %%d1, %%d0, (5*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f426cb5, %%d1\n\t" - "msac.l %%d1, %%d0, (6*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x03768962, %%d1\n\t" - "msac.l %%d1, %%d0, (8*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x00b2aa3e, %%d1\n\t" - "mac.l %%d1, %%d0, (9*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0ffc19fd, %%d1\n\t" - "msac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f9ee890, %%d1\n\t" - "mac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x04cfb0e2, %%d1\n\t" - "msac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x07635284, %%d1\n\t" - "mac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0d7e8807, %%d1\n\t" - "mac.l %%d1, %%d0, (17*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0bcbe352, %%d1\n\t" - "msac.l %%d1, %%d0, (%[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d6\n\t" - "asl.l #3, %%d6\n\t" - "add.l %%d7, %%d6\n\t" - "move.l %%d6, (%[x])\n\t" - "neg.l %%d6\n\t" - "move.l %%d6, (17*4, %[x])\n\t" + /* MAD_F_ML0(hi, lo, t12, MAD_F(0x061f78aa)); */ + /* MAD_F_MLA(hi, lo, t13, MAD_F(0x0ec835e8)); */ + /* x[13] = -(x[4] = MAD_F_MLZ(hi, lo) + t4); */ + /* x[16] = -(x[1] = x[1] + t4); */ + /* x[25] = x[28] = x[25] + t4; */ + "mac.l %%d2, %%d0, (4*4, %[t]), %%d2, %%acc0\n" + "mac.l %%d1, %%d3, ( %[X]), %%d0, %%acc0\n" + "movclr.l %%acc0, %%d6\n" + "asl.l #3, %%d6\n" + "add.l %%d2, %%d6\n" + "move.l %%d6, ( 4*4, %[x])\n" + "neg.l %%d6\n" + "move.l %%d6, (13*4, %[x])\n" + + "add.l %%d2, %%d4\n" + "move.l %%d4, ( 1*4, %[x])\n" + "neg.l %%d4\n" + "move.l %%d4, (16*4, %[x])\n" + + "move.l (25*4, %[x]), %%d4\n" + "add.l %%d2, %%d4\n" + "move.l %%d4, (25*4, %[x])\n" + "move.l %%d4, (28*4, %[x])\n" + + /* MAD_F_ML0(hi, lo, X[0], MAD_F(0x0898c779)); */ + /* MAD_F_MLA(hi, lo, X[2], MAD_F(0x04cfb0e2)); */ + /* MAD_F_MLA(hi, lo, X[3], MAD_F(0x0bcbe352)); */ + /* MAD_F_MLA(hi, lo, X[5], MAD_F(0x00b2aa3e)); */ + /* MAD_F_MLA(hi, lo, X[6], MAD_F(0x0e313245)); */ + /* MAD_F_MLA(hi, lo, X[8], -MAD_F(0x03768962)); */ + /* MAD_F_MLA(hi, lo, X[9], MAD_F(0x0f9ee890)); */ + /* MAD_F_MLA(hi, lo, X[11], -MAD_F(0x07635284)); */ + /* MAD_F_MLA(hi, lo, X[12], MAD_F(0x0ffc19fd)); */ + /* MAD_F_MLA(hi, lo, X[14], -MAD_F(0x0acf37ad)); */ + /* MAD_F_MLA(hi, lo, X[15], MAD_F(0x0f426cb5)); */ + /* MAD_F_MLA(hi, lo, X[17], -MAD_F(0x0d7e8807)); */ + /* x[15] = -(x[2] = MAD_F_MLZ(hi, lo) + t5); */ + /* MAD_F_ML0(hi, lo, X[0], MAD_F(0x07635284)); */ + /* MAD_F_MLA(hi, lo, X[2], MAD_F(0x0acf37ad)); */ + /* MAD_F_MLA(hi, lo, X[3], MAD_F(0x03768962)); */ + /* MAD_F_MLA(hi, lo, X[5], MAD_F(0x0d7e8807)); */ + /* MAD_F_MLA(hi, lo, X[6], -MAD_F(0x00b2aa3e)); */ + /* MAD_F_MLA(hi, lo, X[8], MAD_F(0x0f426cb5)); */ + /* MAD_F_MLA(hi, lo, X[9], -MAD_F(0x04cfb0e2)); */ + /* MAD_F_MLA(hi, lo, X[11], MAD_F(0x0ffc19fd)); */ + /* MAD_F_MLA(hi, lo, X[12], -MAD_F(0x0898c779)); */ + /* MAD_F_MLA(hi, lo, X[14], MAD_F(0x0f9ee890)); */ + /* MAD_F_MLA(hi, lo, X[15], -MAD_F(0x0bcbe352)); */ + /* MAD_F_MLA(hi, lo, X[17], MAD_F(0x0e313245)); */ + /* x[14] = -(x[3] = MAD_F_MLZ(hi, lo) + t5); */ + /* MAD_F_ML0(hi, lo, X[0], -MAD_F(0x0ffc19fd)); */ + /* MAD_F_MLA(hi, lo, X[2], -MAD_F(0x0f9ee890)); */ + /* MAD_F_MLA(hi, lo, X[3], -MAD_F(0x0f426cb5)); */ + /* MAD_F_MLA(hi, lo, X[5], -MAD_F(0x0e313245)); */ + /* MAD_F_MLA(hi, lo, X[6], -MAD_F(0x0d7e8807)); */ + /* MAD_F_MLA(hi, lo, X[8], -MAD_F(0x0bcbe352)); */ + /* MAD_F_MLA(hi, lo, X[9], -MAD_F(0x0acf37ad)); */ + /* MAD_F_MLA(hi, lo, X[11], -MAD_F(0x0898c779)); */ + /* MAD_F_MLA(hi, lo, X[12], -MAD_F(0x07635284)); */ + /* MAD_F_MLA(hi, lo, X[14], -MAD_F(0x04cfb0e2)); */ + /* MAD_F_MLA(hi, lo, X[15], -MAD_F(0x03768962)); */ + /* MAD_F_MLA(hi, lo, X[17], -MAD_F(0x00b2aa3e)); */ + /* x[26] = x[27] = MAD_F_MLZ(hi, lo) + t5; */ + "move.l #0x0ffc19fd, %%d1\n" + "move.l #0x0898c779, %%d2\n" + "move.l #0x07635284, %%d3\n" + "mac.l %%d2, %%d0, %%acc0\n" + "mac.l %%d3, %%d0, %%acc1\n" + "msac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d3, %%d0, %%acc0\n" + "mac.l %%d1, %%d0, %%acc1\n" + "msac.l %%d2, %%d0, (12*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d1, %%d0, %%acc0\n" + "msac.l %%d2, %%d0, %%acc1\n" + "msac.l %%d3, %%d0, ( 2*4, %[X]), %%d0, %%acc2\n" + "move.l #0x0acf37ad, %%d1\n" + "move.l #0x0f9ee890, %%d2\n" + "move.l #0x04cfb0e2, %%d3\n" + "mac.l %%d3, %%d0, %%acc0\n" + "mac.l %%d1, %%d0, %%acc1\n" + "msac.l %%d2, %%d0, ( 9*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d2, %%d0, %%acc0\n" + "msac.l %%d3, %%d0, %%acc1\n" + "msac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d1, %%d0, %%acc0\n" + "mac.l %%d2, %%d0, %%acc1\n" + "msac.l %%d3, %%d0, ( 3*4, %[X]), %%d0, %%acc2\n" + "move.l #0x03768962, %%d1\n" + "move.l #0x0f426cb5, %%d2\n" + "move.l #0x0bcbe352, %%d3\n" + "mac.l %%d3, %%d0, %%acc0\n" + "mac.l %%d1, %%d0, %%acc1\n" + "msac.l %%d2, %%d0, ( 8*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d1, %%d0, %%acc0\n" + "mac.l %%d2, %%d0, %%acc1\n" + "msac.l %%d3, %%d0, (15*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d2, %%d0, %%acc0\n" + "msac.l %%d3, %%d0, %%acc1\n" + "msac.l %%d1, %%d0, ( 5*4, %[X]), %%d0, %%acc2\n" + "move.l #0x0e313245, %%d1\n" + "move.l #0x00b2aa3e, %%d2\n" + "move.l #0x0d7e8807, %%d3\n" + "mac.l %%d2, %%d0, %%acc0\n" + "mac.l %%d3, %%d0, %%acc1\n" + "msac.l %%d1, %%d0, ( 6*4, %[X]), %%d0, %%acc2\n" + "mac.l %%d1, %%d0, %%acc0\n" + "msac.l %%d2, %%d0, %%acc1\n" + "msac.l %%d3, %%d0, (17*4, %[X]), %%d0, %%acc2\n" + "msac.l %%d3, %%d0, %%acc0\n" + "mac.l %%d1, %%d0, %%acc1\n" + "msac.l %%d2, %%d0, ( 5*4, %[t]), %%d6, %%acc2\n" + "movclr.l %%acc0, %%d7\n" + "asl.l #3, %%d7\n" + "add.l %%d6, %%d7\n" + "move.l %%d7, (2*4, %[x])\n" + "neg.l %%d7\n" + "move.l %%d7, (15*4, %[x])\n" - "move.l #0x0f9ee890, %%d1\n\t" - "msac.l %%d1, %%d0, (2*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x07635284, %%d1\n\t" - "msac.l %%d1, %%d0, (3*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x00b2aa3e, %%d1\n\t" - "msac.l %%d1, %%d0, (5*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0bcbe352, %%d1\n\t" - "mac.l %%d1, %%d0, (6*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f426cb5, %%d1\n\t" - "mac.l %%d1, %%d0, (8*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0d7e8807, %%d1\n\t" - "mac.l %%d1, %%d0, (9*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0898c779, %%d1\n\t" - "mac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x04cfb0e2, %%d1\n\t" - "msac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0acf37ad, %%d1\n\t" - "msac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0ffc19fd, %%d1\n\t" - "msac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0e313245, %%d1\n\t" - "msac.l %%d1, %%d0, (17*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x03768962, %%d1\n\t" - "msac.l %%d1, %%d0, (1*4, %[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d6\n\t" - "asl.l #3, %%d6\n\t" - "add.l %%d7, %%d6\n\t" - "move.l %%d6, (24*4, %[x])\n\t" - "move.l %%d6, (29*4, %[x])\n\t" + "movclr.l %%acc1, %%d7\n" + "asl.l #3, %%d7\n" + "add.l %%d6, %%d7\n" + "move.l %%d7, (3*4, %[x])\n" + "neg.l %%d7\n" + "move.l %%d7, (14*4, %[x])\n" - "move.l #0x0216a2a2, %%d1\n\t" - "msac.l %%d1, %%d0, (7*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x09bd7ca0, %%d1\n\t" - "msac.l %%d1, %%d0, (10*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0cb19346, %%d1\n\t" - "mac.l %%d1, %%d0, (16*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0fdcf549, %%d1\n\t" - "mac.l %%d1, %%d0, (%[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d7\n\t" - "asl.l #3, %%d7\n\t" - "add.l %%d5, %%d7\n\t" - "move.l %%d7, (3*4, %[t])\n\t" - - "move.l #0x00b2aa3e, %%d1\n\t" - "mac.l %%d1, %%d0, (2*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x03768962, %%d1\n\t" - "mac.l %%d1, %%d0, (3*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x04cfb0e2, %%d1\n\t" - "msac.l %%d1, %%d0, (5*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x07635284, %%d1\n\t" - "msac.l %%d1, %%d0, (6*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0898c779, %%d1\n\t" - "mac.l %%d1, %%d0, (8*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0acf37ad, %%d1\n\t" - "mac.l %%d1, %%d0, (9*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0bcbe352, %%d1\n\t" - "msac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0d7e8807, %%d1\n\t" - "msac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0e313245, %%d1\n\t" - "mac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f426cb5, %%d1\n\t" - "mac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f9ee890, %%d1\n\t" - "msac.l %%d1, %%d0, (17*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0ffc19fd, %%d1\n\t" - "msac.l %%d1, %%d0, (%[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d6\n\t" - "asl.l #3, %%d6\n\t" - "add.l %%d7, %%d6\n\t" - "move.l %%d6, (8*4, %[x])\n\t" - "neg.l %%d6\n\t" - "move.l %%d6, (9*4, %[x])\n\t" - - "move.l #0x0e313245, %%d1\n\t" - "msac.l %%d1, %%d0, (2*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0bcbe352, %%d1\n\t" - "mac.l %%d1, %%d0, (3*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f9ee890, %%d1\n\t" - "mac.l %%d1, %%d0, (5*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0898c779, %%d1\n\t" - "msac.l %%d1, %%d0, (6*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0ffc19fd, %%d1\n\t" - "msac.l %%d1, %%d0, (8*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x04cfb0e2, %%d1\n\t" - "mac.l %%d1, %%d0, (9*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f426cb5, %%d1\n\t" - "mac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x00b2aa3e, %%d1\n\t" - "msac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0d7e8807, %%d1\n\t" - "msac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x03768962, %%d1\n\t" - "msac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0acf37ad, %%d1\n\t" - "mac.l %%d1, %%d0, (17*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x07635284, %%d1\n\t" - "mac.l %%d1, %%d0, (%[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d6\n\t" - "asl.l #3, %%d6\n\t" - "add.l %%d7, %%d6\n\t" - "move.l %%d6, (21*4, %[x])\n\t" - "move.l %%d6, (32*4, %[x])\n\t" - - "move.l #0x0d7e8807, %%d1\n\t" - "msac.l %%d1, %%d0, (2*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f426cb5, %%d1\n\t" - "mac.l %%d1, %%d0, (3*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0acf37ad, %%d1\n\t" - "mac.l %%d1, %%d0, (5*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0ffc19fd, %%d1\n\t" - "msac.l %%d1, %%d0, (6*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x07635284, %%d1\n\t" - "msac.l %%d1, %%d0, (8*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f9ee890, %%d1\n\t" - "mac.l %%d1, %%d0, (9*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x03768962, %%d1\n\t" - "mac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0e313245, %%d1\n\t" - "msac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x00b2aa3e, %%d1\n\t" - "mac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0bcbe352, %%d1\n\t" - "mac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x04cfb0e2, %%d1\n\t" - "msac.l %%d1, %%d0, (17*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0898c779, %%d1\n\t" - "msac.l %%d1, %%d0, (14*4, %[t]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d6\n\t" - "asl.l #3, %%d6\n\t" - "sub.l %%d7, %%d6\n\t" - "move.l %%d6, (20*4, %[x])\n\t" - "move.l %%d6, (33*4, %[x])\n\t" - - "move.l #0x0ec835e8, %%d1\n\t" - "msac.l %%d1, %%d0, (15*4, %[t]), %%d0, %%acc0\n\t" - "move.l #0x061f78aa, %%d1\n\t" - "mac.l %%d1, %%d0, (12*4, %[t]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d6\n\t" - "asl.l #3, %%d6\n\t" - "sub.l %%d5, %%d6\n\t" - "move.l %%d6, (4*4, %[t])\n\t" - - "move.l #0x061f78aa, %%d1\n\t" - "mac.l %%d1, %%d0, (13*4, %[t]), %%d0, %%acc0\n\t" - "move.l #0x0ec835e8, %%d1\n\t" - "mac.l %%d1, %%d0, (8*4, %[t]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d7\n\t" /* don't need t7 anymore */ - "asl.l #3, %%d7\n\t" - "add.l %%d6, %%d7\n\t" - "move.l %%d7, (4*4, %[x])\n\t" - "neg.l %%d7\n\t" - "move.l %%d7, (13*4, %[x])\n\t" - - "move.l #0x09bd7ca0, %%d1\n\t" - "mac.l %%d1, %%d0, (9*4, %[t]), %%d0, %%acc0\n\t" - "move.l #0x0216a2a2, %%d1\n\t" - "msac.l %%d1, %%d0, (10*4, %[t]), %%d0, %%acc0\n\t" - "move.l #0x0fdcf549, %%d1\n\t" - "mac.l %%d1, %%d0, (11*4, %[t]), %%d0, %%acc0\n\t" - "move.l #0x0cb19346, %%d1\n\t" - "msac.l %%d1, %%d0, (8*4, %[t]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d7\n\t" - "asl.l #3, %%d7\n\t" - "add.l %%d6, %%d7\n\t" - "move.l %%d7, (1*4, %[x])\n\t" - "neg.l %%d7\n\t" - "move.l %%d7, (16*4, %[x])\n\t" - - "move.l #0x0fdcf549, %%d1\n\t" - "msac.l %%d1, %%d0, (9*4, %[t]), %%d0, %%acc0\n\t" - "move.l #0x0cb19346, %%d1\n\t" - "msac.l %%d1, %%d0, (10*4, %[t]), %%d0, %%acc0\n\t" - "move.l #0x09bd7ca0, %%d1\n\t" - "msac.l %%d1, %%d0, (11*4, %[t]), %%d0, %%acc0\n\t" - "move.l #0x0216a2a2, %%d1\n\t" - "msac.l %%d1, %%d0, (1*4, %[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d7\n\t" - "asl.l #3, %%d7\n\t" - "add.l %%d6, %%d7\n\t" - "move.l %%d7, (25*4, %[x])\n\t" - "move.l %%d7, (28*4, %[x])\n\t" - - "move.l #0x0fdcf549, %%d1\n\t" - "msac.l %%d1, %%d0, (7*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0cb19346, %%d1\n\t" - "msac.l %%d1, %%d0, (10*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x09bd7ca0, %%d1\n\t" - "msac.l %%d1, %%d0, (16*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0216a2a2, %%d1\n\t" - "msac.l %%d1, %%d0, (%[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d6\n\t" - "asl.l #3, %%d6\n\t" - "sub.l (6*4, %[t]), %%d6\n\t" - "move.l %%d6, (5*4, %[t])\n\t" - - "move.l #0x0898c779, %%d1\n\t" - "mac.l %%d1, %%d0, (2*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x04cfb0e2, %%d1\n\t" - "mac.l %%d1, %%d0, (3*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0bcbe352, %%d1\n\t" - "mac.l %%d1, %%d0, (5*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x00b2aa3e, %%d1\n\t" - "mac.l %%d1, %%d0, (6*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0e313245, %%d1\n\t" - "mac.l %%d1, %%d0, (8*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x03768962, %%d1\n\t" - "msac.l %%d1, %%d0, (9*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f9ee890, %%d1\n\t" - "mac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x07635284, %%d1\n\t" - "msac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0ffc19fd, %%d1\n\t" - "mac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0acf37ad, %%d1\n\t" - "msac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f426cb5, %%d1\n\t" - "mac.l %%d1, %%d0, (17*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0d7e8807, %%d1\n\t" - "msac.l %%d1, %%d0, (%[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d7\n\t" - "asl.l #3, %%d7\n\t" - "add.l %%d6, %%d7\n\t" - "move.l %%d7, (2*4, %[x])\n\t" - "neg.l %%d7\n\t" - "move.l %%d7, (15*4, %[x])\n\t" - - "move.l #0x07635284, %%d1\n\t" - "mac.l %%d1, %%d0, (2*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0acf37ad, %%d1\n\t" - "mac.l %%d1, %%d0, (3*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x03768962, %%d1\n\t" - "mac.l %%d1, %%d0, (5*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0d7e8807, %%d1\n\t" - "mac.l %%d1, %%d0, (6*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x00b2aa3e, %%d1\n\t" - "msac.l %%d1, %%d0, (8*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f426cb5, %%d1\n\t" - "mac.l %%d1, %%d0, (9*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x04cfb0e2, %%d1\n\t" - "msac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0ffc19fd, %%d1\n\t" - "mac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0898c779, %%d1\n\t" - "msac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f9ee890, %%d1\n\t" - "mac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0bcbe352, %%d1\n\t" - "msac.l %%d1, %%d0, (17*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0e313245, %%d1\n\t" - "mac.l %%d1, %%d0, (%[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d7\n\t" - "asl.l #3, %%d7\n\t" - "add.l %%d6, %%d7\n\t" - "move.l %%d7, (3*4, %[x])\n\t" - "neg.l %%d7\n\t" - "move.l %%d7, (14*4, %[x])\n\t" - - "move.l #0x0ffc19fd, %%d1\n\t" - "msac.l %%d1, %%d0, (2*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f9ee890, %%d1\n\t" - "msac.l %%d1, %%d0, (3*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0f426cb5, %%d1\n\t" - "msac.l %%d1, %%d0, (5*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0e313245, %%d1\n\t" - "msac.l %%d1, %%d0, (6*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0d7e8807, %%d1\n\t" - "msac.l %%d1, %%d0, (8*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0bcbe352, %%d1\n\t" - "msac.l %%d1, %%d0, (9*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0acf37ad, %%d1\n\t" - "msac.l %%d1, %%d0, (11*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x0898c779, %%d1\n\t" - "msac.l %%d1, %%d0, (12*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x07635284, %%d1\n\t" - "msac.l %%d1, %%d0, (14*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x04cfb0e2, %%d1\n\t" - "msac.l %%d1, %%d0, (15*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x03768962, %%d1\n\t" - "msac.l %%d1, %%d0, (17*4, %[X]), %%d0, %%acc0\n\t" - "move.l #0x00b2aa3e, %%d1\n\t" - "msac.l %%d1, %%d0, (%[X]), %%d0, %%acc0\n\t" - "movclr.l %%acc0, %%d7\n\t" - "asl.l #3, %%d7\n\t" - "add.l %%d6, %%d7\n\t" - "move.l %%d7, (26*4, %[x])\n\t" - "move.l %%d7, (27*4, %[x])\n\t" + "movclr.l %%acc2, %%d7\n" + "asl.l #3, %%d7\n" + "add.l %%d6, %%d7\n" + "move.l %%d7, (26*4, %[x])\n" + "move.l %%d7, (27*4, %[x])\n" + : : [X] "a" (X), [x] "a" (x), [t] "a" (t) : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"); /* pfew */ @@ -2355,46 +2470,63 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) register mad_fixed64hi_t hi; register mad_fixed64lo_t lo; + MAD_F_ML0(hi, lo, (t14 = X[1] - X[10]), -MAD_F(0x0ec835e8)); + MAD_F_MLA(hi, lo, (t15 = X[7] + X[16]), MAD_F(0x061f78aa)); + t4 = MAD_F_MLZ(hi, lo); + MAD_F_ML0(hi, lo, X[4], MAD_F(0x0ec835e8)); MAD_F_MLA(hi, lo, X[13], MAD_F(0x061f78aa)); - t6 = MAD_F_MLZ(hi, lo); - MAD_F_MLA(hi, lo, (t14 = X[1] - X[10]), -MAD_F(0x061f78aa)); - MAD_F_MLA(hi, lo, (t15 = X[7] + X[16]), -MAD_F(0x0ec835e8)); - + MAD_F_MLA(hi, lo, t14, -MAD_F(0x061f78aa)); + MAD_F_MLA(hi, lo, t15, -MAD_F(0x0ec835e8)); t0 = MAD_F_MLZ(hi, lo); - MAD_F_MLA(hi, lo, (t8 = X[0] - X[11] - X[12]), MAD_F(0x0216a2a2)); - MAD_F_MLA(hi, lo, (t9 = X[2] - X[9] - X[14]), MAD_F(0x09bd7ca0)); - MAD_F_MLA(hi, lo, (t10 = X[3] - X[8] - X[15]), -MAD_F(0x0cb19346)); - MAD_F_MLA(hi, lo, (t11 = X[5] - X[6] - X[17]), -MAD_F(0x0fdcf549)); - - x[7] = MAD_F_MLZ(hi, lo); - x[10] = -x[7]; + MAD_F_MLA(hi, lo, (t8 =X[0]-X[11]-X[12]), MAD_F(0x0216a2a2)); + MAD_F_MLA(hi, lo, (t9 =X[2]-X[ 9]-X[14]), MAD_F(0x09bd7ca0)); + MAD_F_MLA(hi, lo, (t10=X[3]-X[ 8]-X[15]), -MAD_F(0x0cb19346)); + MAD_F_MLA(hi, lo, (t11=X[5]-X[ 6]-X[17]), -MAD_F(0x0fdcf549)); + x[10] = -(x[7] = MAD_F_MLZ(hi, lo)); MAD_F_ML0(hi, lo, t8, -MAD_F(0x0cb19346)); MAD_F_MLA(hi, lo, t9, MAD_F(0x0fdcf549)); MAD_F_MLA(hi, lo, t10, MAD_F(0x0216a2a2)); MAD_F_MLA(hi, lo, t11, -MAD_F(0x09bd7ca0)); - x[19] = x[34] = MAD_F_MLZ(hi, lo) - t0; - t12 = X[0] - X[3] + X[8] - X[11] - X[12] + X[15]; - t13 = X[2] + X[5] - X[6] - X[9] - X[14] - X[17]; + MAD_F_ML0(hi, lo, t8, MAD_F(0x09bd7ca0)); + MAD_F_MLA(hi, lo, t9, -MAD_F(0x0216a2a2)); + MAD_F_MLA(hi, lo, t10, MAD_F(0x0fdcf549)); + MAD_F_MLA(hi, lo, t11, -MAD_F(0x0cb19346)); + x[ 1] = MAD_F_MLZ(hi, lo); - MAD_F_ML0(hi, lo, t12, -MAD_F(0x0ec835e8)); - MAD_F_MLA(hi, lo, t13, MAD_F(0x061f78aa)); + MAD_F_ML0(hi, lo, t8, -MAD_F(0x0fdcf549)); + MAD_F_MLA(hi, lo, t9, -MAD_F(0x0cb19346)); + MAD_F_MLA(hi, lo, t10, -MAD_F(0x09bd7ca0)); + MAD_F_MLA(hi, lo, t11, -MAD_F(0x0216a2a2)); + x[25] = MAD_F_MLZ(hi, lo); - x[22] = x[31] = MAD_F_MLZ(hi, lo) + t0; + t12 = t8 - t10; + t13 = t9 + t11; MAD_F_ML0(hi, lo, X[1], -MAD_F(0x09bd7ca0)); MAD_F_MLA(hi, lo, X[7], MAD_F(0x0216a2a2)); MAD_F_MLA(hi, lo, X[10], -MAD_F(0x0fdcf549)); MAD_F_MLA(hi, lo, X[16], MAD_F(0x0cb19346)); - t1 = MAD_F_MLZ(hi, lo) + t6; + MAD_F_ML0(hi, lo, X[1], -MAD_F(0x0216a2a2)); + MAD_F_MLA(hi, lo, X[7], -MAD_F(0x09bd7ca0)); + MAD_F_MLA(hi, lo, X[10], MAD_F(0x0cb19346)); + MAD_F_MLA(hi, lo, X[16], MAD_F(0x0fdcf549)); + t3 = MAD_F_MLZ(hi, lo); + + MAD_F_ML0(hi, lo, X[1], -MAD_F(0x0fdcf549)); + MAD_F_MLA(hi, lo, X[7], -MAD_F(0x0cb19346)); + MAD_F_MLA(hi, lo, X[10], -MAD_F(0x09bd7ca0)); + MAD_F_MLA(hi, lo, X[16], -MAD_F(0x0216a2a2)); + t5 = MAD_F_MLZ(hi, lo) - t6; + MAD_F_ML0(hi, lo, X[0], MAD_F(0x03768962)); MAD_F_MLA(hi, lo, X[2], MAD_F(0x0e313245)); MAD_F_MLA(hi, lo, X[3], -MAD_F(0x0ffc19fd)); @@ -2407,9 +2539,7 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) MAD_F_MLA(hi, lo, X[14], MAD_F(0x00b2aa3e)); MAD_F_MLA(hi, lo, X[15], -MAD_F(0x07635284)); MAD_F_MLA(hi, lo, X[17], -MAD_F(0x0f9ee890)); - - x[6] = MAD_F_MLZ(hi, lo) + t1; - x[11] = -x[6]; + x[11] = -(x[6] = MAD_F_MLZ(hi, lo) + t1); MAD_F_ML0(hi, lo, X[0], -MAD_F(0x0f426cb5)); MAD_F_MLA(hi, lo, X[2], -MAD_F(0x00b2aa3e)); @@ -2423,7 +2553,6 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) MAD_F_MLA(hi, lo, X[14], MAD_F(0x0d7e8807)); MAD_F_MLA(hi, lo, X[15], MAD_F(0x0ffc19fd)); MAD_F_MLA(hi, lo, X[17], MAD_F(0x04cfb0e2)); - x[23] = x[30] = MAD_F_MLZ(hi, lo) + t1; MAD_F_ML0(hi, lo, X[0], -MAD_F(0x0bcbe352)); @@ -2438,19 +2567,17 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) MAD_F_MLA(hi, lo, X[14], MAD_F(0x0e313245)); MAD_F_MLA(hi, lo, X[15], MAD_F(0x0898c779)); MAD_F_MLA(hi, lo, X[17], -MAD_F(0x0acf37ad)); - x[18] = x[35] = MAD_F_MLZ(hi, lo) - t1; MAD_F_ML0(hi, lo, X[4], MAD_F(0x061f78aa)); MAD_F_MLA(hi, lo, X[13], -MAD_F(0x0ec835e8)); - - t7 = MAD_F_MLZ(hi, lo); + t3+= (t7 = MAD_F_MLZ(hi, lo)); + t4-= t7; MAD_F_MLA(hi, lo, X[1], -MAD_F(0x0cb19346)); MAD_F_MLA(hi, lo, X[7], MAD_F(0x0fdcf549)); MAD_F_MLA(hi, lo, X[10], MAD_F(0x0216a2a2)); MAD_F_MLA(hi, lo, X[16], -MAD_F(0x09bd7ca0)); - t2 = MAD_F_MLZ(hi, lo); MAD_F_MLA(hi, lo, X[0], MAD_F(0x04cfb0e2)); @@ -2465,9 +2592,7 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) MAD_F_MLA(hi, lo, X[14], MAD_F(0x0898c779)); MAD_F_MLA(hi, lo, X[15], MAD_F(0x00b2aa3e)); MAD_F_MLA(hi, lo, X[17], MAD_F(0x0f426cb5)); - - x[5] = MAD_F_MLZ(hi, lo); - x[12] = -x[5]; + x[12] = -(x[5] = MAD_F_MLZ(hi, lo)); MAD_F_ML0(hi, lo, X[0], MAD_F(0x0acf37ad)); MAD_F_MLA(hi, lo, X[2], -MAD_F(0x0898c779)); @@ -2481,9 +2606,7 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) MAD_F_MLA(hi, lo, X[14], MAD_F(0x07635284)); MAD_F_MLA(hi, lo, X[15], MAD_F(0x0d7e8807)); MAD_F_MLA(hi, lo, X[17], -MAD_F(0x0bcbe352)); - - x[0] = MAD_F_MLZ(hi, lo) + t2; - x[17] = -x[0]; + x[17] = -(x[0] = MAD_F_MLZ(hi, lo) + t2); MAD_F_ML0(hi, lo, X[0], -MAD_F(0x0f9ee890)); MAD_F_MLA(hi, lo, X[2], -MAD_F(0x07635284)); @@ -2497,16 +2620,8 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) MAD_F_MLA(hi, lo, X[14], -MAD_F(0x0ffc19fd)); MAD_F_MLA(hi, lo, X[15], -MAD_F(0x0e313245)); MAD_F_MLA(hi, lo, X[17], -MAD_F(0x03768962)); - x[24] = x[29] = MAD_F_MLZ(hi, lo) + t2; - MAD_F_ML0(hi, lo, X[1], -MAD_F(0x0216a2a2)); - MAD_F_MLA(hi, lo, X[7], -MAD_F(0x09bd7ca0)); - MAD_F_MLA(hi, lo, X[10], MAD_F(0x0cb19346)); - MAD_F_MLA(hi, lo, X[16], MAD_F(0x0fdcf549)); - - t3 = MAD_F_MLZ(hi, lo) + t7; - MAD_F_ML0(hi, lo, X[0], MAD_F(0x00b2aa3e)); MAD_F_MLA(hi, lo, X[2], MAD_F(0x03768962)); MAD_F_MLA(hi, lo, X[3], -MAD_F(0x04cfb0e2)); @@ -2519,9 +2634,7 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) MAD_F_MLA(hi, lo, X[14], MAD_F(0x0f426cb5)); MAD_F_MLA(hi, lo, X[15], -MAD_F(0x0f9ee890)); MAD_F_MLA(hi, lo, X[17], -MAD_F(0x0ffc19fd)); - - x[8] = MAD_F_MLZ(hi, lo) + t3; - x[9] = -x[8]; + x[9] = -(x[8] = MAD_F_MLZ(hi, lo) + t3); MAD_F_ML0(hi, lo, X[0], -MAD_F(0x0e313245)); MAD_F_MLA(hi, lo, X[2], MAD_F(0x0bcbe352)); @@ -2535,7 +2648,6 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) MAD_F_MLA(hi, lo, X[14], -MAD_F(0x03768962)); MAD_F_MLA(hi, lo, X[15], MAD_F(0x0acf37ad)); MAD_F_MLA(hi, lo, X[17], MAD_F(0x07635284)); - x[21] = x[32] = MAD_F_MLZ(hi, lo) + t3; MAD_F_ML0(hi, lo, X[0], -MAD_F(0x0d7e8807)); @@ -2550,41 +2662,17 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) MAD_F_MLA(hi, lo, X[14], MAD_F(0x0bcbe352)); MAD_F_MLA(hi, lo, X[15], -MAD_F(0x04cfb0e2)); MAD_F_MLA(hi, lo, X[17], -MAD_F(0x0898c779)); - x[20] = x[33] = MAD_F_MLZ(hi, lo) - t3; - MAD_F_ML0(hi, lo, t14, -MAD_F(0x0ec835e8)); - MAD_F_MLA(hi, lo, t15, MAD_F(0x061f78aa)); - - t4 = MAD_F_MLZ(hi, lo) - t7; + MAD_F_ML0(hi, lo, t12, -MAD_F(0x0ec835e8)); + MAD_F_MLA(hi, lo, t13, MAD_F(0x061f78aa)); + x[22] = x[31] = MAD_F_MLZ(hi, lo) + t0; MAD_F_ML0(hi, lo, t12, MAD_F(0x061f78aa)); MAD_F_MLA(hi, lo, t13, MAD_F(0x0ec835e8)); - - x[4] = MAD_F_MLZ(hi, lo) + t4; - x[13] = -x[4]; - - MAD_F_ML0(hi, lo, t8, MAD_F(0x09bd7ca0)); - MAD_F_MLA(hi, lo, t9, -MAD_F(0x0216a2a2)); - MAD_F_MLA(hi, lo, t10, MAD_F(0x0fdcf549)); - MAD_F_MLA(hi, lo, t11, -MAD_F(0x0cb19346)); - - x[1] = MAD_F_MLZ(hi, lo) + t4; - x[16] = -x[1]; - - MAD_F_ML0(hi, lo, t8, -MAD_F(0x0fdcf549)); - MAD_F_MLA(hi, lo, t9, -MAD_F(0x0cb19346)); - MAD_F_MLA(hi, lo, t10, -MAD_F(0x09bd7ca0)); - MAD_F_MLA(hi, lo, t11, -MAD_F(0x0216a2a2)); - - x[25] = x[28] = MAD_F_MLZ(hi, lo) + t4; - - MAD_F_ML0(hi, lo, X[1], -MAD_F(0x0fdcf549)); - MAD_F_MLA(hi, lo, X[7], -MAD_F(0x0cb19346)); - MAD_F_MLA(hi, lo, X[10], -MAD_F(0x09bd7ca0)); - MAD_F_MLA(hi, lo, X[16], -MAD_F(0x0216a2a2)); - - t5 = MAD_F_MLZ(hi, lo) - t6; + x[13] = -(x[4] = MAD_F_MLZ(hi, lo) + t4); + x[16] = -(x[1] = x[1] + t4); + x[25] = x[28] = x[25] + t4; MAD_F_ML0(hi, lo, X[0], MAD_F(0x0898c779)); MAD_F_MLA(hi, lo, X[2], MAD_F(0x04cfb0e2)); @@ -2598,9 +2686,7 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) MAD_F_MLA(hi, lo, X[14], -MAD_F(0x0acf37ad)); MAD_F_MLA(hi, lo, X[15], MAD_F(0x0f426cb5)); MAD_F_MLA(hi, lo, X[17], -MAD_F(0x0d7e8807)); - - x[2] = MAD_F_MLZ(hi, lo) + t5; - x[15] = -x[2]; + x[15] = -(x[2] = MAD_F_MLZ(hi, lo) + t5); MAD_F_ML0(hi, lo, X[0], MAD_F(0x07635284)); MAD_F_MLA(hi, lo, X[2], MAD_F(0x0acf37ad)); @@ -2614,9 +2700,7 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) MAD_F_MLA(hi, lo, X[14], MAD_F(0x0f9ee890)); MAD_F_MLA(hi, lo, X[15], -MAD_F(0x0bcbe352)); MAD_F_MLA(hi, lo, X[17], MAD_F(0x0e313245)); - - x[3] = MAD_F_MLZ(hi, lo) + t5; - x[14] = -x[3]; + x[14] = -(x[3] = MAD_F_MLZ(hi, lo) + t5); MAD_F_ML0(hi, lo, X[0], -MAD_F(0x0ffc19fd)); MAD_F_MLA(hi, lo, X[2], -MAD_F(0x0f9ee890)); @@ -2630,7 +2714,6 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36]) MAD_F_MLA(hi, lo, X[14], -MAD_F(0x04cfb0e2)); MAD_F_MLA(hi, lo, X[15], -MAD_F(0x03768962)); MAD_F_MLA(hi, lo, X[17], -MAD_F(0x00b2aa3e)); - x[26] = x[27] = MAD_F_MLZ(hi, lo) + t5; } #endif /* CPU_COLDFIRE */ diff --git a/apps/codecs/libmad/synth.c b/apps/codecs/libmad/synth.c index e6955d6af9..2da8f64db7 100644 --- a/apps/codecs/libmad/synth.c +++ b/apps/codecs/libmad/synth.c @@ -580,35 +580,138 @@ static void synth_full(struct mad_synth *synth, struct mad_frame const *frame, unsigned int nch, unsigned int ns) { - unsigned int phase, ch, s, sb, pe, po; - mad_fixed_t *pcm1, *pcm2, (*filter)[2][2][16][8]; + unsigned int phase, ch, s, sb, p; + mad_fixed_t *pcm, (*filter)[2][2][16][8]; mad_fixed_t const (*sbsample)[36][32]; mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8]; - mad_fixed_t const (*Dptr)[32]; - mad_fixed64hi_t hi; + mad_fixed_t const (*D0ptr)[32]; + mad_fixed_t const (*D1ptr)[32]; + mad_fixed64hi_t hi0, hi1; for (ch = 0; ch < nch; ++ch) { sbsample = &frame->sbsample[ch]; filter = &synth->filter[ch]; phase = synth->phase; - pcm1 = synth->pcm.samples[ch]; + pcm = synth->pcm.samples[ch]; for (s = 0; s < ns; ++s) { dct32((*sbsample)[s], phase >> 1, (*filter)[0][phase & 1], (*filter)[1][phase & 1]); - pe = phase & ~1; - po = ((phase - 1) & 0xf) | 1; + p = (phase - 1) & 0xf; /* calculate 32 samples */ - fe = &(*filter)[0][ phase & 1][0]; fx = &(*filter)[0][~phase & 1][0]; fo = &(*filter)[1][~phase & 1][0]; - Dptr = &D[0]; + D0ptr = (void*)&D[0][ p]; + D1ptr = (void*)&D[0][-p]; - asm volatile( + if(s & 1) + { + asm volatile( + "movem.l (%1), %%d0-%%d7\n\t" + "move.l 4(%2), %%a5\n\t" + "msac.l %%d0, %%a5, 60(%2), %%a5, %%acc0\n\t" + "msac.l %%d1, %%a5, 52(%2), %%a5, %%acc0\n\t" + "msac.l %%d2, %%a5, 44(%2), %%a5, %%acc0\n\t" + "msac.l %%d3, %%a5, 36(%2), %%a5, %%acc0\n\t" + "msac.l %%d4, %%a5, 28(%2), %%a5, %%acc0\n\t" + "msac.l %%d5, %%a5, 20(%2), %%a5, %%acc0\n\t" + "msac.l %%d6, %%a5, 12(%2), %%a5, %%acc0\n\t" + "msac.l %%d7, %%a5, (%2), %%a5, %%acc0\n\t" + + "movem.l (%3), %%d0-%%d7\n\t" + "mac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t" + "mac.l %%d1, %%a5, 48(%2), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, 40(%2), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t" + "mac.l %%d4, %%a5, 24(%2), %%a5, %%acc0\n\t" + "mac.l %%d5, %%a5, 16(%2), %%a5, %%acc0\n\t" + "mac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t" + "mac.l %%d7, %%a5, %%acc0\n\t" + "movclr.l %%acc0, %0\n\t" + : "=r" (hi0) : "a" (*fx), "a" (*D0ptr), "a" (*fe) + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5"); + + pcm[0] = hi0 << 3; /* shift result to libmad's fixed point format */ + pcm += 16; + + for (sb = 15; sb; sb--, fo++) { + ++fe; + ++D0ptr; + ++D1ptr; + + /* D[32 - sb][i] == -D[sb][31 - i] */ + asm volatile ( + "movem.l (%0), %%d0-%%d7\n\t" + "move.l 4(%2), %%a5\n\t" + "msac.l %%d0, %%a5, 60(%2), %%a5, %%acc0\n\t" + "msac.l %%d1, %%a5, 52(%2), %%a5, %%acc0\n\t" + "msac.l %%d2, %%a5, 44(%2), %%a5, %%acc0\n\t" + "msac.l %%d3, %%a5, 36(%2), %%a5, %%acc0\n\t" + "msac.l %%d4, %%a5, 28(%2), %%a5, %%acc0\n\t" + "msac.l %%d5, %%a5, 20(%2), %%a5, %%acc0\n\t" + "msac.l %%d6, %%a5, 12(%2), %%a5, %%acc0\n\t" + "msac.l %%d7, %%a5, 112(%3), %%a5, %%acc0\n\t" + "mac.l %%d7, %%a5, 104(%3), %%a5, %%acc1\n\t" + "mac.l %%d6, %%a5, 96(%3), %%a5, %%acc1\n\t" + "mac.l %%d5, %%a5, 88(%3), %%a5, %%acc1\n\t" + "mac.l %%d4, %%a5, 80(%3), %%a5, %%acc1\n\t" + "mac.l %%d3, %%a5, 72(%3), %%a5, %%acc1\n\t" + "mac.l %%d2, %%a5, 64(%3), %%a5, %%acc1\n\t" + "mac.l %%d1, %%a5, 120(%3), %%a5, %%acc1\n\t" + "mac.l %%d0, %%a5, 8(%2), %%a5, %%acc1\n\t" + "movem.l (%1), %%d0-%%d7\n\t" + "mac.l %%d7, %%a5, 16(%2), %%a5, %%acc0\n\t" + "mac.l %%d6, %%a5, 24(%2), %%a5, %%acc0\n\t" + "mac.l %%d5, %%a5, 32(%2), %%a5, %%acc0\n\t" + "mac.l %%d4, %%a5, 40(%2), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, 48(%2), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, 56(%2), %%a5, %%acc0\n\t" + "mac.l %%d1, %%a5, (%2), %%a5, %%acc0\n\t" + "mac.l %%d0, %%a5, 60(%3), %%a5, %%acc0\n\t" + "mac.l %%d0, %%a5, 68(%3), %%a5, %%acc1\n\t" + "mac.l %%d1, %%a5, 76(%3), %%a5, %%acc1\n\t" + "mac.l %%d2, %%a5, 84(%3), %%a5, %%acc1\n\t" + "mac.l %%d3, %%a5, 92(%3), %%a5, %%acc1\n\t" + "mac.l %%d4, %%a5, 100(%3), %%a5, %%acc1\n\t" + "mac.l %%d5, %%a5, 108(%3), %%a5, %%acc1\n\t" + "mac.l %%d6, %%a5, 116(%3), %%a5, %%acc1\n\t" + "mac.l %%d7, %%a5, %%acc1\n\t" + : : "a" (*fo), "a" (*fe), "a" (*D0ptr), "a" (*D1ptr) + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5"); + + asm volatile( + "movclr.l %%acc0, %0\n\t" + "movclr.l %%acc1, %1\n\t" : "=d" (hi0), "=d" (hi1) ); + + pcm[-sb] = hi0 << 3; + pcm[ sb] = hi1 << 3; + } + + ++D0ptr; + asm volatile( + "movem.l (%1), %%d0-%%d7\n\t" + "move.l 4(%2), %%a5\n\t" + "mac.l %%d0, %%a5, 60(%2), %%a5, %%acc0\n\t" + "mac.l %%d1, %%a5, 52(%2), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, 44(%2), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, 36(%2), %%a5, %%acc0\n\t" + "mac.l %%d4, %%a5, 28(%2), %%a5, %%acc0\n\t" + "mac.l %%d5, %%a5, 20(%2), %%a5, %%acc0\n\t" + "mac.l %%d6, %%a5, 12(%2), %%a5, %%acc0\n\t" + "mac.l %%d7, %%a5, %%acc0\n\t" + "movclr.l %%acc0, %0\n\t" + : "=r" (hi0) : "a" (*fo), "a" (*D0ptr) + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5"); + + pcm[0] = -(hi0 << 3); + } + else + { + asm volatile( "movem.l (%1), %%d0-%%d7\n\t" "move.l (%2), %%a5\n\t" "msac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t" @@ -617,127 +720,80 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame, "msac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t" "msac.l %%d4, %%a5, 24(%2), %%a5, %%acc0\n\t" "msac.l %%d5, %%a5, 16(%2), %%a5, %%acc0\n\t" - "msac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t" - "msac.l %%d7, %%a5, (%4), %%a5, %%acc0\n\t" + "msac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t" + "msac.l %%d7, %%a5, 4(%2), %%a5, %%acc0\n\t" "movem.l (%3), %%d0-%%d7\n\t" - "mac.l %%d0, %%a5, 56(%4), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, 48(%4), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, 40(%4), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, 32(%4), %%a5, %%acc0\n\t" - "mac.l %%d4, %%a5, 24(%4), %%a5, %%acc0\n\t" - "mac.l %%d5, %%a5, 16(%4), %%a5, %%acc0\n\t" - "mac.l %%d6, %%a5, 8(%4), %%a5, %%acc0\n\t" - "mac.l %%d7, %%a5, %%acc0\n\t" + "mac.l %%d0, %%a5, 60(%2), %%a5, %%acc0\n\t" + "mac.l %%d1, %%a5, 52(%2), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, 44(%2), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, 36(%2), %%a5, %%acc0\n\t" + "mac.l %%d4, %%a5, 28(%2), %%a5, %%acc0\n\t" + "mac.l %%d5, %%a5, 20(%2), %%a5, %%acc0\n\t" + "mac.l %%d6, %%a5, 12(%2), %%a5, %%acc0\n\t" + "mac.l %%d7, %%a5, %%acc0\n\t" "movclr.l %%acc0, %0\n\t" - : "=r" (hi) - : "a" (*fx), "a" (*Dptr + po), "a" (*fe), "a" (*Dptr + pe) + : "=r" (hi0) : "a" (*fx), "a" (*D0ptr), "a" (*fe) : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5"); - *pcm1++ = hi << 3; /* shift result to libmad's fixed point format */ + pcm[0] = hi0 << 3; /* shift result to libmad's fixed point format */ + pcm += 16; - pcm2 = pcm1 + 30; + for (sb = 15; sb; sb--, fo++) { + ++fe; + ++D0ptr; + ++D1ptr; - for (sb = 1; sb < 16; ++sb) { - ++fe; - ++Dptr; - - /* D[32 - sb][i] == -D[sb][31 - i] */ -#if __GNUC__ >= 4 - /* GCC 4.0.1 can't find a suitable register here if all of d0-d7 - * are clobbered, so use fewer registers. It does mean two extra - * movem instructions, but should have no additional performance - * impact (like not being able to use burst mode for the movem). - */ - asm volatile ( - "movem.l (%1), %%d0-%%d3\n\t" + /* D[32 - sb][i] == -D[sb][31 - i] */ + asm volatile ( + "movem.l (%0), %%d0-%%d7\n\t" "move.l (%2), %%a5\n\t" - "msac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t" - "msac.l %%d1, %%a5, 48(%2), %%a5, %%acc0\n\t" - "msac.l %%d2, %%a5, 40(%2), %%a5, %%acc0\n\t" - "msac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t" - "movem.l 16(%1), %%d0-%%d3\n\t" - "msac.l %%d0, %%a5, 24(%2), %%a5, %%acc0\n\t" - "msac.l %%d1, %%a5, 16(%2), %%a5, %%acc0\n\t" - "msac.l %%d2, %%a5, 8(%2), %%a5, %%acc0\n\t" - "msac.l %%d3, %%a5, 8(%4), %%a5, %%acc0\n\t" - - "movem.l 16(%3), %%d0-%%d3\n\t" - "mac.l %%d3, %%a5, 16(%4), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, 24(%4), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, 32(%4), %%a5, %%acc0\n\t" - "mac.l %%d0, %%a5, 40(%4), %%a5, %%acc0\n\t" - "movem.l (%3), %%d0-%%d3\n\t" - "mac.l %%d3, %%a5, 48(%4), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, 56(%4), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, (%4), %%a5, %%acc0\n\t" - "mac.l %%d0, %%a5, %%acc0\n\t" - "movclr.l %%acc0, %0\n\t" - : "=r" (hi) - : "a" (*fo), "a" (*Dptr + po), "a" (*fe), "a" (*Dptr + pe) - : "d0", "d1", "d2", "d3", "a5"); -#else - asm volatile ( + "msac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t" + "msac.l %%d1, %%a5, 48(%2), %%a5, %%acc0\n\t" + "msac.l %%d2, %%a5, 40(%2), %%a5, %%acc0\n\t" + "msac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t" + "msac.l %%d4, %%a5, 24(%2), %%a5, %%acc0\n\t" + "msac.l %%d5, %%a5, 16(%2), %%a5, %%acc0\n\t" + "msac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t" + "msac.l %%d7, %%a5, 116(%3), %%a5, %%acc0\n\t" + "mac.l %%d7, %%a5, 108(%3), %%a5, %%acc1\n\t" + "mac.l %%d6, %%a5, 100(%3), %%a5, %%acc1\n\t" + "mac.l %%d5, %%a5, 92(%3), %%a5, %%acc1\n\t" + "mac.l %%d4, %%a5, 84(%3), %%a5, %%acc1\n\t" + "mac.l %%d3, %%a5, 76(%3), %%a5, %%acc1\n\t" + "mac.l %%d2, %%a5, 68(%3), %%a5, %%acc1\n\t" + "mac.l %%d1, %%a5, 60(%3), %%a5, %%acc1\n\t" + "mac.l %%d0, %%a5, 12(%2), %%a5, %%acc1\n\t" "movem.l (%1), %%d0-%%d7\n\t" - "move.l (%2), %%a5\n\t" - "msac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t" - "msac.l %%d1, %%a5, 48(%2), %%a5, %%acc0\n\t" - "msac.l %%d2, %%a5, 40(%2), %%a5, %%acc0\n\t" - "msac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t" - "msac.l %%d4, %%a5, 24(%2), %%a5, %%acc0\n\t" - "msac.l %%d5, %%a5, 16(%2), %%a5, %%acc0\n\t" - "msac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t" - "msac.l %%d7, %%a5, 8(%4), %%a5, %%acc0\n\t" - - "movem.l (%3), %%d0-%%d7\n\t" - "mac.l %%d7, %%a5, 16(%4), %%a5, %%acc0\n\t" - "mac.l %%d6, %%a5, 24(%4), %%a5, %%acc0\n\t" - "mac.l %%d5, %%a5, 32(%4), %%a5, %%acc0\n\t" - "mac.l %%d4, %%a5, 40(%4), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, 48(%4), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, 56(%4), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, (%4), %%a5, %%acc0\n\t" - "mac.l %%d0, %%a5, %%acc0\n\t" - "movclr.l %%acc0, %0\n\t" - : "=r" (hi) - : "a" (*fo), "a" (*Dptr + po), "a" (*fe), "a" (*Dptr + pe) + "mac.l %%d7, %%a5, 20(%2), %%a5, %%acc0\n\t" + "mac.l %%d6, %%a5, 28(%2), %%a5, %%acc0\n\t" + "mac.l %%d5, %%a5, 36(%2), %%a5, %%acc0\n\t" + "mac.l %%d4, %%a5, 44(%2), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, 52(%2), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, 60(%2), %%a5, %%acc0\n\t" + "mac.l %%d1, %%a5, 4(%2), %%a5, %%acc0\n\t" + "mac.l %%d0, %%a5, 120(%3), %%a5, %%acc0\n\t" + "mac.l %%d0, %%a5, 64(%3), %%a5, %%acc1\n\t" + "mac.l %%d1, %%a5, 72(%3), %%a5, %%acc1\n\t" + "mac.l %%d2, %%a5, 80(%3), %%a5, %%acc1\n\t" + "mac.l %%d3, %%a5, 88(%3), %%a5, %%acc1\n\t" + "mac.l %%d4, %%a5, 96(%3), %%a5, %%acc1\n\t" + "mac.l %%d5, %%a5, 104(%3), %%a5, %%acc1\n\t" + "mac.l %%d6, %%a5, 112(%3), %%a5, %%acc1\n\t" + "mac.l %%d7, %%a5, %%acc1\n\t" + : : "a" (*fo), "a" (*fe), "a" (*D0ptr), "a" (*D1ptr) : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5"); -#endif - *pcm1++ = hi << 3; - + + asm volatile( + "movclr.l %%acc0, %0\n\t" + "movclr.l %%acc1, %1\n\t" : "=d" (hi0), "=d" (hi1) ); + + pcm[-sb] = hi0 << 3; + pcm[ sb] = hi1 << 3; + } + + ++D0ptr; asm volatile( - "movem.l (%1), %%d0-%%d7\n\t" - "move.l 60(%2), %%a5\n\t" - "mac.l %%d0, %%a5, 68(%2), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, 76(%2), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, 84(%2), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, 92(%2), %%a5, %%acc0\n\t" - "mac.l %%d4, %%a5, 100(%2), %%a5, %%acc0\n\t" - "mac.l %%d5, %%a5, 108(%2), %%a5, %%acc0\n\t" - "mac.l %%d6, %%a5, 116(%2), %%a5, %%acc0\n\t" - "mac.l %%d7, %%a5, 116(%4), %%a5, %%acc0\n\t" - - "movem.l (%3), %%d0-%%d7\n\t" - "mac.l %%d7, %%a5, 108(%4), %%a5, %%acc0\n\t" - "mac.l %%d6, %%a5, 100(%4), %%a5, %%acc0\n\t" - "mac.l %%d5, %%a5, 92(%4), %%a5, %%acc0\n\t" - "mac.l %%d4, %%a5, 84(%4), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, 76(%4), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, 68(%4), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, 60(%4), %%a5, %%acc0\n\t" - "mac.l %%d0, %%a5, %%acc0\n\t" - "movclr.l %%acc0, %0\n\t" - : "=r" (hi) - : "a" (*fe), "a" (*Dptr - pe), "a" (*fo), "a" (*Dptr - po) - : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5"); - - *pcm2-- = hi << 3; - - ++fo; - } - - ++Dptr; - asm volatile( "movem.l (%1), %%d0-%%d7\n\t" "move.l (%2), %%a5\n\t" "mac.l %%d0, %%a5, 56(%2), %%a5, %%acc0\n\t" @@ -746,15 +802,15 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame, "mac.l %%d3, %%a5, 32(%2), %%a5, %%acc0\n\t" "mac.l %%d4, %%a5, 24(%2), %%a5, %%acc0\n\t" "mac.l %%d5, %%a5, 16(%2), %%a5, %%acc0\n\t" - "mac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t" - "mac.l %%d7, %%a5, %%acc0\n\t" + "mac.l %%d6, %%a5, 8(%2), %%a5, %%acc0\n\t" + "mac.l %%d7, %%a5, %%acc0\n\t" "movclr.l %%acc0, %0\n\t" - : "=r" (hi) : "a" (*fo), "a" (*Dptr + po) + : "=r" (hi0) : "a" (*fo), "a" (*D0ptr) : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a5"); - *pcm1 = -(hi << 3); - pcm1 += 16; - + pcm[0] = -(hi0 << 3); + } + pcm += 16; phase = (phase + 1) % 16; } } @@ -766,129 +822,200 @@ static void synth_full(struct mad_synth *synth, struct mad_frame const *frame, unsigned int nch, unsigned int ns) { - unsigned int phase, ch, s, sb, pe, po; - mad_fixed_t *pcm1, *pcm2, (*filter)[2][2][16][8]; + int p; + unsigned int phase, ch, s, sb; + mad_fixed_t *pcm, (*filter)[2][2][16][8]; mad_fixed_t const (*sbsample)[36][32]; - register mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8]; - register mad_fixed_t const (*Dptr)[32], *ptr; - register mad_fixed64hi_t hi; - register mad_fixed64lo_t lo; + mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8]; + mad_fixed_t const (*D0ptr)[32], *ptr; + mad_fixed_t const (*D1ptr)[32]; + mad_fixed64hi_t hi; + mad_fixed64lo_t lo; for (ch = 0; ch < nch; ++ch) { sbsample = &frame->sbsample[ch]; filter = &synth->filter[ch]; phase = synth->phase; - pcm1 = synth->pcm.samples[ch]; + pcm = synth->pcm.samples[ch]; for (s = 0; s < ns; ++s) { dct32((*sbsample)[s], phase >> 1, (*filter)[0][phase & 1], (*filter)[1][phase & 1]); - pe = phase & ~1; - po = ((phase - 1) & 0xf) | 1; + p = (phase - 1) & 0xf; /* calculate 32 samples */ - fe = &(*filter)[0][ phase & 1][0]; fx = &(*filter)[0][~phase & 1][0]; fo = &(*filter)[1][~phase & 1][0]; - Dptr = &D[0]; + D0ptr = (void*)&D[0][ p]; + D1ptr = (void*)&D[0][-p]; - ptr = *Dptr + po; - ML0(hi, lo, (*fx)[0], ptr[ 0]); - MLA(hi, lo, (*fx)[1], ptr[14]); - MLA(hi, lo, (*fx)[2], ptr[12]); - MLA(hi, lo, (*fx)[3], ptr[10]); - MLA(hi, lo, (*fx)[4], ptr[ 8]); - MLA(hi, lo, (*fx)[5], ptr[ 6]); - MLA(hi, lo, (*fx)[6], ptr[ 4]); - MLA(hi, lo, (*fx)[7], ptr[ 2]); - MLN(hi, lo); + if(s & 1) + { + ptr = *D0ptr; + ML0(hi, lo, (*fx)[0], ptr[ 1]); + MLA(hi, lo, (*fx)[1], ptr[15]); + MLA(hi, lo, (*fx)[2], ptr[13]); + MLA(hi, lo, (*fx)[3], ptr[11]); + MLA(hi, lo, (*fx)[4], ptr[ 9]); + MLA(hi, lo, (*fx)[5], ptr[ 7]); + MLA(hi, lo, (*fx)[6], ptr[ 5]); + MLA(hi, lo, (*fx)[7], ptr[ 3]); + MLN(hi, lo); + MLA(hi, lo, (*fe)[0], ptr[ 0]); + MLA(hi, lo, (*fe)[1], ptr[14]); + MLA(hi, lo, (*fe)[2], ptr[12]); + MLA(hi, lo, (*fe)[3], ptr[10]); + MLA(hi, lo, (*fe)[4], ptr[ 8]); + MLA(hi, lo, (*fe)[5], ptr[ 6]); + MLA(hi, lo, (*fe)[6], ptr[ 4]); + MLA(hi, lo, (*fe)[7], ptr[ 2]); + pcm[0] = SHIFT(MLZ(hi, lo)); + pcm += 16; - ptr = *Dptr + pe; - MLA(hi, lo, (*fe)[0], ptr[ 0]); - MLA(hi, lo, (*fe)[1], ptr[14]); - MLA(hi, lo, (*fe)[2], ptr[12]); - MLA(hi, lo, (*fe)[3], ptr[10]); - MLA(hi, lo, (*fe)[4], ptr[ 8]); - MLA(hi, lo, (*fe)[5], ptr[ 6]); - MLA(hi, lo, (*fe)[6], ptr[ 4]); - MLA(hi, lo, (*fe)[7], ptr[ 2]); + for (sb = 15; sb; sb--, fo++) + { + ++fe; + ++D0ptr; + ++D1ptr; - *pcm1++ = SHIFT(MLZ(hi, lo)); + /* D[32 - sb][i] == -D[sb][31 - i] */ + ptr = *D0ptr; + ML0(hi, lo, (*fo)[0], ptr[ 1]); + MLA(hi, lo, (*fo)[1], ptr[15]); + MLA(hi, lo, (*fo)[2], ptr[13]); + MLA(hi, lo, (*fo)[3], ptr[11]); + MLA(hi, lo, (*fo)[4], ptr[ 9]); + MLA(hi, lo, (*fo)[5], ptr[ 7]); + MLA(hi, lo, (*fo)[6], ptr[ 5]); + MLA(hi, lo, (*fo)[7], ptr[ 3]); + MLN(hi, lo); + MLA(hi, lo, (*fe)[7], ptr[ 2]); + MLA(hi, lo, (*fe)[6], ptr[ 4]); + MLA(hi, lo, (*fe)[5], ptr[ 6]); + MLA(hi, lo, (*fe)[4], ptr[ 8]); + MLA(hi, lo, (*fe)[3], ptr[10]); + MLA(hi, lo, (*fe)[2], ptr[12]); + MLA(hi, lo, (*fe)[1], ptr[14]); + MLA(hi, lo, (*fe)[0], ptr[ 0]); + pcm[-sb] = SHIFT(MLZ(hi, lo)); - pcm2 = pcm1 + 30; + ptr = *D1ptr; + ML0(hi, lo, (*fe)[0], ptr[31 - 16]); + MLA(hi, lo, (*fe)[1], ptr[31 - 14]); + MLA(hi, lo, (*fe)[2], ptr[31 - 12]); + MLA(hi, lo, (*fe)[3], ptr[31 - 10]); + MLA(hi, lo, (*fe)[4], ptr[31 - 8]); + MLA(hi, lo, (*fe)[5], ptr[31 - 6]); + MLA(hi, lo, (*fe)[6], ptr[31 - 4]); + MLA(hi, lo, (*fe)[7], ptr[31 - 2]); + MLA(hi, lo, (*fo)[7], ptr[31 - 3]); + MLA(hi, lo, (*fo)[6], ptr[31 - 5]); + MLA(hi, lo, (*fo)[5], ptr[31 - 7]); + MLA(hi, lo, (*fo)[4], ptr[31 - 9]); + MLA(hi, lo, (*fo)[3], ptr[31 - 11]); + MLA(hi, lo, (*fo)[2], ptr[31 - 13]); + MLA(hi, lo, (*fo)[1], ptr[31 - 15]); + MLA(hi, lo, (*fo)[0], ptr[31 - 1]); + pcm[sb] = SHIFT(MLZ(hi, lo)); + } - for (sb = 1; sb < 16; ++sb) { - ++fe; - ++Dptr; + ptr = *(D0ptr + 1); + ML0(hi, lo, (*fo)[0], ptr[ 1]); + MLA(hi, lo, (*fo)[1], ptr[15]); + MLA(hi, lo, (*fo)[2], ptr[13]); + MLA(hi, lo, (*fo)[3], ptr[11]); + MLA(hi, lo, (*fo)[4], ptr[ 9]); + MLA(hi, lo, (*fo)[5], ptr[ 7]); + MLA(hi, lo, (*fo)[6], ptr[ 5]); + MLA(hi, lo, (*fo)[7], ptr[ 3]); + pcm[0] = SHIFT(-MLZ(hi, lo)); + } + else + { + ptr = *D0ptr; + ML0(hi, lo, (*fx)[0], ptr[ 0]); + MLA(hi, lo, (*fx)[1], ptr[14]); + MLA(hi, lo, (*fx)[2], ptr[12]); + MLA(hi, lo, (*fx)[3], ptr[10]); + MLA(hi, lo, (*fx)[4], ptr[ 8]); + MLA(hi, lo, (*fx)[5], ptr[ 6]); + MLA(hi, lo, (*fx)[6], ptr[ 4]); + MLA(hi, lo, (*fx)[7], ptr[ 2]); + MLN(hi, lo); + MLA(hi, lo, (*fe)[0], ptr[ 1]); + MLA(hi, lo, (*fe)[1], ptr[15]); + MLA(hi, lo, (*fe)[2], ptr[13]); + MLA(hi, lo, (*fe)[3], ptr[11]); + MLA(hi, lo, (*fe)[4], ptr[ 9]); + MLA(hi, lo, (*fe)[5], ptr[ 7]); + MLA(hi, lo, (*fe)[6], ptr[ 5]); + MLA(hi, lo, (*fe)[7], ptr[ 3]); + pcm[0] = SHIFT(MLZ(hi, lo)); + pcm += 16; - /* D[32 - sb][i] == -D[sb][31 - i] */ + for (sb = 15; sb; sb--, fo++) + { + ++fe; + ++D0ptr; + ++D1ptr; - ptr = *Dptr + po; - ML0(hi, lo, (*fo)[0], ptr[ 0]); - MLA(hi, lo, (*fo)[1], ptr[14]); - MLA(hi, lo, (*fo)[2], ptr[12]); - MLA(hi, lo, (*fo)[3], ptr[10]); - MLA(hi, lo, (*fo)[4], ptr[ 8]); - MLA(hi, lo, (*fo)[5], ptr[ 6]); - MLA(hi, lo, (*fo)[6], ptr[ 4]); - MLA(hi, lo, (*fo)[7], ptr[ 2]); - MLN(hi, lo); + /* D[32 - sb][i] == -D[sb][31 - i] */ + ptr = *D0ptr; + ML0(hi, lo, (*fo)[0], ptr[ 0]); + MLA(hi, lo, (*fo)[1], ptr[14]); + MLA(hi, lo, (*fo)[2], ptr[12]); + MLA(hi, lo, (*fo)[3], ptr[10]); + MLA(hi, lo, (*fo)[4], ptr[ 8]); + MLA(hi, lo, (*fo)[5], ptr[ 6]); + MLA(hi, lo, (*fo)[6], ptr[ 4]); + MLA(hi, lo, (*fo)[7], ptr[ 2]); + MLN(hi, lo); + MLA(hi, lo, (*fe)[7], ptr[ 3]); + MLA(hi, lo, (*fe)[6], ptr[ 5]); + MLA(hi, lo, (*fe)[5], ptr[ 7]); + MLA(hi, lo, (*fe)[4], ptr[ 9]); + MLA(hi, lo, (*fe)[3], ptr[11]); + MLA(hi, lo, (*fe)[2], ptr[13]); + MLA(hi, lo, (*fe)[1], ptr[15]); + MLA(hi, lo, (*fe)[0], ptr[ 1]); + pcm[-sb] = SHIFT(MLZ(hi, lo)); - ptr = *Dptr + pe; - MLA(hi, lo, (*fe)[7], ptr[ 2]); - MLA(hi, lo, (*fe)[6], ptr[ 4]); - MLA(hi, lo, (*fe)[5], ptr[ 6]); - MLA(hi, lo, (*fe)[4], ptr[ 8]); - MLA(hi, lo, (*fe)[3], ptr[10]); - MLA(hi, lo, (*fe)[2], ptr[12]); - MLA(hi, lo, (*fe)[1], ptr[14]); - MLA(hi, lo, (*fe)[0], ptr[ 0]); + ptr = *D1ptr; + ML0(hi, lo, (*fe)[0], ptr[31 - 1]); + MLA(hi, lo, (*fe)[1], ptr[31 - 15]); + MLA(hi, lo, (*fe)[2], ptr[31 - 13]); + MLA(hi, lo, (*fe)[3], ptr[31 - 11]); + MLA(hi, lo, (*fe)[4], ptr[31 - 9]); + MLA(hi, lo, (*fe)[5], ptr[31 - 7]); + MLA(hi, lo, (*fe)[6], ptr[31 - 5]); + MLA(hi, lo, (*fe)[7], ptr[31 - 3]); + MLA(hi, lo, (*fo)[7], ptr[31 - 2]); + MLA(hi, lo, (*fo)[6], ptr[31 - 4]); + MLA(hi, lo, (*fo)[5], ptr[31 - 6]); + MLA(hi, lo, (*fo)[4], ptr[31 - 8]); + MLA(hi, lo, (*fo)[3], ptr[31 - 10]); + MLA(hi, lo, (*fo)[2], ptr[31 - 12]); + MLA(hi, lo, (*fo)[1], ptr[31 - 14]); + MLA(hi, lo, (*fo)[0], ptr[31 - 16]); + pcm[sb] = SHIFT(MLZ(hi, lo)); + } - *pcm1++ = SHIFT(MLZ(hi, lo)); - - ptr = *Dptr - pe; - ML0(hi, lo, (*fe)[0], ptr[31 - 16]); - MLA(hi, lo, (*fe)[1], ptr[31 - 14]); - MLA(hi, lo, (*fe)[2], ptr[31 - 12]); - MLA(hi, lo, (*fe)[3], ptr[31 - 10]); - MLA(hi, lo, (*fe)[4], ptr[31 - 8]); - MLA(hi, lo, (*fe)[5], ptr[31 - 6]); - MLA(hi, lo, (*fe)[6], ptr[31 - 4]); - MLA(hi, lo, (*fe)[7], ptr[31 - 2]); - - ptr = *Dptr - po; - MLA(hi, lo, (*fo)[7], ptr[31 - 2]); - MLA(hi, lo, (*fo)[6], ptr[31 - 4]); - MLA(hi, lo, (*fo)[5], ptr[31 - 6]); - MLA(hi, lo, (*fo)[4], ptr[31 - 8]); - MLA(hi, lo, (*fo)[3], ptr[31 - 10]); - MLA(hi, lo, (*fo)[2], ptr[31 - 12]); - MLA(hi, lo, (*fo)[1], ptr[31 - 14]); - MLA(hi, lo, (*fo)[0], ptr[31 - 16]); - - *pcm2-- = SHIFT(MLZ(hi, lo)); - - ++fo; + ptr = *(D0ptr + 1); + ML0(hi, lo, (*fo)[0], ptr[ 0]); + MLA(hi, lo, (*fo)[1], ptr[14]); + MLA(hi, lo, (*fo)[2], ptr[12]); + MLA(hi, lo, (*fo)[3], ptr[10]); + MLA(hi, lo, (*fo)[4], ptr[ 8]); + MLA(hi, lo, (*fo)[5], ptr[ 6]); + MLA(hi, lo, (*fo)[6], ptr[ 4]); + MLA(hi, lo, (*fo)[7], ptr[ 2]); + pcm[0] = SHIFT(-MLZ(hi, lo)); } - ++Dptr; - - ptr = *Dptr + po; - ML0(hi, lo, (*fo)[0], ptr[ 0]); - MLA(hi, lo, (*fo)[1], ptr[14]); - MLA(hi, lo, (*fo)[2], ptr[12]); - MLA(hi, lo, (*fo)[3], ptr[10]); - MLA(hi, lo, (*fo)[4], ptr[ 8]); - MLA(hi, lo, (*fo)[5], ptr[ 6]); - MLA(hi, lo, (*fo)[6], ptr[ 4]); - MLA(hi, lo, (*fo)[7], ptr[ 2]); - - *pcm1 = SHIFT(-MLZ(hi, lo)); - pcm1 += 16; - + pcm += 16; phase = (phase + 1) % 16; } }