forked from len0rd/rockbox
Further performance optimization of the atrac3 decoder. Rework the internal sample representation and usage of dsp routines. For now a quick and dirty solution is used to add a fract part of 2 bits. Through this several buffers and functions as well as copy loops could be removed. Furthermore add some ASM for coldfire and place some additional data in IRAM on PP5022/24 and X5/M5. Speedup on ARM: +3%, speedup on Coldfire: +639%. Both ARM and Coldfire can decode in realtime now.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@22561 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
2a49ff672e
commit
2dbb424eb9
4 changed files with 64 additions and 60 deletions
|
|
@ -41,7 +41,6 @@ enum codec_status codec_main(void)
|
|||
static size_t buff_size;
|
||||
int datasize, res, consumed, i, time_offset;
|
||||
uint8_t *bit_buffer;
|
||||
int16_t outbuf[2048] __attribute__((aligned(32)));
|
||||
uint16_t fs,sps,h;
|
||||
uint32_t packet_count;
|
||||
int scrambling_unit_size, num_units, elapsed = 0;
|
||||
|
|
@ -62,9 +61,9 @@ next_track:
|
|||
init_rm(&rmctx);
|
||||
|
||||
ci->configure(DSP_SET_FREQUENCY, ci->id3->frequency);
|
||||
ci->configure(DSP_SET_SAMPLE_DEPTH, 16);
|
||||
ci->configure(DSP_SET_SAMPLE_DEPTH, 17); /* Remark: atrac3 uses s15.0 by default, s15.2 was hacked. */
|
||||
ci->configure(DSP_SET_STEREO_MODE, rmctx.nb_channels == 1 ?
|
||||
STEREO_MONO : STEREO_INTERLEAVED);
|
||||
STEREO_MONO : STEREO_NONINTERLEAVED);
|
||||
|
||||
packet_count = rmctx.nb_packets;
|
||||
rmctx.audio_framesize = rmctx.block_align;
|
||||
|
|
@ -145,7 +144,7 @@ seek_start :
|
|||
ci->seek_complete();
|
||||
}
|
||||
if(pkt.length)
|
||||
res = atrac3_decode_frame(&rmctx,&q, outbuf, &datasize, pkt.frames[i], rmctx.block_align);
|
||||
res = atrac3_decode_frame(&rmctx, &q, &datasize, pkt.frames[i], rmctx.block_align);
|
||||
else /* indicates that there are no remaining frames */
|
||||
goto done;
|
||||
|
||||
|
|
@ -155,7 +154,7 @@ seek_start :
|
|||
}
|
||||
|
||||
if(datasize)
|
||||
ci->pcmbuf_insert(outbuf, NULL, q.samples_per_frame / rmctx.nb_channels);
|
||||
ci->pcmbuf_insert(q.outSamples, q.outSamples + 1024, q.samples_per_frame / rmctx.nb_channels);
|
||||
elapsed = rmctx.audiotimestamp+(1000*8*sps/rmctx.bit_rate)*i;
|
||||
ci->set_elapsed(elapsed);
|
||||
rmctx.frame_number++;
|
||||
|
|
|
|||
|
|
@ -55,18 +55,9 @@
|
|||
#define FFMIN(a,b) ((a) > (b) ? (b) : (a))
|
||||
#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0)
|
||||
|
||||
/**
|
||||
* Clips a signed integer value into the -32768,32767 range.
|
||||
*/
|
||||
static inline int16_t av_clip_int16(int a)
|
||||
{
|
||||
if ((a+32768) & ~65535) return (a>>31) ^ 32767;
|
||||
else return a;
|
||||
}
|
||||
|
||||
static int32_t qmf_window[48] IBSS_ATTR;
|
||||
static VLC spectral_coeff_tab[7];
|
||||
static channel_unit channel_units[2];
|
||||
static channel_unit channel_units[2] IBSS_ATTR_LARGE_IRAM;
|
||||
|
||||
/**
|
||||
* Matrixing within quadrature mirror synthesis filter.
|
||||
|
|
@ -516,7 +507,6 @@ static void gainCompensateAndOverlap (int32_t *pIn, int32_t *pPrev, int32_t *pOu
|
|||
int32_t gain1, gain2, gain_inc;
|
||||
int cnt, numdata, nsample, startLoc, endLoc;
|
||||
|
||||
|
||||
if (pGain2->num_gain_data == 0)
|
||||
gain1 = ONE_16;
|
||||
else
|
||||
|
|
@ -735,7 +725,16 @@ static int decodeChannelSoundUnit (GetBitContext *gb, channel_unit *pSnd, int32_
|
|||
numBands = (subbandTab[numSubbands] - 1) >> 8;
|
||||
if (lastTonal >= 0)
|
||||
numBands = FFMAX((lastTonal + 256) >> 8, numBands);
|
||||
|
||||
|
||||
/* Remark: Hardcoded hack to add 2 bits (empty) fract part to internal sample
|
||||
* representation. Needed for higher accuracy in internal calculations as
|
||||
* well as for DSP configuration. See also: ../atrac3_rm.c, DSP_SET_SAMPLE_DEPTH
|
||||
* Todo: Check spectral requantisation for using and outputting samples with
|
||||
* fract part. */
|
||||
int32_t i;
|
||||
for (i=0; i<1024; ++i) {
|
||||
pSnd->spectrum[i] <<= 2;
|
||||
}
|
||||
|
||||
/* Reconstruct time domain samples. */
|
||||
for (band=0; band<4; band++) {
|
||||
|
|
@ -863,11 +862,9 @@ static int decodeFrame(ATRAC3Context *q, const uint8_t* databuf, int off)
|
|||
*/
|
||||
|
||||
int atrac3_decode_frame(RMContext *rmctx, ATRAC3Context *q,
|
||||
void *data, int *data_size,
|
||||
const uint8_t *buf, int buf_size) {
|
||||
int result = 0, off = 0, i;
|
||||
int *data_size, const uint8_t *buf, int buf_size) {
|
||||
int result = 0, off = 0;
|
||||
const uint8_t* databuf;
|
||||
int16_t* samples = data;
|
||||
|
||||
if (buf_size < rmctx->block_align)
|
||||
return buf_size;
|
||||
|
|
@ -887,19 +884,10 @@ int atrac3_decode_frame(RMContext *rmctx, ATRAC3Context *q,
|
|||
return -1;
|
||||
}
|
||||
|
||||
if (q->channels == 1) {
|
||||
/* mono */
|
||||
for (i = 0; i<1024; i++)
|
||||
samples[i] = av_clip_int16(q->outSamples[i]);
|
||||
*data_size = 1024 * sizeof(int16_t);
|
||||
} else {
|
||||
/* stereo */
|
||||
for (i = 0; i < 1024; i++) {
|
||||
samples[i*2] = av_clip_int16(q->outSamples[i]);
|
||||
samples[i*2+1] = av_clip_int16(q->outSamples[1024+i]);
|
||||
}
|
||||
*data_size = 2048 * sizeof(int16_t);
|
||||
}
|
||||
if (q->channels == 1)
|
||||
*data_size = 1024 * sizeof(int32_t);
|
||||
else
|
||||
*data_size = 2048 * sizeof(int32_t);
|
||||
|
||||
return rmctx->block_align;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,14 @@
|
|||
#include "ffmpeg_bitstream.h"
|
||||
#include "../librm/rm.h"
|
||||
|
||||
#if (CONFIG_CPU == PP5022) || (CONFIG_CPU == PP5024) || (CONFIG_CPU == MCF5250)
|
||||
/* PP5022/24 and MCF5250 have larger IRAM */
|
||||
#define IBSS_ATTR_LARGE_IRAM IBSS_ATTR
|
||||
#else
|
||||
/* other CPUs IRAM is not large enough */
|
||||
#define IBSS_ATTR_LARGE_IRAM
|
||||
#endif
|
||||
|
||||
/* These structures are needed to store the parsed gain control data. */
|
||||
typedef struct {
|
||||
int num_gain_data;
|
||||
|
|
@ -75,6 +83,5 @@ typedef struct {
|
|||
int atrac3_decode_init(ATRAC3Context *q, RMContext *rmctx);
|
||||
|
||||
int atrac3_decode_frame(RMContext *rmctx, ATRAC3Context *q,
|
||||
void *data, int *data_size,
|
||||
const uint8_t *buf, int buf_size);
|
||||
int *data_size, const uint8_t *buf, int buf_size);
|
||||
|
||||
|
|
|
|||
|
|
@ -36,17 +36,38 @@
|
|||
: "r"(X),"r"(Y)); \
|
||||
low; \
|
||||
})
|
||||
|
||||
#define fixmul32(X,Y) \
|
||||
({ \
|
||||
int32_t low; \
|
||||
int32_t high; \
|
||||
asm volatile ( /* calculates: result = (X*Y)>>32 */ \
|
||||
"smull %0,%1,%2,%3 \n\t" /* 64 = 32x32 multiply */ \
|
||||
: "=&r"(low), "=&r" (high) \
|
||||
: "r"(X),"r"(Y)); \
|
||||
high; \
|
||||
})
|
||||
#elif defined(CPU_COLDFIRE)
|
||||
#define fixmul16(X,Y) \
|
||||
({ \
|
||||
int32_t t1, t2; \
|
||||
asm volatile ( \
|
||||
"mac.l %[x],%[y],%%acc0\n\t" /* multiply */ \
|
||||
"mulu.l %[y],%[x] \n\t" /* get lower half, avoid emac stall */ \
|
||||
"movclr.l %%acc0,%[t1] \n\t" /* get higher half */ \
|
||||
"moveq.l #15,%[t2] \n\t" \
|
||||
"asl.l %[t2],%[t1] \n\t" /* hi <<= 15, plus one free */ \
|
||||
"moveq.l #16,%[t2] \n\t" \
|
||||
"lsr.l %[t2],%[x] \n\t" /* (unsigned)lo >>= 16 */ \
|
||||
"or.l %[x],%[t1] \n\t" /* combine result */ \
|
||||
: /* outputs */ \
|
||||
[t1]"=&d"(t1), \
|
||||
[t2]"=&d"(t2) \
|
||||
: /* inputs */ \
|
||||
[x] "d" ((X)), \
|
||||
[y] "d" ((Y))); \
|
||||
t1; \
|
||||
})
|
||||
|
||||
#define fixmul31(X,Y) \
|
||||
({ \
|
||||
int32_t t; \
|
||||
asm volatile ( \
|
||||
"mac.l %[x], %[y], %%acc0\n\t" /* multiply */ \
|
||||
"movclr.l %%acc0, %[t]\n\t" /* get higher half as result */ \
|
||||
: [t] "=d" (t) \
|
||||
: [x] "r" ((X)), [y] "r" ((Y))); \
|
||||
t; \
|
||||
})
|
||||
#else
|
||||
static inline int32_t fixmul16(int32_t x, int32_t y)
|
||||
{
|
||||
|
|
@ -69,17 +90,6 @@
|
|||
|
||||
return (int32_t)temp;
|
||||
}
|
||||
|
||||
static inline int32_t fixmul32(int32_t x, int32_t y)
|
||||
{
|
||||
int64_t temp;
|
||||
temp = x;
|
||||
temp *= y;
|
||||
|
||||
temp >>= 32; //16+31-16 = 31 bits
|
||||
|
||||
return (int32_t)temp;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int32_t fixdiv16(int32_t x, int32_t y)
|
||||
|
|
@ -104,13 +114,13 @@ static inline int32_t fastSqrt(int32_t n)
|
|||
/*
|
||||
* Logically, these are unsigned.
|
||||
* We need the sign bit to test
|
||||
* whether (op - res - one) underflowed.
|
||||
* whether (op - res - one) underflowed.
|
||||
*/
|
||||
int32_t op, res, one;
|
||||
op = n;
|
||||
res = 0;
|
||||
/* "one" starts at the highest power of four <= than the argument. */
|
||||
one = 1 << 30; /* second-to-top bit set */
|
||||
one = 1 << 30; /* second-to-top bit set */
|
||||
while (one > op) one >>= 2;
|
||||
while (one != 0)
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue