1
0
Fork 0
forked from len0rd/rockbox

C optimisations to the predictor decoding - create a single function for decoding stereo streams, and reorganise to minimise the number of variables used. My -c1000 test track now decodes at 93% realtime on PortalPlayer (was 78%), 187% on Coldfire (was 170%) and 447% on Gigabeat (was 408%).

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@13608 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Dave Chapman 2007-06-10 08:55:16 +00:00
parent 57440d5292
commit 601ede7f9c
4 changed files with 206 additions and 133 deletions

View file

@ -47,7 +47,7 @@ void init_frame_decoder(struct ape_ctx_t* ape_ctx,
//printf("CRC=0x%08x\n",ape_ctx->CRC); //printf("CRC=0x%08x\n",ape_ctx->CRC);
//printf("Flags=0x%08x\n",ape_ctx->frameflags); //printf("Flags=0x%08x\n",ape_ctx->frameflags);
init_predictor_decoder(ape_ctx); init_predictor_decoder(&ape_ctx->predictor);
switch (ape_ctx->compressiontype) switch (ape_ctx->compressiontype)
{ {
@ -117,7 +117,7 @@ int decode_chunk(struct ape_ctx_t* ape_ctx,
} }
/* Now apply the predictor decoding */ /* Now apply the predictor decoding */
predictor_decode_mono(ape_ctx,decoded0,count); predictor_decode_mono(&ape_ctx->predictor,decoded0,count);
if (ape_ctx->channels==2) { if (ape_ctx->channels==2) {
/* Pseudo-stereo - just copy left channel to right channel */ /* Pseudo-stereo - just copy left channel to right channel */
@ -163,7 +163,7 @@ int decode_chunk(struct ape_ctx_t* ape_ctx,
} }
/* Now apply the predictor decoding */ /* Now apply the predictor decoding */
predictor_decode_stereo(ape_ctx,decoded0,decoded1,count); predictor_decode_stereo(&ape_ctx->predictor,decoded0,decoded1,count);
if (ape_ctx->bps == 8) { if (ape_ctx->bps == 8) {
/* TODO: Handle 8-bit streams */ /* TODO: Handle 8-bit streams */

View file

@ -68,24 +68,28 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
#define HISTORY_SIZE 512 #define HISTORY_SIZE 512
#define PREDICTOR_ORDER 8 #define PREDICTOR_ORDER 8
/* Total size of all predictor histories - 50 * sizeof(int32_t) */
#define PREDICTOR_SIZE 50
struct predictor_t struct predictor_t
{ {
/* Adaption co-efficients */
int32_t coeffsA[4];
int32_t coeffsB[5];
/* Filter histories */ /* Filter histories */
int32_t historybuffer[HISTORY_SIZE + PREDICTOR_ORDER * 4]; int32_t* buf;
int32_t* delayA;
int32_t* delayB;
int32_t* adaptcoeffsA;
int32_t* adaptcoeffsB;
int32_t lastA; int32_t YlastA;
int32_t XlastA;
int32_t filterA; int32_t YfilterA;
int32_t filterB; int32_t XfilterA;
int32_t YfilterB;
int32_t XfilterB;
/* Adaption co-efficients */
int32_t YcoeffsA[4];
int32_t XcoeffsA[4];
int32_t YcoeffsB[5];
int32_t XcoeffsB[5];
int32_t historybuffer[HISTORY_SIZE + PREDICTOR_SIZE];
}; };
struct ape_ctx_t struct ape_ctx_t
@ -129,8 +133,7 @@ struct ape_ctx_t
int frameflags; int frameflags;
int currentframeblocks; int currentframeblocks;
int blocksdecoded; int blocksdecoded;
struct predictor_t predictorY; struct predictor_t predictor;
struct predictor_t predictorX;
}; };
int ape_parseheader(int fd, struct ape_ctx_t* ape_ctx); int ape_parseheader(int fd, struct ape_ctx_t* ape_ctx);

View file

@ -37,160 +37,230 @@ static const int32_t initial_coeffs[4] = {
360, 317, -109, 98 360, 317, -109, 98
}; };
static void init_predictor(struct predictor_t* p) #define YDELAYA (18 + PREDICTOR_ORDER*4)
#define YDELAYB (18 + PREDICTOR_ORDER*3)
#define XDELAYA (18 + PREDICTOR_ORDER*2)
#define XDELAYB (18 + PREDICTOR_ORDER)
#define YADAPTCOEFFSA (18)
#define XADAPTCOEFFSA (14)
#define YADAPTCOEFFSB (10)
#define XADAPTCOEFFSB (5)
void init_predictor_decoder(struct predictor_t* p)
{ {
/* Zero the history buffers */ /* Zero the history buffers */
memset(p->historybuffer, 0, (PREDICTOR_ORDER*4) * sizeof(int32_t)); memset(p->historybuffer, 0, PREDICTOR_SIZE * sizeof(int32_t));
p->delayA = p->historybuffer + PREDICTOR_ORDER*4; p->buf = p->historybuffer;
p->delayB = p->historybuffer + PREDICTOR_ORDER*3;
p->adaptcoeffsA = p->historybuffer + PREDICTOR_ORDER*2;
p->adaptcoeffsB = p->historybuffer + PREDICTOR_ORDER;
/* Initialise and zero the co-efficients */ /* Initialise and zero the co-efficients */
memcpy(p->coeffsA, initial_coeffs, sizeof(initial_coeffs)); memcpy(p->YcoeffsA, initial_coeffs, sizeof(initial_coeffs));
memset(p->coeffsB, 0, sizeof(p->coeffsB)); memcpy(p->XcoeffsA, initial_coeffs, sizeof(initial_coeffs));
memset(p->YcoeffsB, 0, sizeof(p->YcoeffsB));
memset(p->XcoeffsB, 0, sizeof(p->XcoeffsB));
p->filterA = 0; p->YfilterA = 0;
p->filterB = 0; p->YfilterB = 0;
p->YlastA = 0;
p->lastA = 0; p->XfilterA = 0;
p->XfilterB = 0;
p->XlastA = 0;
} }
static int do_predictor_decode(struct predictor_t* p, int32_t A, int32_t B) #ifdef CPU_COLDFIRE
/* Putting this in IRAM makes a small speedup (e.g. 186% -> 187%
realtime for a -c1000 file on Coldfire, but is slower on PP. */
int predictor_decode_stereo(struct predictor_t* p, int32_t* decoded0, int32_t* decoded1, int count) ICODE_ATTR;
#endif
int predictor_decode_stereo(struct predictor_t* p, int32_t* decoded0, int32_t* decoded1, int count)
{ {
int32_t predictionA, predictionB, currentA; int32_t predictionA, predictionB;
p->delayA[0] = p->lastA;
p->delayA[-1] = p->delayA[0] - p->delayA[-1];
predictionA = scalarproduct4_rev32(p->coeffsA,p->delayA);
/* Apply a scaled first-order filter compression */
p->delayB[0] = B - ((p->filterB * 31) >> 5);
p->filterB = B;
p->delayB[-1] = p->delayB[0] - p->delayB[-1];
predictionB = scalarproduct5_rev32(p->coeffsB,p->delayB);
currentA = A + ((predictionA + (predictionB >> 1)) >> 10);
p->adaptcoeffsA[0] = SIGN(p->delayA[0]);
p->adaptcoeffsA[-1] = SIGN(p->delayA[-1]);
p->adaptcoeffsB[0] = SIGN(p->delayB[0]);
p->adaptcoeffsB[-1] = SIGN(p->delayB[-1]);
if (A > 0)
{
vector_sub4_rev32(p->coeffsA, p->adaptcoeffsA);
vector_sub5_rev32(p->coeffsB, p->adaptcoeffsB);
}
else if (A < 0)
{
vector_add4_rev32(p->coeffsA, p->adaptcoeffsA);
vector_add5_rev32(p->coeffsB, p->adaptcoeffsB);
}
p->delayA++;
p->delayB++;
p->adaptcoeffsA++;
p->adaptcoeffsB++;
/* Have we filled the history buffer? */
if (p->delayA == p->historybuffer + HISTORY_SIZE + (PREDICTOR_ORDER*4)) {
memmove(p->historybuffer, p->delayA - (PREDICTOR_ORDER*4),
(PREDICTOR_ORDER*4) * sizeof(int32_t));
p->delayA = p->historybuffer + PREDICTOR_ORDER*4;
p->delayB = p->historybuffer + PREDICTOR_ORDER*3;
p->adaptcoeffsA = p->historybuffer + PREDICTOR_ORDER*2;
p->adaptcoeffsB = p->historybuffer + PREDICTOR_ORDER;
}
p->lastA = currentA;
p->filterA = currentA + ((p->filterA * 31) >> 5);
return p->filterA;
}
static int32_t X;
void init_predictor_decoder(struct ape_ctx_t* ape_ctx)
{
X = 0;
init_predictor(&ape_ctx->predictorY);
init_predictor(&ape_ctx->predictorX);
}
int predictor_decode_stereo(struct ape_ctx_t* ape_ctx, int32_t* decoded0, int32_t* decoded1, int count) ICODE_ATTR;
int predictor_decode_stereo(struct ape_ctx_t* ape_ctx, int32_t* decoded0, int32_t* decoded1, int count)
{
while (count--) while (count--)
{ {
*decoded0 = do_predictor_decode(&ape_ctx->predictorY, *decoded0, X); /* Predictor Y */
X = do_predictor_decode(&ape_ctx->predictorX, *decoded1, *(decoded0)++); p->buf[YDELAYA] = p->YlastA;
*(decoded1++) = X; p->buf[YADAPTCOEFFSA] = SIGN(p->buf[YDELAYA]);
p->buf[YDELAYA-1] = p->buf[YDELAYA] - p->buf[YDELAYA-1];
p->buf[YADAPTCOEFFSA-1] = SIGN(p->buf[YDELAYA-1]);
predictionA = (p->buf[YDELAYA] * p->YcoeffsA[0]) +
(p->buf[YDELAYA-1] * p->YcoeffsA[1]) +
(p->buf[YDELAYA-2] * p->YcoeffsA[2]) +
(p->buf[YDELAYA-3] * p->YcoeffsA[3]);
/* Apply a scaled first-order filter compression */
p->buf[YDELAYB] = p->XfilterA - ((p->YfilterB * 31) >> 5);
p->buf[YADAPTCOEFFSB] = SIGN(p->buf[YDELAYB]);
p->YfilterB = p->XfilterA;
p->buf[YDELAYB-1] = p->buf[YDELAYB] - p->buf[YDELAYB-1];
p->buf[YADAPTCOEFFSB-1] = SIGN(p->buf[YDELAYB-1]);
predictionB = (p->buf[YDELAYB] * p->YcoeffsB[0]) +
(p->buf[YDELAYB-1] * p->YcoeffsB[1]) +
(p->buf[YDELAYB-2] * p->YcoeffsB[2]) +
(p->buf[YDELAYB-3] * p->YcoeffsB[3]) +
(p->buf[YDELAYB-4] * p->YcoeffsB[4]);
p->YlastA = *decoded0 + ((predictionA + (predictionB >> 1)) >> 10);
p->YfilterA = p->YlastA + ((p->YfilterA * 31) >> 5);
/* Predictor X */
p->buf[XDELAYA] = p->XlastA;
p->buf[XADAPTCOEFFSA] = SIGN(p->buf[XDELAYA]);
p->buf[XDELAYA-1] = p->buf[XDELAYA] - p->buf[XDELAYA-1];
p->buf[XADAPTCOEFFSA-1] = SIGN(p->buf[XDELAYA-1]);
predictionA = (p->buf[XDELAYA] * p->XcoeffsA[0]) +
(p->buf[XDELAYA-1] * p->XcoeffsA[1]) +
(p->buf[XDELAYA-2] * p->XcoeffsA[2]) +
(p->buf[XDELAYA-3] * p->XcoeffsA[3]);
/* Apply a scaled first-order filter compression */
p->buf[XDELAYB] = p->YfilterA - ((p->XfilterB * 31) >> 5);
p->buf[XADAPTCOEFFSB] = SIGN(p->buf[XDELAYB]);
p->XfilterB = p->YfilterA;
p->buf[XDELAYB-1] = p->buf[XDELAYB] - p->buf[XDELAYB-1];
p->buf[XADAPTCOEFFSB-1] = SIGN(p->buf[XDELAYB-1]);
predictionB = (p->buf[XDELAYB] * p->XcoeffsB[0]) +
(p->buf[XDELAYB-1] * p->XcoeffsB[1]) +
(p->buf[XDELAYB-2] * p->XcoeffsB[2]) +
(p->buf[XDELAYB-3] * p->XcoeffsB[3]) +
(p->buf[XDELAYB-4] * p->XcoeffsB[4]);
p->XlastA = *decoded1 + ((predictionA + (predictionB >> 1)) >> 10);
p->XfilterA = p->XlastA + ((p->XfilterA * 31) >> 5);
if (*decoded0 > 0)
{
p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA];
p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1];
p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2];
p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3];
p->YcoeffsB[0] -= p->buf[YADAPTCOEFFSB];
p->YcoeffsB[1] -= p->buf[YADAPTCOEFFSB-1];
p->YcoeffsB[2] -= p->buf[YADAPTCOEFFSB-2];
p->YcoeffsB[3] -= p->buf[YADAPTCOEFFSB-3];
p->YcoeffsB[4] -= p->buf[YADAPTCOEFFSB-4];
}
else if (*decoded0 < 0)
{
p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA];
p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1];
p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2];
p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3];
p->YcoeffsB[0] += p->buf[YADAPTCOEFFSB];
p->YcoeffsB[1] += p->buf[YADAPTCOEFFSB-1];
p->YcoeffsB[2] += p->buf[YADAPTCOEFFSB-2];
p->YcoeffsB[3] += p->buf[YADAPTCOEFFSB-3];
p->YcoeffsB[4] += p->buf[YADAPTCOEFFSB-4];
}
*(decoded0++) = p->YfilterA;
if (*decoded1 > 0)
{
p->XcoeffsA[0] -= p->buf[XADAPTCOEFFSA];
p->XcoeffsA[1] -= p->buf[XADAPTCOEFFSA-1];
p->XcoeffsA[2] -= p->buf[XADAPTCOEFFSA-2];
p->XcoeffsA[3] -= p->buf[XADAPTCOEFFSA-3];
p->XcoeffsB[0] -= p->buf[XADAPTCOEFFSB];
p->XcoeffsB[1] -= p->buf[XADAPTCOEFFSB-1];
p->XcoeffsB[2] -= p->buf[XADAPTCOEFFSB-2];
p->XcoeffsB[3] -= p->buf[XADAPTCOEFFSB-3];
p->XcoeffsB[4] -= p->buf[XADAPTCOEFFSB-4];
}
else if (*decoded1 < 0)
{
p->XcoeffsA[0] += p->buf[XADAPTCOEFFSA];
p->XcoeffsA[1] += p->buf[XADAPTCOEFFSA-1];
p->XcoeffsA[2] += p->buf[XADAPTCOEFFSA-2];
p->XcoeffsA[3] += p->buf[XADAPTCOEFFSA-3];
p->XcoeffsB[0] += p->buf[XADAPTCOEFFSB];
p->XcoeffsB[1] += p->buf[XADAPTCOEFFSB-1];
p->XcoeffsB[2] += p->buf[XADAPTCOEFFSB-2];
p->XcoeffsB[3] += p->buf[XADAPTCOEFFSB-3];
p->XcoeffsB[4] += p->buf[XADAPTCOEFFSB-4];
}
*(decoded1++) = p->XfilterA;
/* Combined */
p->buf++;
/* Have we filled the history buffer? */
if (p->buf == p->historybuffer + HISTORY_SIZE) {
memmove(p->historybuffer, p->buf,
PREDICTOR_SIZE * sizeof(int32_t));
p->buf = p->historybuffer;
}
} }
return 0; return 0;
} }
int predictor_decode_mono(struct ape_ctx_t* ape_ctx, int32_t* decoded0, int count) int predictor_decode_mono(struct predictor_t* p, int32_t* decoded0, int count)
{ {
struct predictor_t* p = &ape_ctx->predictorY;
int32_t predictionA, currentA, A; int32_t predictionA, currentA, A;
currentA = p->lastA; currentA = p->YlastA;
while (count--) while (count--)
{ {
A = *decoded0; A = *decoded0;
p->delayA[0] = currentA; p->buf[YDELAYA] = currentA;
p->delayA[-1] = p->delayA[0] - p->delayA[-1]; p->buf[YDELAYA-1] = p->buf[YDELAYA] - p->buf[YDELAYA-1];
predictionA = (p->delayA[0] * p->coeffsA[0]) + predictionA = (p->buf[YDELAYA] * p->YcoeffsA[0]) +
(p->delayA[-1] * p->coeffsA[1]) + (p->buf[YDELAYA-1] * p->YcoeffsA[1]) +
(p->delayA[-2] * p->coeffsA[2]) + (p->buf[YDELAYA-2] * p->YcoeffsA[2]) +
(p->delayA[-3] * p->coeffsA[3]); (p->buf[YDELAYA-3] * p->YcoeffsA[3]);
currentA = A + (predictionA >> 10); currentA = A + (predictionA >> 10);
p->adaptcoeffsA[0] = SIGN(p->delayA[0]); p->buf[YADAPTCOEFFSA] = SIGN(p->buf[YDELAYA]);
p->adaptcoeffsA[-1] = SIGN(p->delayA[-1]); p->buf[YADAPTCOEFFSA-1] = SIGN(p->buf[YDELAYA-1]);
if (A > 0) if (A > 0)
{ {
p->coeffsA[0] -= p->adaptcoeffsA[0]; p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA];
p->coeffsA[1] -= p->adaptcoeffsA[-1]; p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1];
p->coeffsA[2] -= p->adaptcoeffsA[-2]; p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2];
p->coeffsA[3] -= p->adaptcoeffsA[-3]; p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3];
} }
else if (A < 0) else if (A < 0)
{ {
p->coeffsA[0] += p->adaptcoeffsA[0]; p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA];
p->coeffsA[1] += p->adaptcoeffsA[-1]; p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1];
p->coeffsA[2] += p->adaptcoeffsA[-2]; p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2];
p->coeffsA[3] += p->adaptcoeffsA[-3]; p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3];
} }
p->delayA++; p->buf++;
p->adaptcoeffsA++;
/* Have we filled the history buffer? */ /* Have we filled the history buffer? */
if (p->delayA == p->historybuffer + HISTORY_SIZE + (PREDICTOR_ORDER*4)) { if (p->buf == p->historybuffer + HISTORY_SIZE) {
memmove(p->historybuffer, p->delayA - (PREDICTOR_ORDER*4), memmove(p->historybuffer, p->buf,
(PREDICTOR_ORDER*4) * sizeof(int32_t)); PREDICTOR_SIZE * sizeof(int32_t));
p->delayA = p->historybuffer + PREDICTOR_ORDER*4; p->buf = p->historybuffer;
p->adaptcoeffsA = p->historybuffer + PREDICTOR_ORDER*2;
} }
p->filterA = currentA + ((p->filterA * 31) >> 5); p->YfilterA = currentA + ((p->YfilterA * 31) >> 5);
*(decoded0++) = p->filterA; *(decoded0++) = p->YfilterA;
} }
p->lastA = currentA; p->YlastA = currentA;
return 0; return 0;
} }

View file

@ -2,7 +2,7 @@
libdemac - A Monkey's Audio decoder libdemac - A Monkey's Audio decoder
$Id:$ $Id$
Copyright (C) Dave Chapman 2007 Copyright (C) Dave Chapman 2007
@ -29,8 +29,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
#include "parser.h" #include "parser.h"
#include "filter.h" #include "filter.h"
void init_predictor_decoder(struct ape_ctx_t* ape_ctx); void init_predictor_decoder(struct predictor_t* p);
int predictor_decode_stereo(struct ape_ctx_t* ape_ctx, int32_t* decoded0, int32_t* decoded1, int count); int predictor_decode_stereo(struct predictor_t* p, int32_t* decoded0, int32_t* decoded1, int count);
int predictor_decode_mono(struct ape_ctx_t* ape_ctx, int32_t* decoded0, int count); int predictor_decode_mono(struct predictor_t* p, int32_t* decoded0, int count);
#endif #endif