From c1cd0469ca9f084b39d747ccca5d64442c3833ca Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Tue, 2 Dec 2008 02:26:04 +0000 Subject: [PATCH] Implement mono predictor in assembler for coldfire, yielding a ~6% speedup for mono -c1000. Apply ideas gained from it back to the stereo predictor, saving 4 instructions. No speed increase for stereo, probably due to cache aliasing effects. * 80-column police. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19296 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/demac/libdemac/predictor-cf.S | 435 ++++++++++++++-------- apps/codecs/demac/libdemac/predictor.c | 2 + 2 files changed, 291 insertions(+), 146 deletions(-) diff --git a/apps/codecs/demac/libdemac/predictor-cf.S b/apps/codecs/demac/libdemac/predictor-cf.S index cd2e07fd5e..c76d7f629a 100644 --- a/apps/codecs/demac/libdemac/predictor-cf.S +++ b/apps/codecs/demac/libdemac/predictor-cf.S @@ -25,13 +25,6 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA */ #include "demac_config.h" - .text - - .align 2 - - .global predictor_decode_stereo - .type predictor_decode_stereo,@function - /* NOTE: The following need to be kept in sync with parser.h */ #define YDELAYA 200 @@ -63,6 +56,13 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA #define historybuffer 100 /* int32_t historybuffer[] */ + .text + + .align 2 + + .global predictor_decode_stereo + .type predictor_decode_stereo,@function + | void predictor_decode_stereo(struct predictor_t* p, | int32_t* decoded0, | int32_t* decoded1, @@ -92,6 +92,8 @@ predictor_decode_stereo: | %d1 = p->buf[YDELAYA-2] | %d2 = p->buf[YDELAYA-1] + move.l %d3, (YDELAYA,%a5) | p->buf[YDELAYA] = %d3 + sub.l %d3, %d2 neg.l %d2 | %d2 = %d3 - %d2 @@ -102,12 +104,10 @@ predictor_decode_stereo: | %d6 = p->YcoeffsA[2] | %d7 = p->YcoeffsA[3] - mac.l %d3, %d4, %acc0 | %acc0 = p->buf[YDELAYA] * p->YcoeffsA[0] - mac.l %d2, %d5, %acc0 | %acc0 += p->buf[YDELAYA-1] * p->YcoeffsA[1] - mac.l %d1, %d6, %acc0 | %acc0 += p->buf[YDELAYA-2] * p->YcoeffsA[2] - mac.l %d0, %d7, %acc0 | %acc0 += p->buf[YDELAYA-3] * p->YcoeffsA[3] - - move.l %d3, (YDELAYA,%a5) | p->buf[YDELAYA] = %d3 + mac.l %d3, %d4, %acc0 | %acc0 = p->buf[YDELAYA] * p->YcoeffsA[0] + mac.l %d2, %d5, %acc0 | %acc0 += p->buf[YDELAYA-1] * p->YcoeffsA[1] + mac.l %d1, %d6, %acc0 | %acc0 += p->buf[YDELAYA-2] * p->YcoeffsA[2] + mac.l %d0, %d7, %acc0 | %acc0 += p->buf[YDELAYA-3] * p->YcoeffsA[3] tst.l %d2 beq.s 1f @@ -125,10 +125,6 @@ predictor_decode_stereo: 1: | %d3 = SIGN(%d3) move.l %d3, (YADAPTCOEFFSA,%a5) | p->buf[YADAPTCOEFFSA] = %d3 - movclr.l %acc0, %d0 - - | NOTE: %d0 now contains predictionA - don't overwrite. - | Predictor Y, Filter B movem.l (YfilterB,%a6), %d2-%d3 | %d2 = p->YfilterB @@ -156,11 +152,11 @@ predictor_decode_stereo: | %a1 = p->YcoeffsB[3] | %a2 = p->YcoeffsB[4] - mac.l %d3, %d1, %acc0 | %acc0 = p->buf[YDELAYB] * p->YcoeffsB[0] - mac.l %d7, %d2, %acc0 | %acc0 += p->buf[YDELAYB-1] * p->YcoeffsB[1] - mac.l %d6, %a0, %acc0 | %acc0 += p->buf[YDELAYB-2] * p->YcoeffsB[2] - mac.l %d5, %a1, %acc0 | %acc0 += p->buf[YDELAYB-3] * p->YcoeffsB[3] - mac.l %d4, %a2, %acc0 | %acc0 += p->buf[YDELAYB-4] * p->YcoeffsB[4] + mac.l %d3, %d1, %acc1 | %acc1 = p->buf[YDELAYB] * p->YcoeffsB[0] + mac.l %d7, %d2, %acc1 | %acc1 += p->buf[YDELAYB-1] * p->YcoeffsB[1] + mac.l %d6, %a0, %acc1 | %acc1 += p->buf[YDELAYB-2] * p->YcoeffsB[2] + mac.l %d5, %a1, %acc1 | %acc1 += p->buf[YDELAYB-3] * p->YcoeffsB[3] + mac.l %d4, %a2, %acc1 | %acc1 += p->buf[YDELAYB-4] * p->YcoeffsB[4] move.l %d3, (YDELAYB, %a5) | p->buf[YDELAYB] = %d3 @@ -179,38 +175,10 @@ predictor_decode_stereo: 1: | %d3 = SIGN(%d3) move.l %d3, (YADAPTCOEFFSB, %a5) | p->buf[YADAPTCOEFFSB] = %d3 - movclr.l %acc0, %d4 - - | %d0 still contains predictionA - | %d4 contains predictionB - - | Finish Predictor Y - - asr.l #1, %d4 - add.l %d4, %d0 | %d0 += (%d1 >> 1) - move.l (%a3), %d5 | %d5 = *decoded0 - move.l %d5, %d4 | %d4 = %d5 - asr.l #8, %d0 - asr.l #2, %d0 | %d0 >>= 10 - add.l %d0, %d4 | %d4 += %d0 - move.l %d4, (YlastA,%a6) | p->YlastA = %d4 - - move.l (YfilterA,%a6), %d6 | %d6 = p->YfilterA - move.l %d6, %d0 - lsl.l #5, %d6 - sub.l %d0, %d6 | %d6 = 31 * %d6 - asr.l #5, %d6 | %d6 >>= 5 - add.l %d6, %d4 - move.l %d4, (YfilterA,%a6) | p->YfilterA = %d4 - - | %d4 contains p->YfilterA - | %d5 contains *decoded0 - | %d1, %d2, %a0, %a1, %a2 contain p->YcoeffsB[0..4] | %d7, %d3 contain p->buf[YADAPTCOEFFSB-1] and p->buf[YADAPTCOEFFSB] - move.l %d4, (%a3)+ | *(decoded0++) = %d1 (p->YfilterA) - tst.l %d5 + move.l (%a3), %d0 | %d0 = *decoded0 beq.s 3f movem.l (YADAPTCOEFFSB-16,%a5), %d4-%d6 | %d4 = p->buf[YADAPTCOEFFSB-4] @@ -221,11 +189,11 @@ predictor_decode_stereo: | *decoded0 > 0 - sub.l %d3, %d1 | %d1 = p->YcoeffsB[0] - p->buf[YADAPTCOEFFSB] - sub.l %d7, %d2 | %d2 = p->YcoeffsB[1] - p->buf[YADAPTCOEFFSB-1] - sub.l %d6, %a0 | %a0 = p->YcoeffsB[2] - p->buf[YADAPTCOEFFSB-2] - sub.l %d5, %a1 | %a1 = p->YcoeffsB[3] - p->buf[YADAPTCOEFFSB-3] - sub.l %d4, %a2 | %a2 = p->YcoeffsB[4] - p->buf[YADAPTCOEFFSB-4] + sub.l %d3, %d1 | %d1 = p->YcoeffsB[0] - p->buf[YADAPTCOEFFSB] + sub.l %d7, %d2 | %d2 = p->YcoeffsB[1] - p->buf[YADAPTCOEFFSB-1] + sub.l %d6, %a0 | %a0 = p->YcoeffsB[2] - p->buf[YADAPTCOEFFSB-2] + sub.l %d5, %a1 | %a1 = p->YcoeffsB[3] - p->buf[YADAPTCOEFFSB-3] + sub.l %d4, %a2 | %a2 = p->YcoeffsB[4] - p->buf[YADAPTCOEFFSB-4] movem.l %d1-%d2/%a0-%a2, (YcoeffsB,%a6) | Save p->YcoeffsB[] @@ -234,47 +202,69 @@ predictor_decode_stereo: | %d6 = p->YcoeffsA[2] | %d7 = p->YcoeffsA[3] - movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 | %d2 = p->buf[YADAPTCOEFFSA-3] + movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 + | %d2 = p->buf[YADAPTCOEFFSA-3] | %a0 = p->buf[YADAPTCOEFFSA-2] | %a1 = p->buf[YADAPTCOEFFSA-1] | %a2 = p->buf[YADAPTCOEFFSA] - sub.l %a2, %d4 | %d4 = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA] - sub.l %a1, %d5 | %d5 = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1] - sub.l %a0, %d6 | %d6 = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2] - sub.l %d2, %d7 | %d7 = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3] + sub.l %a2, %d4 | %d4 = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA] + sub.l %a1, %d5 | %d5 = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1] + sub.l %a0, %d6 | %d6 = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2] + sub.l %d2, %d7 | %d7 = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3] bra.s 2f 1: | *decoded0 < 0 - add.l %d3, %d1 | %d1 = p->YcoeffsB[0] + p->buf[YADAPTCOEFFSB] - add.l %d7, %d2 | %d2 = p->YcoeffsB[1] + p->buf[YADAPTCOEFFSB-1] - add.l %d6, %a0 | %a0 = p->YcoeffsB[2] + p->buf[YADAPTCOEFFSB-2] - add.l %d5, %a1 | %a1 = p->YcoeffsB[3] + p->buf[YADAPTCOEFFSB-3] - add.l %d4, %a2 | %a2 = p->YcoeffsB[4] + p->buf[YADAPTCOEFFSB-4] + add.l %d3, %d1 | %d1 = p->YcoeffsB[0] + p->buf[YADAPTCOEFFSB] + add.l %d7, %d2 | %d2 = p->YcoeffsB[1] + p->buf[YADAPTCOEFFSB-1] + add.l %d6, %a0 | %a0 = p->YcoeffsB[2] + p->buf[YADAPTCOEFFSB-2] + add.l %d5, %a1 | %a1 = p->YcoeffsB[3] + p->buf[YADAPTCOEFFSB-3] + add.l %d4, %a2 | %a2 = p->YcoeffsB[4] + p->buf[YADAPTCOEFFSB-4] movem.l %d1-%d2/%a0-%a2, (YcoeffsB,%a6) | Save p->YcoeffsB[] - + movem.l (YcoeffsA,%a6), %d4-%d7 | %d4 = p->YcoeffsA[0] | %d5 = p->YcoeffsA[1] | %d6 = p->YcoeffsA[2] | %d7 = p->YcoeffsA[3] - - movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 | %d2 = p->buf[YADAPTCOEFFSA-3] + + movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 + | %d2 = p->buf[YADAPTCOEFFSA-3] | %a0 = p->buf[YADAPTCOEFFSA-2] | %a1 = p->buf[YADAPTCOEFFSA-1] | %a2 = p->buf[YADAPTCOEFFSA] - - add.l %a2, %d4 | %d4 = p->YcoeffsA[0] + p->buf[YADAPTCOEFFSA] - add.l %a1, %d5 | %d5 = p->YcoeffsA[1] + p->buf[YADAPTCOEFFSA-1] - add.l %a0, %d6 | %d6 = p->YcoeffsA[2] + p->buf[YADAPTCOEFFSA-2] - add.l %d2, %d7 | %d7 = p->YcoeffsA[3] + p->buf[YADAPTCOEFFSA-3] + + add.l %a2, %d4 | %d4 = p->YcoeffsA[0] + p->buf[YADAPTCOEFFSA] + add.l %a1, %d5 | %d5 = p->YcoeffsA[1] + p->buf[YADAPTCOEFFSA-1] + add.l %a0, %d6 | %d6 = p->YcoeffsA[2] + p->buf[YADAPTCOEFFSA-2] + add.l %d2, %d7 | %d7 = p->YcoeffsA[3] + p->buf[YADAPTCOEFFSA-3] 2: movem.l %d4-%d7, (YcoeffsA,%a6) | Save p->YcoeffsA[] 3: + | Finish Predictor Y + + movclr.l %acc0, %d1 | %d1 = predictionA + movclr.l %acc1, %d2 | %d2 = predictionB + asr.l #1, %d2 + add.l %d2, %d1 | %d1 += (%d2 >> 1) + asr.l #8, %d1 + asr.l #2, %d1 | %d1 >>= 10 + add.l %d0, %d1 | %d1 += %d0 + move.l %d1, (YlastA,%a6) | p->YlastA = %d1 + + move.l (YfilterA,%a6), %d2 | %d2 = p->YfilterA + move.l %d2, %d0 + lsl.l #5, %d2 + sub.l %d0, %d2 | %d2 = 31 * %d2 + asr.l #5, %d2 | %d2 >>= 5 + add.l %d1, %d2 + move.l %d2, (YfilterA,%a6) | p->YfilterA = %d2 + + | *decoded0 stored 2 instructions down, avoiding pipeline stall | ***** PREDICTOR X ***** @@ -282,11 +272,15 @@ predictor_decode_stereo: move.l (XlastA,%a6), %d3 | %d3 = p->XlastA + move.l %d2, (%a3)+ | *(decoded0++) = %d2 (p->YfilterA) + movem.l (XDELAYA-12,%a5), %d0-%d2 | %d0 = p->buf[XDELAYA-3] | %d1 = p->buf[XDELAYA-2] | %d2 = p->buf[XDELAYA-1] - sub.l %d3, %d2 + move.l %d3, (XDELAYA,%a5) | p->buf[XDELAYA] = %d3 + + sub.l %d3, %d2 neg.l %d2 | %d2 = %d3 -%d2 move.l %d2, (XDELAYA-4,%a5) | p->buf[XDELAYA-1] = %d2 @@ -296,13 +290,11 @@ predictor_decode_stereo: | %d6 = p->XcoeffsA[2] | %d7 = p->XcoeffsA[3] - mac.l %d3, %d4, %acc0 | %acc0 = p->buf[XDELAYA] * p->XcoeffsA[0] - mac.l %d2, %d5, %acc0 | %acc0 += p->buf[XDELAYA-1] * p->XcoeffsA[1] - mac.l %d1, %d6, %acc0 | %acc0 += p->buf[XDELAYA-2] * p->XcoeffsA[2] - mac.l %d0, %d7, %acc0 | %acc0 += p->buf[XDELAYA-3] * p->XcoeffsA[3] + mac.l %d3, %d4, %acc0 | %acc0 = p->buf[XDELAYA] * p->XcoeffsA[0] + mac.l %d2, %d5, %acc0 | %acc0 += p->buf[XDELAYA-1] * p->XcoeffsA[1] + mac.l %d1, %d6, %acc0 | %acc0 += p->buf[XDELAYA-2] * p->XcoeffsA[2] + mac.l %d0, %d7, %acc0 | %acc0 += p->buf[XDELAYA-3] * p->XcoeffsA[3] - move.l %d3, (XDELAYA,%a5) | p->buf[XDELAYA] = %d3 - tst.l %d2 beq.s 1f spl.b %d2 | pos: 0x??????ff, neg: 0x??????00 @@ -319,10 +311,6 @@ predictor_decode_stereo: 1: | %d3 = SIGN(%d3) move.l %d3, (XADAPTCOEFFSA,%a5) | p->buf[XADAPTCOEFFSA] = %d3 - movclr.l %acc0, %d0 - - | NOTE: %d0 now contains predictionA - don't overwrite. - | Predictor X, Filter B movem.l (XfilterB,%a6), %d2-%d3 | %d2 = p->XfilterB @@ -350,11 +338,11 @@ predictor_decode_stereo: | %a1 = p->XcoeffsB[3] | %a2 = p->XcoeffsB[4] - mac.l %d3, %d1, %acc0 | %acc0 = p->buf[XDELAYB] * p->XcoeffsB[0] - mac.l %d7, %d2, %acc0 | %acc0 += p->buf[XDELAYB-1] * p->XcoeffsB[1] - mac.l %d6, %a0, %acc0 | %acc0 += p->buf[XDELAYB-2] * p->XcoeffsB[2] - mac.l %d5, %a1, %acc0 | %acc0 += p->buf[XDELAYB-3] * p->XcoeffsB[3] - mac.l %d4, %a2, %acc0 | %acc0 += p->buf[XDELAYB-4] * p->XcoeffsB[4] + mac.l %d3, %d1, %acc1 | %acc1 = p->buf[XDELAYB] * p->XcoeffsB[0] + mac.l %d7, %d2, %acc1 | %acc1 += p->buf[XDELAYB-1] * p->XcoeffsB[1] + mac.l %d6, %a0, %acc1 | %acc1 += p->buf[XDELAYB-2] * p->XcoeffsB[2] + mac.l %d5, %a1, %acc1 | %acc1 += p->buf[XDELAYB-3] * p->XcoeffsB[3] + mac.l %d4, %a2, %acc1 | %acc1 += p->buf[XDELAYB-4] * p->XcoeffsB[4] move.l %d3, (XDELAYB, %a5) | p->buf[XDELAYB] = %d3 @@ -374,38 +362,10 @@ predictor_decode_stereo: 1: | %d3 = SIGN(%d3) move.l %d3, (XADAPTCOEFFSB, %a5) | p->buf[XADAPTCOEFFSB] = %d3 - movclr.l %acc0, %d4 - - | %d0 still contains predictionA - | %d4 contains predictionB - - | Finish Predictor X - - asr.l #1, %d4 - add.l %d4, %d0 | %d0 += (%d1 >> 1) - move.l (%a4), %d5 | %d5 = *decoded1 - move.l %d5, %d4 | %d4 = %d5 - asr.l #8, %d0 - asr.l #2, %d0 | %d0 >>= 10 - add.l %d0, %d4 | %d4 += %d0 - move.l %d4, (XlastA,%a6) | p->XlastA = %d1 - - move.l (XfilterA,%a6), %d6 | %d6 = p->XfilterA - move.l %d6, %d0 - lsl.l #5, %d6 - sub.l %d0, %d6 | %d6 = 31 * %d6 - asr.l #5, %d6 | %d6 >>= 5 - add.l %d6, %d4 - move.l %d4, (XfilterA,%a6) | p->XfilterA = %d6 - - | %d4 contains p->XfilterA - | %d5 contains *decoded1 - | %d1, %d2, %a0, %a1, %a2 contain p->XcoeffsB[0..4] | %d7, %d3 contain p->buf[XADAPTCOEFFSB-1] and p->buf[XADAPTCOEFFSB] - move.l %d4, (%a4)+ | *(decoded1++) = %d1 (p->XfilterA) - tst.l %d5 + move.l (%a4), %d0 | %d0 = *decoded1 beq.s 3f movem.l (XADAPTCOEFFSB-16,%a5), %d4-%d6 | %d4 = p->buf[XADAPTCOEFFSB-4] @@ -416,38 +376,39 @@ predictor_decode_stereo: | *decoded1 > 0 - sub.l %d3, %d1 | %d1 = p->XcoeffsB[0] - p->buf[XADAPTCOEFFSB] - sub.l %d7, %d2 | %d2 = p->XcoeffsB[1] - p->buf[XADAPTCOEFFSB-1] - sub.l %d6, %a0 | %a0 = p->XcoeffsB[2] - p->buf[XADAPTCOEFFSB-2] - sub.l %d5, %a1 | %a1 = p->XcoeffsB[3] - p->buf[XADAPTCOEFFSB-3] - sub.l %d4, %a2 | %a2 = p->XcoeffsB[4] - p->buf[XADAPTCOEFFSB-4] - + sub.l %d3, %d1 | %d1 = p->XcoeffsB[0] - p->buf[XADAPTCOEFFSB] + sub.l %d7, %d2 | %d2 = p->XcoeffsB[1] - p->buf[XADAPTCOEFFSB-1] + sub.l %d6, %a0 | %a0 = p->XcoeffsB[2] - p->buf[XADAPTCOEFFSB-2] + sub.l %d5, %a1 | %a1 = p->XcoeffsB[3] - p->buf[XADAPTCOEFFSB-3] + sub.l %d4, %a2 | %a2 = p->XcoeffsB[4] - p->buf[XADAPTCOEFFSB-4] + movem.l %d1-%d2/%a0-%a2, (XcoeffsB,%a6) | Save p->XcoeffsB[] - + movem.l (XcoeffsA,%a6), %d4-%d7 | %d4 = p->XcoeffsA[0] | %d5 = p->XcoeffsA[1] | %d6 = p->XcoeffsA[2] | %d7 = p->XcoeffsA[3] - movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 | %d2 = p->buf[XADAPTCOEFFSA-3] + movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 + | %d2 = p->buf[XADAPTCOEFFSA-3] | %a0 = p->buf[XADAPTCOEFFSA-2] | %a1 = p->buf[XADAPTCOEFFSA-1] | %a2 = p->buf[XADAPTCOEFFSA] - sub.l %a2, %d4 | %d4 = p->XcoeffsA[0] - p->buf[XADAPTCOEFFSA] - sub.l %a1, %d5 | %d5 = p->XcoeffsA[1] - p->buf[XADAPTCOEFFSA-1] - sub.l %a0, %d6 | %d6 = p->XcoeffsA[2] - p->buf[XADAPTCOEFFSA-2] - sub.l %d2, %d7 | %d7 = p->XcoeffsA[3] - p->buf[XADAPTCOEFFSA-3] - + sub.l %a2, %d4 | %d4 = p->XcoeffsA[0] - p->buf[XADAPTCOEFFSA] + sub.l %a1, %d5 | %d5 = p->XcoeffsA[1] - p->buf[XADAPTCOEFFSA-1] + sub.l %a0, %d6 | %d6 = p->XcoeffsA[2] - p->buf[XADAPTCOEFFSA-2] + sub.l %d2, %d7 | %d7 = p->XcoeffsA[3] - p->buf[XADAPTCOEFFSA-3] + bra.s 2f 1: | *decoded1 < 0 - add.l %d3, %d1 | %d1 = p->XcoeffsB[0] + p->buf[XADAPTCOEFFSB] - add.l %d7, %d2 | %d2 = p->XcoeffsB[1] + p->buf[XADAPTCOEFFSB-1] - add.l %d6, %a0 | %a0 = p->XcoeffsB[2] + p->buf[XADAPTCOEFFSB-2] - add.l %d5, %a1 | %a1 = p->XcoeffsB[3] + p->buf[XADAPTCOEFFSB-3] - add.l %d4, %a2 | %a2 = p->XcoeffsB[4] + p->buf[XADAPTCOEFFSB-4] + add.l %d3, %d1 | %d1 = p->XcoeffsB[0] + p->buf[XADAPTCOEFFSB] + add.l %d7, %d2 | %d2 = p->XcoeffsB[1] + p->buf[XADAPTCOEFFSB-1] + add.l %d6, %a0 | %a0 = p->XcoeffsB[2] + p->buf[XADAPTCOEFFSB-2] + add.l %d5, %a1 | %a1 = p->XcoeffsB[3] + p->buf[XADAPTCOEFFSB-3] + add.l %d4, %a2 | %a2 = p->XcoeffsB[4] + p->buf[XADAPTCOEFFSB-4] movem.l %d1-%d2/%a0-%a2, (XcoeffsB,%a6) | Save p->XcoeffsB[] @@ -456,31 +417,53 @@ predictor_decode_stereo: | %d6 = p->XcoeffsA[2] | %d7 = p->XcoeffsA[3] - movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 | %d2 = p->buf[XADAPTCOEFFSA-3] + movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 + | %d2 = p->buf[XADAPTCOEFFSA-3] | %a0 = p->buf[XADAPTCOEFFSA-2] | %a1 = p->buf[XADAPTCOEFFSA-1] | %a2 = p->buf[XADAPTCOEFFSA] - add.l %a2, %d4 | %d4 = p->XcoeffsA[0] + p->buf[XADAPTCOEFFSA] - add.l %a1, %d5 | %d5 = p->XcoeffsA[1] + p->buf[XADAPTCOEFFSA-1] - add.l %a0, %d6 | %d6 = p->XcoeffsA[2] + p->buf[XADAPTCOEFFSA-2] - add.l %d2, %d7 | %d7 = p->XcoeffsA[3] + p->buf[XADAPTCOEFFSA-3] + add.l %a2, %d4 | %d4 = p->XcoeffsA[0] + p->buf[XADAPTCOEFFSA] + add.l %a1, %d5 | %d5 = p->XcoeffsA[1] + p->buf[XADAPTCOEFFSA-1] + add.l %a0, %d6 | %d6 = p->XcoeffsA[2] + p->buf[XADAPTCOEFFSA-2] + add.l %d2, %d7 | %d7 = p->XcoeffsA[3] + p->buf[XADAPTCOEFFSA-3] 2: movem.l %d4-%d7, (XcoeffsA,%a6) | Save p->XcoeffsA[] 3: + | Finish Predictor X + + movclr.l %acc0, %d1 | %d1 = predictionA + movclr.l %acc1, %d2 | %d2 = predictionB + asr.l #1, %d2 + add.l %d2, %d1 | %d1 += (%d2 >> 1) + asr.l #8, %d1 + asr.l #2, %d1 | %d1 >>= 10 + add.l %d0, %d1 | %d1 += %d0 + move.l %d1, (XlastA,%a6) | p->XlastA = %d1 + + move.l (XfilterA,%a6), %d2 | %d2 = p->XfilterA + move.l %d2, %d0 + lsl.l #5, %d2 + sub.l %d0, %d2 | %d2 = 31 * %d2 + asr.l #5, %d2 | %d6 >>= 2 + add.l %d1, %d2 + move.l %d2, (XfilterA,%a6) | p->XfilterA = %d2 + + | *decoded1 stored 3 instructions down, avoiding pipeline stall | ***** COMMON ***** addq.l #4, %a5 | p->buf++ - lea.l (historybuffer+PREDICTOR_HISTORY_SIZE*4,%a6), %a2 - | %a2 = &p->historybuffer[PREDICTOR_HISTORY_SIZE] + | %a2 = &p->historybuffer[PREDICTOR_HISTORY_SIZE] + move.l %d2, (%a4)+ | *(decoded1++) = %d2 (p->XfilterA) + cmp.l %a2, %a5 - beq.s .move_hist | The history buffer is full, we need to do a memmove - + beq.s .move_hist | History buffer is full, we need to do a memmove + subq.l #1, (%sp) | decrease loop count bne.w .loop @@ -514,3 +497,163 @@ predictor_decode_stereo: bne.w .loop bra.s .done + .size predictor_decode_stereo, .-predictor_decode_stereo + + + .global predictor_decode_mono + .type predictor_decode_mono,@function + +| void predictor_decode_mono(struct predictor_t* p, +| int32_t* decoded0, +| int count) + +predictor_decode_mono: + lea.l (-11*4,%sp), %sp + movem.l %d2-%d7/%a2-%a6, (%sp) + + move.l #0, %macsr | signed integer mode + + move.l (11*4+4,%sp), %a6 | %a6 = p + move.l (11*4+8,%sp), %a4 | %a4 = decoded0 + move.l (11*4+12,%sp), %d7 | %d7 = count + move.l (%a6), %a5 | %a5 = p->buf + + move.l (YlastA,%a6), %d3 | %d3 = p->YlastA + +.loopm: + + | ***** PREDICTOR ***** + + movem.l (YDELAYA-12,%a5), %d0-%d2 | %d0 = p->buf[YDELAYA-3] + | %d1 = p->buf[YDELAYA-2] + | %d2 = p->buf[YDELAYA-1] + + move.l %d3, (YDELAYA,%a5) | p->buf[YDELAYA] = %d3 + + sub.l %d3, %d2 + neg.l %d2 | %d2 = %d3 - %d2 + + move.l %d2, (YDELAYA-4,%a5) | p->buf[YDELAYA-1] = %d2 + + movem.l (YcoeffsA,%a6), %a0-%a3 | %a0 = p->YcoeffsA[0] + | %a1 = p->YcoeffsA[1] + | %a2 = p->YcoeffsA[2] + | %a3 = p->YcoeffsA[3] + + mac.l %d3, %a0, %acc0 | %acc0 = p->buf[YDELAYA] * p->YcoeffsA[0] + mac.l %d2, %a1, %acc0 | %acc0 += p->buf[YDELAYA-1] * p->YcoeffsA[1] + mac.l %d1, %a2, %acc0 | %acc0 += p->buf[YDELAYA-2] * p->YcoeffsA[2] + mac.l %d0, %a3, %acc0 | %acc0 += p->buf[YDELAYA-3] * p->YcoeffsA[3] + + tst.l %d2 + beq.s 1f + spl.b %d2 | pos: 0x??????ff, neg: 0x??????00 + extb.l %d2 | pos: 0xffffffff, neg: 0x00000000 + or.l #1, %d2 | pos: 0xffffffff, neg: 0x00000001 +1: | %d2 = SIGN(%d2) + move.l %d2, (YADAPTCOEFFSA-4,%a5) | p->buf[YADAPTCOEFFSA-1] = %d2 + + tst.l %d3 + beq.s 1f + spl.b %d3 + extb.l %d3 + or.l #1, %d3 +1: | %d3 = SIGN(%d3) + move.l %d3, (YADAPTCOEFFSA,%a5) | p->buf[YADAPTCOEFFSA] = %d3 + + move.l (%a4), %d0 | %d0 = *decoded0 + beq.s 3f + + movem.l (YADAPTCOEFFSA-12,%a5),%d4-%d5 | %d4 = p->buf[YADAPTCOEFFSA-3] + | %d5 = p->buf[YADAPTCOEFFSA-2] + + bmi.s 1f | flags still valid here + + | *decoded0 > 0 + + sub.l %d3, %a0 | %a0 = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA] + sub.l %d2, %a1 | %a1 = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1] + sub.l %d5, %a2 | %a2 = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2] + sub.l %d4, %a3 | %a3 = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3] + + bra.s 2f + +1: | *decoded0 < 0 + + add.l %d3, %a0 | %a0 = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA] + add.l %d2, %a1 | %a1 = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1] + add.l %d5, %a2 | %a2 = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2] + add.l %d4, %a3 | %a3 = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3] + +2: + movem.l %a0-%a3, (YcoeffsA,%a6) | save p->YcoeffsA[] + +3: + | Finish Predictor + + movclr.l %acc0, %d3 | %d3 = predictionA + asr.l #8, %d3 + asr.l #2, %d3 | %d3 >>= 10 + add.l %d0, %d3 | %d3 += %d0 + + move.l (YfilterA,%a6), %d2 | %d2 = p->YfilterA + move.l %d2, %d0 + lsl.l #5, %d2 + sub.l %d0, %d2 | %d2 = 31 * %d2 + asr.l #5, %d2 | %d2 >>= 5 + add.l %d3, %d2 + move.l %d2, (YfilterA,%a6) | p->YfilterA = %d2 + + | *decoded0 stored 3 instructions down, avoiding pipeline stall + + | ***** COMMON ***** + + addq.l #4, %a5 | p->buf++ + lea.l (historybuffer+PREDICTOR_HISTORY_SIZE*4,%a6), %a3 + | %a3 = &p->historybuffer[PREDICTOR_HISTORY_SIZE] + + move.l %d2, (%a4)+ | *(decoded0++) = %d2 (p->YfilterA) + + cmp.l %a3, %a5 + beq.s .move_histm | History buffer is full, we need to do a memmove + + subq.l #1, %d7 | decrease loop count + bne.w .loopm + + move.l %d3, (YlastA,%a6) | %d3 = p->YlastA + +.donem: + move.l %a5, (%a6) | Save value of p->buf + movem.l (%sp), %d2-%d7/%a2-%a6 + lea.l (11*4,%sp), %sp + rts + +.move_histm: + move.l %d3, (YlastA,%a6) | %d3 = p->YlastA + + lea.l (historybuffer,%a6), %a3 + + | dest = %a3 (p->historybuffer) + | src = %a5 (p->buf) + | n = 200 + + movem.l (%a5), %d0-%d6/%a0-%a2 | 40 bytes + movem.l %d0-%d6/%a0-%a2, (%a3) + movem.l (40,%a5), %d0-%d6/%a0-%a2 | 40 bytes + movem.l %d0-%d6/%a0-%a2, (40,%a3) + movem.l (80,%a5), %d0-%d6/%a0-%a2 | 40 bytes + movem.l %d0-%d6/%a0-%a2, (80,%a3) + movem.l (120,%a5), %d0-%d6/%a0-%a2 | 40 bytes + movem.l %d0-%d6/%a0-%a2, (120,%a3) + movem.l (160,%a5), %d0-%d6/%a0-%a2 | 40 bytes + movem.l %d0-%d6/%a0-%a2, (160,%a3) + + move.l %a3, %a5 | p->buf = &p->historybuffer[0] + + move.l (YlastA,%a6), %d3 | %d3 = p->YlastA + + subq.l #1, %d7 | decrease loop count + bne.w .loopm + + bra.s .donem + .size predictor_decode_mono, .-predictor_decode_mono diff --git a/apps/codecs/demac/libdemac/predictor.c b/apps/codecs/demac/libdemac/predictor.c index d4f886fb8c..0d03d1d2fb 100644 --- a/apps/codecs/demac/libdemac/predictor.c +++ b/apps/codecs/demac/libdemac/predictor.c @@ -211,6 +211,7 @@ void ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p, } #endif +#if !defined(CPU_COLDFIRE) void ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p, int32_t* decoded0, int count) @@ -269,3 +270,4 @@ void ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p, p->YlastA = currentA; } +#endif