1
0
Fork 0
forked from len0rd/rockbox

Mpegplayer: Assembler optimised IDCT for coldfire, based on FS #5995 by Karim Boucher. Put the IDCT block buffer in IRAM for better performance. The whole libmpeg2 decoder struct doesn't fit without throwing some libmad buffers out of IRAM, but then doesn't change performance significantly. Mpegplayer is quite usable now on X5; H300 is sort-of usable for widescreen.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15156 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Jens Arnold 2007-10-16 22:55:40 +00:00
parent 84f5c5c3e3
commit fc43b9df82
5 changed files with 611 additions and 2 deletions

View file

@ -13,6 +13,10 @@ idct.c
motion_comp_c.c
#endif /* CPU_* */
#ifdef CPU_COLDFIRE
idct_coldfire.S
#endif
slice.c
video_out_rockbox.c
mpeg_settings.c

View file

@ -401,6 +401,12 @@ void mpeg2_reset (mpeg2dec_t * mpeg2dec, int full_reset)
}
#ifdef CPU_COLDFIRE
/* twice as large as on other targets because coldfire uses
* a secondary, transposed buffer for optimisation */
static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16);
#endif
mpeg2dec_t * mpeg2_init (void)
{
mpeg2dec_t * mpeg2dec;
@ -410,7 +416,11 @@ mpeg2dec_t * mpeg2_init (void)
mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),
MPEG2_ALLOC_MPEG2DEC);
if (mpeg2dec == NULL)
return NULL;
return NULL;
#ifdef CPU_COLDFIRE
mpeg2dec->decoder.DCTblock = static_dct_block;
#endif
rb->memset (mpeg2dec->decoder.DCTblock, 0, 64 * sizeof (int16_t));
rb->memset (mpeg2dec->quantizer_matrix, 0, 4 * 64 * sizeof (uint8_t));

View file

@ -76,6 +76,14 @@ uint8_t mpeg2_clip[3840 * 2 + 256] IBSS_ATTR;
#define CLIP(i) ((mpeg2_clip + 3840)[i])
#endif
#ifdef CPU_COLDFIRE
/* assembler functions */
extern void mpeg2_idct_copy_coldfire(int16_t * block, uint8_t * dest,
const int stride);
extern void mpeg2_idct_add_coldfire(const int last, int16_t * block,
uint8_t * dest, const int stride);
#else /* !CPU_COLDFIE */
#if 0
#define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
do { \
@ -258,6 +266,8 @@ static void mpeg2_idct_add_c (const int last, int16_t * block,
}
}
#endif /* !CPU_COLDFIRE */
void mpeg2_idct_init (void)
{
extern uint8_t default_mpeg2_scan_norm[64];
@ -266,8 +276,13 @@ void mpeg2_idct_init (void)
extern uint8_t mpeg2_scan_alt[64];
int i, j;
#ifdef CPU_COLDFIRE
mpeg2_idct_copy = mpeg2_idct_copy_coldfire;
mpeg2_idct_add = mpeg2_idct_add_coldfire;
#else
mpeg2_idct_copy = mpeg2_idct_copy_c;
mpeg2_idct_add = mpeg2_idct_add_c;
mpeg2_idct_add = mpeg2_idct_add_c;
#endif
#if !defined(CPU_COLDFIRE) && !defined(CPU_ARM)
for (i = -3840; i < 3840 + 256; i++)

View file

@ -0,0 +1,574 @@
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id $
*
* Copyright (C) 2007 Jens Arnold
* Based on the work of Karim Boucher and Rani Hod
*
* All files in this archive are subject to the GNU General Public License.
* See the file COPYING in the source tree root for full license agreement.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
.global mpeg2_idct_copy_coldfire
.type mpeg2_idct_copy_coldfire, @function
.global mpeg2_idct_add_coldfire
.type mpeg2_idct_add_coldfire, @function
/* The IDCT itself.
* Input: %a0: block pointer
* All registers are preserved. */
.align 2
.idct:
lea.l (-15*4,%sp), %sp
movem.l %d0-%d7/%a0-%a6, (%sp) | save all registers
move.l %a0, %a6
move.l #0, %macsr | signed integer mode
move.l #((2048<<16)+2841), %a0 | W0, W1
move.l #((2676<<16)+2408), %a1 | W2, W3
move.l #((2048<<16)+1609), %a2 | W4, W5
move.l #((1108<<16)+ 565), %a3 | W6, W7
lea.l (128,%a6), %a4 | secondary, transposed temp buffer
moveq.l #8, %d3 | loop counter
.row_loop:
movem.l (%a6), %d0-%d2/%a5 | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
mac.w %a0l, %d2u, %acc0 | %acc0 = W1 * f1
mac.w %a1l, %d2l, %acc0 | + W3 * f3
mac.w %a2l, %a5u, %acc0 | + W5 * f5
mac.w %a3l, %a5l, %acc0 | + W7 * f7
mac.w %a1l, %d2u, %acc1 | %acc1 = W3 * f1
msac.w %a3l, %d2l, %acc1 | - W7 * f3
msac.w %a0l, %a5u, %acc1 | - W1 * f5
msac.w %a2l, %a5l, %acc1 | - W5 * f7
mac.w %a2l, %d2u, %acc2 | %acc2 = W5 * f1
msac.w %a0l, %d2l, %acc2 | - W1 * f3
mac.w %a3l, %a5u, %acc2 | + W7 * f5
mac.w %a1l, %a5l, %acc2 | + W3 * f7
mac.w %a3l, %d2u, %acc3 | %acc3 = W7 * f1
msac.w %a2l, %d2l, %acc3 | - W5 * f3
mac.w %a1l, %a5u, %acc3 | + W3 * f5
msac.w %a0l, %a5l, %acc3 | - W1 * f7
lea.l (16,%a6), %a6 | Advance to next row; put here to fill EMAC latency
add.l #(1<<16), %d0 | f0 += 1;
movclr.l %acc0, %d4 | b0
movclr.l %acc1, %d5 | b1
movclr.l %acc2, %d6 | b2
movclr.l %acc3, %d7 | b3
mac.w %a0u, %d0u, %acc0 | %acc0 = W0 * f0
mac.w %a2u, %d1u, %acc0 | + W4 * f4
move.l %acc0, %acc3
mac.w %a1u, %d0l, %acc0 | + W2 * f2
mac.w %a3u, %d1l, %acc0 | + W6 * f6
mac.w %a0u, %d0u, %acc1 | %acc1 = W0 * f0
msac.w %a2u, %d1u, %acc1 | - W4 * f4
move.l %acc1, %acc2
mac.w %a3u, %d0l, %acc1 | + W6 * f2
msac.w %a1u, %d1l, %acc1 | - W2 * f6
| ^ move.l %acc0, %acc3 %acc2 = W0 * f0 - W4 * f4
msac.w %a3u, %d0l, %acc2 | - W6 * f2
mac.w %a1u, %d1l, %acc2 | + W2 * f6
| ^ move.l %acc1, %acc2 %acc3 = W0 * f0 + W4 * f4
msac.w %a1u, %d0l, %acc3 | - W2 * f2
msac.w %a3u, %d1l, %acc3 | - W6 * f6
moveq.l #12, %d1 | shift amount
move.l %acc0, %d0 | block[7] = (a0
sub.l %d4,%d0 | - b0)
asr.l %d1, %d0 | >> 12
move.w %d0, (7*16,%a4)
move.l %acc1, %d0 | block[6] = (a1
sub.l %d5,%d0 | - b1)
asr.l %d1, %d0 | >> 12
move.w %d0, (6*16,%a4)
move.l %acc2, %d0 | block[5] = (a2
sub.l %d6,%d0 | - b2)
asr.l %d1, %d0 | >> 12
move.w %d0, (5*16,%a4)
move.l %acc3, %d0 | block[4] = (a3
sub.l %d7,%d0 | - b3)
asr.l %d1, %d0 | >> 12
move.w %d0, (4*16,%a4)
movclr.l %acc3, %d0 | block[3] = (a3
add.l %d7, %d0 | + b3)
asr.l %d1, %d0 | >> 12
move.w %d0, (3*16,%a4)
movclr.l %acc2, %d0 | block[2] = (a2
add.l %d6, %d0 | + b2)
asr.l %d1, %d0 | >> 12
move.w %d0, (2*16,%a4)
movclr.l %acc1, %d0 | block[1] = (a1
add.l %d5, %d0 | + b1)
asr.l %d1, %d0 | >> 12
move.w %d0, (1*16,%a4)
movclr.l %acc0, %d0 | block[0] = (a0
add.l %d4, %d0 | + b0)
asr.l %d1, %d0 | >> 12
move.w %d0, (%a4)+ | advance to next temp column
subq.l #1, %d3 | loop 8 times
bne.w .row_loop
| %a6 now points to the temp buffer, where we need it.
lea.l (-16-128,%a4), %a4 | point %a4 back to the input block
moveq.l #8, %d3 | loop counter
.col_loop:
movem.l (%a6), %d0-%d2/%a5 | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
mac.w %a0l, %d2u, %acc0 | %acc0 = W1 * f1
mac.w %a1l, %d2l, %acc0 | + W3 * f3
mac.w %a2l, %a5u, %acc0 | + W5 * f5
mac.w %a3l, %a5l, %acc0 | + W7 * f7
mac.w %a1l, %d2u, %acc1 | %acc1 = W3 * f1
msac.w %a3l, %d2l, %acc1 | - W7 * f3
msac.w %a0l, %a5u, %acc1 | - W1 * f5
msac.w %a2l, %a5l, %acc1 | - W5 * f7
mac.w %a2l, %d2u, %acc2 | %acc2 = W5 * f1
msac.w %a0l, %d2l, %acc2 | - W1 * f3
mac.w %a3l, %a5u, %acc2 | + W7 * f5
mac.w %a1l, %a5l, %acc2 | + W3 * f7
mac.w %a3l, %d2u, %acc3 | %acc3 = W7 * f1
msac.w %a2l, %d2l, %acc3 | - W5 * f3
mac.w %a1l, %a5u, %acc3 | + W3 * f5
msac.w %a0l, %a5l, %acc3 | - W1 * f7
lea.l (16,%a6), %a6 | Advance to next row; put here to fill EMAC latency
add.l #(32<<16), %d0 | DC offset: 0.5
movclr.l %acc0, %d4 | b0
movclr.l %acc1, %d5 | b1
movclr.l %acc2, %d6 | b2
movclr.l %acc3, %d7 | b3
mac.w %a0u, %d0u, %acc0 | %acc0 = W0 * f0
mac.w %a2u, %d1u, %acc0 | + W4 * f4
move.l %acc0, %acc3
mac.w %a1u, %d0l, %acc0 | + W2 * f2
mac.w %a3u, %d1l, %acc0 | + W6 * f6
mac.w %a0u, %d0u, %acc1 | %acc1 = W0 * f0
msac.w %a2u, %d1u, %acc1 | - W4 * f4
move.l %acc1, %acc2
mac.w %a3u, %d0l, %acc1 | + W6 * f2
msac.w %a1u, %d1l, %acc1 | - W2 * f6
| ^ move.l %acc1, %acc2 %acc2 = W0 * f0 - W4 * f4
msac.w %a3u, %d0l, %acc2 | - W6 * f2
mac.w %a1u, %d1l, %acc2 | + W2 * f6
| ^ move.l %acc0, %acc3 %acc3 = W0 * f0 + W4 * f4
msac.w %a1u, %d0l, %acc3 | - W2 * f2
msac.w %a3u, %d1l, %acc3 | - W6 * f6
moveq.l #17, %d1 | shift amount
move.l %acc0, %d0 | block[7] = (a0
sub.l %d4,%d0 | - b0)
asr.l %d1, %d0 | >> 17
move.w %d0, (7*16,%a4)
move.l %acc1, %d0 | block[6] = (a1
sub.l %d5,%d0 | - b1)
asr.l %d1, %d0 | >> 17
move.w %d0, (6*16,%a4)
move.l %acc2, %d0 | block[5] = (a2
sub.l %d6,%d0 | - b2)
asr.l %d1, %d0 | >> 17
move.w %d0, (5*16,%a4)
move.l %acc3, %d0 | block[4] = (a3
sub.l %d7,%d0 | - b3)
asr.l %d1, %d0 | >> 17
move.w %d0, (4*16,%a4)
movclr.l %acc3, %d0 | block[3] = (a3
add.l %d7, %d0 | + b3)
asr.l %d1, %d0 | >> 17
move.w %d0, (3*16,%a4)
movclr.l %acc2, %d0 | block[2] = (a2
add.l %d6, %d0 | + b2)
asr.l %d1, %d0 | >> 17
move.w %d0, (2*16,%a4)
movclr.l %acc1, %d0 | block[1] = (a1
add.l %d5, %d0 | + b1)
asr.l %d1, %d0 | >> 17
move.w %d0, (1*16,%a4)
movclr.l %acc0, %d0 | block[0] = (a0
add.l %d4, %d0 | + b0)
asr.l %d1, %d0 | >> 17
move.w %d0, (%a4)+ | advance to next column
subq.l #1, %d3 | loop 8 times
bne.w .col_loop
movem.l (%sp), %d0-%d7/%a0-%a6 | restore all registers
lea.l (15*4,%sp), %sp
rts
.align 2
mpeg2_idct_copy_coldfire:
lea.l (-4*4,%sp), %sp
movem.l %d2-%d4/%a2, (%sp) | save some registers
movem.l (4*4+4,%sp), %a0-%a2| %a0 - block pointer
| %a1 - destination pointer
| %a2 - stride
bsr.w .idct | apply idct to block
move.l #255, %d1 | preload constant for clipping
moveq.l #8, %d4 | loop counter
.copy_clip_loop:
move.w (%a0), %d0 | load block[0]
ext.l %d0 | sign extend
cmp.l %d1, %d0 | overflow?
bls.b 1f
spl.b %d0 | yes: set appropriate limit value in low byte
1:
move.b %d0, %d2 | collect output bytes 0..3 in %d2
lsl.l #8, %d2
move.w (2,%a0), %d0 | load block[1]
ext.l %d0 | sign extend
cmp.l %d1, %d0 | overflow?
bls.b 1f
spl.b %d0 | yes: set appropriate limit value in low byte
1:
move.b %d0, %d2 | collect output bytes 0..3 in %d2
lsl.l #8, %d2
clr.l (%a0)+ | clear block[0] and block[1],
| %a0 now pointing to block[2]
move.w (%a0), %d0 | do b2 and b3
ext.l %d0
cmp.l %d1, %d0
bls.b 1f
spl.b %d0
1:
move.b %d0, %d2
lsl.l #8, %d2
move.w (2,%a0), %d0
ext.l %d0
cmp.l %d1, %d0
bls.b 1f
spl.b %d0
1:
move.b %d0, %d2
clr.l (%a0)+
move.w (%a0), %d0 | do b4 and b5
ext.l %d0
cmp.l %d1, %d0
bls.b 1f
spl.b %d0
1:
move.b %d0, %d3
lsl.l #8, %d3
move.w (2,%a0), %d0
ext.l %d0
cmp.l %d1, %d0
bls.b 1f
spl.b %d0
1:
move.b %d0, %d3
lsl.l #8, %d3
clr.l (%a0)+
move.w (%a0), %d0 | do b6 and b7
ext.l %d0
cmp.l %d1, %d0
bls.b 1f
spl.b %d0
1:
move.b %d0, %d3
lsl.l #8, %d3
move.w (2,%a0), %d0
ext.l %d0
cmp.l %d1, %d0
bls.b 1f
spl.b %d0
1:
move.b %d0, %d3
clr.l (%a0)+
movem.l %d2-%d3, (%a1) | write all 8 output bytes at once
lea.l (%a2,%a1), %a1 | advance output pointer
subq.l #1, %d4 | loop 8 times
bne.w .copy_clip_loop
movem.l (%sp), %d2-%d4/%a2 | restore registers
lea.l (4*4,%sp), %sp
rts
.align 2
mpeg2_idct_add_coldfire:
lea.l (-7*4,%sp), %sp
movem.l %d2-%d7/%a2, (%sp) | save some registers
movem.l (7*4+4,%sp), %d0/%a0-%a2| %d0 - last value
| %a0 - block pointer
| %a1 - destination pointer
| %a2 - stride
cmp.l #129, %d0 | last == 129 ?
bne.b .idct_add | no: perform idct + addition
move.w (%a0), %d0
ext.l %d0 | ((block[0]
asr.l #4, %d0 | >> 4)
and.l #7, %d0 | & 7)
subq.l #4, %d0 | - 4 == 0 ?
bne.w .dc_add | no: just perform addition
.idct_add:
bsr.w .idct | apply idct
move.l #255, %d2 | preload constant for clipping
clr.l %d3 | used for splitting input words into bytes
moveq.l #8, %d4 | loop counter
.add_clip_loop:
movem.l (%a1), %d6-%d7 | fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
swap %d6 | (b2 b3 b0 b1)
swap %d7 | (b6 b7 b4 b5)
move.w (2,%a0), %d0 | load block[1]
ext.l %d0 | sign extend
move.b %d6, %d3 | copy b1
lsr.l #8, %d6 | prepare 1st buffer for next byte
add.l %d3, %d0 | add b1
cmp.l %d2, %d0 | overflow ?
bls.b 1f
spl.b %d0 | yes: set appropriate limit value in low byte
1:
move.w (%a0), %d1 | load block[0]
ext.l %d1 | sign extend
move.b %d6, %d3 | copy b0
lsr.l #8, %d6 | prepare 1st buffer for next byte
add.l %d3, %d1 | add b0
cmp.l %d2, %d1 | overflow ?
bls.b 1f
spl.b %d1 | yes: set appropriate limit value in low byte
1:
move.b %d1, %d5 | collect output bytes 0..3 in %d5
lsl.l #8, %d5
move.b %d0, %d5
lsl.l #8, %d5
clr.l (%a0)+ | clear block[0] and block[1]
| %a0 now pointing to block[2]
move.w (2,%a0), %d0 | do b3 and b2
ext.l %d0
move.b %d6, %d3
lsr.l #8, %d6
add.l %d3, %d0
cmp.l %d2, %d0
bls.b 1f
spl.b %d0
1:
move.w (%a0), %d1
ext.l %d1
add.l %d6, %d1
cmp.l %d2, %d1
bls.b 1f
spl.b %d1
1:
move.b %d1, %d5
lsl.l #8, %d5
move.b %d0, %d5
clr.l (%a0)+
move.w (2,%a0), %d0 | do b5 and b4
ext.l %d0
move.b %d7, %d3
lsr.l #8, %d7
add.l %d3, %d0
cmp.l %d2, %d0
bls.b 1f
spl.b %d0
1:
move.w (%a0), %d1
ext.l %d1
move.b %d7, %d3
lsr.l #8, %d7
add.l %d3, %d1
cmp.l %d2, %d1
bls.b 1f
spl.b %d1
1:
move.b %d1, %d6
lsl.l #8, %d6
move.b %d0, %d6
lsl.l #8, %d6
clr.l (%a0)+
move.w (2,%a0), %d0 | do b7 and b6
ext.l %d0
move.b %d7, %d3
lsr.l #8, %d7
add.l %d3, %d0
cmp.l %d2, %d0
bls.b 1f
spl.b %d0
1:
move.w (%a0), %d1
ext.l %d1
add.l %d7, %d1
cmp.l %d2, %d1
bls.b 1f
spl.b %d1
1:
move.b %d1, %d6
lsl.l #8, %d6
move.b %d0, %d6
clr.l (%a0)+
movem.l %d5-%d6, (%a1) | write all 8 output bytes at once
lea.l (%a2,%a1), %a1 | advance output pointer
subq.l #1, %d4 | loop 8 times
bne.w .add_clip_loop
bra.w .idct_add_end
.dc_add:
move.w (%a0), %d0
ext.l %d0 | %d0 = (block[0]
add.l #64, %d0 | + 64)
asr.l #7, %d0 | >> 7
clr.w (%a0) | clear block[0]
clr.w (63*2,%a0) | and block[63]
move.l %d0, %a0 | DC value in %a0
move.l #255, %d2 | preload constant for clipping
clr.l %d3 | for splitting input words into bytes
moveq.l #8, %d4 | loop counter
.dc_clip_loop:
movem.l (%a1), %d6-%d7 | (b0 b1 b2 b3) (b4 b5 b6 b7)
swap %d6 | (b2 b3 b0 b1)
swap %d7 | (b6 b7 b4 b5)
move.l %a0, %d0 | copy DC
move.b %d6, %d3 | copy b1
lsr.l #8, %d6 | prepare 1st buffer for next byte
add.l %d3, %d0 | add b1
cmp.l %d2, %d0 | overflow ?
bls.b 1f
spl.b %d0 | yes: set appropriate limit value in low byte
1:
move.l %a0, %d1 | copy DC
move.b %d6, %d3 | copy b0
lsr.l #8, %d6 | prepare 1st buffer for next byte
add.l %d3, %d1 | add b0
cmp.l %d2, %d1 | overflow ?
bls.b 1f
spl.b %d1 | yes: set appropriate limit value in low byte
1:
move.b %d1, %d5 | collect output bytes 0..3 in %d5
lsl.l #8, %d5
move.b %d0, %d5
lsl.l #8, %d5
move.l %a0, %d0 | do b3 and b2
move.b %d6, %d3
lsr.l #8, %d6
add.l %d3, %d0
cmp.l %d2, %d0
bls.b 1f
spl.b %d0
1:
move.l %a0, %d1
add.l %d6, %d1
cmp.l %d2, %d1
bls.b 1f
spl.b %d1
1:
move.b %d1, %d5
lsl.l #8, %d5
move.b %d0, %d5
move.l %a0, %d0 | do b5 and b4
move.b %d7, %d3
lsr.l #8, %d7
add.l %d3, %d0
cmp.l %d2, %d0
bls.b 1f
spl.b %d0
1:
move.l %a0, %d1
move.b %d7, %d3
lsr.l #8, %d7
add.l %d3, %d1
cmp.l %d2, %d1
bls.b 1f
spl.b %d1
1:
move.b %d1, %d6 | do b7 and b6
lsl.l #8, %d6
move.b %d0, %d6
lsl.l #8, %d6
move.l %a0, %d0
move.b %d7, %d3
lsr.l #8, %d7
add.l %d3, %d0
cmp.l %d2, %d0
bls.b 1f
spl.b %d0
1:
move.l %a0, %d1
add.l %d7, %d1
cmp.l %d2, %d1
bls.b 1f
spl.b %d1
1:
move.b %d1, %d6
lsl.l #8, %d6
move.b %d0, %d6
movem.l %d5-%d6, (%a1) | write all 8 output bytes at once
lea.l (%a2,%a1), %a1 | advance output pointer
subq.l #1, %d4 | loop 8 times
bne.w .dc_clip_loop
.idct_add_end:
movem.l (%sp), %d2-%d7/%a2 | restore registers
lea.l (7*4,%sp), %sp
rts

View file

@ -20,6 +20,8 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "config.h" /* for Rockbox CPU_ #defines */
/* macroblock modes */
#define MACROBLOCK_INTRA 1
@ -92,7 +94,11 @@ struct mpeg2_decoder_s {
int16_t dc_dct_pred[3];
/* DCT coefficients */
#ifdef CPU_COLDFIRE
int16_t *DCTblock; /* put buffer separately to have it in IRAM */
#else
int16_t DCTblock[64] ATTR_ALIGN(64);
#endif
uint8_t * picture_dest[3];
void (* convert) (void * convert_id, uint8_t * const * src,