Mpegplayer: Assembler optimised IDCT for coldfire, based on FS #5995 by Karim Boucher. Put the IDCT block buffer in IRAM for better performance. The whole libmpeg2 decoder struct doesn't fit without throwing some libmad buffers out of IRAM, but then doesn't change performance significantly. Mpegplayer is quite usable now on X5; H300 is sort-of usable for widescreen.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15156 a1c6a512-1295-4272-9138-f99709370657
2007-10-16 22:55:40 +00:00 · 2007-10-16 22:55:40 +00:00 · fc43b9df82
commit fc43b9df82
parent 84f5c5c3e3
5 changed files with 611 additions and 2 deletions
--- a/apps/plugins/mpegplayer/SOURCES
+++ b/apps/plugins/mpegplayer/SOURCES
@ -13,6 +13,10 @@ idct.c
 motion_comp_c.c
 #endif /* CPU_* */

+#ifdef CPU_COLDFIRE
+idct_coldfire.S
+#endif
+
 slice.c
 video_out_rockbox.c
 mpeg_settings.c
--- a/apps/plugins/mpegplayer/decode.c
+++ b/apps/plugins/mpegplayer/decode.c
@ -401,6 +401,12 @@ void mpeg2_reset (mpeg2dec_t * mpeg2dec, int full_reset)

 }

+#ifdef CPU_COLDFIRE
+/* twice as large as on other targets because coldfire uses
+ * a secondary, transposed buffer for optimisation */
+static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16);
+#endif
+
 mpeg2dec_t * mpeg2_init (void)
 {
    mpeg2dec_t * mpeg2dec;
@ -410,7 +416,11 @@ mpeg2dec_t * mpeg2_init (void)
    mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),
 					    MPEG2_ALLOC_MPEG2DEC);
    if (mpeg2dec == NULL)
-	return NULL;
+	    return NULL;
+
+#ifdef CPU_COLDFIRE
+    mpeg2dec->decoder.DCTblock = static_dct_block;
+#endif

    rb->memset (mpeg2dec->decoder.DCTblock, 0, 64 * sizeof (int16_t));
    rb->memset (mpeg2dec->quantizer_matrix, 0, 4 * 64 * sizeof (uint8_t));
--- a/apps/plugins/mpegplayer/idct.c
+++ b/apps/plugins/mpegplayer/idct.c
@ -76,6 +76,14 @@ uint8_t mpeg2_clip[3840 * 2 + 256] IBSS_ATTR;
 #define CLIP(i) ((mpeg2_clip + 3840)[i])
 #endif

+#ifdef CPU_COLDFIRE
+/* assembler functions */
+extern void mpeg2_idct_copy_coldfire(int16_t * block, uint8_t * dest,
+                                     const int stride);
+extern void mpeg2_idct_add_coldfire(const int last, int16_t * block,
+                                    uint8_t * dest, const int stride);
+#else /* !CPU_COLDFIE */
+
 #if 0
 #define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
    do {                             \
@ -258,6 +266,8 @@ static void mpeg2_idct_add_c (const int last, int16_t * block,
    }
 }

+#endif /* !CPU_COLDFIRE */
+
 void mpeg2_idct_init (void)
 {
    extern uint8_t default_mpeg2_scan_norm[64];
@ -266,8 +276,13 @@ void mpeg2_idct_init (void)
    extern uint8_t mpeg2_scan_alt[64];
    int i, j;

+#ifdef CPU_COLDFIRE
+    mpeg2_idct_copy = mpeg2_idct_copy_coldfire;
+    mpeg2_idct_add  = mpeg2_idct_add_coldfire;
+#else
    mpeg2_idct_copy = mpeg2_idct_copy_c;
-    mpeg2_idct_add = mpeg2_idct_add_c;
+    mpeg2_idct_add  = mpeg2_idct_add_c;
+#endif

 #if !defined(CPU_COLDFIRE) && !defined(CPU_ARM)
    for (i = -3840; i < 3840 + 256; i++)
--- a/apps/plugins/mpegplayer/idct_coldfire.S
+++ b/apps/plugins/mpegplayer/idct_coldfire.S
@ -0,0 +1,574 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id $
+ *
+ * Copyright (C) 2007 Jens Arnold
+ * Based on the work of Karim Boucher and Rani Hod
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+ 
+    .global     mpeg2_idct_copy_coldfire
+    .type       mpeg2_idct_copy_coldfire, @function
+    .global     mpeg2_idct_add_coldfire
+    .type       mpeg2_idct_add_coldfire, @function
+
+    /* The IDCT itself.
+     * Input: %a0: block pointer
+     * All registers are preserved. */
+    .align  2
+.idct:
+    lea.l   (-15*4,%sp), %sp
+    movem.l %d0-%d7/%a0-%a6, (%sp)  | save all registers
+    move.l  %a0, %a6
+
+    move.l  #0, %macsr              | signed integer mode
+
+    move.l  #((2048<<16)+2841), %a0 | W0,  W1
+    move.l  #((2676<<16)+2408), %a1 | W2,  W3
+    move.l  #((2048<<16)+1609), %a2 | W4,  W5
+    move.l  #((1108<<16)+ 565), %a3 | W6,  W7
+    
+    lea.l   (128,%a6), %a4      | secondary, transposed temp buffer
+    moveq.l #8, %d3             | loop counter
+    
+.row_loop:
+    movem.l (%a6), %d0-%d2/%a5  | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
+    
+    mac.w   %a0l, %d2u, %acc0   | %acc0 = W1 * f1
+    mac.w   %a1l, %d2l, %acc0   |       + W3 * f3
+    mac.w   %a2l, %a5u, %acc0   |       + W5 * f5
+    mac.w   %a3l, %a5l, %acc0   |       + W7 * f7
+
+    mac.w   %a1l, %d2u, %acc1   | %acc1 = W3 * f1
+    msac.w  %a3l, %d2l, %acc1   |       - W7 * f3
+    msac.w  %a0l, %a5u, %acc1   |       - W1 * f5
+    msac.w  %a2l, %a5l, %acc1   |       - W5 * f7
+    
+    mac.w   %a2l, %d2u, %acc2   | %acc2 = W5 * f1
+    msac.w  %a0l, %d2l, %acc2   |       - W1 * f3
+    mac.w   %a3l, %a5u, %acc2   |       + W7 * f5
+    mac.w   %a1l, %a5l, %acc2   |       + W3 * f7
+
+    mac.w   %a3l, %d2u, %acc3   | %acc3 = W7 * f1
+    msac.w  %a2l, %d2l, %acc3   |       - W5 * f3
+    mac.w   %a1l, %a5u, %acc3   |       + W3 * f5
+    msac.w  %a0l, %a5l, %acc3   |       - W1 * f7
+
+    lea.l   (16,%a6), %a6       | Advance to next row; put here to fill EMAC latency
+    add.l   #(1<<16), %d0       | f0 += 1; 
+
+    movclr.l %acc0, %d4         | b0
+    movclr.l %acc1, %d5         | b1
+    movclr.l %acc2, %d6         | b2
+    movclr.l %acc3, %d7         | b3
+
+    mac.w   %a0u, %d0u, %acc0   | %acc0 = W0 * f0
+    mac.w   %a2u, %d1u, %acc0   |       + W4 * f4
+    move.l  %acc0, %acc3
+    mac.w   %a1u, %d0l, %acc0   |       + W2 * f2
+    mac.w   %a3u, %d1l, %acc0   |       + W6 * f6
+
+    mac.w   %a0u, %d0u, %acc1   | %acc1 = W0 * f0
+    msac.w  %a2u, %d1u, %acc1   |       - W4 * f4
+    move.l  %acc1, %acc2
+    mac.w   %a3u, %d0l, %acc1   |       + W6 * f2
+    msac.w  %a1u, %d1l, %acc1   |       - W2 * f6
+
+    | ^ move.l  %acc0, %acc3      %acc2 = W0 * f0 - W4 * f4
+    msac.w  %a3u, %d0l, %acc2   |       - W6 * f2
+    mac.w   %a1u, %d1l, %acc2   |       + W2 * f6
+
+    | ^ move.l  %acc1, %acc2      %acc3 = W0 * f0 + W4 * f4
+    msac.w  %a1u, %d0l, %acc3   |       - W2 * f2
+    msac.w  %a3u, %d1l, %acc3   |       - W6 * f6
+
+    moveq.l #12, %d1            | shift amount
+
+    move.l  %acc0, %d0          | block[7] = (a0
+    sub.l   %d4,%d0             |     - b0)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (7*16,%a4)
+
+    move.l  %acc1, %d0          | block[6] = (a1
+    sub.l   %d5,%d0             |     - b1)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (6*16,%a4)
+   
+    move.l  %acc2, %d0          | block[5] = (a2
+    sub.l   %d6,%d0             |     - b2)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (5*16,%a4)
+   
+    move.l  %acc3, %d0          | block[4] = (a3
+    sub.l   %d7,%d0             |     - b3)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (4*16,%a4)
+
+    movclr.l %acc3, %d0         | block[3] = (a3
+    add.l   %d7, %d0            |     + b3)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (3*16,%a4)
+
+    movclr.l %acc2, %d0         | block[2] = (a2
+    add.l   %d6, %d0            |     + b2)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (2*16,%a4)
+
+    movclr.l %acc1, %d0         | block[1] = (a1
+    add.l   %d5, %d0            |     + b1)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (1*16,%a4)
+
+    movclr.l %acc0, %d0         | block[0] = (a0
+    add.l   %d4, %d0            |     + b0)
+    asr.l   %d1, %d0            |     >> 12
+    move.w  %d0, (%a4)+         | advance to next temp column
+    
+    subq.l  #1, %d3             | loop 8 times
+    bne.w   .row_loop
+    
+    | %a6 now points to the temp buffer, where we need it.
+    lea.l   (-16-128,%a4), %a4  | point %a4 back to the input block
+    moveq.l #8, %d3             | loop counter
+    
+.col_loop:
+    movem.l (%a6), %d0-%d2/%a5  | fetch (f0, f2, f4, f6, f1, f3, f5, f7)
+  
+    mac.w   %a0l, %d2u, %acc0   | %acc0 = W1 * f1
+    mac.w   %a1l, %d2l, %acc0   |       + W3 * f3
+    mac.w   %a2l, %a5u, %acc0   |       + W5 * f5
+    mac.w   %a3l, %a5l, %acc0   |       + W7 * f7
+
+    mac.w   %a1l, %d2u, %acc1   | %acc1 = W3 * f1
+    msac.w  %a3l, %d2l, %acc1   |       - W7 * f3
+    msac.w  %a0l, %a5u, %acc1   |       - W1 * f5
+    msac.w  %a2l, %a5l, %acc1   |       - W5 * f7
+    
+    mac.w   %a2l, %d2u, %acc2   | %acc2 = W5 * f1
+    msac.w  %a0l, %d2l, %acc2   |       - W1 * f3
+    mac.w   %a3l, %a5u, %acc2   |       + W7 * f5
+    mac.w   %a1l, %a5l, %acc2   |       + W3 * f7
+
+    mac.w   %a3l, %d2u, %acc3   | %acc3 = W7 * f1
+    msac.w  %a2l, %d2l, %acc3   |       - W5 * f3
+    mac.w   %a1l, %a5u, %acc3   |       + W3 * f5
+    msac.w  %a0l, %a5l, %acc3   |       - W1 * f7
+    
+    lea.l   (16,%a6), %a6       | Advance to next row; put here to fill EMAC latency
+    add.l   #(32<<16), %d0      | DC offset: 0.5
+
+    movclr.l %acc0, %d4         | b0
+    movclr.l %acc1, %d5         | b1
+    movclr.l %acc2, %d6         | b2
+    movclr.l %acc3, %d7         | b3
+
+    mac.w   %a0u, %d0u, %acc0   | %acc0 = W0 * f0
+    mac.w   %a2u, %d1u, %acc0   |       + W4 * f4
+    move.l  %acc0, %acc3
+    mac.w   %a1u, %d0l, %acc0   |       + W2 * f2
+    mac.w   %a3u, %d1l, %acc0   |       + W6 * f6
+
+    mac.w   %a0u, %d0u, %acc1   | %acc1 = W0 * f0
+    msac.w  %a2u, %d1u, %acc1   |       - W4 * f4
+    move.l  %acc1, %acc2
+    mac.w   %a3u, %d0l, %acc1   |       + W6 * f2
+    msac.w  %a1u, %d1l, %acc1   |       - W2 * f6
+
+    | ^ move.l  %acc1, %acc2      %acc2 = W0 * f0 - W4 * f4
+    msac.w  %a3u, %d0l, %acc2   |       - W6 * f2
+    mac.w   %a1u, %d1l, %acc2   |       + W2 * f6
+
+    | ^ move.l  %acc0, %acc3      %acc3 = W0 * f0 + W4 * f4
+    msac.w  %a1u, %d0l, %acc3   |       - W2 * f2
+    msac.w  %a3u, %d1l, %acc3   |       - W6 * f6
+
+    moveq.l #17, %d1            | shift amount
+
+    move.l  %acc0, %d0          | block[7] = (a0
+    sub.l   %d4,%d0             |     - b0)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (7*16,%a4)
+   
+    move.l  %acc1, %d0          | block[6] = (a1
+    sub.l   %d5,%d0             |     - b1)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (6*16,%a4)
+   
+    move.l  %acc2, %d0          | block[5] = (a2
+    sub.l   %d6,%d0             |     - b2)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (5*16,%a4)
+   
+    move.l  %acc3, %d0          | block[4] = (a3
+    sub.l   %d7,%d0             |     - b3)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (4*16,%a4)
+   
+    movclr.l %acc3, %d0         | block[3] = (a3
+    add.l   %d7, %d0            |     + b3)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (3*16,%a4)
+   
+    movclr.l %acc2, %d0         | block[2] = (a2
+    add.l   %d6, %d0            |     + b2)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (2*16,%a4)
+   
+    movclr.l %acc1, %d0         | block[1] = (a1
+    add.l   %d5, %d0            |     + b1)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (1*16,%a4)
+   
+    movclr.l %acc0, %d0         | block[0] = (a0
+    add.l   %d4, %d0            |     + b0)
+    asr.l   %d1, %d0            |     >> 17
+    move.w  %d0, (%a4)+         | advance to next column
+
+    subq.l  #1, %d3             | loop 8 times
+    bne.w   .col_loop
+    
+    movem.l (%sp), %d0-%d7/%a0-%a6  | restore all registers
+    lea.l   (15*4,%sp), %sp
+    rts
+    
+    .align  2
+
+mpeg2_idct_copy_coldfire:
+    lea.l   (-4*4,%sp), %sp
+    movem.l %d2-%d4/%a2, (%sp)  | save some registers
+    movem.l (4*4+4,%sp), %a0-%a2| %a0 - block pointer
+                                | %a1 - destination pointer
+                                | %a2 - stride
+
+    bsr.w   .idct               | apply idct to block
+
+    move.l  #255, %d1           | preload constant for clipping
+    moveq.l #8, %d4             | loop counter
+    
+.copy_clip_loop:
+    move.w  (%a0), %d0          | load block[0]
+    ext.l   %d0                 | sign extend
+    cmp.l   %d1, %d0            | overflow?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d0, %d2            | collect output bytes 0..3 in %d2
+    lsl.l   #8, %d2
+
+    move.w  (2,%a0), %d0        | load block[1]
+    ext.l   %d0                 | sign extend
+    cmp.l   %d1, %d0            | overflow?
+    bls.b   1f
+    spl.b   %d0                 | yes: set appropriate limit value in low byte
+1:
+    move.b  %d0, %d2            | collect output bytes 0..3 in %d2
+    lsl.l   #8, %d2
+    clr.l   (%a0)+              | clear block[0] and block[1],
+                                | %a0 now pointing to block[2]
+    move.w  (%a0), %d0          | do b2 and b3
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d2
+    lsl.l   #8, %d2
+
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d2
+    clr.l   (%a0)+
+
+    move.w  (%a0), %d0          | do b4 and b5
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+    clr.l   (%a0)+
+
+    move.w  (%a0), %d0          | do b6 and b7
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3
+    lsl.l   #8, %d3
+
+    move.w  (2,%a0), %d0
+    ext.l   %d0
+    cmp.l   %d1, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.b  %d0, %d3            
+    clr.l   (%a0)+
+    
+    movem.l %d2-%d3, (%a1)      | write all 8 output bytes at once
+    lea.l   (%a2,%a1), %a1      | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .copy_clip_loop
+
+    movem.l (%sp), %d2-%d4/%a2  | restore registers
+    lea.l   (4*4,%sp), %sp
+    rts
+
+    .align  2
+
+mpeg2_idct_add_coldfire:
+    lea.l   (-7*4,%sp), %sp
+    movem.l %d2-%d7/%a2, (%sp)      | save some registers
+    movem.l (7*4+4,%sp), %d0/%a0-%a2| %d0 - last value
+                                    | %a0 - block pointer
+                                    | %a1 - destination pointer
+                                    | %a2 - stride
+    cmp.l   #129, %d0           | last == 129 ?
+    bne.b   .idct_add           |   no: perform idct + addition
+    move.w  (%a0), %d0          
+    ext.l   %d0                 | ((block[0]
+    asr.l   #4, %d0             |      >> 4)
+    and.l   #7, %d0             |      & 7)
+    subq.l  #4, %d0             |      - 4 == 0 ?
+    bne.w   .dc_add             |   no: just perform addition
+
+.idct_add:
+    bsr.w   .idct               | apply idct
+    
+    move.l  #255, %d2           | preload constant for clipping
+    clr.l   %d3                 | used for splitting input words into bytes
+    moveq.l #8, %d4             | loop counter
+    
+.add_clip_loop:
+    movem.l (%a1), %d6-%d7      | fetch (b0 b1 b2 b3) (b4 b5 b6 b7)
+    swap    %d6                 | (b2 b3 b0 b1) 
+    swap    %d7                 | (b6 b7 b4 b5)
+    
+    move.w  (2,%a0), %d0        | load block[1]
+    ext.l   %d0                 | sign extend
+    move.b  %d6, %d3            | copy b1
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d0            | add b1
+    cmp.l   %d2, %d0            | overflow ?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.w  (%a0), %d1          | load block[0]
+    ext.l   %d1                 | sign extend
+    move.b  %d6, %d3            | copy b0
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d1            | add b0
+    cmp.l   %d2, %d1            | overflow ?
+    bls.b   1f
+    spl.b   %d1                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d1, %d5            | collect output bytes 0..3 in %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    lsl.l   #8, %d5
+    clr.l   (%a0)+              | clear block[0] and block[1]
+                                |   %a0 now pointing to block[2]
+    move.w  (2,%a0), %d0        | do b3 and b2
+    ext.l   %d0
+    move.b  %d6, %d3
+    lsr.l   #8, %d6
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    add.l   %d6, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    clr.l   (%a0)+
+
+    move.w  (2,%a0), %d0        | do b5 and b4
+    ext.l   %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    lsl.l   #8, %d6
+    clr.l   (%a0)+
+
+    move.w  (2,%a0), %d0        | do b7 and b6
+    ext.l   %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.w  (%a0), %d1
+    ext.l   %d1
+    add.l   %d7, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    clr.l   (%a0)+
+
+    movem.l %d5-%d6, (%a1)      | write all 8 output bytes at once
+    lea.l   (%a2,%a1), %a1      | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .add_clip_loop
+
+    bra.w   .idct_add_end
+    
+.dc_add:
+    move.w  (%a0), %d0
+    ext.l   %d0                 | %d0 = (block[0]
+    add.l   #64, %d0            |       + 64)
+    asr.l   #7, %d0             |       >> 7
+    clr.w   (%a0)               | clear block[0]
+    clr.w   (63*2,%a0)          |   and block[63]
+    move.l  %d0, %a0            | DC value in %a0
+    
+    move.l  #255, %d2           | preload constant for clipping
+    clr.l   %d3                 | for splitting input words into bytes
+    moveq.l #8, %d4             | loop counter
+    
+.dc_clip_loop:
+    movem.l (%a1), %d6-%d7      | (b0 b1 b2 b3) (b4 b5 b6 b7)
+    swap    %d6                 | (b2 b3 b0 b1)
+    swap    %d7                 | (b6 b7 b4 b5)
+    
+    move.l  %a0, %d0            | copy DC
+    move.b  %d6, %d3            | copy b1
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d0            | add b1
+    cmp.l   %d2, %d0            | overflow ?
+    bls.b   1f
+    spl.b   %d0                 |   yes: set appropriate limit value in low byte
+1:
+    move.l  %a0, %d1            | copy DC
+    move.b  %d6, %d3            | copy b0
+    lsr.l   #8, %d6             | prepare 1st buffer for next byte
+    add.l   %d3, %d1            | add b0
+    cmp.l   %d2, %d1            | overflow ?
+    bls.b   1f
+    spl.b   %d1                 |   yes: set appropriate limit value in low byte
+1:
+    move.b  %d1, %d5            | collect output bytes 0..3 in %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+    lsl.l   #8, %d5
+
+    move.l  %a0, %d0            | do b3 and b2
+    move.b  %d6, %d3
+    lsr.l   #8, %d6
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    add.l   %d6, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d5
+    lsl.l   #8, %d5
+    move.b  %d0, %d5
+
+    move.l  %a0, %d0            | do b5 and b4
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6            | do b7 and b6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+    lsl.l   #8, %d6
+
+    move.l  %a0, %d0
+    move.b  %d7, %d3
+    lsr.l   #8, %d7
+    add.l   %d3, %d0
+    cmp.l   %d2, %d0
+    bls.b   1f
+    spl.b   %d0
+1:
+    move.l  %a0, %d1
+    add.l   %d7, %d1
+    cmp.l   %d2, %d1
+    bls.b   1f
+    spl.b   %d1
+1:
+    move.b  %d1, %d6
+    lsl.l   #8, %d6
+    move.b  %d0, %d6
+
+    movem.l %d5-%d6, (%a1)      | write all 8 output bytes at once
+    lea.l   (%a2,%a1), %a1      | advance output pointer
+    subq.l  #1, %d4             | loop 8 times
+    bne.w   .dc_clip_loop
+
+.idct_add_end:
+    movem.l (%sp), %d2-%d7/%a2  | restore registers
+    lea.l   (7*4,%sp), %sp
+    rts
--- a/apps/plugins/mpegplayer/mpeg2_internal.h
+++ b/apps/plugins/mpegplayer/mpeg2_internal.h
@ -20,6 +20,8 @@
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+ 
+#include "config.h" /* for Rockbox CPU_ #defines */

 /* macroblock modes */
 #define MACROBLOCK_INTRA 1
@ -92,7 +94,11 @@ struct mpeg2_decoder_s {
    int16_t dc_dct_pred[3];

    /* DCT coefficients */
+#ifdef CPU_COLDFIRE
+    int16_t *DCTblock;  /* put buffer separately to have it in IRAM */
+#else
    int16_t DCTblock[64] ATTR_ALIGN(64);
+#endif

    uint8_t * picture_dest[3];
    void (* convert) (void * convert_id, uint8_t * const * src,