H300, X5: Faster lcd_yuv_blit() using EMAC. Speedup of the function itself at 124MHz: 10.5% on X5, 16.5% on H300. mpegplayer speedup 3..4%

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@11429 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Jens Arnold 2006-11-04 00:42:18 +00:00
parent 0d8781e2f9
commit f8b1da2f7b
4 changed files with 273 additions and 479 deletions

View file

@ -304,10 +304,11 @@ void lcd_blit(const fb_data* data, int x, int by, int width,
/* Line write helper function for lcd_yuv_blit. Write two lines of yuv420.
* y should have two lines of Y back to back.
* bu and rv should contain the Cb and Cr data for the two lines of Y.
* Stores bu, guv and rv in repective buffers for use in second line.
* Needs EMAC set to saturated, signed integer mode.
*/
extern void lcd_write_yuv420_lines(const unsigned char *y,
unsigned char *bu, unsigned char *guv, unsigned char *rv, int width);
const unsigned char *bu,
const unsigned char *rv, int width);
/* Performance function to blit a YUV bitmap directly to the LCD
* src_x, src_y, width and height should be even
@ -317,10 +318,9 @@ void lcd_yuv_blit(unsigned char * const src[3],
int src_x, int src_y, int stride,
int x, int y, int width, int height)
{
/* IRAM Y, Cb/bu, guv and Cb/rv buffers. */
/* IRAM Y, Cb and Cb buffers. */
unsigned char y_ibuf[LCD_WIDTH*2];
unsigned char bu_ibuf[LCD_WIDTH/2];
unsigned char guv_ibuf[LCD_WIDTH/2];
unsigned char rv_ibuf[LCD_WIDTH/2];
const unsigned char *ysrc, *usrc, *vsrc;
const unsigned char *ysrc_max;
@ -342,13 +342,14 @@ void lcd_yuv_blit(unsigned char * const src[3],
vsrc = src[2] + (src_y * stride >> 2) + (src_x >> 1);
ysrc_max = ysrc + height * stride;
coldfire_set_macsr(EMAC_SATURATE);
do
{
memcpy(y_ibuf, ysrc, width);
memcpy(y_ibuf + width, ysrc + stride, width);
memcpy(bu_ibuf, usrc, width >> 1);
memcpy(rv_ibuf, vsrc, width >> 1);
lcd_write_yuv420_lines(y_ibuf, bu_ibuf, guv_ibuf, rv_ibuf, width);
lcd_write_yuv420_lines(y_ibuf, bu_ibuf, rv_ibuf, width);
ysrc += 2 * stride;
usrc += stride >> 1;
vsrc += stride >> 1;
@ -381,6 +382,7 @@ void lcd_update(void)
}
}
/* Update a fraction of the display. */
void lcd_update_rect(int, int, int, int) ICODE_ATTR;
void lcd_update_rect(int x, int y, int width, int height)

View file

@ -40,260 +40,158 @@
* |G| = |1.000000 -0.334136 -0.714136| |Pb|
* |B| |1.000000 1.772000 0.000000| |Pr|
* Scaled, normalized, rounded and tweaked to yield RGB 666:
* |R| |74 0 101| |Y' - 16| / 256
* |G| = |74 -24 -51| |Cb - 128| / 256
* |B| |74 128 0| |Cr - 128| / 256
* |R| |19611723 0 26881894| |Y' - 16| >> 26
* |G| = |19611723 -6406711 -13692816| |Cb - 128| >> 26
* |B| |19611723 33976259 0| |Cr - 128| >> 26
*
* Needs EMAC set to saturated, signed integer mode.
*/
.align 2
.global lcd_write_yuv420_lines
.type lcd_write_yuv420_lines,@function
.type lcd_write_yuv420_lines, @function
lcd_write_yuv420_lines:
lea.l (-36,%sp),%sp /* free up some registers */
movem.l %d2-%d6/%a2-%a5,(%sp)
lea.l (-44, %sp), %sp /* free up some registers */
movem.l %d2-%d7/%a2-%a6, (%sp)
lea.l 0xf0008002,%a0 /* LCD data port */
movem.l (36+4,%sp),%a1-%a5 /* Y data, Cb data, guv storage, Cr data, width */
lea.l (%a1,%a5),%a5 /* end address */
lea.l 0xf0008002, %a0 /* LCD data port */
movem.l (44+4, %sp), %a1-%a4 /* Y data, Cb data, Cr data, width */
lea.l (%a1, %a4), %a4 /* end address */
.yuv_line_loop1:
/** Write first pixel **/
clr.l %d1 /* get bu component */
move.b (%a2),%d1
clr.l %d3 /* get rv component */
move.b (%a4),%d3
moveq.l #-128,%d0
add.l %d0,%d1
add.l %d0,%d3
move.l #19611723, %a5 /* y factor */
move.l #33976259, %a6 /* bu factor */
move.l #-6406711, %d5 /* gu factor */
move.l #-13692816, %d6 /* gv factor */
move.l #0x01040820, %d7 /* bitmask for signed->unsigned conversion
* of R, G and B within RGGB6666 at once */
move.l %d1,%d2 /* %d2 = cb component for guv */
asr.l #1,%d1 /* %d1 = 128 * (Cb - 128) / 256 */
move.b %d1,(%a2)+ /* save bu for next line */
moveq.l #-24,%d0 /* multiply first term of guv */
muls.w %d0,%d2
moveq.l #-51,%d0 /* multiply second term of guv */
muls.w %d3,%d0
add.l %d0,%d2
asr.l #8,%d2
move.b %d2,(%a3)+ /* save guv for next line */
moveq.l #101,%d0
muls.w %d0,%d3
asr.l #8,%d3
move.b %d3,(%a4)+ /* save rv for next line */
/* chroma for (very) first & second pixel */
clr.l %d2 /* load u component */
move.b (%a2)+, %d2
clr.l %d3 /* load v component */
move.b (%a3)+, %d3
moveq.l #-128, %d0
add.l %d0, %d2
add.l %d0, %d3
clr.l %d4 /* get y component */
move.b (%a1)+,%d4
moveq.l #74,%d0
muls.w %d0,%d4
asr.l #8,%d4
subq.l #4,%d4
move.l %d4,%d5
move.l %d4,%d6
/* : %d4,%d5,%d6 = Y, %d1 = bu, %d2 = guv, %d3 = rv */
mac.l %a6, %d2, %acc0 /* bu */
mac.l %d5, %d2, %acc1 /* gu */
mac.l %d6, %d3, %acc1 /* gv */
move.l #26881894, %d0 /* rv factor */
mac.l %d0, %d3, %acc2 /* rv */
add.l %d3,%d4 /* get r */
add.l %d2,%d5 /* get g */
add.l %d1,%d6 /* get b */
/* luma for (very) first pixel */
clr.l %d1
move.b (%a1)+, %d1
moveq.l #-126, %d0
add.l %d1, %d0 /* y' (-0.5 ... +0.5) */
mac.l %a5, %d0, %acc0
mac.l %a5, %d0, %acc1
mac.l %a5, %d0, %acc2
move.l %d6,%d0 /* is clamping needed? */
or.l %d5,%d0
or.l %d4,%d0
asr.l #6,%d0
beq.b .yuv_no_clamp1 /* values in range: skip clamping */
moveq.l #63, %d0
cmp.l %d0, %d4
bls.s .yuv_red_ok1
spl.b %d4
and.l %d0, %d4
.yuv_red_ok1:
cmp.l %d0, %d5
bls.s .yuv_green_ok1
spl.b %d5
and.l %d0, %d5
.yuv_green_ok1:
cmp.l %d0, %d6
bls.s .yuv_blue_ok1
spl.b %d6
and.l %d0, %d6
.yuv_blue_ok1:
.yuv_no_clamp1:
/* : %d4 = R, %d5 = G, %d6 = B */
bra.b .yuv_line_entry
move.l %d5,%d0 /* save g for lower 9 bits */
lsl.l #3,%d4 /* R << 3 */
lsr.l #3,%d0 /* G >> 3 */
or.l %d4,%d0
move.w %d0,(%a0) /* |00000000|000000000|0000000r|rrrrrggg| */
lsl.l #6,%d5 /* B << 6 */
or.l %d5,%d6 /* |00000000|000000000|0000gggg|ggbbbbbb| */
move.w %d6,(%a0)
.yuv_line_loop:
/* chroma for first & second pixel */
clr.l %d2 /* load u component */
move.b (%a2)+, %d2
clr.l %d3 /* load v component */
move.b (%a3)+, %d3
moveq.l #-128, %d0
add.l %d0, %d2
add.l %d0, %d3
/** Write second pixel **/
clr.l %d4
move.b (%a1)+,%d4 /* get y component */
moveq.l #74,%d0
muls.w %d0,%d4
asr.l #8,%d4
subq.l #4,%d4
/* : %d4 = Y, %d1 = bu, %d2 = guv, %d3 = rv */
mac.l %a6, %d2, %acc0 /* bu */
mac.l %d5, %d2, %acc1 /* gu */
mac.l %d6, %d3, %acc1 /* gv */
move.l #26881894, %d0 /* rv factor */
mac.l %d0, %d3, %acc2 /* rv */
/* Add Y + each chroma component (can clobber %d1-%d3 values now) */
add.l %d4,%d3 /* get r */
add.l %d4,%d2 /* get g */
add.l %d4,%d1 /* get b */
/* luma for first pixel */
clr.l %d1
move.b (%a1)+, %d1
moveq.l #-126, %d0
add.l %d1, %d0 /* y' (-0.5 ... +0.5) */
mac.l %a5, %d0, %acc0
mac.l %a5, %d0, %acc1
mac.l %a5, %d0, %acc2
move.w %d4, (%a0)
/* 2nd LCD write is delayed one pixel to use it for filling the EMAC latency */
move.l %d1,%d0 /* is clamping needed? */
or.l %d2,%d0
or.l %d3,%d0
asr.l #6,%d0
beq.b .yuv_no_clamp2 /* values in range: skip clamping */
moveq.l #63, %d0
cmp.l %d0, %d3
bls.s .yuv_red_ok2
spl.b %d3
and.l %d0, %d3
.yuv_red_ok2:
cmp.l %d0, %d2
bls.s .yuv_green_ok2
spl.b %d2
and.l %d0, %d2
.yuv_green_ok2:
cmp.l %d0, %d1
bls.s .yuv_blue_ok2
spl.b %d1
and.l %d0, %d1
.yuv_blue_ok2:
.yuv_no_clamp2:
/* : %d3 = R, %d2 = G, %d1 = B */
/* convert to RGB666, pack and output */
.yuv_line_entry:
moveq.l #26, %d0
move.l %acc0, %d4
move.l %acc1, %d3
move.l %acc2, %d2
lsr.l %d0, %d4
lsr.l %d0, %d3
lsr.l %d0, %d2
lsl.l #6, %d2
or.l %d3, %d2 /* |00000000|00000000|0000Rrrr|rrGggggg| */
lsl.l #7, %d2
or.l %d2, %d3 /* |00000000|00000Rrr|rrrGgggg|g0Gggggg| */
lsl.l #6, %d3
or.l %d3, %d4 /* |0000000R|rrrrrGgg|ggg0Gggg|ggBbbbbb| */
eor.l %d7, %d4 /* |0000000r|rrrrrggg|ggg0gggg|ggbbbbbb| */
swap %d4
move.w %d4, (%a0)
swap %d4
move.l %d2,%d0 /* save g for lower 9 bits */
lsl.l #3,%d3 /* R << 3 */
lsr.l #3,%d0 /* G >> 3 */
or.l %d3,%d0 /* |00000000|000000000|0000000r|rrrrrggg| */
move.w %d0,(%a0)
lsl.l #6,%d2 /* G << 6 */
or.l %d2,%d1 /* |00000000|000000000|0000gggg|ggbbbbbb| */
move.w %d1,(%a0)
/* luma for second pixel as delta from the first */
clr.l %d0
move.b (%a1)+, %d0
sub.l %d1, %d0
mac.l %a5, %d0, %acc0
mac.l %a5, %d0, %acc1
mac.l %a5, %d0, %acc2
cmp.l %a1,%a5 /* run %a1 up to end of line */
bhi.w .yuv_line_loop1
move.w %d4, (%a0)
/* 2nd LCD write is delayed one pixel to use it for filling the EMAC latency */
/* convert to RGB666, pack and output */
moveq.l #26, %d0
movclr.l %acc0, %d4
movclr.l %acc1, %d3
movclr.l %acc2, %d2
lsr.l %d0, %d4
lsr.l %d0, %d3
lsr.l %d0, %d2
lsl.l #6, %d2
or.l %d3, %d2 /* |00000000|00000000|0000Rrrr|rrGggggg| */
lsl.l #7, %d2
or.l %d2, %d3 /* |00000000|00000Rrr|rrrGgggg|g0Gggggg| */
lsl.l #6, %d3
or.l %d3, %d4 /* |0000000R|rrrrrGgg|ggg0Gggg|ggBbbbbb| */
eor.l %d7, %d4 /* |0000000r|rrrrrggg|ggg0gggg|ggbbbbbb| */
swap %d4
move.w %d4, (%a0)
swap %d4
cmp.l %a1, %a4 /* run %a1 up to end of line */
bhi.w .yuv_line_loop
tst.l (44+4, %sp) /* use original Y pointer as a flag to */
beq.b .yuv_exit /* distinguish between first and second */
clr.l (44+4, %sp) /* pixel line */
/* Rewind chroma pointers */
movem.l (36+8, %sp), %a2-%a5 /* bu data, guv data, rv data, width */
lea.l (%a1, %a5), %a5 /* next end address */
movem.l (44+8, %sp), %a2-%a4 /* Cb data, Cr data, width */
lea.l (%a1, %a4), %a4 /* end address */
bra.w .yuv_line_loop
.yuv_exit:
move.w %d4, (%a0) /* write (very) last 2nd word */
.yuv_line_loop2:
move.b (%a2)+,%d1 /* read save chromas and sign extend */
extb.l %d1
move.b (%a3)+,%d2
extb.l %d2
move.b (%a4)+,%d3
extb.l %d3
clr.l %d4
move.b (%a1)+,%d4 /* get y component */
moveq.l #74,%d0
muls.w %d0,%d4
asr.l #8,%d4
subq.l #4,%d4
move.l %d4,%d5
move.l %d4,%d6
/* : %d4,%d5,%d6 = Y, %d1 = bu, %d2 = guv, %d3 = rv */
add.l %d3,%d4 /* get r */
add.l %d2,%d5 /* get g */
add.l %d1,%d6 /* get b */
move.l %d6,%d0 /* is clamping needed? */
or.l %d5,%d0
or.l %d4,%d0
asr.l #6,%d0
beq.b .yuv_no_clamp3 /* values in range: skip clamping */
moveq.l #63, %d0
cmp.l %d0, %d4
bls.s .yuv_red_ok3
spl.b %d4
and.l %d0, %d4
.yuv_red_ok3:
cmp.l %d0, %d5
bls.s .yuv_green_ok3
spl.b %d5
and.l %d0, %d5
.yuv_green_ok3:
cmp.l %d0, %d6
bls.s .yuv_blue_ok3
spl.b %d6
and.l %d0, %d6
.yuv_blue_ok3:
.yuv_no_clamp3:
/* : %d4 = R, %d5 = G, %d6 = B */
move.l %d5,%d0 /* save g for lower 9 bits */
lsl.l #3,%d4 /* R << 3 */
lsr.l #3,%d0 /* G >> 3 */
or.l %d4,%d0
move.w %d0,(%a0) /* |00000000|000000000|0000000r|rrrrrggg| */
lsl.l #6,%d5 /* B << 6 */
or.l %d5,%d6 /* |00000000|000000000|0000gggg|ggbbbbbb| */
move.w %d6,(%a0)
/** Write second pixel **/
clr.l %d4
move.b (%a1)+,%d4 /* get y component */
moveq.l #74,%d0
muls.w %d0,%d4
asr.l #8,%d4
subq.l #4,%d4
/* : %d4 = Y, %d1 = bu, %d2 = guv, %d3 = rv */
/* Add Y + each chroma component (can clobber %d1-%d3 values now) */
add.l %d4,%d3 /* get r */
add.l %d4,%d2 /* get g */
add.l %d4,%d1 /* get b */
move.l %d1,%d0 /* is clamping needed? */
or.l %d2,%d0
or.l %d3,%d0
asr.l #6,%d0
beq.b .yuv_no_clamp4 /* values in range: skip clamping */
moveq.l #63, %d0
cmp.l %d0, %d3
bls.s .yuv_red_ok4
spl.b %d3
and.l %d0, %d3
.yuv_red_ok4:
cmp.l %d0, %d2
bls.s .yuv_green_ok4
spl.b %d2
and.l %d0, %d2
.yuv_green_ok4:
cmp.l %d0, %d1
bls.s .yuv_blue_ok4
spl.b %d1
and.l %d0, %d1
.yuv_blue_ok4:
.yuv_no_clamp4:
/* : %d3 = R, %d2 = G, %d1 = B */
move.l %d2,%d0 /* save g for lower 9 bits */
lsl.l #3,%d3 /* R << 3 */
lsr.l #3,%d0 /* G >> 3 */
or.l %d3,%d0 /* |00000000|000000000|0000000r|rrrrrggg| */
move.w %d0,(%a0)
lsl.l #6,%d2 /* G << 6 */
or.l %d2,%d1 /* |00000000|000000000|0000gggg|ggbbbbbb| */
move.w %d1,(%a0)
cmp.l %a1,%a5 /* run %a0 up to end of line */
bhi.w .yuv_line_loop2
movem.l (%sp),%d2-%d6/%a2-%a5
lea.l (36,%sp),%sp /* restore registers */
movem.l (%sp), %d2-%d7/%a2-%a6
lea.l (44, %sp), %sp /* restore registers */
rts
.yuv_end:
.size lcd_write_yuv420_lines,.yuv_end-lcd_write_yuv420_lines
/* end lcd_write_yuv420_lines */
.size lcd_write_yuv420_lines, yuv_end - lcd_write_yuv420_lines
/* begin lcd_write_data */

View file

@ -429,11 +429,11 @@ void lcd_blit(const fb_data* data, int x, int by, int width,
/* Line write helper function for lcd_yuv_blit. Write two lines of yuv420.
* y should have two lines of Y back to back.
* bu and rv should contain the Cb and Cr data for the two lines of Y.
* Stores bu, guv and rv in repective buffers for use in second line.
* Needs EMAC set to saturated, signed integer mode.
*/
extern void lcd_write_yuv420_lines(const unsigned char *y,
unsigned char *bu, unsigned char *guv, unsigned char *rv,
int width);
const unsigned char *bu,
const unsigned char *rv, int width);
/* Performance function to blit a YUV bitmap directly to the LCD
* src_x, src_y, width and height should be even and within the LCD's
@ -446,7 +446,6 @@ void lcd_yuv_blit(unsigned char * const src[3],
/* IRAM Y, Cb/bu, guv and Cb/rv buffers. */
unsigned char y_ibuf[LCD_WIDTH*2];
unsigned char bu_ibuf[LCD_WIDTH/2];
unsigned char guv_ibuf[LCD_WIDTH/2];
unsigned char rv_ibuf[LCD_WIDTH/2];
const unsigned char *ysrc, *usrc, *vsrc;
const unsigned char *ysrc_max;
@ -457,28 +456,29 @@ void lcd_yuv_blit(unsigned char * const src[3],
if (r_entry_mode == R_ENTRY_MODE_SOLID)
hw_dither(true);
width = (width + 1) & ~1;
height = (height + 1) & ~1;
width &= ~1; /* stay on the safe side */
height &= ~1;
/* Set start position and window */
/* Set start position and window */
lcd_write_reg(R_RAM_ADDR_SET, (x << 8) | (y + y_offset));
lcd_write_reg(R_VERT_RAM_ADDR_POS, ((x + width - 1) << 8) | x);
lcd_begin_write_gram();
ysrc = src[0] + src_y*stride + src_x;
usrc = src[1] + (src_y*stride >> 2) + (src_x >> 1);
vsrc = src[2] + (usrc - src[1]);
ysrc_max = ysrc + height*stride;
ysrc = src[0] + src_y * stride + src_x;
usrc = src[1] + (src_y * stride >> 2) + (src_x >> 1);
vsrc = src[2] + (src_y * stride >> 2) + (src_x >> 1);
ysrc_max = ysrc + height * stride;
coldfire_set_macsr(EMAC_SATURATE);
do
{
memcpy(y_ibuf, ysrc, width);
memcpy(&y_ibuf[width], &ysrc[stride], width);
memcpy(y_ibuf + width, ysrc + stride, width);
memcpy(bu_ibuf, usrc, width >> 1);
memcpy(rv_ibuf, vsrc, width >> 1);
lcd_write_yuv420_lines(y_ibuf, bu_ibuf, guv_ibuf, rv_ibuf, width);
ysrc += stride << 1;
lcd_write_yuv420_lines(y_ibuf, bu_ibuf, rv_ibuf, width);
ysrc += 2 * stride;
usrc += stride >> 1;
vsrc += stride >> 1;
}

View file

@ -22,7 +22,7 @@
.section .icode, "ax", @progbits
/* lcd_write_yuv420_lines(), based on lcd-as-x5.S
/* lcd_write_yuv420_lines()
*
* See http://en.wikipedia.org/wiki/YCbCr
* ITU-R BT.601 (formerly CCIR 601):
@ -38,252 +38,146 @@
* |R| |1.000000 0.000000 1.402000| |Y'|
* |G| = |1.000000 -0.334136 -0.714136| |Pb|
* |B| |1.000000 1.772000 0.000000| |Pr|
* Scaled, normalized, rounded and tweaked to yield RGB666, as converting
* directly to RGB565 gives too much roundoff error:
* |R| |74 0 101| |Y' - 16| / 256
* |G| = |74 -24 -51| |Cb - 128| / 256
* |B| |74 128 0| |Cr - 128| / 256
* Scaled, normalized, rounded and tweaked to yield RGB565:
* |R| |19611723 0 26881894| |Y' - 16| >> 27
* |G| = |19611723 -6406711 -13692816| |Cb - 128| >> 26
* |B| |19611723 33976259 0| |Cr - 128| >> 27
*
* Needs EMAC set to saturated, signed integer mode.
*/
.align 2
.global lcd_write_yuv420_lines
.type lcd_write_yuv420_lines, @function
lcd_write_yuv420_lines:
lea.l (-36, %sp), %sp /* free up some registers */
movem.l %d2-%d6/%a2-%a5, (%sp)
lea.l (-44, %sp), %sp /* free up some registers */
movem.l %d2-%d7/%a2-%a6, (%sp)
lea.l 0xf0000002, %a0 /* LCD data port */
movem.l (36+4, %sp), %a1-%a5 /* Y data, Cb data, guv storage, Cr data, width */
lea.l (%a1, %a5), %a5 /* end address */
movem.l (44+4, %sp), %a1-%a4 /* Y data, Cb data, Cr data, width */
lea.l (%a1, %a4), %a4 /* end address */
.yuv_line_loop1:
/* chroma for first & second pixel */
clr.l %d1 /* load bu component */
move.b (%a2), %d1
clr.l %d3 /* load rv component */
move.b (%a4), %d3
move.l #19611723, %a5 /* y factor */
move.l #33976259, %a6 /* bu factor */
move.l #-6406711, %d5 /* gu factor */
move.l #-13692816, %d6 /* gv factor */
move.l #0x8410, %d7 /* bitmask for signed->unsigned conversion
* of R, G and B within RGB565 at once */
/* chroma for (very) first & second pixel */
clr.l %d2 /* load u component */
move.b (%a2)+, %d2
clr.l %d3 /* load v component */
move.b (%a3)+, %d3
moveq.l #-128, %d0
add.l %d0, %d1
add.l %d0, %d2
add.l %d0, %d3
move.l %d1, %d2 /* %d2 = cb component for guv */
asr.l #1, %d1 /* %d1 = 128 * (Cb - 128) / 256 */
move.b %d1, (%a2)+ /* save bu for next line */
moveq.l #-24, %d0
muls.w %d0, %d2 /* %d2 = -24 * (Cb - 128)*/
moveq.l #-51, %d0
muls.w %d3, %d0
add.l %d0, %d2 /* %d2 = -24 * (Cb - 128) - 51 * (Cr - 128) */
asr.l #8, %d2
move.b %d2, (%a3)+ /* save guv for next line */
moveq.l #101, %d0
muls.w %d0, %d3 /* %d3 = 101 * (Cr - 128) */
asr.l #8, %d3
move.b %d3, (%a4)+ /* save rv for next line */
mac.l %a6, %d2, %acc0 /* bu */
mac.l %d5, %d2, %acc1 /* gu */
mac.l %d6, %d3, %acc1 /* gv */
move.l #26881894, %d0 /* rv factor */
mac.l %d0, %d3, %acc2 /* rv */
/* luma for (very) first pixel */
clr.l %d1
move.b (%a1)+, %d1
moveq.l #-126, %d0
add.l %d1, %d0 /* y' (-0.5 ... +0.5) */
mac.l %a5, %d0, %acc0
mac.l %a5, %d0, %acc1
mac.l %a5, %d0, %acc2
bra.b .yuv_line_entry
.yuv_line_loop:
/* chroma for first & second pixel */
clr.l %d2 /* load u component */
move.b (%a2)+, %d2
clr.l %d3 /* load v component */
move.b (%a3)+, %d3
moveq.l #-128, %d0
add.l %d0, %d2
add.l %d0, %d3
mac.l %a6, %d2, %acc0 /* bu */
mac.l %d5, %d2, %acc1 /* gu */
mac.l %d6, %d3, %acc1 /* gv */
move.l #26881894, %d0 /* rv factor */
mac.l %d0, %d3, %acc2 /* rv */
/* luma for first pixel */
clr.l %d4 /* load y component */
move.b (%a1)+, %d4
moveq.l #74, %d0
muls.w %d0, %d4 /* %d4 = 36 * Y */
asr.l #8, %d4
subq.l #4, %d4 /* correction for (Y - 16) and rounding */
move.l %d4, %d5
move.l %d4, %d6
clr.l %d1
move.b (%a1)+, %d1
moveq.l #-126, %d0
add.l %d1, %d0 /* y' (-0.5 ... +0.5) */
mac.l %a5, %d0, %acc0
mac.l %a5, %d0, %acc1
mac.l %a5, %d0, %acc2
/* combine & write first pixel */
add.l %d1, %d4 /* %d4 = blue */
add.l %d2, %d5 /* %d5 = green */
add.l %d3, %d6 /* %d6 = red */
move.w %d4, (%a0)
/* LCD write is delayed one pixel to use it for filling the EMAC latency */
/* convert to RGB565, pack and output */
.yuv_line_entry:
moveq.l #27, %d0
move.l %acc0, %d2
move.l %acc1, %d3
move.l %acc2, %d4
lsr.l %d0, %d2
lsr.l %d0, %d4
moveq.l #26, %d0
lsr.l %d0, %d3
lsl.l #6, %d4
or.l %d3, %d4
lsl.l #5, %d4
or.l %d2, %d4
eor.l %d7, %d4
/* luma for second pixel as delta from the first */
clr.l %d0
move.b (%a1)+, %d0
sub.l %d1, %d0
mac.l %a5, %d0, %acc0
mac.l %a5, %d0, %acc1
mac.l %a5, %d0, %acc2
move.w %d4, (%a0)
/* LCD write is delayed one pixel to use it for filling the EMAC latency */
/* convert to RGB565, pack and output */
moveq.l #27, %d0
movclr.l %acc0, %d2
movclr.l %acc1, %d3
movclr.l %acc2, %d4
lsr.l %d0, %d2
lsr.l %d0, %d4
moveq.l #26, %d0
lsr.l %d0, %d3
lsl.l #6, %d4
or.l %d3, %d4
lsl.l #5, %d4
or.l %d2, %d4
eor.l %d7, %d4
cmp.l %a1, %a4 /* run %a1 up to end of line */
bhi.w .yuv_line_loop
move.l %d4, %d0 /* clamping */
or.l %d5, %d0
or.l %d6, %d0
asr.l #6, %d0
beq.s .yuv_all_ok1
moveq.l #63, %d0
cmp.l %d0, %d4
bls.s .yuv_blue_ok1
spl.b %d4
and.l %d0, %d4
.yuv_blue_ok1:
cmp.l %d0, %d5
bls.s .yuv_green_ok1
spl.b %d5
and.l %d0, %d5
.yuv_green_ok1:
cmp.l %d0, %d6
bls.s .yuv_red_ok1
spl.b %d6
and.l %d0, %d6
.yuv_red_ok1:
.yuv_all_ok1:
lsr.l #1, %d6 /* pack, convert to RGB565 and output */
lsr.l #1, %d4
lsl.l #6, %d6
or.l %d6, %d5
lsl.l #5, %d5
or.l %d5, %d4
move.w %d4, (%a0)
/* luma for second pixel */
clr.l %d4 /* load y component */
move.b (%a1)+, %d4
moveq.l #74, %d0
muls.w %d0, %d4 /* %d4 = 36 * Y */
asr.l #8, %d4
subq.l #4, %d4 /* correction for (Y - 16) and rounding */
/* combine & write second pixel */
add.l %d4, %d1 /* %d1 = blue */
add.l %d4, %d2 /* %d2 = green */
add.l %d4, %d3 /* %d3 = red */
move.l %d1, %d0 /* clamping */
or.l %d2, %d0
or.l %d3, %d0
asr.l #6, %d0
beq.s .yuv_all_ok2
moveq.l #63, %d0
cmp.l %d0, %d1
bls.s .yuv_blue_ok2
spl.b %d1
and.l %d0, %d1
.yuv_blue_ok2:
cmp.l %d0, %d2
bls.s .yuv_green_ok2
spl.b %d2
and.l %d0, %d2
.yuv_green_ok2:
cmp.l %d0, %d3
bls.s .yuv_red_ok2
spl.b %d3
and.l %d0, %d3
.yuv_red_ok2:
.yuv_all_ok2:
lsr.l #1, %d3 /* pack, convert to RGB565 and output */
lsr.l #1, %d1
lsl.l #6, %d3
or.l %d3, %d2
lsl.l #5, %d2
or.l %d2, %d1
move.w %d1, (%a0)
cmp.l %a1,%a5 /* run %a1 up to end of line */
bhi.w .yuv_line_loop1
tst.l (44+4, %sp) /* use original Y pointer as a flag to */
beq.b .yuv_exit /* distinguish between first and second */
clr.l (44+4, %sp) /* pixel line */
/* Rewind chroma pointers */
movem.l (36+8, %sp), %a2-%a5 /* bu data, guv data, rv data, width */
lea.l (%a1, %a5), %a5 /* next end address */
.yuv_line_loop2:
/* read saved chromas and sign extend */
move.b (%a2)+, %d1
extb.l %d1
move.b (%a3)+, %d2
extb.l %d2
move.b (%a4)+, %d3
extb.l %d3
/* luma for first pixel */
clr.l %d4 /* load y component */
move.b (%a1)+, %d4
moveq.l #74, %d0
muls.w %d0, %d4 /* %d4 = 36 * Y */
asr.l #8, %d4
subq.l #4, %d4 /* correction for (Y - 16) and rounding */
move.l %d4, %d5
move.l %d4, %d6
movem.l (44+8, %sp), %a2-%a4 /* Cb data, Cr data, width */
lea.l (%a1, %a4), %a4 /* end address */
bra.w .yuv_line_loop
/* combine & write first pixel */
add.l %d1, %d4 /* %d4 = blue */
add.l %d2, %d5 /* %d5 = green */
add.l %d3, %d6 /* %d6 = red */
move.l %d4, %d0 /* clamping */
or.l %d5, %d0
or.l %d6, %d0
asr.l #6, %d0
beq.s .yuv_all_ok3
moveq.l #63, %d0
cmp.l %d0, %d4
bls.s .yuv_blue_ok3
spl.b %d4
and.l %d0, %d4
.yuv_blue_ok3:
cmp.l %d0, %d5
bls.s .yuv_green_ok3
spl.b %d5
and.l %d0, %d5
.yuv_green_ok3:
cmp.l %d0, %d6
bls.s .yuv_red_ok3
spl.b %d6
and.l %d0, %d6
.yuv_red_ok3:
.yuv_all_ok3:
.yuv_exit:
move.w %d4, (%a0) /* write (very) last pixel */
lsr.l #1, %d6 /* pack, convert to RGB565 and output */
lsr.l #1, %d4
lsl.l #6, %d6
or.l %d6, %d5
lsl.l #5, %d5
or.l %d5, %d4
move.w %d4, (%a0)
/* luma for second pixel */
clr.l %d4 /* load y component */
move.b (%a1)+, %d4
moveq.l #74, %d0
muls.w %d0, %d4 /* %d4 = 36 * Y */
asr.l #8, %d4
subq.l #4, %d4 /* correction for (Y - 16) and rounding */
/* combine & write second pixel */
add.l %d4, %d1 /* %d1 = blue */
add.l %d4, %d2 /* %d2 = green */
add.l %d4, %d3 /* %d3 = red */
move.l %d1, %d0 /* clamping */
or.l %d2, %d0
or.l %d3, %d0
asr.l #6, %d0
beq.s .yuv_all_ok4
moveq.l #63, %d0
cmp.l %d0, %d1
bls.s .yuv_blue_ok4
spl.b %d1
and.l %d0, %d1
.yuv_blue_ok4:
cmp.l %d0, %d2
bls.s .yuv_green_ok4
spl.b %d2
and.l %d0, %d2
.yuv_green_ok4:
cmp.l %d0, %d3
bls.s .yuv_red_ok4
spl.b %d3
and.l %d0, %d3
.yuv_red_ok4:
.yuv_all_ok4:
lsr.l #1, %d3 /* pack, convert to RGB565 and output */
lsr.l #1, %d1
lsl.l #6, %d3
or.l %d3, %d2
lsl.l #5, %d2
or.l %d2, %d1
move.w %d1, (%a0)
cmp.l %a1, %a5 /* run %a1 up to end of line */
bhi.w .yuv_line_loop2
movem.l (%sp), %d2-%d6/%a2-%a5
lea.l (36, %sp), %sp /* restore registers */
movem.l (%sp), %d2-%d7/%a2-%a6
lea.l (44, %sp), %sp /* restore registers */
rts
.lcd_write_yuv420_lines_end:
.size lcd_write_yuv420_lines, .lcd_write_yuv420_lines_end - lcd_write_yuv420_lines
.yuv_end:
.size lcd_write_yuv420_lines, .yuv_end - lcd_write_yuv420_lines