Speed up of iPod nano 1G and iPod color LCD. Use HDD6330 asm part for YUV blitting, introduce special handling for full width screen updates. Speed up is about +30% for YUV on both color/nano1G.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28930 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Andree Buschmann 2010-12-29 23:17:47 +00:00
parent 1980fc3a61
commit b04d676706
2 changed files with 182 additions and 102 deletions

View file

@ -0,0 +1,152 @@
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id:$
*
* Copyright (C) 2010 by Andree Buschmann
*
* Generic asm helper function used by YUV blitting.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
#include "cpu.h"
.section .icode, "ax", %progbits
/****************************************************************************
* void lcd_yuv_write_inner_loop(unsigned char const * const ysrc,
* unsigned char const * const usrc,
* unsigned char const * const vsrc,
* int width);
*
* YUV- > RGB565 conversion
* |R| |1.000000 -0.000001 1.402000| |Y'|
* |G| = |1.000000 -0.334136 -0.714136| |Pb|
* |B| |1.000000 1.772000 0.000000| |Pr|
* Scaled, normalized, rounded and tweaked to yield RGB 565:
* |R| |74 0 101| |Y' - 16| >> 9
* |G| = |74 -24 -51| |Cb - 128| >> 8
* |B| |74 128 0| |Cr - 128| >> 9
*
*/
.align 2
.global lcd_yuv_write_inner_loop
.type lcd_yuv_write_inner_loop, %function
lcd_yuv_write_inner_loop:
@ r0 = ysrc
@ r1 = usrc
@ r2 = vsrc
@ r3 = width
stmfd sp!, { r4-r11, lr } @ save regs
mov r4, #0x70000000 @ r4 = LCD2_BLOCK_CTRL - 0x20
add r4, r4, #0x8a00 @
add r5, r4, #0x100 @ r5 = LCD2_BLOCK_DATA
10: @ loop
ldrb r7, [r1], #1 @ *usrc++
ldrb r8, [r2], #1 @ *vsrc++
sub r7, r7, #128 @ Cb -= 128
sub r8, r8, #128 @ Cr -= 128
add r10, r8, r8, asl #2 @ Cr*101
add r10, r10, r8, asl #5
add r10, r10, r8, asl #6
add r11, r8, r8, asl #1 @ Cr*51 + Cb*24
add r11, r11, r11, asl #4
add r11, r11, r7, asl #3
add r11, r11, r7, asl #4
add r12, r7, #2 @ r12 = bu = (Cb*128 + 256) >> 9
mov r12, r12, asr #2
add r10, r10, #256 @ r10 = rv = (Cr*101 + 256) >> 9
mov r10, r10, asr #9
rsb r11, r11, #128 @ r11 = guv = (-r11 + 128) >> 8
mov r11, r11, asr #8
@ pixel_1
ldrb r7, [r0], #1 @ *ysrc++
sub r7, r7, #16 @ Y = (Y' - 16) * 37
add r8, r7, r7, asl #2
add r7, r8, r7, asl #5
add r9, r10, r7, asr #8 @ R = (Y >> 8) + rv
add r8, r11, r7, asr #7 @ G = (Y >> 7) + guv
add r7, r12, r7, asr #8 @ B = (Y >> 8) + bu
cmp r9, #31 @ clamp R
mvnhi r9, r9, asr #31
andhi r9, r9, #31
cmp r8, #63 @ clamp G
mvnhi r8, r8, asr #31
andhi r8, r8, #63
cmp r7, #31 @ clamp B
mvnhi r7, r7, asr #31
andhi r7, r7, #31
orr r6, r7, r8, lsl #5 @ pack pixel
orr r6, r6, r9, lsl #11
mov r7, r6, lsl #8 @ swap bytes
and r7, r7, #0xff00
add r6, r7, r6, lsr #8
@ pixel_2
ldrb r7, [r0], #1 @ *ysrc++
sub r7, r7, #16 @ Y = (Y' - 16) * 37
add r8, r7, r7, asl #2
add r7, r8, r7, asl #5
add r9, r10, r7, asr #8 @ R = (Y >> 8) + rv
add r8, r11, r7, asr #7 @ G = (Y >> 7) + guv
add r7, r12, r7, asr #8 @ B = (Y >> 8) + bu
cmp r9, #31 @ clamp R
mvnhi r9, r9, asr #31
andhi r9, r9, #31
cmp r8, #63 @ clamp G
mvnhi r8, r8, asr #31
andhi r8, r8, #63
cmp r7, #31 @ clamp B
mvnhi r7, r7, asr #31
andhi r7, r7, #31
orr r7, r7, r8, lsl #5 @ pack pixel
orr r7, r7, r9, lsl #11
orr r6, r6, r7, lsl #24 @ swap bytes and add pixels simultaneously
mov r7, r7, lsr #8
orr r6, r6, r7, lsl #16
#if 1
11: @ while (!(LCD2_BLOCK_CTRL & LCD2_BLOCK_TXOK));
ldr r11, [r4, #0x20] @
tst r11, #0x1000000 @
beq 11b @
#endif
str r6, [r5] @ send two pixels
subs r3, r3, #2 @ decrease width
bgt 10b @ loop
ldmpc regs=r4-r11 @ restore regs
.ltorg @ dump constant pool
.size lcd_yuv_write_inner_loop, .-lcd_yuv_write_inner_loop

View file

@ -121,38 +121,14 @@ void lcd_init_device(void)
}
/*** update functions ***/
extern void lcd_yuv_write_inner_loop(unsigned char const * const ysrc,
unsigned char const * const usrc,
unsigned char const * const vsrc,
int width);
#define CSUB_X 2
#define CSUB_Y 2
/* YUV- > RGB565 conversion
* |R| |1.000000 -0.000001 1.402000| |Y'|
* |G| = |1.000000 -0.334136 -0.714136| |Pb|
* |B| |1.000000 1.772000 0.000000| |Pr|
* Scaled, normalized, rounded and tweaked to yield RGB 565:
* |R| |74 0 101| |Y' - 16| >> 9
* |G| = |74 -24 -51| |Cb - 128| >> 8
* |B| |74 128 0| |Cr - 128| >> 9
*/
#define RGBYFAC 74 /* 1.0 */
#define RVFAC 101 /* 1.402 */
#define GVFAC (-51) /* -0.714136 */
#define GUFAC (-24) /* -0.334136 */
#define BUFAC 128 /* 1.772 */
/* ROUNDOFFS contain constant for correct round-offs as well as
constant parts of the conversion matrix (e.g. (Y'-16)*RGBYFAC
-> constant part = -16*RGBYFAC). Through extraction of these
constant parts we save at leat 4 substractions in the conversion
loop */
#define ROUNDOFFSR (256 - 16*RGBYFAC - 128*RVFAC)
#define ROUNDOFFSG (128 - 16*RGBYFAC - 128*GVFAC - 128*GUFAC)
#define ROUNDOFFSB (256 - 16*RGBYFAC - 128*BUFAC)
#define MAX_5BIT 0x1f
#define MAX_6BIT 0x3f
/* Performance function to blit a YUV bitmap directly to the LCD */
void lcd_blit_yuv(unsigned char * const src[3],
int src_x, int src_y, int stride,
@ -222,7 +198,8 @@ void lcd_blit_yuv(unsigned char * const src[3],
const int stride_div_csub_x = stride/CSUB_X;
h=0;
while (1) {
while (1)
{
/* upsampling, YUV->RGB conversion and reduction to RGB565 in one go */
const unsigned char *ysrc = src[0] + stride * src_y + src_x;
@ -231,17 +208,11 @@ void lcd_blit_yuv(unsigned char * const src[3],
const unsigned char *usrc = src[1] + uvoffset;
const unsigned char *vsrc = src[2] + uvoffset;
const unsigned char *row_end = ysrc + width;
int yp, up, vp;
int red1, green1, blue1;
int red2, green2, blue2;
int rc, gc, bc;
int pixels_to_write;
fb_data pixel1,pixel2;
if (h==0) {
if (h==0)
{
while (!(LCD2_BLOCK_CTRL & LCD2_BLOCK_READY));
LCD2_BLOCK_CONFIG = 0;
@ -251,7 +222,8 @@ void lcd_blit_yuv(unsigned char * const src[3],
h = height;
/* calculate how much we can do in one go */
if (pixels_to_write > 0x10000) {
if (pixels_to_write > 0x10000)
{
h = (0x10000/2) / width;
pixels_to_write = (width * h) * 2;
}
@ -262,61 +234,7 @@ void lcd_blit_yuv(unsigned char * const src[3],
LCD2_BLOCK_CTRL = 0x34000000;
}
do
{
up = *usrc++;
vp = *vsrc++;
rc = RVFAC * vp + ROUNDOFFSR;
gc = GVFAC * vp + GUFAC * up + ROUNDOFFSG;
bc = BUFAC * up + ROUNDOFFSB;
/* Pixel 1 -> RGB565 */
yp = *ysrc++ * RGBYFAC;
red1 = (yp + rc) >> 9;
green1 = (yp + gc) >> 8;
blue1 = (yp + bc) >> 9;
/* Pixel 2 -> RGB565 */
yp = *ysrc++ * RGBYFAC;
red2 = (yp + rc) >> 9;
green2 = (yp + gc) >> 8;
blue2 = (yp + bc) >> 9;
/* Since out of bounds errors are relatively rare, we check two
pixels at once to see if any components are out of bounds, and
then fix whichever is broken. This works due to high values and
negative values both being !=0 when bitmasking them.
We first check for red and blue components (5bit range). */
if ((red1 | blue1 | red2 | blue2) & ~MAX_5BIT)
{
if (red1 & ~MAX_5BIT)
red1 = (red1 >> 31) ? 0 : MAX_5BIT;
if (blue1 & ~MAX_5BIT)
blue1 = (blue1 >> 31) ? 0 : MAX_5BIT;
if (red2 & ~MAX_5BIT)
red2 = (red2 >> 31) ? 0 : MAX_5BIT;
if (blue2 & ~MAX_5BIT)
blue2 = (blue2 >> 31) ? 0 : MAX_5BIT;
}
/* We second check for green component (6bit range) */
if ((green1 | green2) & ~MAX_6BIT)
{
if (green1 & ~MAX_6BIT)
green1 = (green1 >> 31) ? 0 : MAX_6BIT;
if (green2 & ~MAX_6BIT)
green2 = (green2 >> 31) ? 0 : MAX_6BIT;
}
pixel1 = swap16((red1 << 11) | (green1 << 5) | blue1);
pixel2 = swap16((red2 << 11) | (green2 << 5) | blue2);
while (!(LCD2_BLOCK_CTRL & LCD2_BLOCK_TXOK));
/* output 2 pixels */
LCD2_BLOCK_DATA = (pixel2 << 16) | pixel1;
}
while (ysrc < row_end);
lcd_yuv_write_inner_loop(ysrc,usrc,vsrc,width);
src_y++;
h--;
@ -415,6 +333,15 @@ void lcd_update_rect(int x, int y, int width, int height)
LCD2_BLOCK_CONFIG = 0xc0010000 | (pixels_to_write - 1);
LCD2_BLOCK_CTRL = 0x34000000;
if (LCD_WIDTH == width) {
/* for each row and column in a single loop */
for (r = 0; r < h*width; r += 2) {
while (!(LCD2_BLOCK_CTRL & LCD2_BLOCK_TXOK));
/* output 2 pixels */
LCD2_BLOCK_DATA = *addr++;
}
} else {
/* for each row */
for (r = 0; r < h; r++) {
/* for each column */
@ -426,6 +353,7 @@ void lcd_update_rect(int x, int y, int width, int height)
}
addr += (LCD_WIDTH - width)/2;
}
}
while (!(LCD2_BLOCK_CTRL & LCD2_BLOCK_READY));
LCD2_BLOCK_CONFIG = 0;