1
0
Fork 0
forked from len0rd/rockbox

Use pre-multiplication in scaler to save one multiply per color component on ARM and Coldfire, at the cost of an extra add/shift in the horizontal scaler to reduce values to a workable range. SH-1 retains the same basic math, as

the use of 16x16->32 hardware multiplication in the earlier scaler stages saves more than removing the 32x32->40 multiply to descale output.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21091 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Andrew Mahone 2009-05-26 20:00:47 +00:00
parent c4ed88f593
commit 92785b8f2f
5 changed files with 356 additions and 212 deletions

View file

@ -49,8 +49,8 @@ static void output_row_null(uint32_t row, void * row_in,
#else #else
uint32_t *lim = in + ctx->bm->width; uint32_t *lim = in + ctx->bm->width;
#endif #endif
for (; in < lim; in++) while (in < lim)
output = SC_MUL(*in + ctx->round, ctx->divisor); output = SC_OUT(*in++, ctx);
return; return;
} }

View file

@ -733,7 +733,7 @@ static void output_row_grey_32(uint32_t row, void * row_in,
uint32_t *qp = (uint32_t*)row_in; uint32_t *qp = (uint32_t*)row_in;
uint8_t *dest = (uint8_t*)ctx->bm->data + ctx->bm->width * row; uint8_t *dest = (uint8_t*)ctx->bm->data + ctx->bm->width * row;
for (col = 0; col < ctx->bm->width; col++) for (col = 0; col < ctx->bm->width; col++)
*dest++ = SC_MUL((*qp++) + ctx->round,ctx->divisor); *dest++ = SC_OUT(*qp++, ctx);
} }
static unsigned int get_size_grey(struct bitmap *bm) static unsigned int get_size_grey(struct bitmap *bm)

View file

@ -592,25 +592,12 @@ static inline PFreal fcos(int iangle)
return fsin(iangle + (IANGLE_MAX >> 2)); return fsin(iangle + (IANGLE_MAX >> 2));
} }
static inline uint32_t div255(uint32_t val) static inline unsigned scale_val(unsigned val, unsigned bits)
{ {
return ((((val >> 8) + val) >> 8) + val) >> 8; val = val * ((1 << bits) - 1);
return ((val >> 8) + val + 128) >> 8;
} }
#define SCALE_VAL(val,out) div255((val) * (out) + 127)
#define SCALE_VAL32(val, out) \
({ \
uint32_t val__ = (val) * (out); \
val__ = ((((val__ >> 8) + val__) >> 8) + val__ + 128) >> 8; \
val__; \
})
#define SCALE_VAL8(val, out) \
({ \
unsigned val__ = (val) * (out); \
val__ = ((val__ >> 8) + val__ + 128) >> 8; \
val__; \
})
static void output_row_8_transposed(uint32_t row, void * row_in, static void output_row_8_transposed(uint32_t row, void * row_in,
struct scaler_context *ctx) struct scaler_context *ctx)
{ {
@ -625,9 +612,9 @@ static void output_row_8_transposed(uint32_t row, void * row_in,
unsigned r, g, b; unsigned r, g, b;
for (; dest < end; dest += ctx->bm->height) for (; dest < end; dest += ctx->bm->height)
{ {
r = SCALE_VAL8(qp->red, 31); r = scale_val(qp->red, 5);
g = SCALE_VAL8(qp->green, 63); g = scale_val(qp->green, 6);
b = SCALE_VAL8((qp++)->blue, 31); b = scale_val((qp++)->blue, 5);
*dest = LCD_RGBPACK_LCD(r,g,b); *dest = LCD_RGBPACK_LCD(r,g,b);
} }
#endif #endif
@ -641,19 +628,15 @@ static void output_row_32_transposed(uint32_t row, void * row_in,
#ifdef USEGSLIB #ifdef USEGSLIB
uint32_t *qp = (uint32_t*)row_in; uint32_t *qp = (uint32_t*)row_in;
for (; dest < end; dest += ctx->bm->height) for (; dest < end; dest += ctx->bm->height)
*dest = SC_MUL((*qp++) + ctx->round, ctx->divisor); *dest = SC_OUT(*qp++, ctx);
#else #else
struct uint32_rgb *qp = (struct uint32_rgb*)row_in; struct uint32_rgb *qp = (struct uint32_rgb*)row_in;
uint32_t rb_mul = SCALE_VAL32(ctx->divisor, 31),
rb_rnd = SCALE_VAL32(ctx->round, 31),
g_mul = SCALE_VAL32(ctx->divisor, 63),
g_rnd = SCALE_VAL32(ctx->round, 63);
int r, g, b; int r, g, b;
for (; dest < end; dest += ctx->bm->height) for (; dest < end; dest += ctx->bm->height)
{ {
r = SC_MUL(qp->r + rb_rnd, rb_mul); r = scale_val(SC_OUT(qp->r, ctx), 5);
g = SC_MUL(qp->g + g_rnd, g_mul); g = scale_val(SC_OUT(qp->g, ctx), 6);
b = SC_MUL(qp->b + rb_rnd, rb_mul); b = scale_val(SC_OUT(qp->b, ctx), 5);
qp++; qp++;
*dest = LCD_RGBPACK_LCD(r,g,b); *dest = LCD_RGBPACK_LCD(r,g,b);
} }
@ -670,14 +653,14 @@ static void output_row_32_transposed_fromyuv(uint32_t row, void * row_in,
for (; dest < end; dest += ctx->bm->height) for (; dest < end; dest += ctx->bm->height)
{ {
unsigned r, g, b, y, u, v; unsigned r, g, b, y, u, v;
y = SC_MUL(qp->b + ctx->round, ctx->divisor); y = SC_OUT(qp->b, ctx);
u = SC_MUL(qp->g + ctx->round, ctx->divisor); u = SC_OUT(qp->g, ctx);
v = SC_MUL(qp->r + ctx->round, ctx->divisor); v = SC_OUT(qp->r, ctx);
qp++; qp++;
yuv_to_rgb(y, u, v, &r, &g, &b); yuv_to_rgb(y, u, v, &r, &g, &b);
r = (31 * r + (r >> 3) + 127) >> 8; r = scale_val(r, 5);
g = (63 * g + (g >> 2) + 127) >> 8; g = scale_val(g, 6);
b = (31 * b + (b >> 3) + 127) >> 8; b = scale_val(b, 5);
*dest = LCD_RGBPACK_LCD(r, g, b); *dest = LCD_RGBPACK_LCD(r, g, b);
} }
} }

View file

@ -131,20 +131,45 @@ int recalc_dimension(struct dim *dst, struct dim *src)
return false; \ return false; \
} }
/* Set up rounding and scale factors for horizontal area scaler */ #if defined(CPU_COLDFIRE)
static inline void scale_h_area_setup(struct scaler_context *ctx) #define MAC(op1, op2, num) \
asm volatile( \
"mac.l %0, %1, %%acc" #num \
: \
: "%d" (op1), "d" (op2)\
)
#define MAC_OUT(dest, num) \
asm volatile( \
"movclr.l %%acc" #num ", %0" \
: "=d" (dest) \
)
#elif defined(CPU_SH)
/* calculate the 32-bit product of unsigned 16-bit op1 and op2 */
static inline int32_t mul_s16_s16(int16_t op1, int16_t op2)
{ {
/* sum is output value * src->width */ return (int32_t)(op1 * op2);
SDEBUGF("scale_h_area_setup\n");
ctx->divisor = ctx->src->width;
} }
/* calculate the 32-bit product of signed 16-bit op1 and op2 */
static inline uint32_t mul_u16_u16(uint16_t op1, uint16_t op2)
{
return (uint32_t)(op1 * op2);
}
#endif
/* horizontal area average scaler */ /* horizontal area average scaler */
static bool scale_h_area(void *out_line_ptr, static bool scale_h_area(void *out_line_ptr,
struct scaler_context *ctx, bool accum) struct scaler_context *ctx, bool accum)
{ {
SDEBUGF("scale_h_area\n"); SDEBUGF("scale_h_area\n");
unsigned int ix, ox, oxe, mul; unsigned int ix, ox, oxe, mul;
#if defined(CPU_SH) || defined (TEST_SH_MATH)
const uint32_t h_i_val = ctx->src->width,
h_o_val = ctx->bm->width;
#else
const uint32_t h_i_val = ctx->h_i_val,
h_o_val = ctx->h_o_val;
#endif
#ifdef HAVE_LCD_COLOR #ifdef HAVE_LCD_COLOR
struct uint32_rgb rgbvalacc = { 0, 0, 0 }, struct uint32_rgb rgbvalacc = { 0, 0, 0 },
rgbvaltmp = { 0, 0, 0 }, rgbvaltmp = { 0, 0, 0 },
@ -161,31 +186,57 @@ static bool scale_h_area(void *out_line_ptr,
yield(); yield();
for (ix = 0; ix < (unsigned int)ctx->src->width; ix++) for (ix = 0; ix < (unsigned int)ctx->src->width; ix++)
{ {
oxe += ctx->bm->width; oxe += h_o_val;
/* end of current area has been reached */ /* end of current area has been reached */
/* fill buffer if needed */ /* fill buffer if needed */
FILL_BUF(part,ctx->store_part,ctx->args); FILL_BUF(part,ctx->store_part,ctx->args);
#ifdef HAVE_LCD_COLOR #ifdef HAVE_LCD_COLOR
if (oxe >= (unsigned int)ctx->src->width) if (oxe >= h_i_val)
{ {
/* "reset" error, which now represents partial coverage of next /* "reset" error, which now represents partial coverage of next
pixel by the next area pixel by the next area
*/ */
oxe -= ctx->src->width; oxe -= h_i_val;
#if defined(CPU_COLDFIRE)
/* Coldfire EMAC math */
/* add saved partial pixel from start of area */ /* add saved partial pixel from start of area */
rgbvalacc.r = rgbvalacc.r * ctx->bm->width + rgbvaltmp.r * mul; MAC(rgbvalacc.r, h_o_val, 0);
rgbvalacc.g = rgbvalacc.g * ctx->bm->width + rgbvaltmp.g * mul; MAC(rgbvalacc.g, h_o_val, 1);
rgbvalacc.b = rgbvalacc.b * ctx->bm->width + rgbvaltmp.b * mul; MAC(rgbvalacc.b, h_o_val, 2);
MAC(rgbvaltmp.r, mul, 0);
MAC(rgbvaltmp.g, mul, 1);
MAC(rgbvaltmp.b, mul, 2);
/* get new pixel , then add its partial coverage to this area */
mul = h_o_val - oxe;
rgbvaltmp.r = part->buf->red;
rgbvaltmp.g = part->buf->green;
rgbvaltmp.b = part->buf->blue;
MAC(rgbvaltmp.r, mul, 0);
MAC(rgbvaltmp.g, mul, 1);
MAC(rgbvaltmp.b, mul, 2);
MAC_OUT(rgbvalacc.r, 0);
MAC_OUT(rgbvalacc.g, 1);
MAC_OUT(rgbvalacc.b, 2);
#else
/* generic C math */
/* add saved partial pixel from start of area */
rgbvalacc.r = rgbvalacc.r * h_o_val + rgbvaltmp.r * mul;
rgbvalacc.g = rgbvalacc.g * h_o_val + rgbvaltmp.g * mul;
rgbvalacc.b = rgbvalacc.b * h_o_val + rgbvaltmp.b * mul;
/* get new pixel , then add its partial coverage to this area */ /* get new pixel , then add its partial coverage to this area */
rgbvaltmp.r = part->buf->red; rgbvaltmp.r = part->buf->red;
rgbvaltmp.g = part->buf->green; rgbvaltmp.g = part->buf->green;
rgbvaltmp.b = part->buf->blue; rgbvaltmp.b = part->buf->blue;
mul = ctx->bm->width - oxe; mul = h_o_val - oxe;
rgbvalacc.r += rgbvaltmp.r * mul; rgbvalacc.r += rgbvaltmp.r * mul;
rgbvalacc.g += rgbvaltmp.g * mul; rgbvalacc.g += rgbvaltmp.g * mul;
rgbvalacc.b += rgbvaltmp.b * mul; rgbvalacc.b += rgbvaltmp.b * mul;
#endif /* CPU */
rgbvalacc.r = (rgbvalacc.r + (1 << 21)) >> 22;
rgbvalacc.g = (rgbvalacc.g + (1 << 21)) >> 22;
rgbvalacc.b = (rgbvalacc.b + (1 << 21)) >> 22;
/* store or accumulate to output row */ /* store or accumulate to output row */
if (accum) if (accum)
{ {
@ -200,7 +251,7 @@ static bool scale_h_area(void *out_line_ptr,
rgbvalacc.r = 0; rgbvalacc.r = 0;
rgbvalacc.g = 0; rgbvalacc.g = 0;
rgbvalacc.b = 0; rgbvalacc.b = 0;
mul = ctx->bm->width - mul; mul = oxe;
ox += 1; ox += 1;
/* inside an area */ /* inside an area */
} else { } else {
@ -210,21 +261,45 @@ static bool scale_h_area(void *out_line_ptr,
rgbvalacc.b += part->buf->blue; rgbvalacc.b += part->buf->blue;
} }
#else #else
if (oxe >= (unsigned int)ctx->src->width) if (oxe >= h_i_val)
{ {
/* "reset" error, which now represents partial coverage of next /* "reset" error, which now represents partial coverage of next
pixel by the next area pixel by the next area
*/ */
oxe -= ctx->src->width; oxe -= h_i_val;
#if defined(CPU_COLDFIRE)
/* Coldfire EMAC math */
/* add saved partial pixel from start of area */ /* add saved partial pixel from start of area */
acc = MULUQ(acc, ctx->bm->width) + MULUQ(tmp, mul); MAC(acc, h_o_val, 0);
MAC(tmp, mul, 0);
/* get new pixel , then add its partial coverage to this area */
tmp = *(part->buf);
mul = h_o_val - oxe;
MAC(tmp, mul, 0);
MAC_OUT(acc, 0);
#elif defined(CPU_SH)
/* SH-1 16x16->32 math */
/* add saved partial pixel from start of area */
acc = mul_u16_u16(acc, h_o_val) + mul_u16_u16(tmp, mul);
/* get new pixel , then add its partial coverage to this area */ /* get new pixel , then add its partial coverage to this area */
tmp = *(part->buf); tmp = *(part->buf);
mul = ctx->bm->width - oxe; mul = h_o_val - oxe;
acc += MULUQ(tmp, mul); acc += mul_u16_u16(tmp, mul);
#else
/* generic C math */
/* add saved partial pixel from start of area */
acc = (acc * h_o_val) + (tmp * mul);
/* get new pixel , then add its partial coverage to this area */
tmp = *(part->buf);
mul = h_o_val - oxe;
acc += tmp * mul;
#endif /* CPU */
#if !(defined(CPU_SH) || defined(TEST_SH_MATH))
/* round, divide, and either store or accumulate to output row */ /* round, divide, and either store or accumulate to output row */
acc = (acc + (1 << 21)) >> 22;
#endif
if (accum) if (accum)
{ {
acc += out_line[ox]; acc += out_line[ox];
@ -232,7 +307,7 @@ static bool scale_h_area(void *out_line_ptr,
out_line[ox] = acc; out_line[ox] = acc;
/* reset accumulator */ /* reset accumulator */
acc = 0; acc = 0;
mul = ctx->bm->width - mul; mul = oxe;
ox += 1; ox += 1;
/* inside an area */ /* inside an area */
} else { } else {
@ -249,56 +324,56 @@ static bool scale_h_area(void *out_line_ptr,
/* vertical area average scaler */ /* vertical area average scaler */
static inline bool scale_v_area(struct rowset *rset, struct scaler_context *ctx) static inline bool scale_v_area(struct rowset *rset, struct scaler_context *ctx)
{ {
uint32_t mul, x, oy, iy, oye; uint32_t mul, oy, iy, oye;
#if defined(CPU_SH) || defined (TEST_SH_MATH)
const uint32_t v_i_val = ctx->src->height,
v_o_val = ctx->bm->height;
#else
const uint32_t v_i_val = ctx->v_i_val,
v_o_val = ctx->v_o_val;
#endif
/* Set up rounding and scale factors */ /* Set up rounding and scale factors */
ctx->divisor *= ctx->src->height;
ctx->round = ctx->divisor >> 1;
ctx->divisor = 1 + (-((ctx->divisor + 1) >> 1)) / ctx->divisor;
mul = 0; mul = 0;
oy = rset->rowstart; oy = rset->rowstart;
oye = 0; oye = 0;
#ifdef HAVE_LCD_COLOR #ifdef HAVE_LCD_COLOR
uint32_t *rowacc = (uint32_t *) ctx->buf, uint32_t *rowacc = (uint32_t *) ctx->buf,
*rowtmp = rowacc + 3 * ctx->bm->width; *rowtmp = rowacc + 3 * ctx->bm->width,
*rowacc_px, *rowtmp_px;
memset((void *)ctx->buf, 0, ctx->bm->width * 2 * sizeof(struct uint32_rgb)); memset((void *)ctx->buf, 0, ctx->bm->width * 2 * sizeof(struct uint32_rgb));
#else #else
uint32_t *rowacc = (uint32_t *) ctx->buf, uint32_t *rowacc = (uint32_t *) ctx->buf,
*rowtmp = rowacc + ctx->bm->width; *rowtmp = rowacc + ctx->bm->width,
*rowacc_px, *rowtmp_px;
memset((void *)ctx->buf, 0, ctx->bm->width * 2 * sizeof(uint32_t)); memset((void *)ctx->buf, 0, ctx->bm->width * 2 * sizeof(uint32_t));
#endif #endif
SDEBUGF("scale_v_area\n"); SDEBUGF("scale_v_area\n");
/* zero the accumulator and temp rows */ /* zero the accumulator and temp rows */
for (iy = 0; iy < (unsigned int)ctx->src->height; iy++) for (iy = 0; iy < (unsigned int)ctx->src->height; iy++)
{ {
oye += ctx->bm->height; oye += v_o_val;
/* end of current area has been reached */ /* end of current area has been reached */
if (oye >= (unsigned int)ctx->src->height) if (oye >= v_i_val)
{ {
/* "reset" error, which now represents partial coverage of the next /* "reset" error, which now represents partial coverage of the next
row by the next area row by the next area
*/ */
oye -= ctx->src->height; oye -= v_i_val;
/* add stored partial row to accumulator */ /* add stored partial row to accumulator */
#ifdef HAVE_LCD_COLOR for(rowacc_px = rowacc, rowtmp_px = rowtmp; rowacc_px != rowtmp;
for (x = 0; x < 3 * (unsigned int)ctx->bm->width; x++) rowacc_px++, rowtmp_px++)
#else *rowacc_px = *rowacc_px * v_o_val + *rowtmp_px * mul;
for (x = 0; x < (unsigned int)ctx->bm->width; x++)
#endif
rowacc[x] = rowacc[x] * ctx->bm->height + mul * rowtmp[x];
/* store new scaled row in temp row */ /* store new scaled row in temp row */
if(!ctx->h_scaler(rowtmp, ctx, false)) if(!ctx->h_scaler(rowtmp, ctx, false))
return false; return false;
/* add partial coverage by new row to this area, then round and /* add partial coverage by new row to this area, then round and
scale to final value scale to final value
*/ */
mul = ctx->bm->height - oye; mul = v_o_val - oye;
#ifdef HAVE_LCD_COLOR for(rowacc_px = rowacc, rowtmp_px = rowtmp; rowacc_px != rowtmp;
for (x = 0; x < 3 * (unsigned int)ctx->bm->width; x++) rowacc_px++, rowtmp_px++)
#else *rowacc_px += mul * *rowtmp_px;
for (x = 0; x < (unsigned int)ctx->bm->width; x++)
#endif
rowacc[x] += mul * rowtmp[x];
ctx->output_row(oy, (void*)rowacc, ctx); ctx->output_row(oy, (void*)rowacc, ctx);
/* clear accumulator row, store partial coverage for next row */ /* clear accumulator row, store partial coverage for next row */
#ifdef HAVE_LCD_COLOR #ifdef HAVE_LCD_COLOR
@ -319,20 +394,18 @@ static inline bool scale_v_area(struct rowset *rset, struct scaler_context *ctx)
} }
#ifdef HAVE_UPSCALER #ifdef HAVE_UPSCALER
/* Set up rounding and scale factors for the horizontal scaler. The divisor
is bm->width - 1, so that the first and last pixels in the row align
exactly between input and output
*/
static inline void scale_h_linear_setup(struct scaler_context *ctx)
{
ctx->divisor = ctx->bm->width - 1;
}
/* horizontal linear scaler */ /* horizontal linear scaler */
static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx, static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx,
bool accum) bool accum)
{ {
unsigned int ix, ox, ixe; unsigned int ix, ox, ixe;
#if defined(CPU_SH) || defined (TEST_SH_MATH)
const uint32_t h_i_val = ctx->src->width - 1,
h_o_val = ctx->bm->width - 1;
#else
const uint32_t h_i_val = ctx->h_i_val,
h_o_val = ctx->h_o_val;
#endif
/* type x = x is an ugly hack for hiding an unitialized data warning. The /* type x = x is an ugly hack for hiding an unitialized data warning. The
values are conditionally initialized before use, but other values are values are conditionally initialized before use, but other values are
set such that this will occur before these are used. set such that this will occur before these are used.
@ -348,27 +421,35 @@ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx,
FILL_BUF_INIT(part,ctx->store_part,ctx->args); FILL_BUF_INIT(part,ctx->store_part,ctx->args);
ix = 0; ix = 0;
/* The error is set so that values are initialized on the first pass. */ /* The error is set so that values are initialized on the first pass. */
ixe = ctx->bm->width - 1; ixe = h_o_val;
/* give other tasks a chance to run */ /* give other tasks a chance to run */
yield(); yield();
for (ox = 0; ox < (uint32_t)ctx->bm->width; ox++) for (ox = 0; ox < (uint32_t)ctx->bm->width; ox++)
{ {
#ifdef HAVE_LCD_COLOR #ifdef HAVE_LCD_COLOR
if (ixe >= ((uint32_t)ctx->bm->width - 1)) if (ixe >= h_o_val)
{ {
/* Store the new "current" pixel value in rgbval, and the color /* Store the new "current" pixel value in rgbval, and the color
step value in rgbinc. step value in rgbinc.
*/ */
ixe -= (ctx->bm->width - 1); ixe -= h_o_val;
rgbinc.r = -(part->buf->red); rgbinc.r = -(part->buf->red);
rgbinc.g = -(part->buf->green); rgbinc.g = -(part->buf->green);
rgbinc.b = -(part->buf->blue); rgbinc.b = -(part->buf->blue);
rgbval.r = (part->buf->red) * (ctx->bm->width - 1); #if defined(CPU_COLDFIRE)
rgbval.g = (part->buf->green) * (ctx->bm->width - 1); /* Coldfire EMAC math */
rgbval.b = (part->buf->blue) * (ctx->bm->width - 1); MAC(part->buf->red, h_o_val, 0);
MAC(part->buf->green, h_o_val, 1);
MAC(part->buf->blue, h_o_val, 2);
#else
/* generic C math */
rgbval.r = (part->buf->red) * h_o_val;
rgbval.g = (part->buf->green) * h_o_val;
rgbval.b = (part->buf->blue) * h_o_val;
#endif /* CPU */
ix += 1; ix += 1;
/* If this wasn't the last pixel, add the next one to rgbinc. */ /* If this wasn't the last pixel, add the next one to rgbinc. */
if (ix < (uint32_t)ctx->src->width) { if (LIKELY(ix < (uint32_t)ctx->src->width)) {
part->buf++; part->buf++;
part->len--; part->len--;
/* Fetch new pixels if needed */ /* Fetch new pixels if needed */
@ -379,14 +460,28 @@ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx,
/* Add a partial step to rgbval, in this pixel isn't precisely /* Add a partial step to rgbval, in this pixel isn't precisely
aligned with the new source pixel aligned with the new source pixel
*/ */
#if defined(CPU_COLDFIRE)
/* Coldfire EMAC math */
MAC(rgbinc.r, ixe, 0);
MAC(rgbinc.g, ixe, 1);
MAC(rgbinc.b, ixe, 2);
#else
/* generic C math */
rgbval.r += rgbinc.r * ixe; rgbval.r += rgbinc.r * ixe;
rgbval.g += rgbinc.g * ixe; rgbval.g += rgbinc.g * ixe;
rgbval.b += rgbinc.b * ixe; rgbval.b += rgbinc.b * ixe;
#endif
} }
/* Now multiple the color increment to its proper value */ #if defined(CPU_COLDFIRE)
rgbinc.r *= ctx->src->width - 1; /* get final EMAC result out of ACC registers */
rgbinc.g *= ctx->src->width - 1; MAC_OUT(rgbval.r, 0);
rgbinc.b *= ctx->src->width - 1; MAC_OUT(rgbval.g, 1);
MAC_OUT(rgbval.b, 2);
#endif
/* Now multiply the color increment to its proper value */
rgbinc.r *= h_i_val;
rgbinc.g *= h_i_val;
rgbinc.b *= h_i_val;
} else { } else {
rgbval.r += rgbinc.r; rgbval.r += rgbinc.r;
rgbval.g += rgbinc.g; rgbval.g += rgbinc.g;
@ -395,27 +490,36 @@ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx,
/* round and scale values, and accumulate or store to output */ /* round and scale values, and accumulate or store to output */
if (accum) if (accum)
{ {
out_line[ox].r += rgbval.r; out_line[ox].r += (rgbval.r + (1 << 21)) >> 22;
out_line[ox].g += rgbval.g; out_line[ox].g += (rgbval.g + (1 << 21)) >> 22;
out_line[ox].b += rgbval.b; out_line[ox].b += (rgbval.b + (1 << 21)) >> 22;
} else { } else {
out_line[ox].r = rgbval.r; out_line[ox].r = (rgbval.r + (1 << 21)) >> 22;
out_line[ox].g = rgbval.g; out_line[ox].g = (rgbval.g + (1 << 21)) >> 22;
out_line[ox].b = rgbval.b; out_line[ox].b = (rgbval.b + (1 << 21)) >> 22;
} }
#else #else
if (ixe >= ((uint32_t)ctx->bm->width - 1)) if (ixe >= h_o_val)
{ {
/* Store the new "current" pixel value in rgbval, and the color /* Store the new "current" pixel value in rgbval, and the color
step value in rgbinc. step value in rgbinc.
*/ */
ixe -= (ctx->bm->width - 1); ixe -= h_o_val;
val = *(part->buf); val = *(part->buf);
inc = -val; inc = -val;
val = MULUQ(val, ctx->bm->width - 1); #if defined(CPU_COLDFIRE)
/* Coldfire EMAC math */
MAC(val, h_o_val, 0);
#elif defined(CPU_SH)
/* SH-1 16x16->32 math */
val = mul_u16_u16(val, h_o_val);
#else
/* generic C math */
val = val * h_o_val;
#endif
ix += 1; ix += 1;
/* If this wasn't the last pixel, add the next one to rgbinc. */ /* If this wasn't the last pixel, add the next one to rgbinc. */
if (ix < (uint32_t)ctx->src->width) { if (LIKELY(ix < (uint32_t)ctx->src->width)) {
part->buf++; part->buf++;
part->len--; part->len--;
/* Fetch new pixels if needed */ /* Fetch new pixels if needed */
@ -424,12 +528,40 @@ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx,
/* Add a partial step to rgbval, in this pixel isn't precisely /* Add a partial step to rgbval, in this pixel isn't precisely
aligned with the new source pixel aligned with the new source pixel
*/ */
val += MULQ(inc, ixe); #if defined(CPU_COLDFIRE)
/* Coldfire EMAC math */
MAC(inc, ixe, 0);
#elif defined(CPU_SH)
/* SH-1 16x16->32 math */
val += mul_s16_s16(inc, ixe);
#else
/* generic C math */
val += inc * ixe;
#endif
} }
#if defined(CPU_COLDFIRE)
/* get final EMAC result out of ACC register */
MAC_OUT(val, 0);
#endif
/* Now multiply the color increment to its proper value */ /* Now multiply the color increment to its proper value */
inc = MULQ(inc, ctx->src->width - 1); #if defined(CPU_SH)
/* SH-1 16x16->32 math */
inc = mul_s16_s16(inc, h_i_val);
#else
/* generic C math */
inc *= h_i_val;
#endif
} else } else
val += inc; val += inc;
#if !(defined(CPU_SH) || defined(TEST_SH_MATH))
/* round and scale values, and accumulate or store to output */
if (accum)
{
out_line[ox] += (val + (1 << 21)) >> 22;
} else {
out_line[ox] = (val + (1 << 21)) >> 22;
}
#else
/* round and scale values, and accumulate or store to output */ /* round and scale values, and accumulate or store to output */
if (accum) if (accum)
{ {
@ -438,7 +570,8 @@ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx,
out_line[ox] = val; out_line[ox] = val;
} }
#endif #endif
ixe += ctx->src->width - 1; #endif
ixe += h_i_val;
} }
return true; return true;
} }
@ -447,71 +580,66 @@ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx,
static inline bool scale_v_linear(struct rowset *rset, static inline bool scale_v_linear(struct rowset *rset,
struct scaler_context *ctx) struct scaler_context *ctx)
{ {
uint32_t mul, x, iy, iye; uint32_t mul, iy, iye;
int32_t oy; int32_t oy;
/* Set up scale and rounding factors, the divisor is bm->height - 1 */ #if defined(CPU_SH) || defined (TEST_SH_MATH)
ctx->divisor *= (ctx->bm->height - 1); const uint32_t v_i_val = ctx->src->height - 1,
ctx->round = ctx->divisor >> 1; v_o_val = ctx->bm->height - 1;
ctx->divisor = 1 + (-((ctx->divisor + 1) >> 1)) / ctx->divisor; #else
/* Set up our two temp buffers. The names are generic because they'll be const uint32_t v_i_val = ctx->v_i_val,
swapped each time a new input row is read v_o_val = ctx->v_o_val;
#endif
/* Set up our buffers, to store the increment and current value for each
column, and one temp buffer used to read in new rows.
*/ */
#ifdef HAVE_LCD_COLOR #ifdef HAVE_LCD_COLOR
uint32_t *rowinc = (uint32_t *)(ctx->buf), uint32_t *rowinc = (uint32_t *)(ctx->buf),
*rowval = rowinc + 3 * ctx->bm->width, *rowval = rowinc + 3 * ctx->bm->width,
*rowtmp = rowval + 3 * ctx->bm->width; *rowtmp = rowval + 3 * ctx->bm->width,
#else #else
uint32_t *rowinc = (uint32_t *)(ctx->buf), uint32_t *rowinc = (uint32_t *)(ctx->buf),
*rowval = rowinc + ctx->bm->width, *rowval = rowinc + ctx->bm->width,
*rowtmp = rowval + ctx->bm->width; *rowtmp = rowval + ctx->bm->width,
#endif #endif
*rowinc_px, *rowval_px, *rowtmp_px;
SDEBUGF("scale_v_linear\n"); SDEBUGF("scale_v_linear\n");
mul = 0; mul = 0;
iy = 0; iy = 0;
iye = ctx->bm->height - 1; iye = v_o_val;
/* get first scaled row in rowtmp */ /* get first scaled row in rowtmp */
if(!ctx->h_scaler((void*)rowtmp, ctx, false)) if(!ctx->h_scaler((void*)rowtmp, ctx, false))
return false; return false;
for (oy = rset->rowstart; oy != rset->rowstop; oy += rset->rowstep) for (oy = rset->rowstart; oy != rset->rowstop; oy += rset->rowstep)
{ {
if (iye >= (uint32_t)ctx->bm->height - 1) if (iye >= v_o_val)
{ {
iye -= ctx->bm->height - 1; iye -= v_o_val;
iy += 1; iy += 1;
#ifdef HAVE_LCD_COLOR for(rowinc_px = rowinc, rowtmp_px = rowtmp, rowval_px = rowval;
for (x = 0; x < 3 * (uint32_t)ctx->bm->width; x++) rowinc_px < rowval; rowinc_px++, rowtmp_px++, rowval_px++)
#else
for (x = 0; x < (uint32_t)ctx->bm->width; x++)
#endif
{ {
rowinc[x] = -rowtmp[x]; *rowinc_px = -*rowtmp_px;
rowval[x] = rowtmp[x] * (ctx->bm->height - 1); *rowval_px = *rowtmp_px * v_o_val;
} }
if (iy < (uint32_t)ctx->src->height) if (iy < (uint32_t)ctx->src->height)
{ {
if (!ctx->h_scaler((void*)rowtmp, ctx, false)) if (!ctx->h_scaler((void*)rowtmp, ctx, false))
return false; return false;
#ifdef HAVE_LCD_COLOR for(rowinc_px = rowinc, rowtmp_px = rowtmp, rowval_px = rowval;
for (x = 0; x < 3 * (uint32_t)ctx->bm->width; x++) rowinc_px < rowval; rowinc_px++, rowtmp_px++, rowval_px++)
#else
for (x = 0; x < (uint32_t)ctx->bm->width; x++)
#endif
{ {
rowinc[x] += rowtmp[x]; *rowinc_px += *rowtmp_px;
rowval[x] += rowinc[x] * iye; *rowval_px += *rowinc_px * iye;
rowinc[x] *= ctx->src->height - 1; *rowinc_px *= v_i_val;
} }
} }
} else } else
#ifdef HAVE_LCD_COLOR for(rowinc_px = rowinc, rowval_px = rowval; rowinc_px < rowval;
for (x = 0; x < 3 * (uint32_t)ctx->bm->width; x++) rowinc_px++, rowval_px++)
#else *rowval_px += *rowinc_px;
for (x = 0; x < (uint32_t)ctx->bm->width; x++)
#endif
rowval[x] += rowinc[x];
ctx->output_row(oy, (void*)rowval, ctx); ctx->output_row(oy, (void*)rowval, ctx);
iye += ctx->src->height - 1; iye += v_i_val;
} }
return true; return true;
} }
@ -533,9 +661,9 @@ static void output_row_32_native_fromyuv(uint32_t row, void * row_in,
for (col = 0; col < ctx->bm->width; col++) { for (col = 0; col < ctx->bm->width; col++) {
if (ctx->dither) if (ctx->dither)
delta = DITHERXDY(col,dy); delta = DITHERXDY(col,dy);
y = SC_MUL(qp->b + ctx->round, ctx->divisor); y = SC_OUT(qp->b, ctx);
u = SC_MUL(qp->g + ctx->round, ctx->divisor); u = SC_OUT(qp->g, ctx);
v = SC_MUL(qp->r + ctx->round, ctx->divisor); v = SC_OUT(qp->r, ctx);
qp++; qp++;
yuv_to_rgb(y, u, v, &r, &g, &b); yuv_to_rgb(y, u, v, &r, &g, &b);
r = (31 * r + (r >> 3) + delta) >> 8; r = (31 * r + (r >> 3) + delta) >> 8;
@ -571,7 +699,7 @@ static void output_row_32_native(uint32_t row, void * row_in,
for (col = 0; col < ctx->bm->width; col++) { for (col = 0; col < ctx->bm->width; col++) {
if (ctx->dither) if (ctx->dither)
delta = DITHERXDY(col,dy); delta = DITHERXDY(col,dy);
bright = SC_MUL((*qp++) + ctx->round,ctx->divisor); bright = SC_OUT(*qp++, ctx);
bright = (3 * bright + (bright >> 6) + delta) >> 8; bright = (3 * bright + (bright >> 6) + delta) >> 8;
data |= (~bright & 3) << shift; data |= (~bright & 3) << shift;
shift -= 2; shift -= 2;
@ -594,7 +722,7 @@ static void output_row_32_native(uint32_t row, void * row_in,
for (col = 0; col < ctx->bm->width; col++) { for (col = 0; col < ctx->bm->width; col++) {
if (ctx->dither) if (ctx->dither)
delta = DITHERXDY(col,dy); delta = DITHERXDY(col,dy);
bright = SC_MUL((*qp++) + ctx->round, ctx->divisor); bright = SC_OUT(*qp++, ctx);
bright = (3 * bright + (bright >> 6) + delta) >> 8; bright = (3 * bright + (bright >> 6) + delta) >> 8;
*dest++ |= (~bright & 3) << shift; *dest++ |= (~bright & 3) << shift;
} }
@ -609,7 +737,7 @@ static void output_row_32_native(uint32_t row, void * row_in,
for (col = 0; col < ctx->bm->width; col++) { for (col = 0; col < ctx->bm->width; col++) {
if (ctx->dither) if (ctx->dither)
delta = DITHERXDY(col,dy); delta = DITHERXDY(col,dy);
bright = SC_MUL((*qp++) + ctx->round, ctx->divisor); bright = SC_OUT(*qp++, ctx);
bright = (3 * bright + (bright >> 6) + delta) >> 8; bright = (3 * bright + (bright >> 6) + delta) >> 8;
*dest++ |= vi_pattern[bright] << shift; *dest++ |= vi_pattern[bright] << shift;
} }
@ -625,9 +753,9 @@ static void output_row_32_native(uint32_t row, void * row_in,
if (ctx->dither) if (ctx->dither)
delta = DITHERXDY(col,dy); delta = DITHERXDY(col,dy);
q0 = *qp++; q0 = *qp++;
r = SC_MUL(q0.r + ctx->round, ctx->divisor); r = SC_OUT(q0.r, ctx);
g = SC_MUL(q0.g + ctx->round, ctx->divisor); g = SC_OUT(q0.g, ctx);
b = SC_MUL(q0.b + ctx->round, ctx->divisor); b = SC_OUT(q0.b, ctx);
r = (31 * r + (r >> 3) + delta) >> 8; r = (31 * r + (r >> 3) + delta) >> 8;
g = (63 * g + (g >> 2) + delta) >> 8; g = (63 * g + (g >> 2) + delta) >> 8;
b = (31 * b + (b >> 3) + delta) >> 8; b = (31 * b + (b >> 3) + delta) >> 8;
@ -664,13 +792,10 @@ int resize_on_load(struct bitmap *bm, bool dither, struct dim *src,
struct img_part* (*store_part)(void *args), struct img_part* (*store_part)(void *args),
void *args) void *args)
{ {
#ifdef HAVE_UPSCALER
const int sw = src->width; const int sw = src->width;
const int sh = src->height; const int sh = src->height;
const int dw = bm->width; const int dw = bm->width;
const int dh = bm->height; const int dh = bm->height;
#endif
int ret; int ret;
#ifdef HAVE_LCD_COLOR #ifdef HAVE_LCD_COLOR
unsigned int needed = sizeof(struct uint32_rgb) * 3 * bm->width; unsigned int needed = sizeof(struct uint32_rgb) * 3 * bm->width;
@ -721,6 +846,9 @@ int resize_on_load(struct bitmap *bm, bool dither, struct dim *src,
ctx.bm = bm; ctx.bm = bm;
ctx.src = src; ctx.src = src;
ctx.dither = dither; ctx.dither = dither;
#if defined(CPU_SH) || defined (TEST_SH_MATH)
uint32_t div;
#endif
#if !defined(PLUGIN) #if !defined(PLUGIN)
#if defined(HAVE_LCD_COLOR) && defined(HAVE_JPEG) #if defined(HAVE_LCD_COLOR) && defined(HAVE_JPEG)
ctx.output_row = format_index ? output_row_32_native_fromyuv ctx.output_row = format_index ? output_row_32_native_fromyuv
@ -740,23 +868,56 @@ int resize_on_load(struct bitmap *bm, bool dither, struct dim *src,
{ {
#endif #endif
ctx.h_scaler = scale_h_area; ctx.h_scaler = scale_h_area;
scale_h_area_setup(&ctx); #if defined(CPU_SH) || defined (TEST_SH_MATH)
div = sw;
#else
uint32_t h_div = (1U << 24) / sw;
ctx.h_i_val = sw * h_div;
ctx.h_o_val = dw * h_div;
#endif
#ifdef HAVE_UPSCALER #ifdef HAVE_UPSCALER
} else { } else {
ctx.h_scaler = scale_h_linear; ctx.h_scaler = scale_h_linear;
scale_h_linear_setup(&ctx); #if defined(CPU_SH) || defined (TEST_SH_MATH)
div = dw - 1;
#else
uint32_t h_div = (1U << 24) / (dw - 1);
ctx.h_i_val = (sw - 1) * h_div;
ctx.h_o_val = (dw - 1) * h_div;
#endif
} }
#endif #endif
SC_MUL_INIT; #ifdef CPU_COLDFIRE
coldfire_set_macsr(EMAC_UNSIGNED);
#endif
#ifdef HAVE_UPSCALER #ifdef HAVE_UPSCALER
if (sh > dh) if (sh > dh)
#endif
{
#if defined(CPU_SH) || defined (TEST_SH_MATH)
div *= sh;
ctx.recip = ((uint32_t)(-div)) / div + 1;
#else
uint32_t v_div = (1U << 22) / sh;
ctx.v_i_val = sh * v_div;
ctx.v_o_val = dh * v_div;
#endif #endif
ret = scale_v_area(rset, &ctx); ret = scale_v_area(rset, &ctx);
}
#ifdef HAVE_UPSCALER #ifdef HAVE_UPSCALER
else else
ret = scale_v_linear(rset, &ctx); {
#if defined(CPU_SH) || defined (TEST_SH_MATH)
div *= dh - 1;
ctx.recip = ((uint32_t)(-div)) / div + 1;
#else
uint32_t v_div = (1U << 22) / dh;
ctx.v_i_val = (sh - 1) * v_div;
ctx.v_o_val = (dh - 1) * v_div;
#endif
ret = scale_v_linear(rset, &ctx);
}
#endif #endif
SC_MUL_END;
#ifdef HAVE_ADJUSTABLE_CPU_FREQ #ifdef HAVE_ADJUSTABLE_CPU_FREQ
cpu_boost(false); cpu_boost(false);
#endif #endif

View file

@ -43,67 +43,61 @@
#define MAX_SC_STACK_ALLOC 0 #define MAX_SC_STACK_ALLOC 0
#define HAVE_UPSCALER 1 #define HAVE_UPSCALER 1
#if defined(CPU_COLDFIRE) #if defined(CPU_SH)
#define SC_MUL_INIT \ /* perform 32x32->40 unsigned multiply, round off and return top 8 bits */
unsigned long macsr_st = coldfire_get_macsr(); \ static inline uint32_t sc_mul_u32_rnd(uint32_t m, uint32_t n)
coldfire_set_macsr(EMAC_UNSIGNED);
#define SC_MUL_END coldfire_set_macsr(macsr_st);
#define SC_MUL(x, y) \
({ \
unsigned long t; \
asm ("mac.l %[a], %[b], %%acc0\n\t" \
"move.l %%accext01, %[t]\n\t" \
"move.l #0, %%acc0\n\t" \
: [t] "=r" (t) : [a] "r" (x), [b] "r" (y)); \
t; \
})
#elif (CONFIG_CPU == SH7034)
/* multiply two unsigned 32 bit values and return the top 32 bit
* of the 64 bit result */
static inline unsigned sc_mul32(unsigned a, unsigned b)
{ {
unsigned r, t1, t2, t3; unsigned r, t1, t2, t3;
unsigned h = 1 << 15;
/* notation:
m = ab, n = cd
final result is (((a *c) << 32) + ((b * c + a * d) << 16) + b * d +
(1 << 31)) >> 32
*/
asm ( asm (
"swap.w %[a], %[t1] \n" /* t1 = ba */ "swap.w %[m], %[t1]\n\t" /* t1 = ba */
"mulu %[t1], %[b] \n" /* a * d */ "mulu %[m], %[n]\n\t" /* b * d */
"swap.w %[b], %[t3] \n" /* t3 = dc */ "swap.w %[n], %[t3]\n\t" /* t3 = dc */
"sts macl, %[t2] \n" /* t2 = a * d */ "sts macl, %[r]\n\t" /* r = b * d */
"mulu %[t1], %[t3] \n" /* a * c */ "mulu %[m], %[t3]\n\t" /* b * c */
"sts macl, %[r] \n" /* hi = a * c */ "shlr16 %[r]\n\t"
"mulu %[a], %[t3] \n" /* b * c */ "sts macl, %[t2]\n\t" /* t2 = b * c */
"clrt \n" "mulu %[t1], %[t3]\n\t" /* a * c */
"sts macl, %[t3] \n" /* t3 = b * c */ "add %[t2], %[r]\n\t"
"addc %[t2], %[t3] \n" /* t3 += t2, carry -> t2 */ "sts macl, %[t3]\n\t" /* t3 = a * c */
"movt %[t2] \n" "mulu %[t1], %[n]\n\t" /* a * d */
"mulu %[a], %[b] \n" /* b * d */ "shll16 %[t3]\n\t"
"mov %[t3], %[t1] \n" /* t1t3 = t2t3 << 16 */ "sts macl, %[t2]\n\t" /* t2 = a * d */
"xtrct %[t2], %[t1] \n" "add %[t2], %[r]\n\t"
"shll16 %[t3] \n" "add %[t3], %[r]\n\t" /* r = ((b * d) >> 16) + (b * c + a * d) +
"sts macl, %[t2] \n" /* lo = b * d */ ((a * c) << 16) */
"clrt \n" /* hi.lo += t1t3 */ "add %[h], %[r]\n\t" /* round result */
"addc %[t3], %[t2] \n" "shlr16 %[r]\n\t" /* truncate result */
"addc %[t1], %[r] \n"
: /* outputs */ : /* outputs */
[r] "=&r"(r), [r] "=&r"(r),
[t1]"=&r"(t1), [t1]"=&r"(t1),
[t2]"=&r"(t2), [t2]"=&r"(t2),
[t3]"=&r"(t3) [t3]"=&r"(t3)
: /* inputs */ : /* inputs */
[a] "r" (a), [h] "r" (h),
[b] "r" (b) [m] "r" (m),
[n] "r" (n)
); );
return r; return r;
} }
#define SC_MUL(x, y) sc_mul32(x, y) #elif defined(TEST_SH_MATH)
#define SC_MUL_INIT static inline uint32_t sc_mul_u32_rnd(uint32_t op1, uint32_t op2)
#define SC_MUL_END {
uint64_t tmp = (uint64_t)op1 * op2;
tmp += 1LU << 31;
tmp >>= 32;
return tmp;
}
#else
#define SC_OUT(n, c) (((n) + (1 << 23)) >> 24)
#endif #endif
#ifndef SC_OUT
#ifndef SC_MUL #define SC_OUT(n, c) (sc_mul_u32_rnd(n, (c)->recip))
#define SC_MUL(x, y) ((x) * (uint64_t)(y) >> 32)
#define SC_MUL_INIT
#define SC_MUL_END
#endif #endif
struct img_part { struct img_part {
@ -130,8 +124,14 @@ struct uint32_rgb {
horizontal scaler, and row output horizontal scaler, and row output
*/ */
struct scaler_context { struct scaler_context {
uint32_t divisor; #if defined(CPU_SH) || defined(TEST_SH_MATH)
uint32_t round; uint32_t recip;
#else
uint32_t h_i_val;
uint32_t h_o_val;
uint32_t v_i_val;
uint32_t v_o_val;
#endif
struct bitmap *bm; struct bitmap *bm;
struct dim *src; struct dim *src;
unsigned char *buf; unsigned char *buf;