1
0
Fork 0
forked from len0rd/rockbox

* Assembler optimised gray_update_rect() and writearray() for arm (greyscale iPods). * Some slight optimisations for coldfire (H1x0) and SH1 (archos). * Comment and formatting cleanup.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10473 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Jens Arnold 2006-08-07 17:21:38 +00:00
parent 8921b34e4b
commit c00d799fa3
3 changed files with 675 additions and 406 deletions

View file

@ -648,14 +648,165 @@ void gray_update_rect(int x, int y, int width, int height)
cbuf = _gray_info.cur_buffer + srcofs_row; cbuf = _gray_info.cur_buffer + srcofs_row;
bbuf = _gray_info.back_buffer + srcofs_row; bbuf = _gray_info.back_buffer + srcofs_row;
#if 0 /* CPU specific asm versions will go here */ #ifdef CPU_ARM
asm volatile (
"ldr r0, [%[cbuf]] \n"
"ldr r1, [%[bbuf]] \n"
"eor r1, r0, r1 \n"
"ldr r0, [%[cbuf], #4] \n"
"ldr %[chg], [%[bbuf], #4] \n"
"eor %[chg], r0, %[chg] \n"
"orr %[chg], %[chg], r1 \n"
: /* outputs */
[chg] "=&r"(change)
: /* inputs */
[cbuf]"r"(cbuf),
[bbuf]"r"(bbuf)
: /* clobbers */
"r0", "r1"
);
if (change != 0)
{
unsigned char *addr, *end;
unsigned mask, trash;
pat_ptr = &pat_stack[8];
/* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */
asm volatile (
"mov r3, #8 \n" /* loop count */
"mov %[mask], #0 \n"
".ur_pre_loop: \n"
"mov %[mask], %[mask], lsl #1 \n" /* shift mask */
"ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */
"ldrb r1, [%[bbuf]] \n" /* read back buffer */
"strb r0, [%[bbuf]], #1 \n" /* update back buffer */
"mov r2, #0 \n" /* preset for skipped pixel */
"cmp r0, r1 \n" /* no change? */
"beq .ur_skip \n" /* -> skip */
"ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */
"add r0, %[rnd], %[rnd], lsl #3 \n" /* multiply by 75 */
"add %[rnd], %[rnd], %[rnd], lsl #1 \n"
"add %[rnd], %[rnd], r0, lsl #3 \n"
"add %[rnd], %[rnd], #74 \n" /* add another 74 */
/* Since the lower bits are not very random: get bits 8..15 (need max. 5) */
"and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */
"cmp r1, %[dpth] \n" /* random >= depth ? */
"subhs r1, r1, %[dpth] \n" /* yes: random -= depth */
"mov r0, r2, lsl r1 \n" /** rotate pattern **/
"sub r1, %[dpth], r1 \n"
"orr r2, r0, r2, lsr r1 \n"
"orr %[mask], %[mask], #1 \n" /* set mask bit */
".ur_skip: \n"
"str r2, [%[patp], #-4]! \n" /* push on pattern stack */
"subs r3, r3, #1 \n" /* loop 8 times (pixel block) */
"bne .ur_pre_loop \n"
: /* outputs */
[cbuf]"+r"(cbuf),
[bbuf]"+r"(bbuf),
[patp]"+r"(pat_ptr),
[rnd] "+r"(_gray_random_buffer),
[mask]"=&r"(mask)
: /* inputs */
[bpat]"r"(_gray_info.bitpattern),
[dpth]"r"(_gray_info.depth),
[rmsk]"r"(_gray_info.randmask)
: /* clobbers */
"r0", "r1", "r2", "r3"
);
addr = dst_row;
end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
/* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */
asm volatile (
"ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */
"mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
"ands %[mask], %[mask], #0xff \n"
"beq .ur_sloop \n" /* short loop if nothing to keep */
".ur_floop: \n" /** full loop (there are bits to keep)**/
"movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */
"adc r0, r0, r0 \n" /* put bit into LSB for byte */
"movs r8, r8, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r7, r7, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r6, r6, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r5, r5, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r4, r4, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r3, r3, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r2, r2, lsr #1 \n"
"adc r0, r0, r0 \n"
"ldrb r1, [%[addr]] \n" /* read old value */
"and r1, r1, %[mask] \n" /* mask out replaced bits */
"orr r1, r1, r0 \n" /* set new bits */
"strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */
"cmp %[end], %[addr] \n" /* loop for all bitplanes */
"bne .ur_floop \n"
"b .ur_end \n"
".ur_sloop: \n" /** short loop (nothing to keep) **/
"movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */
"adc r0, r0, r0 \n" /* put bit into LSB for byte */
"movs r8, r8, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r7, r7, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r6, r6, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r5, r5, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r4, r4, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r3, r3, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r2, r2, lsr #1 \n"
"adc r0, r0, r0 \n"
"strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */
"cmp %[end], %[addr] \n" /* loop for all bitplanes */
"bne .ur_sloop \n"
".ur_end: \n"
: /* outputs */
[addr]"+r"(addr),
[mask]"+r"(mask),
[rx] "=&r"(trash)
: /* inputs */
[psiz]"r"(_gray_info.plane_size),
[end] "r"(end),
[patp]"[rx]"(pat_ptr)
: /* clobbers */
"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
);
}
#else /* C version, for reference*/ #else /* C version, for reference*/
#warning C version of gray_update_rect() used
(void)pat_ptr; (void)pat_ptr;
/* check whether anything changed in the 8-pixel block */ /* check whether anything changed in the 8-pixel block */
change = *(uint32_t *)cbuf ^ *(uint32_t *)bbuf; change = *(uint32_t *)cbuf ^ *(uint32_t *)bbuf;
cbuf += sizeof(uint32_t); change |= *(uint32_t *)(cbuf + 4) ^ *(uint32_t *)(bbuf + 4);
bbuf += sizeof(uint32_t);
change |= *(uint32_t *)cbuf ^ *(uint32_t *)bbuf;
if (change != 0) if (change != 0)
{ {
@ -664,9 +815,6 @@ void gray_update_rect(int x, int y, int width, int height)
unsigned test = 1; unsigned test = 1;
int i; int i;
cbuf = _gray_info.cur_buffer + srcofs_row;
bbuf = _gray_info.back_buffer + srcofs_row;
/* precalculate the bit patterns with random shifts /* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */ * for all 8 pixels and put them on an extra "stack" */
for (i = 7; i >= 0; i--) for (i = 7; i >= 0; i--)
@ -788,18 +936,18 @@ void gray_update_rect(int x, int y, int width, int height)
#if CONFIG_CPU == SH7034 #if CONFIG_CPU == SH7034
asm volatile ( asm volatile (
"mov.l @%[cbuf]+,r1 \n" "mov.l @%[cbuf],r1 \n"
"mov.l @%[bbuf]+,r2 \n" "mov.l @%[bbuf],r2 \n"
"xor r1,r2 \n" "xor r1,r2 \n"
"mov.l @%[cbuf],r1 \n" "mov.l @(4,%[cbuf]),r1 \n"
"mov.l @%[bbuf],%[chg] \n" "mov.l @(4,%[bbuf]),%[chg] \n"
"xor r1,%[chg] \n" "xor r1,%[chg] \n"
"or r2,%[chg] \n" "or r2,%[chg] \n"
: /* outputs */ : /* outputs */
[cbuf]"+r"(cbuf),
[bbuf]"+r"(bbuf),
[chg] "=r"(change) [chg] "=r"(change)
: /* inputs */ : /* inputs */
[cbuf]"r"(cbuf),
[bbuf]"r"(bbuf)
: /* clobbers */ : /* clobbers */
"r1", "r2" "r1", "r2"
); );
@ -810,13 +958,11 @@ void gray_update_rect(int x, int y, int width, int height)
unsigned mask, trash; unsigned mask, trash;
pat_ptr = &pat_stack[8]; pat_ptr = &pat_stack[8];
cbuf = _gray_info.cur_buffer + srcofs_row;
bbuf = _gray_info.back_buffer + srcofs_row;
/* precalculate the bit patterns with random shifts /* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */ * for all 8 pixels and put them on an extra "stack" */
asm volatile ( asm volatile (
"mov #8,r3 \n" /* loop count in r3: 8 pixels */ "mov #8,r3 \n" /* loop count */
".ur_pre_loop: \n" ".ur_pre_loop: \n"
"mov.b @%[cbuf]+,r0\n" /* read current buffer */ "mov.b @%[cbuf]+,r0\n" /* read current buffer */
@ -860,10 +1006,11 @@ void gray_update_rect(int x, int y, int width, int height)
"rotcr %[mask] \n" /* get mask bit */ "rotcr %[mask] \n" /* get mask bit */
"mov.l r2,@-%[patp]\n" /* push on pattern stack */ "mov.l r2,@-%[patp]\n" /* push on pattern stack */
"add #-1,r3 \n" /* decrease loop count */ "add #-1,r3 \n" /* loop 8 times (pixel block) */
"cmp/pl r3 \n" /* loop count > 0? */ "cmp/pl r3 \n"
"bt .ur_pre_loop\n" /* yes: loop */ "bt .ur_pre_loop\n"
"shlr8 %[mask] \n"
"shlr8 %[mask] \n" /* shift mask to low byte */
"shlr16 %[mask] \n" "shlr16 %[mask] \n"
: /* outputs */ : /* outputs */
[cbuf]"+r"(cbuf), [cbuf]"+r"(cbuf),
@ -885,77 +1032,77 @@ void gray_update_rect(int x, int y, int width, int height)
/* set the bits for all 8 pixels in all bytes according to the /* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */ * precalculated patterns on the pattern stack */
asm volatile ( asm volatile (
"mov.l @%[patp]+,r1\n" /* pop all 8 patterns */ "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */
"mov.l @%[patp]+,r2\n" "mov.l @%[patp]+,r2 \n"
"mov.l @%[patp]+,r3\n" "mov.l @%[patp]+,r3 \n"
"mov.l @%[patp]+,r6\n" "mov.l @%[patp]+,r6 \n"
"mov.l @%[patp]+,r7\n" "mov.l @%[patp]+,r7 \n"
"mov.l @%[patp]+,r8\n" "mov.l @%[patp]+,r8 \n"
"mov.l @%[patp]+,r9\n" "mov.l @%[patp]+,r9 \n"
"mov.l @%[patp],r10\n" "mov.l @%[patp],r10 \n"
"tst %[mask],%[mask] \n" /* nothing to keep? */ "tst %[mask],%[mask] \n"
"bt .ur_sloop \n" /* yes: jump to short loop */ "bt .ur_sloop \n" /* short loop if nothing to keep */
".ur_floop: \n" /** full loop (there are bits to keep)**/ ".ur_floop: \n" /** full loop (there are bits to keep)**/
"shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
"rotcl r0 \n" /* rotate t bit into r0 */ "rotcl r0 \n" /* rotate t bit into r0 */
"shlr r2 \n" "shlr r2 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r3 \n" "shlr r3 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r6 \n" "shlr r6 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r7 \n" "shlr r7 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r8 \n" "shlr r8 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r9 \n" "shlr r9 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r10 \n" "shlr r10 \n"
"mov.b @%[addr],%[rx] \n" /* read old value */ "mov.b @%[addr],%[rx] \n" /* read old value */
"rotcl r0 \n" "rotcl r0 \n"
"and %[mask],%[rx] \n" /* mask out unneeded bits */ "and %[mask],%[rx] \n" /* mask out replaced bits */
"or %[rx],r0 \n" /* set new bits */ "or %[rx],r0 \n" /* set new bits */
"mov.b r0,@%[addr] \n" /* store value to bitplane */ "mov.b r0,@%[addr] \n" /* store value to bitplane */
"add %[psiz],%[addr] \n" /* advance to next bitplane */ "add %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp/hi %[addr],%[end] \n" /* last bitplane done? */ "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */
"bt .ur_floop \n" /* no: loop */ "bt .ur_floop \n"
"bra .ur_end \n" "bra .ur_end \n"
"nop \n" "nop \n"
/* References to C library routines used in the precalc block */ /* References to C library routines used in the precalc block */
".align 2 \n" ".align 2 \n"
".ashlsi3: \n" /* C library routine: */ ".ashlsi3: \n" /* C library routine: */
".long ___ashlsi3 \n" /* shift r4 left by r5, res. in r0 */ ".long ___ashlsi3 \n" /* shift r4 left by r5, res. in r0 */
".lshrsi3: \n" /* C library routine: */ ".lshrsi3: \n" /* C library routine: */
".long ___lshrsi3 \n" /* shift r4 right by r5, res. in r0 */ ".long ___lshrsi3 \n" /* shift r4 right by r5, res. in r0 */
/* both routines preserve r4, destroy r5 and take ~16 cycles */ /* both routines preserve r4, destroy r5 and take ~16 cycles */
".ur_sloop: \n" /** short loop (nothing to keep) **/ ".ur_sloop: \n" /** short loop (nothing to keep) **/
"shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
"rotcl r0 \n" /* rotate t bit into r0 */ "rotcl r0 \n" /* rotate t bit into r0 */
"shlr r2 \n" "shlr r2 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r3 \n" "shlr r3 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r6 \n" "shlr r6 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r7 \n" "shlr r7 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r8 \n" "shlr r8 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r9 \n" "shlr r9 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r10 \n" "shlr r10 \n"
"rotcl r0 \n" "rotcl r0 \n"
"mov.b r0,@%[addr] \n" /* store byte to bitplane */ "mov.b r0,@%[addr] \n" /* store byte to bitplane */
"add %[psiz],%[addr] \n" /* advance to next bitplane */ "add %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp/hi %[addr],%[end] \n" /* last bitplane done? */ "cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */
"bt .ur_sloop \n" /* no: loop */ "bt .ur_sloop \n"
".ur_end: \n" ".ur_end: \n"
: /* outputs */ : /* outputs */
[addr]"+r"(addr), [addr]"+r"(addr),
[mask]"+r"(mask), [mask]"+r"(mask),
@ -970,18 +1117,18 @@ void gray_update_rect(int x, int y, int width, int height)
} }
#elif defined(CPU_COLDFIRE) #elif defined(CPU_COLDFIRE)
asm volatile ( asm volatile (
"move.l (%[cbuf])+,%%d0 \n" "move.l (%[cbuf]),%%d0 \n"
"move.l (%[bbuf])+,%%d1 \n" "move.l (%[bbuf]),%%d1 \n"
"eor.l %%d0,%%d1 \n" "eor.l %%d0,%%d1 \n"
"move.l (%[cbuf]),%%d0 \n" "move.l (4,%[cbuf]),%%d0 \n"
"move.l (%[bbuf]),%[chg]\n" "move.l (4,%[bbuf]),%[chg] \n"
"eor.l %%d0,%[chg] \n" "eor.l %%d0,%[chg] \n"
"or.l %%d1,%[chg] \n" "or.l %%d1,%[chg] \n"
: /* outputs */ : /* outputs */
[cbuf]"+a"(cbuf),
[bbuf]"+a"(bbuf),
[chg] "=&d"(change) [chg] "=&d"(change)
: /* inputs */ : /* inputs */
[cbuf]"a"(cbuf),
[bbuf]"a"(bbuf)
: /* clobbers */ : /* clobbers */
"d0", "d1" "d0", "d1"
); );
@ -992,54 +1139,52 @@ void gray_update_rect(int x, int y, int width, int height)
unsigned mask, trash; unsigned mask, trash;
pat_ptr = &pat_stack[8]; pat_ptr = &pat_stack[8];
cbuf = _gray_info.cur_buffer + srcofs_row;
bbuf = _gray_info.back_buffer + srcofs_row;
/* precalculate the bit patterns with random shifts /* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */ * for all 8 pixels and put them on an extra "stack" */
asm volatile ( asm volatile (
"moveq.l #8,%%d3 \n" /* loop count in d3: 8 pixels */ "moveq.l #8,%%d3 \n" /* loop count */
"clr.l %[mask] \n" "clr.l %[mask] \n"
".ur_pre_loop: \n" ".ur_pre_loop: \n"
"clr.l %%d0 \n" "clr.l %%d0 \n"
"move.b (%[cbuf])+,%%d0 \n" /* read current buffer */ "move.b (%[cbuf])+,%%d0 \n" /* read current buffer */
"clr.l %%d1 \n" "clr.l %%d1 \n"
"move.b (%[bbuf]),%%d1 \n" /* read back buffer */ "move.b (%[bbuf]),%%d1 \n" /* read back buffer */
"move.b %%d0,(%[bbuf])+ \n" /* update back buffer */ "move.b %%d0,(%[bbuf])+ \n" /* update back buffer */
"clr.l %%d2 \n" /* preset for skipped pixel */ "clr.l %%d2 \n" /* preset for skipped pixel */
"cmp.l %%d0,%%d1 \n" /* no change? */ "cmp.l %%d0,%%d1 \n" /* no change? */
"beq.b .ur_skip \n" /* -> skip */ "beq.b .ur_skip \n" /* -> skip */
"move.l (%%d0:l:4,%[bpat]),%%d2 \n" /* d2 = bitpattern[byte]; */ "move.l (%%d0:l:4,%[bpat]),%%d2 \n" /* d2 = bitpattern[byte]; */
"mulu.w #75,%[rnd] \n" /* multiply by 75 */ "mulu.w #75,%[rnd] \n" /* multiply by 75 */
"add.l #74,%[rnd] \n" /* add another 74 */ "add.l #74,%[rnd] \n" /* add another 74 */
/* Since the lower bits are not very random: */ /* Since the lower bits are not very random: */
"move.l %[rnd],%%d1 \n" "move.l %[rnd],%%d1 \n"
"lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */
"and.l %[rmsk],%%d1\n" /* mask out unneeded bits */ "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */
"cmp.l %[dpth],%%d1\n" /* random >= depth ? */ "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */
"blo.b .ur_ntrim \n" "blo.b .ur_ntrim \n"
"sub.l %[dpth],%%d1\n" /* yes: random -= depth; */ "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */
".ur_ntrim: \n" ".ur_ntrim: \n"
"move.l %%d2,%%d0 \n" "move.l %%d2,%%d0 \n" /** rotate pattern **/
"lsl.l %%d1,%%d0 \n" "lsl.l %%d1,%%d0 \n"
"sub.l %[dpth],%%d1\n" "sub.l %[dpth],%%d1 \n"
"neg.l %%d1 \n" /* d1 = depth - d1 */ "neg.l %%d1 \n" /* d1 = depth - d1 */
"lsr.l %%d1,%%d2 \n" "lsr.l %%d1,%%d2 \n"
"or.l %%d0,%%d2 \n" /* rotated_pattern = d2 | d0 */ "or.l %%d0,%%d2 \n" /* rotated_pattern = d2 | d0 */
"or.l #0x0100,%[mask] \n" /* set mask bit */ "or.l #0x0100,%[mask] \n" /* set mask bit */
".ur_skip: \n" ".ur_skip: \n"
"lsr.l #1,%[mask] \n" /* shift mask */ "lsr.l #1,%[mask] \n" /* shift mask */
"move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */
"subq.l #1,%%d3 \n" /* decrease loop count */ "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */
"bne.b .ur_pre_loop\n" /* yes: loop */ "bne.b .ur_pre_loop \n"
: /* outputs */ : /* outputs */
[cbuf]"+a"(cbuf), [cbuf]"+a"(cbuf),
[bbuf]"+a"(bbuf), [bbuf]"+a"(bbuf),
@ -1061,79 +1206,79 @@ void gray_update_rect(int x, int y, int width, int height)
* precalculated patterns on the pattern stack */ * precalculated patterns on the pattern stack */
asm volatile ( asm volatile (
"movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n"
/* pop all 8 patterns */ /* pop all 8 patterns */
"not.l %[mask] \n" /* set mask -> keep mask */ "not.l %[mask] \n" /* "set" mask -> "keep" mask */
"and.l #0xFF,%[mask] \n" "and.l #0xFF,%[mask] \n"
"beq.b .ur_sstart \n" /* yes: jump to short loop */ "beq.b .ur_sstart \n" /* short loop if nothing to keep */
".ur_floop: \n" /** full loop (there are bits to keep)**/ ".ur_floop: \n" /** full loop (there are bits to keep)**/
"clr.l %%d0 \n" "clr.l %%d0 \n"
"lsr.l #1,%%d2 \n" /* shift out mask bit */ "lsr.l #1,%%d2 \n" /* shift out pattern bit */
"addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */ "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */
"lsr.l #1,%%d3 \n" "lsr.l #1,%%d3 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d4 \n" "lsr.l #1,%%d4 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d5 \n" "lsr.l #1,%%d5 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d6 \n" "lsr.l #1,%%d6 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"move.l %%a0,%%d1 \n" "move.l %%a0,%%d1 \n"
"lsr.l #1,%%d1 \n" "lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a0 \n" "move.l %%d1,%%a0 \n"
"move.l %%a1,%%d1 \n" "move.l %%a1,%%d1 \n"
"lsr.l #1,%%d1 \n" "lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a1 \n" "move.l %%d1,%%a1 \n"
"move.l %[ax],%%d1 \n" "move.l %[ax],%%d1 \n"
"lsr.l #1,%%d1 \n" "lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"move.l %%d1,%[ax] \n" "move.l %%d1,%[ax] \n"
"move.b (%[addr]),%%d1 \n" /* read old value */ "move.b (%[addr]),%%d1 \n" /* read old value */
"and.l %[mask],%%d1 \n" /* mask out unneeded bits */ "and.l %[mask],%%d1 \n" /* mask out replaced bits */
"or.l %%d0,%%d1 \n" /* set new bits */ "or.l %%d0,%%d1 \n" /* set new bits */
"move.b %%d1,(%[addr]) \n" /* store value to bitplane */ "move.b %%d1,(%[addr]) \n" /* store value to bitplane */
"add.l %[psiz],%[addr] \n" /* advance to next bitplane */ "add.l %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp.l %[addr],%[end] \n" /* last bitplane done? */ "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */
"bhi.b .ur_floop \n" /* no: loop */ "bhi.b .ur_floop \n"
"bra.b .ur_end \n" "bra.b .ur_end \n"
".ur_sstart: \n" ".ur_sstart: \n"
"move.l %%a0,%[mask]\n" /* mask isn't needed here, reuse reg */ "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */
".ur_sloop: \n" /** short loop (nothing to keep) **/ ".ur_sloop: \n" /** short loop (nothing to keep) **/
"clr.l %%d0 \n" "clr.l %%d0 \n"
"lsr.l #1,%%d2 \n" /* shift out mask bit */ "lsr.l #1,%%d2 \n" /* shift out pattern bit */
"addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */ "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */
"lsr.l #1,%%d3 \n" "lsr.l #1,%%d3 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d4 \n" "lsr.l #1,%%d4 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d5 \n" "lsr.l #1,%%d5 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d6 \n" "lsr.l #1,%%d6 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"lsr.l #1,%[mask] \n" "lsr.l #1,%[mask] \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"move.l %%a1,%%d1 \n" "move.l %%a1,%%d1 \n"
"lsr.l #1,%%d1 \n" "lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a1 \n" "move.l %%d1,%%a1 \n"
"move.l %[ax],%%d1 \n" "move.l %[ax],%%d1 \n"
"lsr.l #1,%%d1 \n" "lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n" "addx.l %%d0,%%d0 \n"
"move.l %%d1,%[ax] \n" "move.l %%d1,%[ax] \n"
"move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */
"add.l %[psiz],%[addr] \n" /* advance to next bitplane */ "add.l %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp.l %[addr],%[end] \n" /* last bitplane done? */ "cmp.l %[addr],%[end] \n" /* loop through all bitplanes */
"bhi.b .ur_sloop \n" /* no: loop */ "bhi.b .ur_sloop \n"
".ur_end: \n" ".ur_end: \n"
: /* outputs */ : /* outputs */
[addr]"+a"(addr), [addr]"+a"(addr),
[mask]"+d"(mask), [mask]"+d"(mask),
@ -1151,9 +1296,7 @@ void gray_update_rect(int x, int y, int width, int height)
(void)pat_ptr; (void)pat_ptr;
/* check whether anything changed in the 8-pixel block */ /* check whether anything changed in the 8-pixel block */
change = *(uint32_t *)cbuf ^ *(uint32_t *)bbuf; change = *(uint32_t *)cbuf ^ *(uint32_t *)bbuf;
cbuf += sizeof(uint32_t); change |= *(uint32_t *)(cbuf + 4) ^ *(uint32_t *)(bbuf + 4);
bbuf += sizeof(uint32_t);
change |= *(uint32_t *)cbuf ^ *(uint32_t *)bbuf;
if (change != 0) if (change != 0)
{ {
@ -1162,9 +1305,6 @@ void gray_update_rect(int x, int y, int width, int height)
unsigned test = 1; unsigned test = 1;
int i; int i;
cbuf = _gray_info.cur_buffer + srcofs_row;
bbuf = _gray_info.back_buffer + srcofs_row;
/* precalculate the bit patterns with random shifts /* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */ * for all 8 pixels and put them on an extra "stack" */
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)

View file

@ -876,8 +876,140 @@ static void _writearray(unsigned char *address, const unsigned char *src,
unsigned long pat_stack[8]; unsigned long pat_stack[8];
unsigned long *pat_ptr = &pat_stack[8]; unsigned long *pat_ptr = &pat_stack[8];
unsigned char *addr, *end; unsigned char *addr, *end;
#if 0 /* CPU specific asm versions will go here */ #ifdef CPU_ARM
const unsigned char *_src;
unsigned _mask, trash;
_mask = mask;
_src = src;
/* precalculate the bit patterns with random shifts
for all 8 pixels and put them on an extra "stack" */
asm volatile (
"mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */
"mov r3, #8 \n" /* loop count */
".wa_loop: \n" /** load pattern for pixel **/
"mov r2, #0 \n" /* pattern for skipped pixel must be 0 */
"movs %[mask], %[mask], lsl #1 \n" /* shift out msb of mask */
"bcc .wa_skip \n" /* skip this pixel */
"ldrb r0, [%[src]] \n" /* load src byte */
"ldrb r0, [%[trns], r0] \n" /* idxtable into pattern index */
"ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */
"add r0, %[rnd], %[rnd], lsl #3 \n" /* multiply by 75 */
"add %[rnd], %[rnd], %[rnd], lsl #1 \n"
"add %[rnd], %[rnd], r0, lsl #3 \n"
"add %[rnd], %[rnd], #74 \n" /* add another 74 */
/* Since the lower bits are not very random: get bits 8..15 (need max. 5) */
"and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */
"cmp r1, %[dpth] \n" /* random >= depth ? */
"subhs r1, r1, %[dpth] \n" /* yes: random -= depth */
"mov r0, r2, lsl r1 \n" /** rotate pattern **/
"sub r1, %[dpth], r1 \n"
"orr r2, r0, r2, lsr r1 \n"
".wa_skip: \n"
"str r2, [%[patp], #-4]! \n" /* push on pattern stack */
"add %[src], %[src], #1 \n" /* src++; */
"subs r3, r3, #1 \n" /* loop 8 times (pixel block) */
"bne .wa_loop \n"
: /* outputs */
[src] "+r"(_src),
[patp]"+r"(pat_ptr),
[rnd] "+r"(_gray_random_buffer),
[mask]"+r"(_mask)
: /* inputs */
[bpat]"r"(_gray_info.bitpattern),
[trns]"r"(_gray_info.idxtable),
[dpth]"r"(_gray_info.depth),
[rmsk]"r"(_gray_info.randmask)
: /* clobbers */
"r0", "r1", "r2", "r3"
);
addr = address;
end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
_mask = mask;
/* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */
asm volatile (
"ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */
"mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
"ands %[mask], %[mask], #0xff \n"
"beq .wa_sloop \n" /* short loop if nothing to keep */
".wa_floop: \n" /** full loop (there are bits to keep)**/
"movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */
"adc r0, r0, r0 \n" /* put bit into LSB of byte */
"movs r8, r8, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r7, r7, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r6, r6, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r5, r5, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r4, r4, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r3, r3, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r2, r2, lsr #1 \n"
"adc r0, r0, r0 \n"
"ldrb r1, [%[addr]] \n" /* read old value */
"and r1, r1, %[mask] \n" /* mask out replaced bits */
"orr r1, r1, r0 \n" /* set new bits */
"strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */
"cmp %[end], %[addr] \n" /* loop through all bitplanes */
"bne .wa_floop \n"
"b .wa_end \n"
".wa_sloop: \n" /** short loop (nothing to keep) **/
"movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */
"adc r0, r0, r0 \n" /* put bit into LSB of byte */
"movs r8, r8, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r7, r7, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r6, r6, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r5, r5, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r4, r4, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r3, r3, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r2, r2, lsr #1 \n"
"adc r0, r0, r0 \n"
"strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */
"cmp %[end], %[addr] \n" /* loop through all bitplanes */
"bne .wa_sloop \n"
".wa_end: \n"
: /* outputs */
[addr]"+r"(addr),
[mask]"+r"(_mask),
[rx] "=&r"(trash)
: /* inputs */
[psiz]"r"(_gray_info.plane_size),
[end] "r"(end),
[patp]"[rx]"(pat_ptr)
: /* clobbers */
"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
);
#else /* C version, for reference*/ #else /* C version, for reference*/
#warning C version of _writearray() used
unsigned test = 0x80; unsigned test = 0x80;
int i; int i;
@ -1027,52 +1159,52 @@ static void _writearray(unsigned char *address, const unsigned char *src,
/* precalculate the bit patterns with random shifts /* precalculate the bit patterns with random shifts
for all 8 pixels and put them on an extra "stack" */ for all 8 pixels and put them on an extra "stack" */
asm volatile ( asm volatile (
"mov #8,r3 \n" /* loop count in r3: 8 pixels */ "mov #8,r3 \n" /* loop count */
".wa_loop: \n" /** load pattern for pixel **/ ".wa_loop: \n" /** load pattern for pixel **/
"mov #0,r0 \n" /* pattern for skipped pixel must be 0 */ "mov #0,r0 \n" /* pattern for skipped pixel must be 0 */
"shlr %[mask] \n" /* shift out lsb of mask */ "shlr %[mask] \n" /* shift out lsb of mask */
"bf .wa_skip \n" /* skip this pixel */ "bf .wa_skip \n" /* skip this pixel */
"mov.b @%[src],r0 \n" /* load src byte */ "mov.b @%[src],r0 \n" /* load src byte */
"extu.b r0,r0 \n" /* extend unsigned */ "extu.b r0,r0 \n" /* extend unsigned */
"mov.b @(r0,%[trns]),r0\n" /* idxtable into pattern index */ "mov.b @(r0,%[trns]),r0\n" /* idxtable into pattern index */
"extu.b r0,r0 \n" /* extend unsigned */ "extu.b r0,r0 \n" /* extend unsigned */
"shll2 r0 \n" "shll2 r0 \n"
"mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */ "mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */
"mov #75,r0 \n" "mov #75,r0 \n"
"mulu r0,%[rnd] \n" /* multiply by 75 */ "mulu r0,%[rnd] \n" /* multiply by 75 */
"sts macl,%[rnd] \n" "sts macl,%[rnd] \n"
"add #74,%[rnd] \n" /* add another 74 */ "add #74,%[rnd] \n" /* add another 74 */
/* Since the lower bits are not very random: */ /* Since the lower bits are not very random: */
"swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */ "swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */
"and %[rmsk],r1 \n" /* mask out unneeded bits */ "and %[rmsk],r1 \n" /* mask out unneeded bits */
"cmp/hs %[dpth],r1 \n" /* random >= depth ? */ "cmp/hs %[dpth],r1 \n" /* random >= depth ? */
"bf .wa_ntrim \n" "bf .wa_ntrim \n"
"sub %[dpth],r1 \n" /* yes: random -= depth; */ "sub %[dpth],r1 \n" /* yes: random -= depth; */
".wa_ntrim: \n" ".wa_ntrim: \n"
"mov.l .ashlsi3,r0 \n" /** rotate pattern **/ "mov.l .ashlsi3,r0 \n" /** rotate pattern **/
"jsr @r0 \n" /* r4 -> r0, shift left by r5 */ "jsr @r0 \n" /* r4 -> r0, shift left by r5 */
"mov r1,r5 \n" "mov r1,r5 \n"
"mov %[dpth],r5 \n" "mov %[dpth],r5 \n"
"sub r1,r5 \n" /* r5 = depth - r1 */ "sub r1,r5 \n" /* r5 = depth - r1 */
"mov.l .lshrsi3,r1 \n" "mov.l .lshrsi3,r1 \n"
"jsr @r1 \n" /* r4 -> r0, shift right by r5 */ "jsr @r1 \n" /* r4 -> r0, shift right by r5 */
"mov r0,r1 \n" /* store previous result in r1 */ "mov r0,r1 \n" /* store previous result in r1 */
"or r1,r0 \n" /* rotated_pattern = r0 | r1 */ "or r1,r0 \n" /* rotated_pattern = r0 | r1 */
".wa_skip: \n" ".wa_skip: \n"
"mov.l r0,@-%[patp]\n" /* push on pattern stack */ "mov.l r0,@-%[patp] \n" /* push on pattern stack */
"add %[stri],%[src] \n" /* src += stride; */ "add %[stri],%[src] \n" /* src += stride; */
"add #-1,r3 \n" /* decrease loop count */ "add #-1,r3 \n" /* loop 8 times (pixel block) */
"cmp/pl r3 \n" /* loop count > 0? */ "cmp/pl r3 \n"
"bt .wa_loop \n" /* yes: loop */ "bt .wa_loop \n"
: /* outputs */ : /* outputs */
[src] "+r"(_src), [src] "+r"(_src),
[rnd] "+r"(_gray_random_buffer), [rnd] "+r"(_gray_random_buffer),
@ -1095,79 +1227,79 @@ static void _writearray(unsigned char *address, const unsigned char *src,
/* set the bits for all 8 pixels in all bytes according to the /* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */ * precalculated patterns on the pattern stack */
asm volatile ( asm volatile (
"mov.l @%[patp]+,r1\n" /* pop all 8 patterns */ "mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */
"mov.l @%[patp]+,r2\n" "mov.l @%[patp]+,r2 \n"
"mov.l @%[patp]+,r3\n" "mov.l @%[patp]+,r3 \n"
"mov.l @%[patp]+,r6\n" "mov.l @%[patp]+,r6 \n"
"mov.l @%[patp]+,r7\n" "mov.l @%[patp]+,r7 \n"
"mov.l @%[patp]+,r8\n" "mov.l @%[patp]+,r8 \n"
"mov.l @%[patp]+,r9\n" "mov.l @%[patp]+,r9 \n"
"mov.l @%[patp],r10\n" "mov.l @%[patp],r10 \n"
"not %[mask],%[mask] \n" /* "set" mask -> "keep" mask */ "not %[mask],%[mask] \n" /* "set" mask -> "keep" mask */
"extu.b %[mask],%[mask] \n" /* mask out high bits */ "extu.b %[mask],%[mask] \n" /* mask out high bits */
"tst %[mask],%[mask] \n" /* nothing to keep? */ "tst %[mask],%[mask] \n"
"bt .wa_sloop \n" /* yes: jump to short loop */ "bt .wa_sloop \n" /* short loop if nothing to keep */
".wa_floop: \n" /** full loop (there are bits to keep)**/ ".wa_floop: \n" /** full loop (there are bits to keep)**/
"shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
"rotcl r0 \n" /* rotate t bit into r0 */ "rotcl r0 \n" /* rotate t bit into r0 */
"shlr r2 \n" "shlr r2 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r3 \n" "shlr r3 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r6 \n" "shlr r6 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r7 \n" "shlr r7 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r8 \n" "shlr r8 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r9 \n" "shlr r9 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r10 \n" "shlr r10 \n"
"mov.b @%[addr],%[rx] \n" /* read old value */ "mov.b @%[addr],%[rx] \n" /* read old value */
"rotcl r0 \n" "rotcl r0 \n"
"and %[mask],%[rx] \n" /* mask out unneeded bits */ "and %[mask],%[rx] \n" /* mask out replaced bits */
"or %[rx],r0 \n" /* set new bits */ "or %[rx],r0 \n" /* set new bits */
"mov.b r0,@%[addr] \n" /* store value to bitplane */ "mov.b r0,@%[addr] \n" /* store value to bitplane */
"add %[psiz],%[addr] \n" /* advance to next bitplane */ "add %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp/hi %[addr],%[end] \n" /* last bitplane done? */ "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */
"bt .wa_floop \n" /* no: loop */ "bt .wa_floop \n"
"bra .wa_end \n" "bra .wa_end \n"
"nop \n" "nop \n"
/* References to C library routines used in the precalc block */ /* References to C library routines used in the precalc block */
".align 2 \n" ".align 2 \n"
".ashlsi3: \n" /* C library routine: */ ".ashlsi3: \n" /* C library routine: */
".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */ ".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */
".lshrsi3: \n" /* C library routine: */ ".lshrsi3: \n" /* C library routine: */
".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */ ".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */
/* both routines preserve r4, destroy r5 and take ~16 cycles */ /* both routines preserve r4, destroy r5 and take ~16 cycles */
".wa_sloop: \n" /** short loop (nothing to keep) **/ ".wa_sloop: \n" /** short loop (nothing to keep) **/
"shlr r1 \n" /* rotate lsb of pattern 1 to t bit */ "shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
"rotcl r0 \n" /* rotate t bit into r0 */ "rotcl r0 \n" /* rotate t bit into r0 */
"shlr r2 \n" "shlr r2 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r3 \n" "shlr r3 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r6 \n" "shlr r6 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r7 \n" "shlr r7 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r8 \n" "shlr r8 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r9 \n" "shlr r9 \n"
"rotcl r0 \n" "rotcl r0 \n"
"shlr r10 \n" "shlr r10 \n"
"rotcl r0 \n" "rotcl r0 \n"
"mov.b r0,@%[addr] \n" /* store byte to bitplane */ "mov.b r0,@%[addr] \n" /* store byte to bitplane */
"add %[psiz],%[addr] \n" /* advance to next bitplane */ "add %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp/hi %[addr],%[end] \n" /* last bitplane done? */ "cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */
"bt .wa_sloop \n" /* no: loop */ "bt .wa_sloop \n"
".wa_end: \n" ".wa_end: \n"
: /* outputs */ : /* outputs */
[addr]"+r"(addr), [addr]"+r"(addr),
[mask]"+r"(_mask), [mask]"+r"(_mask),
@ -1189,43 +1321,43 @@ static void _writearray(unsigned char *address, const unsigned char *src,
/* precalculate the bit patterns with random shifts /* precalculate the bit patterns with random shifts
for all 8 pixels and put them on an extra "stack" */ for all 8 pixels and put them on an extra "stack" */
asm volatile ( asm volatile (
"moveq.l #8,%%d3 \n" /* loop count in d3: 8 pixels */ "moveq.l #8,%%d3 \n" /* loop count */
".wa_loop: \n" /** load pattern for pixel **/ ".wa_loop: \n" /** load pattern for pixel **/
"clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */ "clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */
"lsr.l #1,%[mask] \n" /* shift out lsb of mask */ "lsr.l #1,%[mask] \n" /* shift out lsb of mask */
"bcc.b .wa_skip \n" /* skip this pixel */ "bcc.b .wa_skip \n" /* skip this pixel */
"clr.l %%d0 \n" "clr.l %%d0 \n"
"move.b (%[src]),%%d0 \n" /* load src byte */ "move.b (%[src]),%%d0 \n" /* load src byte */
"move.b (%%d0:l:1,%[trns]),%%d0\n" /* idxtable into pattern index */ "move.b (%%d0:l:1,%[trns]),%%d0\n" /* idxtable into pattern index */
"move.l (%%d0:l:4,%[bpat]),%%d2\n" /* d2 = bitpattern[byte]; */ "move.l (%%d0:l:4,%[bpat]),%%d2\n" /* d2 = bitpattern[byte]; */
"mulu.w #75,%[rnd] \n" /* multiply by 75 */ "mulu.w #75,%[rnd] \n" /* multiply by 75 */
"add.l #74,%[rnd] \n" /* add another 74 */ "add.l #74,%[rnd] \n" /* add another 74 */
/* Since the lower bits are not very random: */ /* Since the lower bits are not very random: */
"move.l %[rnd],%%d1 \n" "move.l %[rnd],%%d1 \n"
"lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */ "lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */
"and.l %[rmsk],%%d1\n" /* mask out unneeded bits */ "and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */
"cmp.l %[dpth],%%d1\n" /* random >= depth ? */ "cmp.l %[dpth],%%d1 \n" /* random >= depth ? */
"blo.b .wa_ntrim \n" "blo.b .wa_ntrim \n"
"sub.l %[dpth],%%d1\n" /* yes: random -= depth; */ "sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */
".wa_ntrim: \n" ".wa_ntrim: \n"
"move.l %%d2,%%d0 \n" "move.l %%d2,%%d0 \n" /** rotate pattern **/
"lsl.l %%d1,%%d0 \n" "lsl.l %%d1,%%d0 \n"
"sub.l %[dpth],%%d1\n" "sub.l %[dpth],%%d1 \n"
"neg.l %%d1 \n" /* d1 = depth - d1 */ "neg.l %%d1 \n" /* d1 = depth - d1 */
"lsr.l %%d1,%%d2 \n" "lsr.l %%d1,%%d2 \n"
"or.l %%d0,%%d2 \n" "or.l %%d0,%%d2 \n"
".wa_skip: \n" ".wa_skip: \n"
"move.l %%d2,-(%[patp]) \n" /* push on pattern stack */ "move.l %%d2,-(%[patp]) \n" /* push on pattern stack */
"add.l %[stri],%[src] \n" /* src += stride; */ "add.l %[stri],%[src] \n" /* src += stride; */
"subq.l #1,%%d3 \n" /* decrease loop count */ "subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */
"bne.b .wa_loop \n" /* yes: loop */ "bne.b .wa_loop \n"
: /* outputs */ : /* outputs */
[src] "+a"(_src), [src] "+a"(_src),
[patp]"+a"(pat_ptr), [patp]"+a"(pat_ptr),
@ -1250,78 +1382,76 @@ static void _writearray(unsigned char *address, const unsigned char *src,
asm volatile ( asm volatile (
"movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n" "movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n"
/* pop all 8 patterns */ /* pop all 8 patterns */
"not.l %[mask] \n" /* "set" mask -> "keep" mask */ "not.l %[mask] \n" /* "set" mask -> "keep" mask */
"and.l #0xFF,%[mask] \n" "and.l #0xFF,%[mask] \n"
"beq.b .wa_sstart \n" /* yes: jump to short loop */ "beq.b .wa_sstart \n" /* short loop if nothing to keep */
".wa_floop: \n" /** full loop (there are bits to keep)**/ ".wa_floop: \n" /** full loop (there are bits to keep)**/
"clr.l %%d0 \n" "lsr.l #1,%%d2 \n" /* shift out pattern bit */
"lsr.l #1,%%d2 \n" /* shift out mask bit */ "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */
"addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */ "lsr.l #1,%%d3 \n"
"lsr.l #1,%%d3 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "lsr.l #1,%%d4 \n"
"lsr.l #1,%%d4 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "lsr.l #1,%%d5 \n"
"lsr.l #1,%%d5 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "lsr.l #1,%%d6 \n"
"lsr.l #1,%%d6 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "move.l %%a0,%%d1 \n"
"move.l %%a0,%%d1 \n" "lsr.l #1,%%d1 \n"
"lsr.l #1,%%d1 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "move.l %%d1,%%a0 \n"
"move.l %%d1,%%a0 \n" "move.l %%a1,%%d1 \n"
"move.l %%a1,%%d1 \n" "lsr.l #1,%%d1 \n"
"lsr.l #1,%%d1 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "move.l %%d1,%%a1 \n"
"move.l %%d1,%%a1 \n" "move.l %[ax],%%d1 \n"
"move.l %[ax],%%d1 \n" "lsr.l #1,%%d1 \n"
"lsr.l #1,%%d1 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "move.l %%d1,%[ax] \n"
"move.l %%d1,%[ax] \n"
"move.b (%[addr]),%%d1 \n" /* read old value */ "move.b (%[addr]),%%d1 \n" /* read old value */
"and.l %[mask],%%d1 \n" /* mask out unneeded bits */ "and.l %[mask],%%d1 \n" /* mask out replaced bits */
"or.l %%d0,%%d1 \n" /* set new bits */ "or.l %%d0,%%d1 \n" /* set new bits */
"move.b %%d1,(%[addr]) \n" /* store value to bitplane */ "move.b %%d1,(%[addr]) \n" /* store value to bitplane */
"add.l %[psiz],%[addr] \n" /* advance to next bitplane */ "add.l %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp.l %[addr],%[end] \n" /* last bitplane done? */ "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */
"bhi.b .wa_floop \n" /* no: loop */ "bhi.b .wa_floop \n"
"bra.b .wa_end \n" "bra.b .wa_end \n"
".wa_sstart: \n" ".wa_sstart: \n"
"move.l %%a0,%[mask]\n" /* mask isn't needed here, reuse reg */ "move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */
".wa_sloop: \n" /** short loop (nothing to keep) **/ ".wa_sloop: \n" /** short loop (nothing to keep) **/
"clr.l %%d0 \n" "lsr.l #1,%%d2 \n" /* shift out pattern bit */
"lsr.l #1,%%d2 \n" /* shift out mask bit */ "addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */
"addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */ "lsr.l #1,%%d3 \n"
"lsr.l #1,%%d3 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "lsr.l #1,%%d4 \n"
"lsr.l #1,%%d4 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "lsr.l #1,%%d5 \n"
"lsr.l #1,%%d5 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "lsr.l #1,%%d6 \n"
"lsr.l #1,%%d6 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "lsr.l #1,%[mask] \n"
"lsr.l #1,%[mask] \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "move.l %%a1,%%d1 \n"
"move.l %%a1,%%d1 \n" "lsr.l #1,%%d1 \n"
"lsr.l #1,%%d1 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "move.l %%d1,%%a1 \n"
"move.l %%d1,%%a1 \n" "move.l %[ax],%%d1 \n"
"move.l %[ax],%%d1 \n" "lsr.l #1,%%d1 \n"
"lsr.l #1,%%d1 \n" "addx.l %%d0,%%d0 \n"
"addx.l %%d0,%%d0 \n" "move.l %%d1,%[ax] \n"
"move.l %%d1,%[ax] \n"
"move.b %%d0,(%[addr]) \n" /* store byte to bitplane */ "move.b %%d0,(%[addr]) \n" /* store byte to bitplane */
"add.l %[psiz],%[addr] \n" /* advance to next bitplane */ "add.l %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp.l %[addr],%[end] \n" /* last bitplane done? */ "cmp.l %[addr],%[end] \n" /* loop for all bitplanes */
"bhi.b .wa_sloop \n" /* no: loop */ "bhi.b .wa_sloop \n"
".wa_end: \n" ".wa_end: \n"
: /* outputs */ : /* outputs */
[addr]"+a"(addr), [addr]"+a"(addr),
[mask]"+d"(_mask), [mask]"+d"(_mask),

View file

@ -283,32 +283,32 @@ void gray_ub_scroll_left(int count)
if (count) if (count)
{ {
asm ( asm (
"mov r4, %[high] \n" "mov r4, %[high] \n" /* rows = height */
".sl_rloop: \n" ".sl_rloop: \n" /* repeat for every row */
"mov r5, %[addr] \n" "mov r5, %[addr] \n" /* get start address */
"mov r2, %[dpth] \n" "mov r2, %[dpth] \n" /* planes = depth */
".sl_oloop: \n" ".sl_oloop: \n" /* repeat for every bitplane */
"mov r6, r5 \n" "mov r6, r5 \n" /* get start address */
"mov r3, %[cols] \n" "mov r3, %[cols] \n" /* cols = col_count */
"mov r1, #0 \n" "mov r1, #0 \n" /* fill with zero */
".sl_iloop: \n" ".sl_iloop: \n" /* repeat for all cols */
"mov r1, r1, lsr #8 \n" "mov r1, r1, lsr #8 \n" /* shift right to get residue */
"ldrb r0, [r6, #-1]! \n" "ldrb r0, [r6, #-1]! \n" /* decrement addr & get data byte */
"orr r1, r1, r0, lsl %[cnt] \n" "orr r1, r1, r0, lsl %[cnt] \n" /* combine with last residue */
"strb r1, [r6] \n" "strb r1, [r6] \n" /* store data */
"subs r3, r3, #1 \n" "subs r3, r3, #1 \n" /* cols-- */
"bne .sl_iloop \n" "bne .sl_iloop \n"
"add r5, r5, %[psiz] \n" "add r5, r5, %[psiz] \n" /* start_address += plane_size */
"subs r2, r2, #1 \n" "subs r2, r2, #1 \n" /* planes-- */
"bne .sl_oloop \n" "bne .sl_oloop \n"
"add %[addr],%[addr],%[bwid] \n" "add %[addr],%[addr],%[bwid] \n" /* start_address += bwidth */
"subs r4, r4, #1 \n" "subs r4, r4, #1 \n" /* rows-- */
"bne .sl_rloop \n" "bne .sl_rloop \n"
: /* outputs */ : /* outputs */
: /* inputs */ : /* inputs */
@ -364,32 +364,32 @@ void gray_ub_scroll_right(int count)
if (count) if (count)
{ {
asm ( asm (
"mov r4, %[high] \n" "mov r4, %[high] \n" /* rows = height */
".sr_rloop: \n" ".sr_rloop: \n" /* repeat for every row */
"mov r5, %[addr] \n" "mov r5, %[addr] \n" /* get start address */
"mov r2, %[dpth] \n" "mov r2, %[dpth] \n" /* planes = depth */
".sr_oloop: \n" ".sr_oloop: \n" /* repeat for every bitplane */
"mov r6, r5 \n" "mov r6, r5 \n" /* get start address */
"mov r3, %[cols] \n" "mov r3, %[cols] \n" /* cols = col_count */
"mov r1, #0 \n" "mov r1, #0 \n" /* fill with zero */
".sr_iloop: \n" ".sr_iloop: \n" /* repeat for all cols */
"ldrb r0, [r6] \n" "ldrb r0, [r6] \n" /* get data byte */
"orr r1, r0, r1, lsl #8 \n" "orr r1, r0, r1, lsl #8 \n" /* combine w/ old data shifted to 2nd byte */
"mov r0, r1, lsr %[cnt] \n" "mov r0, r1, lsr %[cnt] \n" /* shift right */
"strb r0, [r6], #1 \n" "strb r0, [r6], #1 \n" /* store data, increment addr */
"subs r3, r3, #1 \n" "subs r3, r3, #1 \n" /* cols-- */
"bne .sr_iloop \n" "bne .sr_iloop \n"
"add r5, r5, %[psiz] \n" "add r5, r5, %[psiz] \n" /* start_address += plane_size */
"subs r2, r2, #1 \n" "subs r2, r2, #1 \n" /* planes-- */
"bne .sr_oloop \n" "bne .sr_oloop \n"
"add %[addr],%[addr],%[bwid] \n" "add %[addr],%[addr],%[bwid] \n" /* start_address += bwidth */
"subs r4, r4, #1 \n" "subs r4, r4, #1 \n" /* rows-- */
"bne .sr_rloop \n" "bne .sr_rloop \n"
: /* outputs */ : /* outputs */
: /* inputs */ : /* inputs */
@ -714,8 +714,7 @@ void gray_ub_scroll_up(int count)
"move.b (%%a1),%%d0 \n" /* get data byte */ "move.b (%%a1),%%d0 \n" /* get data byte */
"lsl.l #8,%%d1 \n" /* old data to 2nd byte */ "lsl.l #8,%%d1 \n" /* old data to 2nd byte */
"or.l %%d1,%%d0 \n" /* combine old data */ "or.l %%d1,%%d0 \n" /* combine old data */
"clr.l %%d1 \n" "move.l %%d0,%%d1 \n" /* keep data for next round */
"move.b %%d0,%%d1 \n" /* keep data for next round */
"lsr.l %[cnt],%%d0 \n" /* shift right */ "lsr.l %[cnt],%%d0 \n" /* shift right */
"move.b %%d0,(%%a1) \n" /* store data */ "move.b %%d0,(%%a1) \n" /* store data */