1
0
Fork 0
forked from len0rd/rockbox

* Assembler optimised gray_update_rect() and writearray() for arm (greyscale iPods). * Some slight optimisations for coldfire (H1x0) and SH1 (archos). * Comment and formatting cleanup.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10473 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Jens Arnold 2006-08-07 17:21:38 +00:00
parent 8921b34e4b
commit c00d799fa3
3 changed files with 675 additions and 406 deletions

View file

@ -648,14 +648,165 @@ void gray_update_rect(int x, int y, int width, int height)
cbuf = _gray_info.cur_buffer + srcofs_row;
bbuf = _gray_info.back_buffer + srcofs_row;
#if 0 /* CPU specific asm versions will go here */
#ifdef CPU_ARM
asm volatile (
"ldr r0, [%[cbuf]] \n"
"ldr r1, [%[bbuf]] \n"
"eor r1, r0, r1 \n"
"ldr r0, [%[cbuf], #4] \n"
"ldr %[chg], [%[bbuf], #4] \n"
"eor %[chg], r0, %[chg] \n"
"orr %[chg], %[chg], r1 \n"
: /* outputs */
[chg] "=&r"(change)
: /* inputs */
[cbuf]"r"(cbuf),
[bbuf]"r"(bbuf)
: /* clobbers */
"r0", "r1"
);
if (change != 0)
{
unsigned char *addr, *end;
unsigned mask, trash;
pat_ptr = &pat_stack[8];
/* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */
asm volatile (
"mov r3, #8 \n" /* loop count */
"mov %[mask], #0 \n"
".ur_pre_loop: \n"
"mov %[mask], %[mask], lsl #1 \n" /* shift mask */
"ldrb r0, [%[cbuf]], #1 \n" /* read current buffer */
"ldrb r1, [%[bbuf]] \n" /* read back buffer */
"strb r0, [%[bbuf]], #1 \n" /* update back buffer */
"mov r2, #0 \n" /* preset for skipped pixel */
"cmp r0, r1 \n" /* no change? */
"beq .ur_skip \n" /* -> skip */
"ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */
"add r0, %[rnd], %[rnd], lsl #3 \n" /* multiply by 75 */
"add %[rnd], %[rnd], %[rnd], lsl #1 \n"
"add %[rnd], %[rnd], r0, lsl #3 \n"
"add %[rnd], %[rnd], #74 \n" /* add another 74 */
/* Since the lower bits are not very random: get bits 8..15 (need max. 5) */
"and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */
"cmp r1, %[dpth] \n" /* random >= depth ? */
"subhs r1, r1, %[dpth] \n" /* yes: random -= depth */
"mov r0, r2, lsl r1 \n" /** rotate pattern **/
"sub r1, %[dpth], r1 \n"
"orr r2, r0, r2, lsr r1 \n"
"orr %[mask], %[mask], #1 \n" /* set mask bit */
".ur_skip: \n"
"str r2, [%[patp], #-4]! \n" /* push on pattern stack */
"subs r3, r3, #1 \n" /* loop 8 times (pixel block) */
"bne .ur_pre_loop \n"
: /* outputs */
[cbuf]"+r"(cbuf),
[bbuf]"+r"(bbuf),
[patp]"+r"(pat_ptr),
[rnd] "+r"(_gray_random_buffer),
[mask]"=&r"(mask)
: /* inputs */
[bpat]"r"(_gray_info.bitpattern),
[dpth]"r"(_gray_info.depth),
[rmsk]"r"(_gray_info.randmask)
: /* clobbers */
"r0", "r1", "r2", "r3"
);
addr = dst_row;
end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
/* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */
asm volatile (
"ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */
"mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
"ands %[mask], %[mask], #0xff \n"
"beq .ur_sloop \n" /* short loop if nothing to keep */
".ur_floop: \n" /** full loop (there are bits to keep)**/
"movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */
"adc r0, r0, r0 \n" /* put bit into LSB for byte */
"movs r8, r8, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r7, r7, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r6, r6, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r5, r5, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r4, r4, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r3, r3, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r2, r2, lsr #1 \n"
"adc r0, r0, r0 \n"
"ldrb r1, [%[addr]] \n" /* read old value */
"and r1, r1, %[mask] \n" /* mask out replaced bits */
"orr r1, r1, r0 \n" /* set new bits */
"strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */
"cmp %[end], %[addr] \n" /* loop for all bitplanes */
"bne .ur_floop \n"
"b .ur_end \n"
".ur_sloop: \n" /** short loop (nothing to keep) **/
"movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */
"adc r0, r0, r0 \n" /* put bit into LSB for byte */
"movs r8, r8, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r7, r7, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r6, r6, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r5, r5, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r4, r4, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r3, r3, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r2, r2, lsr #1 \n"
"adc r0, r0, r0 \n"
"strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */
"cmp %[end], %[addr] \n" /* loop for all bitplanes */
"bne .ur_sloop \n"
".ur_end: \n"
: /* outputs */
[addr]"+r"(addr),
[mask]"+r"(mask),
[rx] "=&r"(trash)
: /* inputs */
[psiz]"r"(_gray_info.plane_size),
[end] "r"(end),
[patp]"[rx]"(pat_ptr)
: /* clobbers */
"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
);
}
#else /* C version, for reference*/
#warning C version of gray_update_rect() used
(void)pat_ptr;
/* check whether anything changed in the 8-pixel block */
change = *(uint32_t *)cbuf ^ *(uint32_t *)bbuf;
cbuf += sizeof(uint32_t);
bbuf += sizeof(uint32_t);
change |= *(uint32_t *)cbuf ^ *(uint32_t *)bbuf;
change |= *(uint32_t *)(cbuf + 4) ^ *(uint32_t *)(bbuf + 4);
if (change != 0)
{
@ -664,9 +815,6 @@ void gray_update_rect(int x, int y, int width, int height)
unsigned test = 1;
int i;
cbuf = _gray_info.cur_buffer + srcofs_row;
bbuf = _gray_info.back_buffer + srcofs_row;
/* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */
for (i = 7; i >= 0; i--)
@ -711,7 +859,7 @@ void gray_update_rect(int x, int y, int width, int height)
for (i = 7; i >= 0; i--)
data = (data << 1) | ((pat_stack[i] & test) ? 1 : 0);
*addr = data;
addr += _gray_info.plane_size;
test <<= 1;
@ -788,18 +936,18 @@ void gray_update_rect(int x, int y, int width, int height)
#if CONFIG_CPU == SH7034
asm volatile (
"mov.l @%[cbuf]+,r1 \n"
"mov.l @%[bbuf]+,r2 \n"
"xor r1,r2 \n"
"mov.l @%[cbuf],r1 \n"
"mov.l @%[bbuf],%[chg] \n"
"xor r1,%[chg] \n"
"or r2,%[chg] \n"
"mov.l @%[cbuf],r1 \n"
"mov.l @%[bbuf],r2 \n"
"xor r1,r2 \n"
"mov.l @(4,%[cbuf]),r1 \n"
"mov.l @(4,%[bbuf]),%[chg] \n"
"xor r1,%[chg] \n"
"or r2,%[chg] \n"
: /* outputs */
[cbuf]"+r"(cbuf),
[bbuf]"+r"(bbuf),
[chg] "=r"(change)
: /* inputs */
[cbuf]"r"(cbuf),
[bbuf]"r"(bbuf)
: /* clobbers */
"r1", "r2"
);
@ -810,13 +958,11 @@ void gray_update_rect(int x, int y, int width, int height)
unsigned mask, trash;
pat_ptr = &pat_stack[8];
cbuf = _gray_info.cur_buffer + srcofs_row;
bbuf = _gray_info.back_buffer + srcofs_row;
/* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */
asm volatile (
"mov #8,r3 \n" /* loop count in r3: 8 pixels */
"mov #8,r3 \n" /* loop count */
".ur_pre_loop: \n"
"mov.b @%[cbuf]+,r0\n" /* read current buffer */
@ -860,10 +1006,11 @@ void gray_update_rect(int x, int y, int width, int height)
"rotcr %[mask] \n" /* get mask bit */
"mov.l r2,@-%[patp]\n" /* push on pattern stack */
"add #-1,r3 \n" /* decrease loop count */
"cmp/pl r3 \n" /* loop count > 0? */
"bt .ur_pre_loop\n" /* yes: loop */
"shlr8 %[mask] \n"
"add #-1,r3 \n" /* loop 8 times (pixel block) */
"cmp/pl r3 \n"
"bt .ur_pre_loop\n"
"shlr8 %[mask] \n" /* shift mask to low byte */
"shlr16 %[mask] \n"
: /* outputs */
[cbuf]"+r"(cbuf),
@ -885,77 +1032,77 @@ void gray_update_rect(int x, int y, int width, int height)
/* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */
asm volatile (
"mov.l @%[patp]+,r1\n" /* pop all 8 patterns */
"mov.l @%[patp]+,r2\n"
"mov.l @%[patp]+,r3\n"
"mov.l @%[patp]+,r6\n"
"mov.l @%[patp]+,r7\n"
"mov.l @%[patp]+,r8\n"
"mov.l @%[patp]+,r9\n"
"mov.l @%[patp],r10\n"
"mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */
"mov.l @%[patp]+,r2 \n"
"mov.l @%[patp]+,r3 \n"
"mov.l @%[patp]+,r6 \n"
"mov.l @%[patp]+,r7 \n"
"mov.l @%[patp]+,r8 \n"
"mov.l @%[patp]+,r9 \n"
"mov.l @%[patp],r10 \n"
"tst %[mask],%[mask] \n" /* nothing to keep? */
"bt .ur_sloop \n" /* yes: jump to short loop */
"tst %[mask],%[mask] \n"
"bt .ur_sloop \n" /* short loop if nothing to keep */
".ur_floop: \n" /** full loop (there are bits to keep)**/
"shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
"rotcl r0 \n" /* rotate t bit into r0 */
"shlr r2 \n"
"rotcl r0 \n"
"shlr r3 \n"
"rotcl r0 \n"
"shlr r6 \n"
"rotcl r0 \n"
"shlr r7 \n"
"rotcl r0 \n"
"shlr r8 \n"
"rotcl r0 \n"
"shlr r9 \n"
"rotcl r0 \n"
"shlr r10 \n"
".ur_floop: \n" /** full loop (there are bits to keep)**/
"shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
"rotcl r0 \n" /* rotate t bit into r0 */
"shlr r2 \n"
"rotcl r0 \n"
"shlr r3 \n"
"rotcl r0 \n"
"shlr r6 \n"
"rotcl r0 \n"
"shlr r7 \n"
"rotcl r0 \n"
"shlr r8 \n"
"rotcl r0 \n"
"shlr r9 \n"
"rotcl r0 \n"
"shlr r10 \n"
"mov.b @%[addr],%[rx] \n" /* read old value */
"rotcl r0 \n"
"and %[mask],%[rx] \n" /* mask out unneeded bits */
"or %[rx],r0 \n" /* set new bits */
"mov.b r0,@%[addr] \n" /* store value to bitplane */
"rotcl r0 \n"
"and %[mask],%[rx] \n" /* mask out replaced bits */
"or %[rx],r0 \n" /* set new bits */
"mov.b r0,@%[addr] \n" /* store value to bitplane */
"add %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp/hi %[addr],%[end] \n" /* last bitplane done? */
"bt .ur_floop \n" /* no: loop */
"cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */
"bt .ur_floop \n"
"bra .ur_end \n"
"nop \n"
"bra .ur_end \n"
"nop \n"
/* References to C library routines used in the precalc block */
".align 2 \n"
".ashlsi3: \n" /* C library routine: */
".long ___ashlsi3 \n" /* shift r4 left by r5, res. in r0 */
".lshrsi3: \n" /* C library routine: */
".long ___lshrsi3 \n" /* shift r4 right by r5, res. in r0 */
".align 2 \n"
".ashlsi3: \n" /* C library routine: */
".long ___ashlsi3 \n" /* shift r4 left by r5, res. in r0 */
".lshrsi3: \n" /* C library routine: */
".long ___lshrsi3 \n" /* shift r4 right by r5, res. in r0 */
/* both routines preserve r4, destroy r5 and take ~16 cycles */
".ur_sloop: \n" /** short loop (nothing to keep) **/
"shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
"rotcl r0 \n" /* rotate t bit into r0 */
"shlr r2 \n"
"rotcl r0 \n"
"shlr r3 \n"
"rotcl r0 \n"
"shlr r6 \n"
"rotcl r0 \n"
"shlr r7 \n"
"rotcl r0 \n"
"shlr r8 \n"
"rotcl r0 \n"
"shlr r9 \n"
"rotcl r0 \n"
"shlr r10 \n"
"rotcl r0 \n"
".ur_sloop: \n" /** short loop (nothing to keep) **/
"shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
"rotcl r0 \n" /* rotate t bit into r0 */
"shlr r2 \n"
"rotcl r0 \n"
"shlr r3 \n"
"rotcl r0 \n"
"shlr r6 \n"
"rotcl r0 \n"
"shlr r7 \n"
"rotcl r0 \n"
"shlr r8 \n"
"rotcl r0 \n"
"shlr r9 \n"
"rotcl r0 \n"
"shlr r10 \n"
"rotcl r0 \n"
"mov.b r0,@%[addr] \n" /* store byte to bitplane */
"add %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp/hi %[addr],%[end] \n" /* last bitplane done? */
"bt .ur_sloop \n" /* no: loop */
"cmp/hi %[addr],%[end] \n" /* loop through all bitplanes */
"bt .ur_sloop \n"
".ur_end: \n"
".ur_end: \n"
: /* outputs */
[addr]"+r"(addr),
[mask]"+r"(mask),
@ -970,18 +1117,18 @@ void gray_update_rect(int x, int y, int width, int height)
}
#elif defined(CPU_COLDFIRE)
asm volatile (
"move.l (%[cbuf])+,%%d0 \n"
"move.l (%[bbuf])+,%%d1 \n"
"eor.l %%d0,%%d1 \n"
"move.l (%[cbuf]),%%d0 \n"
"move.l (%[bbuf]),%[chg]\n"
"eor.l %%d0,%[chg] \n"
"or.l %%d1,%[chg] \n"
"move.l (%[cbuf]),%%d0 \n"
"move.l (%[bbuf]),%%d1 \n"
"eor.l %%d0,%%d1 \n"
"move.l (4,%[cbuf]),%%d0 \n"
"move.l (4,%[bbuf]),%[chg] \n"
"eor.l %%d0,%[chg] \n"
"or.l %%d1,%[chg] \n"
: /* outputs */
[cbuf]"+a"(cbuf),
[bbuf]"+a"(bbuf),
[chg] "=&d"(change)
: /* inputs */
[cbuf]"a"(cbuf),
[bbuf]"a"(bbuf)
: /* clobbers */
"d0", "d1"
);
@ -992,54 +1139,52 @@ void gray_update_rect(int x, int y, int width, int height)
unsigned mask, trash;
pat_ptr = &pat_stack[8];
cbuf = _gray_info.cur_buffer + srcofs_row;
bbuf = _gray_info.back_buffer + srcofs_row;
/* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */
asm volatile (
"moveq.l #8,%%d3 \n" /* loop count in d3: 8 pixels */
"clr.l %[mask] \n"
"moveq.l #8,%%d3 \n" /* loop count */
"clr.l %[mask] \n"
".ur_pre_loop: \n"
"clr.l %%d0 \n"
"move.b (%[cbuf])+,%%d0 \n" /* read current buffer */
"clr.l %%d1 \n"
"move.b (%[bbuf]),%%d1 \n" /* read back buffer */
"move.b %%d0,(%[bbuf])+ \n" /* update back buffer */
"clr.l %%d2 \n" /* preset for skipped pixel */
"cmp.l %%d0,%%d1 \n" /* no change? */
"beq.b .ur_skip \n" /* -> skip */
".ur_pre_loop: \n"
"clr.l %%d0 \n"
"move.b (%[cbuf])+,%%d0 \n" /* read current buffer */
"clr.l %%d1 \n"
"move.b (%[bbuf]),%%d1 \n" /* read back buffer */
"move.b %%d0,(%[bbuf])+ \n" /* update back buffer */
"clr.l %%d2 \n" /* preset for skipped pixel */
"cmp.l %%d0,%%d1 \n" /* no change? */
"beq.b .ur_skip \n" /* -> skip */
"move.l (%%d0:l:4,%[bpat]),%%d2 \n" /* d2 = bitpattern[byte]; */
"move.l (%%d0:l:4,%[bpat]),%%d2 \n" /* d2 = bitpattern[byte]; */
"mulu.w #75,%[rnd] \n" /* multiply by 75 */
"add.l #74,%[rnd] \n" /* add another 74 */
"mulu.w #75,%[rnd] \n" /* multiply by 75 */
"add.l #74,%[rnd] \n" /* add another 74 */
/* Since the lower bits are not very random: */
"move.l %[rnd],%%d1 \n"
"lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */
"and.l %[rmsk],%%d1\n" /* mask out unneeded bits */
"move.l %[rnd],%%d1 \n"
"lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */
"and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */
"cmp.l %[dpth],%%d1\n" /* random >= depth ? */
"blo.b .ur_ntrim \n"
"sub.l %[dpth],%%d1\n" /* yes: random -= depth; */
".ur_ntrim: \n"
"cmp.l %[dpth],%%d1 \n" /* random >= depth ? */
"blo.b .ur_ntrim \n"
"sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */
".ur_ntrim: \n"
"move.l %%d2,%%d0 \n"
"lsl.l %%d1,%%d0 \n"
"sub.l %[dpth],%%d1\n"
"neg.l %%d1 \n" /* d1 = depth - d1 */
"lsr.l %%d1,%%d2 \n"
"or.l %%d0,%%d2 \n" /* rotated_pattern = d2 | d0 */
"move.l %%d2,%%d0 \n" /** rotate pattern **/
"lsl.l %%d1,%%d0 \n"
"sub.l %[dpth],%%d1 \n"
"neg.l %%d1 \n" /* d1 = depth - d1 */
"lsr.l %%d1,%%d2 \n"
"or.l %%d0,%%d2 \n" /* rotated_pattern = d2 | d0 */
"or.l #0x0100,%[mask] \n" /* set mask bit */
".ur_skip: \n"
"lsr.l #1,%[mask] \n" /* shift mask */
".ur_skip: \n"
"lsr.l #1,%[mask] \n" /* shift mask */
"move.l %%d2,-(%[patp]) \n" /* push on pattern stack */
"subq.l #1,%%d3 \n" /* decrease loop count */
"bne.b .ur_pre_loop\n" /* yes: loop */
"subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */
"bne.b .ur_pre_loop \n"
: /* outputs */
[cbuf]"+a"(cbuf),
[bbuf]"+a"(bbuf),
@ -1061,79 +1206,79 @@ void gray_update_rect(int x, int y, int width, int height)
* precalculated patterns on the pattern stack */
asm volatile (
"movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n"
/* pop all 8 patterns */
"not.l %[mask] \n" /* set mask -> keep mask */
/* pop all 8 patterns */
"not.l %[mask] \n" /* "set" mask -> "keep" mask */
"and.l #0xFF,%[mask] \n"
"beq.b .ur_sstart \n" /* yes: jump to short loop */
"beq.b .ur_sstart \n" /* short loop if nothing to keep */
".ur_floop: \n" /** full loop (there are bits to keep)**/
"clr.l %%d0 \n"
"lsr.l #1,%%d2 \n" /* shift out mask bit */
"addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */
"lsr.l #1,%%d3 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d4 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d5 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d6 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%a0,%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a0 \n"
"move.l %%a1,%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a1 \n"
"move.l %[ax],%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%[ax] \n"
".ur_floop: \n" /** full loop (there are bits to keep)**/
"clr.l %%d0 \n"
"lsr.l #1,%%d2 \n" /* shift out pattern bit */
"addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */
"lsr.l #1,%%d3 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d4 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d5 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d6 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%a0,%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a0 \n"
"move.l %%a1,%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a1 \n"
"move.l %[ax],%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%[ax] \n"
"move.b (%[addr]),%%d1 \n" /* read old value */
"and.l %[mask],%%d1 \n" /* mask out unneeded bits */
"and.l %[mask],%%d1 \n" /* mask out replaced bits */
"or.l %%d0,%%d1 \n" /* set new bits */
"move.b %%d1,(%[addr]) \n" /* store value to bitplane */
"add.l %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp.l %[addr],%[end] \n" /* last bitplane done? */
"bhi.b .ur_floop \n" /* no: loop */
"cmp.l %[addr],%[end] \n" /* loop through all bitplanes */
"bhi.b .ur_floop \n"
"bra.b .ur_end \n"
"bra.b .ur_end \n"
".ur_sstart: \n"
"move.l %%a0,%[mask]\n" /* mask isn't needed here, reuse reg */
".ur_sstart: \n"
"move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */
".ur_sloop: \n" /** short loop (nothing to keep) **/
"clr.l %%d0 \n"
"lsr.l #1,%%d2 \n" /* shift out mask bit */
"addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */
"lsr.l #1,%%d3 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d4 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d5 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d6 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%[mask] \n"
"addx.l %%d0,%%d0 \n"
"move.l %%a1,%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a1 \n"
"move.l %[ax],%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%[ax] \n"
".ur_sloop: \n" /** short loop (nothing to keep) **/
"clr.l %%d0 \n"
"lsr.l #1,%%d2 \n" /* shift out pattern bit */
"addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */
"lsr.l #1,%%d3 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d4 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d5 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d6 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%[mask] \n"
"addx.l %%d0,%%d0 \n"
"move.l %%a1,%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a1 \n"
"move.l %[ax],%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%[ax] \n"
"move.b %%d0,(%[addr]) \n" /* store byte to bitplane */
"add.l %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp.l %[addr],%[end] \n" /* last bitplane done? */
"bhi.b .ur_sloop \n" /* no: loop */
"cmp.l %[addr],%[end] \n" /* loop through all bitplanes */
"bhi.b .ur_sloop \n"
".ur_end: \n"
".ur_end: \n"
: /* outputs */
[addr]"+a"(addr),
[mask]"+d"(mask),
@ -1151,9 +1296,7 @@ void gray_update_rect(int x, int y, int width, int height)
(void)pat_ptr;
/* check whether anything changed in the 8-pixel block */
change = *(uint32_t *)cbuf ^ *(uint32_t *)bbuf;
cbuf += sizeof(uint32_t);
bbuf += sizeof(uint32_t);
change |= *(uint32_t *)cbuf ^ *(uint32_t *)bbuf;
change |= *(uint32_t *)(cbuf + 4) ^ *(uint32_t *)(bbuf + 4);
if (change != 0)
{
@ -1162,9 +1305,6 @@ void gray_update_rect(int x, int y, int width, int height)
unsigned test = 1;
int i;
cbuf = _gray_info.cur_buffer + srcofs_row;
bbuf = _gray_info.back_buffer + srcofs_row;
/* precalculate the bit patterns with random shifts
* for all 8 pixels and put them on an extra "stack" */
for (i = 0; i < 8; i++)

View file

@ -876,8 +876,140 @@ static void _writearray(unsigned char *address, const unsigned char *src,
unsigned long pat_stack[8];
unsigned long *pat_ptr = &pat_stack[8];
unsigned char *addr, *end;
#if 0 /* CPU specific asm versions will go here */
#ifdef CPU_ARM
const unsigned char *_src;
unsigned _mask, trash;
_mask = mask;
_src = src;
/* precalculate the bit patterns with random shifts
for all 8 pixels and put them on an extra "stack" */
asm volatile (
"mov %[mask], %[mask], lsl #24 \n" /* shift mask to upper byte */
"mov r3, #8 \n" /* loop count */
".wa_loop: \n" /** load pattern for pixel **/
"mov r2, #0 \n" /* pattern for skipped pixel must be 0 */
"movs %[mask], %[mask], lsl #1 \n" /* shift out msb of mask */
"bcc .wa_skip \n" /* skip this pixel */
"ldrb r0, [%[src]] \n" /* load src byte */
"ldrb r0, [%[trns], r0] \n" /* idxtable into pattern index */
"ldr r2, [%[bpat], r0, lsl #2] \n" /* r2 = bitpattern[byte]; */
"add r0, %[rnd], %[rnd], lsl #3 \n" /* multiply by 75 */
"add %[rnd], %[rnd], %[rnd], lsl #1 \n"
"add %[rnd], %[rnd], r0, lsl #3 \n"
"add %[rnd], %[rnd], #74 \n" /* add another 74 */
/* Since the lower bits are not very random: get bits 8..15 (need max. 5) */
"and r1, %[rmsk], %[rnd], lsr #8 \n" /* ..and mask out unneeded bits */
"cmp r1, %[dpth] \n" /* random >= depth ? */
"subhs r1, r1, %[dpth] \n" /* yes: random -= depth */
"mov r0, r2, lsl r1 \n" /** rotate pattern **/
"sub r1, %[dpth], r1 \n"
"orr r2, r0, r2, lsr r1 \n"
".wa_skip: \n"
"str r2, [%[patp], #-4]! \n" /* push on pattern stack */
"add %[src], %[src], #1 \n" /* src++; */
"subs r3, r3, #1 \n" /* loop 8 times (pixel block) */
"bne .wa_loop \n"
: /* outputs */
[src] "+r"(_src),
[patp]"+r"(pat_ptr),
[rnd] "+r"(_gray_random_buffer),
[mask]"+r"(_mask)
: /* inputs */
[bpat]"r"(_gray_info.bitpattern),
[trns]"r"(_gray_info.idxtable),
[dpth]"r"(_gray_info.depth),
[rmsk]"r"(_gray_info.randmask)
: /* clobbers */
"r0", "r1", "r2", "r3"
);
addr = address;
end = addr + MULU16(_gray_info.depth, _gray_info.plane_size);
_mask = mask;
/* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */
asm volatile (
"ldmia %[patp], {r2 - r8, %[rx]} \n" /* pop all 8 patterns */
"mvn %[mask], %[mask] \n" /* "set" mask -> "keep" mask */
"ands %[mask], %[mask], #0xff \n"
"beq .wa_sloop \n" /* short loop if nothing to keep */
".wa_floop: \n" /** full loop (there are bits to keep)**/
"movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */
"adc r0, r0, r0 \n" /* put bit into LSB of byte */
"movs r8, r8, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r7, r7, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r6, r6, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r5, r5, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r4, r4, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r3, r3, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r2, r2, lsr #1 \n"
"adc r0, r0, r0 \n"
"ldrb r1, [%[addr]] \n" /* read old value */
"and r1, r1, %[mask] \n" /* mask out replaced bits */
"orr r1, r1, r0 \n" /* set new bits */
"strb r1, [%[addr]], %[psiz] \n" /* store value, advance to next bpl */
"cmp %[end], %[addr] \n" /* loop through all bitplanes */
"bne .wa_floop \n"
"b .wa_end \n"
".wa_sloop: \n" /** short loop (nothing to keep) **/
"movs %[rx], %[rx], lsr #1 \n" /* shift out pattern bit */
"adc r0, r0, r0 \n" /* put bit into LSB of byte */
"movs r8, r8, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r7, r7, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r6, r6, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r5, r5, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r4, r4, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r3, r3, lsr #1 \n"
"adc r0, r0, r0 \n"
"movs r2, r2, lsr #1 \n"
"adc r0, r0, r0 \n"
"strb r0, [%[addr]], %[psiz] \n" /* store byte, advance to next bpl */
"cmp %[end], %[addr] \n" /* loop through all bitplanes */
"bne .wa_sloop \n"
".wa_end: \n"
: /* outputs */
[addr]"+r"(addr),
[mask]"+r"(_mask),
[rx] "=&r"(trash)
: /* inputs */
[psiz]"r"(_gray_info.plane_size),
[end] "r"(end),
[patp]"[rx]"(pat_ptr)
: /* clobbers */
"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8"
);
#else /* C version, for reference*/
#warning C version of _writearray() used
unsigned test = 0x80;
int i;
@ -1027,52 +1159,52 @@ static void _writearray(unsigned char *address, const unsigned char *src,
/* precalculate the bit patterns with random shifts
for all 8 pixels and put them on an extra "stack" */
asm volatile (
"mov #8,r3 \n" /* loop count in r3: 8 pixels */
"mov #8,r3 \n" /* loop count */
".wa_loop: \n" /** load pattern for pixel **/
"mov #0,r0 \n" /* pattern for skipped pixel must be 0 */
"shlr %[mask] \n" /* shift out lsb of mask */
"bf .wa_skip \n" /* skip this pixel */
".wa_loop: \n" /** load pattern for pixel **/
"mov #0,r0 \n" /* pattern for skipped pixel must be 0 */
"shlr %[mask] \n" /* shift out lsb of mask */
"bf .wa_skip \n" /* skip this pixel */
"mov.b @%[src],r0 \n" /* load src byte */
"extu.b r0,r0 \n" /* extend unsigned */
"mov.b @%[src],r0 \n" /* load src byte */
"extu.b r0,r0 \n" /* extend unsigned */
"mov.b @(r0,%[trns]),r0\n" /* idxtable into pattern index */
"extu.b r0,r0 \n" /* extend unsigned */
"shll2 r0 \n"
"extu.b r0,r0 \n" /* extend unsigned */
"shll2 r0 \n"
"mov.l @(r0,%[bpat]),r4\n" /* r4 = bitpattern[byte]; */
"mov #75,r0 \n"
"mulu r0,%[rnd] \n" /* multiply by 75 */
"sts macl,%[rnd] \n"
"add #74,%[rnd] \n" /* add another 74 */
"mov #75,r0 \n"
"mulu r0,%[rnd] \n" /* multiply by 75 */
"sts macl,%[rnd] \n"
"add #74,%[rnd] \n" /* add another 74 */
/* Since the lower bits are not very random: */
"swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */
"and %[rmsk],r1 \n" /* mask out unneeded bits */
"swap.b %[rnd],r1 \n" /* get bits 8..15 (need max. 5) */
"and %[rmsk],r1 \n" /* mask out unneeded bits */
"cmp/hs %[dpth],r1 \n" /* random >= depth ? */
"bf .wa_ntrim \n"
"sub %[dpth],r1 \n" /* yes: random -= depth; */
".wa_ntrim: \n"
"cmp/hs %[dpth],r1 \n" /* random >= depth ? */
"bf .wa_ntrim \n"
"sub %[dpth],r1 \n" /* yes: random -= depth; */
".wa_ntrim: \n"
"mov.l .ashlsi3,r0 \n" /** rotate pattern **/
"jsr @r0 \n" /* r4 -> r0, shift left by r5 */
"mov r1,r5 \n"
"mov.l .ashlsi3,r0 \n" /** rotate pattern **/
"jsr @r0 \n" /* r4 -> r0, shift left by r5 */
"mov r1,r5 \n"
"mov %[dpth],r5 \n"
"sub r1,r5 \n" /* r5 = depth - r1 */
"mov.l .lshrsi3,r1 \n"
"jsr @r1 \n" /* r4 -> r0, shift right by r5 */
"mov r0,r1 \n" /* store previous result in r1 */
"mov %[dpth],r5 \n"
"sub r1,r5 \n" /* r5 = depth - r1 */
"mov.l .lshrsi3,r1 \n"
"jsr @r1 \n" /* r4 -> r0, shift right by r5 */
"mov r0,r1 \n" /* store previous result in r1 */
"or r1,r0 \n" /* rotated_pattern = r0 | r1 */
"or r1,r0 \n" /* rotated_pattern = r0 | r1 */
".wa_skip: \n"
"mov.l r0,@-%[patp]\n" /* push on pattern stack */
".wa_skip: \n"
"mov.l r0,@-%[patp] \n" /* push on pattern stack */
"add %[stri],%[src] \n" /* src += stride; */
"add #-1,r3 \n" /* decrease loop count */
"cmp/pl r3 \n" /* loop count > 0? */
"bt .wa_loop \n" /* yes: loop */
"add #-1,r3 \n" /* loop 8 times (pixel block) */
"cmp/pl r3 \n"
"bt .wa_loop \n"
: /* outputs */
[src] "+r"(_src),
[rnd] "+r"(_gray_random_buffer),
@ -1095,79 +1227,79 @@ static void _writearray(unsigned char *address, const unsigned char *src,
/* set the bits for all 8 pixels in all bytes according to the
* precalculated patterns on the pattern stack */
asm volatile (
"mov.l @%[patp]+,r1\n" /* pop all 8 patterns */
"mov.l @%[patp]+,r2\n"
"mov.l @%[patp]+,r3\n"
"mov.l @%[patp]+,r6\n"
"mov.l @%[patp]+,r7\n"
"mov.l @%[patp]+,r8\n"
"mov.l @%[patp]+,r9\n"
"mov.l @%[patp],r10\n"
"mov.l @%[patp]+,r1 \n" /* pop all 8 patterns */
"mov.l @%[patp]+,r2 \n"
"mov.l @%[patp]+,r3 \n"
"mov.l @%[patp]+,r6 \n"
"mov.l @%[patp]+,r7 \n"
"mov.l @%[patp]+,r8 \n"
"mov.l @%[patp]+,r9 \n"
"mov.l @%[patp],r10 \n"
"not %[mask],%[mask] \n" /* "set" mask -> "keep" mask */
"extu.b %[mask],%[mask] \n" /* mask out high bits */
"tst %[mask],%[mask] \n" /* nothing to keep? */
"bt .wa_sloop \n" /* yes: jump to short loop */
"tst %[mask],%[mask] \n"
"bt .wa_sloop \n" /* short loop if nothing to keep */
".wa_floop: \n" /** full loop (there are bits to keep)**/
"shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
"rotcl r0 \n" /* rotate t bit into r0 */
"shlr r2 \n"
"rotcl r0 \n"
"shlr r3 \n"
"rotcl r0 \n"
"shlr r6 \n"
"rotcl r0 \n"
"shlr r7 \n"
"rotcl r0 \n"
"shlr r8 \n"
"rotcl r0 \n"
"shlr r9 \n"
"rotcl r0 \n"
"shlr r10 \n"
".wa_floop: \n" /** full loop (there are bits to keep)**/
"shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
"rotcl r0 \n" /* rotate t bit into r0 */
"shlr r2 \n"
"rotcl r0 \n"
"shlr r3 \n"
"rotcl r0 \n"
"shlr r6 \n"
"rotcl r0 \n"
"shlr r7 \n"
"rotcl r0 \n"
"shlr r8 \n"
"rotcl r0 \n"
"shlr r9 \n"
"rotcl r0 \n"
"shlr r10 \n"
"mov.b @%[addr],%[rx] \n" /* read old value */
"rotcl r0 \n"
"and %[mask],%[rx] \n" /* mask out unneeded bits */
"or %[rx],r0 \n" /* set new bits */
"mov.b r0,@%[addr] \n" /* store value to bitplane */
"rotcl r0 \n"
"and %[mask],%[rx] \n" /* mask out replaced bits */
"or %[rx],r0 \n" /* set new bits */
"mov.b r0,@%[addr] \n" /* store value to bitplane */
"add %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp/hi %[addr],%[end] \n" /* last bitplane done? */
"bt .wa_floop \n" /* no: loop */
"cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */
"bt .wa_floop \n"
"bra .wa_end \n"
"nop \n"
"bra .wa_end \n"
"nop \n"
/* References to C library routines used in the precalc block */
".align 2 \n"
".ashlsi3: \n" /* C library routine: */
".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */
".lshrsi3: \n" /* C library routine: */
".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */
".align 2 \n"
".ashlsi3: \n" /* C library routine: */
".long ___ashlsi3 \n" /* shift r4 left by r5, result in r0 */
".lshrsi3: \n" /* C library routine: */
".long ___lshrsi3 \n" /* shift r4 right by r5, result in r0 */
/* both routines preserve r4, destroy r5 and take ~16 cycles */
".wa_sloop: \n" /** short loop (nothing to keep) **/
"shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
"rotcl r0 \n" /* rotate t bit into r0 */
"shlr r2 \n"
"rotcl r0 \n"
"shlr r3 \n"
"rotcl r0 \n"
"shlr r6 \n"
"rotcl r0 \n"
"shlr r7 \n"
"rotcl r0 \n"
"shlr r8 \n"
"rotcl r0 \n"
"shlr r9 \n"
"rotcl r0 \n"
"shlr r10 \n"
"rotcl r0 \n"
"mov.b r0,@%[addr] \n" /* store byte to bitplane */
".wa_sloop: \n" /** short loop (nothing to keep) **/
"shlr r1 \n" /* rotate lsb of pattern 1 to t bit */
"rotcl r0 \n" /* rotate t bit into r0 */
"shlr r2 \n"
"rotcl r0 \n"
"shlr r3 \n"
"rotcl r0 \n"
"shlr r6 \n"
"rotcl r0 \n"
"shlr r7 \n"
"rotcl r0 \n"
"shlr r8 \n"
"rotcl r0 \n"
"shlr r9 \n"
"rotcl r0 \n"
"shlr r10 \n"
"rotcl r0 \n"
"mov.b r0,@%[addr] \n" /* store byte to bitplane */
"add %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp/hi %[addr],%[end] \n" /* last bitplane done? */
"bt .wa_sloop \n" /* no: loop */
"cmp/hi %[addr],%[end] \n" /* loop for all bitplanes */
"bt .wa_sloop \n"
".wa_end: \n"
".wa_end: \n"
: /* outputs */
[addr]"+r"(addr),
[mask]"+r"(_mask),
@ -1189,43 +1321,43 @@ static void _writearray(unsigned char *address, const unsigned char *src,
/* precalculate the bit patterns with random shifts
for all 8 pixels and put them on an extra "stack" */
asm volatile (
"moveq.l #8,%%d3 \n" /* loop count in d3: 8 pixels */
"moveq.l #8,%%d3 \n" /* loop count */
".wa_loop: \n" /** load pattern for pixel **/
"clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */
"lsr.l #1,%[mask] \n" /* shift out lsb of mask */
"bcc.b .wa_skip \n" /* skip this pixel */
".wa_loop: \n" /** load pattern for pixel **/
"clr.l %%d2 \n" /* pattern for skipped pixel must be 0 */
"lsr.l #1,%[mask] \n" /* shift out lsb of mask */
"bcc.b .wa_skip \n" /* skip this pixel */
"clr.l %%d0 \n"
"clr.l %%d0 \n"
"move.b (%[src]),%%d0 \n" /* load src byte */
"move.b (%%d0:l:1,%[trns]),%%d0\n" /* idxtable into pattern index */
"move.l (%%d0:l:4,%[bpat]),%%d2\n" /* d2 = bitpattern[byte]; */
"mulu.w #75,%[rnd] \n" /* multiply by 75 */
"add.l #74,%[rnd] \n" /* add another 74 */
"mulu.w #75,%[rnd] \n" /* multiply by 75 */
"add.l #74,%[rnd] \n" /* add another 74 */
/* Since the lower bits are not very random: */
"move.l %[rnd],%%d1 \n"
"lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */
"and.l %[rmsk],%%d1\n" /* mask out unneeded bits */
"move.l %[rnd],%%d1 \n"
"lsr.l #8,%%d1 \n" /* get bits 8..15 (need max. 5) */
"and.l %[rmsk],%%d1 \n" /* mask out unneeded bits */
"cmp.l %[dpth],%%d1\n" /* random >= depth ? */
"blo.b .wa_ntrim \n"
"sub.l %[dpth],%%d1\n" /* yes: random -= depth; */
".wa_ntrim: \n"
"cmp.l %[dpth],%%d1 \n" /* random >= depth ? */
"blo.b .wa_ntrim \n"
"sub.l %[dpth],%%d1 \n" /* yes: random -= depth; */
".wa_ntrim: \n"
"move.l %%d2,%%d0 \n"
"lsl.l %%d1,%%d0 \n"
"sub.l %[dpth],%%d1\n"
"neg.l %%d1 \n" /* d1 = depth - d1 */
"lsr.l %%d1,%%d2 \n"
"or.l %%d0,%%d2 \n"
"move.l %%d2,%%d0 \n" /** rotate pattern **/
"lsl.l %%d1,%%d0 \n"
"sub.l %[dpth],%%d1 \n"
"neg.l %%d1 \n" /* d1 = depth - d1 */
"lsr.l %%d1,%%d2 \n"
"or.l %%d0,%%d2 \n"
".wa_skip: \n"
".wa_skip: \n"
"move.l %%d2,-(%[patp]) \n" /* push on pattern stack */
"add.l %[stri],%[src] \n" /* src += stride; */
"subq.l #1,%%d3 \n" /* decrease loop count */
"bne.b .wa_loop \n" /* yes: loop */
"subq.l #1,%%d3 \n" /* loop 8 times (pixel block) */
"bne.b .wa_loop \n"
: /* outputs */
[src] "+a"(_src),
[patp]"+a"(pat_ptr),
@ -1250,78 +1382,76 @@ static void _writearray(unsigned char *address, const unsigned char *src,
asm volatile (
"movem.l (%[patp]),%%d2-%%d6/%%a0-%%a1/%[ax] \n"
/* pop all 8 patterns */
"not.l %[mask] \n" /* "set" mask -> "keep" mask */
"not.l %[mask] \n" /* "set" mask -> "keep" mask */
"and.l #0xFF,%[mask] \n"
"beq.b .wa_sstart \n" /* yes: jump to short loop */
"beq.b .wa_sstart \n" /* short loop if nothing to keep */
".wa_floop: \n" /** full loop (there are bits to keep)**/
"clr.l %%d0 \n"
"lsr.l #1,%%d2 \n" /* shift out mask bit */
"addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */
"lsr.l #1,%%d3 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d4 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d5 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d6 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%a0,%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a0 \n"
"move.l %%a1,%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a1 \n"
"move.l %[ax],%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%[ax] \n"
".wa_floop: \n" /** full loop (there are bits to keep)**/
"lsr.l #1,%%d2 \n" /* shift out pattern bit */
"addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */
"lsr.l #1,%%d3 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d4 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d5 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d6 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%a0,%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a0 \n"
"move.l %%a1,%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a1 \n"
"move.l %[ax],%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%[ax] \n"
"move.b (%[addr]),%%d1 \n" /* read old value */
"and.l %[mask],%%d1 \n" /* mask out unneeded bits */
"and.l %[mask],%%d1 \n" /* mask out replaced bits */
"or.l %%d0,%%d1 \n" /* set new bits */
"move.b %%d1,(%[addr]) \n" /* store value to bitplane */
"add.l %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp.l %[addr],%[end] \n" /* last bitplane done? */
"bhi.b .wa_floop \n" /* no: loop */
"cmp.l %[addr],%[end] \n" /* loop for all bitplanes */
"bhi.b .wa_floop \n"
"bra.b .wa_end \n"
"bra.b .wa_end \n"
".wa_sstart: \n"
"move.l %%a0,%[mask]\n" /* mask isn't needed here, reuse reg */
".wa_sstart: \n"
"move.l %%a0,%[mask] \n" /* mask isn't needed here, reuse reg */
".wa_sloop: \n" /** short loop (nothing to keep) **/
"clr.l %%d0 \n"
"lsr.l #1,%%d2 \n" /* shift out mask bit */
"addx.l %%d0,%%d0 \n" /* puts bit into LSB, shifts left by 1 */
"lsr.l #1,%%d3 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d4 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d5 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d6 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%[mask] \n"
"addx.l %%d0,%%d0 \n"
"move.l %%a1,%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a1 \n"
"move.l %[ax],%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%[ax] \n"
".wa_sloop: \n" /** short loop (nothing to keep) **/
"lsr.l #1,%%d2 \n" /* shift out pattern bit */
"addx.l %%d0,%%d0 \n" /* put bit into LSB of byte */
"lsr.l #1,%%d3 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d4 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d5 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%%d6 \n"
"addx.l %%d0,%%d0 \n"
"lsr.l #1,%[mask] \n"
"addx.l %%d0,%%d0 \n"
"move.l %%a1,%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%%a1 \n"
"move.l %[ax],%%d1 \n"
"lsr.l #1,%%d1 \n"
"addx.l %%d0,%%d0 \n"
"move.l %%d1,%[ax] \n"
"move.b %%d0,(%[addr]) \n" /* store byte to bitplane */
"add.l %[psiz],%[addr] \n" /* advance to next bitplane */
"cmp.l %[addr],%[end] \n" /* last bitplane done? */
"bhi.b .wa_sloop \n" /* no: loop */
"cmp.l %[addr],%[end] \n" /* loop for all bitplanes */
"bhi.b .wa_sloop \n"
".wa_end: \n"
".wa_end: \n"
: /* outputs */
[addr]"+a"(addr),
[mask]"+d"(_mask),

View file

@ -283,32 +283,32 @@ void gray_ub_scroll_left(int count)
if (count)
{
asm (
"mov r4, %[high] \n"
"mov r4, %[high] \n" /* rows = height */
".sl_rloop: \n"
"mov r5, %[addr] \n"
"mov r2, %[dpth] \n"
".sl_rloop: \n" /* repeat for every row */
"mov r5, %[addr] \n" /* get start address */
"mov r2, %[dpth] \n" /* planes = depth */
".sl_oloop: \n"
"mov r6, r5 \n"
"mov r3, %[cols] \n"
"mov r1, #0 \n"
".sl_oloop: \n" /* repeat for every bitplane */
"mov r6, r5 \n" /* get start address */
"mov r3, %[cols] \n" /* cols = col_count */
"mov r1, #0 \n" /* fill with zero */
".sl_iloop: \n"
"mov r1, r1, lsr #8 \n"
"ldrb r0, [r6, #-1]! \n"
"orr r1, r1, r0, lsl %[cnt] \n"
"strb r1, [r6] \n"
".sl_iloop: \n" /* repeat for all cols */
"mov r1, r1, lsr #8 \n" /* shift right to get residue */
"ldrb r0, [r6, #-1]! \n" /* decrement addr & get data byte */
"orr r1, r1, r0, lsl %[cnt] \n" /* combine with last residue */
"strb r1, [r6] \n" /* store data */
"subs r3, r3, #1 \n"
"bne .sl_iloop \n"
"subs r3, r3, #1 \n" /* cols-- */
"bne .sl_iloop \n"
"add r5, r5, %[psiz] \n"
"subs r2, r2, #1 \n"
"bne .sl_oloop \n"
"add r5, r5, %[psiz] \n" /* start_address += plane_size */
"subs r2, r2, #1 \n" /* planes-- */
"bne .sl_oloop \n"
"add %[addr],%[addr],%[bwid] \n"
"subs r4, r4, #1 \n"
"add %[addr],%[addr],%[bwid] \n" /* start_address += bwidth */
"subs r4, r4, #1 \n" /* rows-- */
"bne .sl_rloop \n"
: /* outputs */
: /* inputs */
@ -364,32 +364,32 @@ void gray_ub_scroll_right(int count)
if (count)
{
asm (
"mov r4, %[high] \n"
"mov r4, %[high] \n" /* rows = height */
".sr_rloop: \n"
"mov r5, %[addr] \n"
"mov r2, %[dpth] \n"
".sr_rloop: \n" /* repeat for every row */
"mov r5, %[addr] \n" /* get start address */
"mov r2, %[dpth] \n" /* planes = depth */
".sr_oloop: \n"
"mov r6, r5 \n"
"mov r3, %[cols] \n"
"mov r1, #0 \n"
".sr_oloop: \n" /* repeat for every bitplane */
"mov r6, r5 \n" /* get start address */
"mov r3, %[cols] \n" /* cols = col_count */
"mov r1, #0 \n" /* fill with zero */
".sr_iloop: \n"
"ldrb r0, [r6] \n"
"orr r1, r0, r1, lsl #8 \n"
"mov r0, r1, lsr %[cnt] \n"
"strb r0, [r6], #1 \n"
".sr_iloop: \n" /* repeat for all cols */
"ldrb r0, [r6] \n" /* get data byte */
"orr r1, r0, r1, lsl #8 \n" /* combine w/ old data shifted to 2nd byte */
"mov r0, r1, lsr %[cnt] \n" /* shift right */
"strb r0, [r6], #1 \n" /* store data, increment addr */
"subs r3, r3, #1 \n"
"bne .sr_iloop \n"
"subs r3, r3, #1 \n" /* cols-- */
"bne .sr_iloop \n"
"add r5, r5, %[psiz] \n"
"subs r2, r2, #1 \n"
"bne .sr_oloop \n"
"add r5, r5, %[psiz] \n" /* start_address += plane_size */
"subs r2, r2, #1 \n" /* planes-- */
"bne .sr_oloop \n"
"add %[addr],%[addr],%[bwid] \n"
"subs r4, r4, #1 \n"
"add %[addr],%[addr],%[bwid] \n" /* start_address += bwidth */
"subs r4, r4, #1 \n" /* rows-- */
"bne .sr_rloop \n"
: /* outputs */
: /* inputs */
@ -714,8 +714,7 @@ void gray_ub_scroll_up(int count)
"move.b (%%a1),%%d0 \n" /* get data byte */
"lsl.l #8,%%d1 \n" /* old data to 2nd byte */
"or.l %%d1,%%d0 \n" /* combine old data */
"clr.l %%d1 \n"
"move.b %%d0,%%d1 \n" /* keep data for next round */
"move.l %%d0,%%d1 \n" /* keep data for next round */
"lsr.l %[cnt],%%d0 \n" /* shift right */
"move.b %%d0,(%%a1) \n" /* store data */