Use hand-written constants table on ARMv5+ for JPEG IDCT, and load four 16-bit constants at a time with ldrd. Not useful for ARMv4, since one load per constant would still be needed, and limited range of ldrsh would force multiple copies of table.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21535 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Andrew Mahone 2009-06-28 02:32:43 +00:00
parent 99ae7bcc43
commit 815dcfdd35

View file

@ -113,7 +113,11 @@ jpeg_idct2h:
results can not be stored merged. results can not be stored merged.
*/ */
stmdb sp!, { r4-r5, lr } stmdb sp!, { r4-r5, lr }
#if ARM_ARCH < 5
ldr r14, =4112 ldr r14, =4112
#else
ldrsh r14, .Lpool4+2
#endif
1: 1:
ldrsh r12, [r0] ldrsh r12, [r0]
ldrsh r4, [r0, #2] ldrsh r4, [r0, #2]
@ -140,7 +144,7 @@ jpeg_idct2h:
ldmia sp!, { r4-r5, pc } ldmia sp!, { r4-r5, pc }
#else #else
stmdb sp!, { r4, lr } stmdb sp!, { r4, lr }
ldr r14, =4112 ldrsh r14, .Lpool4+2
1: 1:
ldr r12, [r0] ldr r12, [r0]
sadd16 r12, r12, r14 sadd16 r12, r12, r14
@ -198,27 +202,26 @@ jpeg_idct4v:
ldmia sp!, { r4-r7, pc } ldmia sp!, { r4-r7, pc }
#elif ARM_ARCH < 6 #elif ARM_ARCH < 6
stmdb sp!, { r4-r8, lr } stmdb sp!, { r4-r8, lr }
ldr r8, =1024 mov r8, #1024
ldr r14, =4433 ldrd r4, .Lpool4
ldr r12, =3302955134
1: 1:
ldrsh r5, [r0, #48] ldrsh r14, [r0, #48]
ldrsh r3, [r0, #16] ldrsh r3, [r0, #16]
ldrsh r4, [r0, #32] ldrsh r12, [r0, #32]
ldrsh r2, [r0] ldrsh r2, [r0]
add r6, r3, r5 /* r6 = z1 = d1 + d3 */ add r6, r3, r14 /* r6 = z1 = d1 + d3 */
add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */ add r7, r2, r12 /* r7 = tmp10 >> 2 = d0 + d2 */
smlabb r6, r14, r6, r8 /* z1 *= 4433 */ smlabb r6, r5, r6, r8 /* z1 *= 4433 */
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */ sub r2, r2, r12 /* r2 = tmp12 >> 2= d0 - d2 */
smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */ smlatb r3, r5, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */ smlabb r14, r4, r14, r6 /* r14 = tmp0 = z1 - z3 * 15137 */
mov r7, r7, lsl #2 mov r7, r7, lsl #2
mov r2, r2, lsl #2 mov r2, r2, lsl #2
add r4, r7, r3, asr #11 /* r4 = o0 */ add r12, r7, r3, asr #11 /* r12 = o0 */
sub r7, r7, r3, asr #11 /* r7 = o3 */ sub r7, r7, r3, asr #11 /* r7 = o3 */
add r3, r2, r5, asr #11 /* r3 = o1 */ add r3, r2, r14, asr #11 /* r3 = o1 */
sub r2, r2, r5, asr #11 /* r2 = o2 */ sub r2, r2, r14, asr #11 /* r2 = o2 */
strh r4, [r0] strh r12, [r0]
strh r7, [r0, #48] strh r7, [r0, #48]
strh r3, [r0, #16] strh r3, [r0, #16]
strh r2, [r0, #32] strh r2, [r0, #32]
@ -228,9 +231,8 @@ jpeg_idct4v:
ldmia sp!, { r4-r8, pc } ldmia sp!, { r4-r8, pc }
#else #else
stmdb sp!, { r4-r10, lr } stmdb sp!, { r4-r10, lr }
ldr r2, =1024 ldrd r2, .Lpool4
ldr r3, =4433 mov r12, #1024
ldr r12, =3302955134
1: 1:
ldr r6, [r0, #32] ldr r6, [r0, #32]
ldr r4, [r0] ldr r4, [r0]
@ -247,12 +249,12 @@ jpeg_idct4v:
/* multiplication expands values beyond 16 bits, so this part needs to be /* multiplication expands values beyond 16 bits, so this part needs to be
split. the values will be merged below so that the rest of the addition split. the values will be merged below so that the rest of the addition
can be done in parallel */ can be done in parallel */
smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */ smlabb r9, r3, r6, r12 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */ smlabt r6, r3, r6, r12 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */ smlatb r10, r3, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */ smlabb r14, r2, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */ smlatt r5, r3, r5, r6 /* r5 = tmp2[1] */
smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */ smlabt r6, r2, r7, r6 /* r6 = tmp0[1] */
mov r8, r8, lsl #2 /* complete the parallel shift started */ mov r8, r8, lsl #2 /* complete the parallel shift started */
mov r4, r4, lsl #2 /* with the earlier bic instructions */ mov r4, r4, lsl #2 /* with the earlier bic instructions */
/* tmp2 are in r10, r5; tmp0 are in r14, r6 */ /* tmp2 are in r10, r5; tmp0 are in r14, r6 */
@ -276,6 +278,17 @@ jpeg_idct4v:
#endif #endif
.size jpeg_idct4v, .-jpeg_idct4v .size jpeg_idct4v, .-jpeg_idct4v
#if ARM_ARCH > 4
.align 4
.Lpool4:
.short -15137
.short 4112
.short 4433
.short 6270
.align 2
#endif
jpeg_idct4h: jpeg_idct4h:
#if ARM_ARCH < 5 #if ARM_ARCH < 5
stmdb sp!, { r4-r10, lr } stmdb sp!, { r4-r10, lr }
@ -328,88 +341,85 @@ jpeg_idct4h:
cmp r0, r2 cmp r0, r2
bcc 1b bcc 1b
ldmia sp!, { r4-r10, pc } ldmia sp!, { r4-r10, pc }
#elif ARM_ARCH < 6 #elif ARM_ARCH < 6 || 1
stmdb sp!, { r4-r10, lr } stmdb sp!, { r4-r9, lr }
ldr r10, =4433 ldrd r4, .Lpool4
ldr r14, =4112
ldr r12, =3302955134
1: 1:
ldrsh r7, [r0, #6] ldrsh r7, [r0, #6]
ldrsh r5, [r0, #2] ldrsh r14, [r0, #2]
ldrsh r4, [r0] ldrsh r12, [r0]
ldrsh r6, [r0, #4] ldrsh r6, [r0, #4]
add r8, r5, r7 /* r8 = z1 = d1 + d3 */ add r8, r14, r7 /* r8 = z1 = d1 + d3 */
add r4, r4, r14 add r12, r12, r4, lsr #16
smulbb r8, r10, r8 /* z1 *= 4433 */ smulbb r8, r5, r8 /* z1 *= 4433 */
add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */ add r9, r12, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */ smlatb r14, r5, r14, r8 /* r14= tmp2 = z1 + z2 * 6270 */
smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */ smlabb r7, r4, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
sub r4, r4, r6 /* r4 = tmp12 >> 13 = d0 - d2 */ sub r12, r12, r6 /* r12= tmp12 >> 13 = d0 - d2 */
add r6, r5, r9, lsl #13 /* r6 = o0 */ add r6, r14, r9, lsl #13 /* r6 = o0 */
rsb r9, r5, r9, lsl #13 /* r9 = o3 */ rsb r9, r14, r9, lsl #13 /* r9 = o3 */
add r5, r7, r4, lsl #13 /* r5 = o1 */ add r14, r7, r12, lsl #13 /* r14= o1 */
rsb r4, r7, r4, lsl #13 /* r4 = o2 */ rsb r12, r7, r12, lsl #13 /* r12= o2 */
mov r6, r6, asr #18 mov r6, r6, asr #18
mov r5, r5, asr #18 mov r14, r14, asr #18
mov r4, r4, asr #18 mov r12, r12, asr #18
mov r9, r9, asr #18 mov r9, r9, asr #18
cmp r6, #255 cmp r6, #255
mvnhi r6, r6, asr #31 mvnhi r6, r6, asr #31
cmp r5, #255 cmp r14, #255
mvnhi r5, r5, asr #31 mvnhi r14, r14, asr #31
cmp r4, #255 cmp r12, #255
mvnhi r4, r4, asr #31 mvnhi r12, r12, asr #31
cmp r9, #255 cmp r9, #255
mvnhi r9, r9, asr #31 mvnhi r9, r9, asr #31
#ifdef HAVE_LCD_COLOR #ifdef HAVE_LCD_COLOR
strb r6, [r1] strb r6, [r1]
strb r5, [r1, #4] strb r14, [r1, #4]
strb r4, [r1, #8] strb r12, [r1, #8]
strb r9, [r1, #12] strb r9, [r1, #12]
#else #else
strb r6, [r1] strb r6, [r1]
strb r5, [r1, #1] strb r14, [r1, #1]
strb r4, [r1, #2] strb r12, [r1, #2]
strb r9, [r1, #3] strb r9, [r1, #3]
#endif #endif
add r0, r0, #16 add r0, r0, #16
add r1, r1, r3 add r1, r1, r3
cmp r0, r2 cmp r0, r2
bcc 1b bcc 1b
ldmia sp!, { r4-r10, pc } ldmia sp!, { r4-r9, pc }
#else #else
stmdb sp!, { r4-r9, lr } stmdb sp!, { r4-r9, lr }
ldr r9, =4433 ldrd r4, .Lpool4
ldr r14, =4112 mov r9, r4, lsr #16
ldr r12, =3302955134
1: 1:
ldmia r0, { r4-r5 } ldmia r0, { r12, r14 }
sadd16 r4, r4, r14 sadd16 r12, r12, r9
sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */ sadd16 r6, r12, r14 /* r6lo = d0 + d2, r6hi = d1 + d3 */
ssub16 r7, r4, r5 /* r7lo = d0 - d2 */ ssub16 r7, r12, r14 /* r7lo = d0 - d2 */
smulbt r8, r9, r6 smulbt r8, r5, r6
sxth r6, r6 sxth r6, r6
smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */ smlatt r12, r5, r12, r8 /* r12= tmp2 = z1 + z2 * 6270 */
smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */ smlabt r14, r4, r14, r8 /* r14= tmp0 = z1 - z3 * 15137 */
sxth r7, r7 sxth r7, r7
add r8, r4, r6, lsl #13 /* r8 = o0 */ add r8, r12, r6, lsl #13 /* r8 = o0 */
rsb r6, r4, r6, lsl #13 /* r6 = o3 */ rsb r6, r12, r6, lsl #13 /* r6 = o3 */
add r4, r5, r7, lsl #13 /* r4 = o1 */ add r12, r14, r7, lsl #13 /* r12= o1 */
rsb r5, r5, r7, lsl #13 /* r5 = o2 */ rsb r14, r14, r7, lsl #13 /* r14= o2 */
usat r8, #8, r8, asr #18 usat r8, #8, r8, asr #18
usat r6, #8, r6, asr #18 usat r6, #8, r6, asr #18
usat r4, #8, r4, asr #18 usat r12, #8, r12, asr #18
usat r5, #8, r5, asr #18 usat r14, #8, r14, asr #18
#ifdef HAVE_LCD_COLOR #ifdef HAVE_LCD_COLOR
strb r8, [r1] strb r8, [r1]
strb r6, [r1, #12] strb r6, [r1, #12]
strb r4, [r1, #4] strb r12, [r1, #4]
strb r5, [r1, #8] strb r14, [r1, #8]
#else #else
strb r8, [r1] strb r8, [r1]
strb r6, [r1, #3] strb r6, [r1, #3]
strb r4, [r1, #1] strb r12, [r1, #1]
strb r5, [r1, #2] strb r14, [r1, #2]
#endif #endif
add r0, r0, #16 add r0, r0, #16
add r1, r1, r3 add r1, r1, r3
@ -450,7 +460,7 @@ jpeg_idct8v:
mov r11, r11, asr #16 /* r11 = z3 = d6 */ mov r11, r11, asr #16 /* r11 = z3 = d6 */
add r8, r8, #8192 add r8, r8, #8192
add r9, r10, r11 add r9, r10, r11
mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */ mov r8, r8, asr #3 /* r8 = z4 = (d0 << 13) + 1024 */
mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */ mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
ldr r14, =6270 ldr r14, =6270
mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */ mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */