mirror of
https://github.com/Rockbox/rockbox.git
synced 2025-11-20 10:32:42 -05:00
Use hand-written constants table on ARMv5+ for JPEG IDCT, and load four 16-bit constants at a time with ldrd. Not useful for ARMv4, since one load per constant would still be needed, and limited range of ldrsh would force multiple copies of table.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21535 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
99ae7bcc43
commit
815dcfdd35
1 changed files with 86 additions and 76 deletions
|
|
@ -113,7 +113,11 @@ jpeg_idct2h:
|
||||||
results can not be stored merged.
|
results can not be stored merged.
|
||||||
*/
|
*/
|
||||||
stmdb sp!, { r4-r5, lr }
|
stmdb sp!, { r4-r5, lr }
|
||||||
|
#if ARM_ARCH < 5
|
||||||
ldr r14, =4112
|
ldr r14, =4112
|
||||||
|
#else
|
||||||
|
ldrsh r14, .Lpool4+2
|
||||||
|
#endif
|
||||||
1:
|
1:
|
||||||
ldrsh r12, [r0]
|
ldrsh r12, [r0]
|
||||||
ldrsh r4, [r0, #2]
|
ldrsh r4, [r0, #2]
|
||||||
|
|
@ -140,7 +144,7 @@ jpeg_idct2h:
|
||||||
ldmia sp!, { r4-r5, pc }
|
ldmia sp!, { r4-r5, pc }
|
||||||
#else
|
#else
|
||||||
stmdb sp!, { r4, lr }
|
stmdb sp!, { r4, lr }
|
||||||
ldr r14, =4112
|
ldrsh r14, .Lpool4+2
|
||||||
1:
|
1:
|
||||||
ldr r12, [r0]
|
ldr r12, [r0]
|
||||||
sadd16 r12, r12, r14
|
sadd16 r12, r12, r14
|
||||||
|
|
@ -198,27 +202,26 @@ jpeg_idct4v:
|
||||||
ldmia sp!, { r4-r7, pc }
|
ldmia sp!, { r4-r7, pc }
|
||||||
#elif ARM_ARCH < 6
|
#elif ARM_ARCH < 6
|
||||||
stmdb sp!, { r4-r8, lr }
|
stmdb sp!, { r4-r8, lr }
|
||||||
ldr r8, =1024
|
mov r8, #1024
|
||||||
ldr r14, =4433
|
ldrd r4, .Lpool4
|
||||||
ldr r12, =3302955134
|
|
||||||
1:
|
1:
|
||||||
ldrsh r5, [r0, #48]
|
ldrsh r14, [r0, #48]
|
||||||
ldrsh r3, [r0, #16]
|
ldrsh r3, [r0, #16]
|
||||||
ldrsh r4, [r0, #32]
|
ldrsh r12, [r0, #32]
|
||||||
ldrsh r2, [r0]
|
ldrsh r2, [r0]
|
||||||
add r6, r3, r5 /* r6 = z1 = d1 + d3 */
|
add r6, r3, r14 /* r6 = z1 = d1 + d3 */
|
||||||
add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */
|
add r7, r2, r12 /* r7 = tmp10 >> 2 = d0 + d2 */
|
||||||
smlabb r6, r14, r6, r8 /* z1 *= 4433 */
|
smlabb r6, r5, r6, r8 /* z1 *= 4433 */
|
||||||
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
|
sub r2, r2, r12 /* r2 = tmp12 >> 2= d0 - d2 */
|
||||||
smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
|
smlatb r3, r5, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
|
||||||
smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
|
smlabb r14, r4, r14, r6 /* r14 = tmp0 = z1 - z3 * 15137 */
|
||||||
mov r7, r7, lsl #2
|
mov r7, r7, lsl #2
|
||||||
mov r2, r2, lsl #2
|
mov r2, r2, lsl #2
|
||||||
add r4, r7, r3, asr #11 /* r4 = o0 */
|
add r12, r7, r3, asr #11 /* r12 = o0 */
|
||||||
sub r7, r7, r3, asr #11 /* r7 = o3 */
|
sub r7, r7, r3, asr #11 /* r7 = o3 */
|
||||||
add r3, r2, r5, asr #11 /* r3 = o1 */
|
add r3, r2, r14, asr #11 /* r3 = o1 */
|
||||||
sub r2, r2, r5, asr #11 /* r2 = o2 */
|
sub r2, r2, r14, asr #11 /* r2 = o2 */
|
||||||
strh r4, [r0]
|
strh r12, [r0]
|
||||||
strh r7, [r0, #48]
|
strh r7, [r0, #48]
|
||||||
strh r3, [r0, #16]
|
strh r3, [r0, #16]
|
||||||
strh r2, [r0, #32]
|
strh r2, [r0, #32]
|
||||||
|
|
@ -228,9 +231,8 @@ jpeg_idct4v:
|
||||||
ldmia sp!, { r4-r8, pc }
|
ldmia sp!, { r4-r8, pc }
|
||||||
#else
|
#else
|
||||||
stmdb sp!, { r4-r10, lr }
|
stmdb sp!, { r4-r10, lr }
|
||||||
ldr r2, =1024
|
ldrd r2, .Lpool4
|
||||||
ldr r3, =4433
|
mov r12, #1024
|
||||||
ldr r12, =3302955134
|
|
||||||
1:
|
1:
|
||||||
ldr r6, [r0, #32]
|
ldr r6, [r0, #32]
|
||||||
ldr r4, [r0]
|
ldr r4, [r0]
|
||||||
|
|
@ -247,12 +249,12 @@ jpeg_idct4v:
|
||||||
/* multiplication expands values beyond 16 bits, so this part needs to be
|
/* multiplication expands values beyond 16 bits, so this part needs to be
|
||||||
split. the values will be merged below so that the rest of the addition
|
split. the values will be merged below so that the rest of the addition
|
||||||
can be done in parallel */
|
can be done in parallel */
|
||||||
smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
|
smlabb r9, r3, r6, r12 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
|
||||||
smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
|
smlabt r6, r3, r6, r12 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
|
||||||
smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
|
smlatb r10, r3, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
|
||||||
smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
|
smlabb r14, r2, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
|
||||||
smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */
|
smlatt r5, r3, r5, r6 /* r5 = tmp2[1] */
|
||||||
smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */
|
smlabt r6, r2, r7, r6 /* r6 = tmp0[1] */
|
||||||
mov r8, r8, lsl #2 /* complete the parallel shift started */
|
mov r8, r8, lsl #2 /* complete the parallel shift started */
|
||||||
mov r4, r4, lsl #2 /* with the earlier bic instructions */
|
mov r4, r4, lsl #2 /* with the earlier bic instructions */
|
||||||
/* tmp2 are in r10, r5; tmp0 are in r14, r6 */
|
/* tmp2 are in r10, r5; tmp0 are in r14, r6 */
|
||||||
|
|
@ -276,6 +278,17 @@ jpeg_idct4v:
|
||||||
#endif
|
#endif
|
||||||
.size jpeg_idct4v, .-jpeg_idct4v
|
.size jpeg_idct4v, .-jpeg_idct4v
|
||||||
|
|
||||||
|
#if ARM_ARCH > 4
|
||||||
|
.align 4
|
||||||
|
.Lpool4:
|
||||||
|
.short -15137
|
||||||
|
.short 4112
|
||||||
|
.short 4433
|
||||||
|
.short 6270
|
||||||
|
|
||||||
|
.align 2
|
||||||
|
#endif
|
||||||
|
|
||||||
jpeg_idct4h:
|
jpeg_idct4h:
|
||||||
#if ARM_ARCH < 5
|
#if ARM_ARCH < 5
|
||||||
stmdb sp!, { r4-r10, lr }
|
stmdb sp!, { r4-r10, lr }
|
||||||
|
|
@ -328,88 +341,85 @@ jpeg_idct4h:
|
||||||
cmp r0, r2
|
cmp r0, r2
|
||||||
bcc 1b
|
bcc 1b
|
||||||
ldmia sp!, { r4-r10, pc }
|
ldmia sp!, { r4-r10, pc }
|
||||||
#elif ARM_ARCH < 6
|
#elif ARM_ARCH < 6 || 1
|
||||||
stmdb sp!, { r4-r10, lr }
|
stmdb sp!, { r4-r9, lr }
|
||||||
ldr r10, =4433
|
ldrd r4, .Lpool4
|
||||||
ldr r14, =4112
|
|
||||||
ldr r12, =3302955134
|
|
||||||
1:
|
1:
|
||||||
ldrsh r7, [r0, #6]
|
ldrsh r7, [r0, #6]
|
||||||
ldrsh r5, [r0, #2]
|
ldrsh r14, [r0, #2]
|
||||||
ldrsh r4, [r0]
|
ldrsh r12, [r0]
|
||||||
ldrsh r6, [r0, #4]
|
ldrsh r6, [r0, #4]
|
||||||
add r8, r5, r7 /* r8 = z1 = d1 + d3 */
|
add r8, r14, r7 /* r8 = z1 = d1 + d3 */
|
||||||
add r4, r4, r14
|
add r12, r12, r4, lsr #16
|
||||||
smulbb r8, r10, r8 /* z1 *= 4433 */
|
smulbb r8, r5, r8 /* z1 *= 4433 */
|
||||||
add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
|
add r9, r12, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
|
||||||
smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */
|
smlatb r14, r5, r14, r8 /* r14= tmp2 = z1 + z2 * 6270 */
|
||||||
smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
|
smlabb r7, r4, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
|
||||||
sub r4, r4, r6 /* r4 = tmp12 >> 13 = d0 - d2 */
|
sub r12, r12, r6 /* r12= tmp12 >> 13 = d0 - d2 */
|
||||||
add r6, r5, r9, lsl #13 /* r6 = o0 */
|
add r6, r14, r9, lsl #13 /* r6 = o0 */
|
||||||
rsb r9, r5, r9, lsl #13 /* r9 = o3 */
|
rsb r9, r14, r9, lsl #13 /* r9 = o3 */
|
||||||
add r5, r7, r4, lsl #13 /* r5 = o1 */
|
add r14, r7, r12, lsl #13 /* r14= o1 */
|
||||||
rsb r4, r7, r4, lsl #13 /* r4 = o2 */
|
rsb r12, r7, r12, lsl #13 /* r12= o2 */
|
||||||
mov r6, r6, asr #18
|
mov r6, r6, asr #18
|
||||||
mov r5, r5, asr #18
|
mov r14, r14, asr #18
|
||||||
mov r4, r4, asr #18
|
mov r12, r12, asr #18
|
||||||
mov r9, r9, asr #18
|
mov r9, r9, asr #18
|
||||||
cmp r6, #255
|
cmp r6, #255
|
||||||
mvnhi r6, r6, asr #31
|
mvnhi r6, r6, asr #31
|
||||||
cmp r5, #255
|
cmp r14, #255
|
||||||
mvnhi r5, r5, asr #31
|
mvnhi r14, r14, asr #31
|
||||||
cmp r4, #255
|
cmp r12, #255
|
||||||
mvnhi r4, r4, asr #31
|
mvnhi r12, r12, asr #31
|
||||||
cmp r9, #255
|
cmp r9, #255
|
||||||
mvnhi r9, r9, asr #31
|
mvnhi r9, r9, asr #31
|
||||||
#ifdef HAVE_LCD_COLOR
|
#ifdef HAVE_LCD_COLOR
|
||||||
strb r6, [r1]
|
strb r6, [r1]
|
||||||
strb r5, [r1, #4]
|
strb r14, [r1, #4]
|
||||||
strb r4, [r1, #8]
|
strb r12, [r1, #8]
|
||||||
strb r9, [r1, #12]
|
strb r9, [r1, #12]
|
||||||
#else
|
#else
|
||||||
strb r6, [r1]
|
strb r6, [r1]
|
||||||
strb r5, [r1, #1]
|
strb r14, [r1, #1]
|
||||||
strb r4, [r1, #2]
|
strb r12, [r1, #2]
|
||||||
strb r9, [r1, #3]
|
strb r9, [r1, #3]
|
||||||
#endif
|
#endif
|
||||||
add r0, r0, #16
|
add r0, r0, #16
|
||||||
add r1, r1, r3
|
add r1, r1, r3
|
||||||
cmp r0, r2
|
cmp r0, r2
|
||||||
bcc 1b
|
bcc 1b
|
||||||
ldmia sp!, { r4-r10, pc }
|
ldmia sp!, { r4-r9, pc }
|
||||||
#else
|
#else
|
||||||
stmdb sp!, { r4-r9, lr }
|
stmdb sp!, { r4-r9, lr }
|
||||||
ldr r9, =4433
|
ldrd r4, .Lpool4
|
||||||
ldr r14, =4112
|
mov r9, r4, lsr #16
|
||||||
ldr r12, =3302955134
|
|
||||||
1:
|
1:
|
||||||
ldmia r0, { r4-r5 }
|
ldmia r0, { r12, r14 }
|
||||||
sadd16 r4, r4, r14
|
sadd16 r12, r12, r9
|
||||||
sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */
|
sadd16 r6, r12, r14 /* r6lo = d0 + d2, r6hi = d1 + d3 */
|
||||||
ssub16 r7, r4, r5 /* r7lo = d0 - d2 */
|
ssub16 r7, r12, r14 /* r7lo = d0 - d2 */
|
||||||
smulbt r8, r9, r6
|
smulbt r8, r5, r6
|
||||||
sxth r6, r6
|
sxth r6, r6
|
||||||
smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */
|
smlatt r12, r5, r12, r8 /* r12= tmp2 = z1 + z2 * 6270 */
|
||||||
smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */
|
smlabt r14, r4, r14, r8 /* r14= tmp0 = z1 - z3 * 15137 */
|
||||||
sxth r7, r7
|
sxth r7, r7
|
||||||
add r8, r4, r6, lsl #13 /* r8 = o0 */
|
add r8, r12, r6, lsl #13 /* r8 = o0 */
|
||||||
rsb r6, r4, r6, lsl #13 /* r6 = o3 */
|
rsb r6, r12, r6, lsl #13 /* r6 = o3 */
|
||||||
add r4, r5, r7, lsl #13 /* r4 = o1 */
|
add r12, r14, r7, lsl #13 /* r12= o1 */
|
||||||
rsb r5, r5, r7, lsl #13 /* r5 = o2 */
|
rsb r14, r14, r7, lsl #13 /* r14= o2 */
|
||||||
usat r8, #8, r8, asr #18
|
usat r8, #8, r8, asr #18
|
||||||
usat r6, #8, r6, asr #18
|
usat r6, #8, r6, asr #18
|
||||||
usat r4, #8, r4, asr #18
|
usat r12, #8, r12, asr #18
|
||||||
usat r5, #8, r5, asr #18
|
usat r14, #8, r14, asr #18
|
||||||
#ifdef HAVE_LCD_COLOR
|
#ifdef HAVE_LCD_COLOR
|
||||||
strb r8, [r1]
|
strb r8, [r1]
|
||||||
strb r6, [r1, #12]
|
strb r6, [r1, #12]
|
||||||
strb r4, [r1, #4]
|
strb r12, [r1, #4]
|
||||||
strb r5, [r1, #8]
|
strb r14, [r1, #8]
|
||||||
#else
|
#else
|
||||||
strb r8, [r1]
|
strb r8, [r1]
|
||||||
strb r6, [r1, #3]
|
strb r6, [r1, #3]
|
||||||
strb r4, [r1, #1]
|
strb r12, [r1, #1]
|
||||||
strb r5, [r1, #2]
|
strb r14, [r1, #2]
|
||||||
#endif
|
#endif
|
||||||
add r0, r0, #16
|
add r0, r0, #16
|
||||||
add r1, r1, r3
|
add r1, r1, r3
|
||||||
|
|
@ -450,7 +460,7 @@ jpeg_idct8v:
|
||||||
mov r11, r11, asr #16 /* r11 = z3 = d6 */
|
mov r11, r11, asr #16 /* r11 = z3 = d6 */
|
||||||
add r8, r8, #8192
|
add r8, r8, #8192
|
||||||
add r9, r10, r11
|
add r9, r10, r11
|
||||||
mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */
|
mov r8, r8, asr #3 /* r8 = z4 = (d0 << 13) + 1024 */
|
||||||
mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
|
mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
|
||||||
ldr r14, =6270
|
ldr r14, =6270
|
||||||
mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */
|
mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue