rockbox/apps/plugins/mpegplayer/libmpeg2/idct_arm.S
Solomon Peachy 8c86fb6da0 arm: Use -masm-syntax-unified when compiling with gcc8 or newer
Annoyingly, this makes all of the '.S' files we compile get treated as
divided syntax, so we need to make the syntax in them explicit.

Change-Id: I56a3916b7b24c84a1214a5d6bc4ed4d651f002cf
2024-05-08 21:45:42 -04:00

445 lines
14 KiB
ArmAsm

/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2007 by Michael Sevakis
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
.syntax unified
.global mpeg2_idct_copy
.type mpeg2_idct_copy, %function
.global mpeg2_idct_add
.type mpeg2_idct_add, %function
/* Custom calling convention:
* r0 contains block pointer and is non-volatile
* all non-volatile c context saved and restored on its behalf
*/
.idct:
add r12, r0, #128
1:
ldrsh r1, [r0, #0] /* d0 */
ldrsh r2, [r0, #2] /* d1 */
ldrsh r3, [r0, #4] /* d2 */
ldrsh r4, [r0, #6] /* d3 */
ldrsh r5, [r0, #8] /* d0 */
ldrsh r6, [r0, #10] /* d1 */
ldrsh r7, [r0, #12] /* d2 */
ldrsh r8, [r0, #14] /* d3 */
orrs r9, r2, r3
orrseq r9, r4, r5
orrseq r9, r6, r7
cmpeq r8, #0
bne 2f
mov r1, r1, asl #15
bic r1, r1, #0x8000
orr r1, r1, r1, lsr #16
str r1, [r0], #4
str r1, [r0], #4
str r1, [r0], #4
str r1, [r0], #4
cmp r0, r12
blo 1b
b 3f
2:
mov r1, r1, asl #11 /* r1 = d0 = (block[0] << 11) + 2048 */
add r1, r1, #2048
add r1, r1, r3, asl #11 /* r1 = t0 = d0 + (block[2] << 11) */
sub r3, r1, r3, asl #12 /* r3 = t1 = d0 - (block[2] << 11) */
add r9, r2, r4 /* r9 = tmp = (d1+d3)*(1108/4) */
add r10, r9, r9, asl #2
add r10, r10, r9, asl #4
add r9, r10, r9, asl #8
add r10, r2, r2, asl #4 /* r2 = t2 = tmp + (d1*(1568/32)*8) */
add r2, r10, r2, asl #5
add r2, r9, r2, asl #3
add r10, r4, r4, asl #2 /* r4 = t3 = tmp - (d3*(3784/8)*2) */
rsb r10, r10, r4, asl #6
add r4, r4, r10, asl #3
sub r4, r9, r4, asl #1
/* t2 & t3 are 1/4 final value here */
add r1, r1, r2, asl #2 /* r1 = a0 = t0 + t2 */
sub r2, r1, r2, asl #3 /* r2 = a3 = t0 - t2 */
add r3, r3, r4, asl #2 /* r3 = a1 = t1 + t3 */
sub r4, r3, r4, asl #3 /* r4 = a2 = t1 - t3 */
add r9, r8, r5 /* r9 = tmp = 565*(d3 + d0) */
add r10, r9, r9, asl #4
add r10, r10, r10, asl #5
add r9, r10, r9, asl #2
add r10, r5, r5, asl #4 /* r5 = t0 = tmp + (((2276/4)*d0)*4) */
add r10, r10, r10, asl #5
add r5, r10, r5, asl #3
add r5, r9, r5, asl #2
add r10, r8, r8, asl #2 /* r8 = t1 = tmp - (((3406/2)*d3)*2) */
add r10, r10, r10, asl #4
add r10, r10, r8, asl #7
rsb r8, r8, r10, asl #3
sub r8, r9, r8, asl #1
add r9, r6, r7 /* r9 = tmp = (2408/8)*(d1 + d2) */
add r10, r9, r9, asl #3
add r10, r10, r10, asl #5
add r9, r10, r9, asl #2
add r10, r7, r7, asl #3 /* r7 = t2 = (tmp*8) - 799*d2 */
add r10, r10, r7, asl #4
rsb r7, r7, r10, asl #5
rsb r7, r7, r9, asl #3
sub r10, r6, r6, asl #4 /* r6 = t3 = (tmp*8) - 4017*d1 */
sub r10, r10, r6, asl #6
add r10, r10, r6, asl #12
add r6, r10, r6
rsb r6, r6, r9, asl #3
/* t0 = r5, t1 = r8, t2 = r7, t3 = r6*/
add r9, r5, r7 /* r9 = b0 = t0 + t2 */
add r10, r8, r6 /* r10 = b3 = t1 + t3 */
sub r5, r5, r7 /* t0 -= t2 */
sub r8, r8, r6 /* t1 -= t3 */
add r6, r5, r8 /* r6 = t0 + t1 */
sub r7, r5, r8 /* r7 = t0 - t1 */
add r11, r6, r6, asr #2 /* r6 = b1 = r6*(181/128) */
add r11, r11, r11, asr #5
add r6, r11, r6, asr #3
add r11, r7, r7, asr #2 /* r7 = b2 = r7*(181/128) */
add r11, r11, r11, asr #5
add r7, r11, r7, asr #3
/* r1 = a0, r3 = a1, r4 = a2, r2 = a3 */
/* r9 = b0, r6 = b1*2, r7 = b2*2, r10 = b3 */
add r5, r1, r9 /* block[0] = (a0 + b0) >> 12 */
mov r5, r5, asr #12
strh r5, [r0], #2
add r8, r3, r6, asr #1 /* block[1] = (a1 + b1) >> 12 */
mov r8, r8, asr #12
strh r8, [r0], #2
add r5, r4, r7, asr #1 /* block[2] = (a2 + b2) >> 12 */
mov r5, r5, asr #12
strh r5, [r0], #2
add r8, r2, r10 /* block[3] = (a3 + b3) >> 12 */
mov r8, r8, asr #12
strh r8, [r0], #2
sub r5, r2, r10 /* block[4] = (a3 - b3) >> 12 */
mov r5, r5, asr #12
strh r5, [r0], #2
sub r8, r4, r7, asr #1 /* block[5] = (a2 - b2) >> 12 */
mov r8, r8, asr #12
strh r8, [r0], #2
sub r5, r3, r6, asr #1 /* block[6] = (a1 - b1) >> 12 */
mov r5, r5, asr #12
strh r5, [r0], #2
sub r8, r1, r9 /* block[7] = (a0 - b0) >> 12 */
mov r8, r8, asr #12
strh r8, [r0], #2
cmp r0, r12
blo 1b
3:
sub r0, r0, #128
add r12, r0, #16
4:
ldrsh r1, [r0, #0*8] /* d0 */
ldrsh r2, [r0, #2*8] /* d1 */
ldrsh r3, [r0, #4*8] /* d2 */
ldrsh r4, [r0, #6*8] /* d3 */
ldrsh r5, [r0, #8*8] /* d0 */
ldrsh r6, [r0, #10*8] /* d1 */
ldrsh r7, [r0, #12*8] /* d2 */
ldrsh r8, [r0, #14*8] /* d3 */
mov r1, r1, asl #11 /* r1 = d0 = (block[0] << 11) + 2048 */
add r1, r1, #65536
add r1, r1, r3, asl #11 /* r1 = t0 = d0 + d2:(block[2] << 11) */
sub r3, r1, r3, asl #12 /* r3 = t1 = d0 - d2:(block[2] << 11) */
add r9, r2, r4 /* r9 = tmp = (d1+d3)*(1108/4) */
add r10, r9, r9, asl #2
add r10, r10, r9, asl #4
add r9, r10, r9, asl #8
add r11, r2, r2, asl #4 /* r2 = t2 = tmp + (d1*(1568/32)*8) */
add r2, r11, r2, asl #5
add r2, r9, r2, asl #3
add r10, r4, r4, asl #2 /* r4 = t3 = tmp - (d3*(3784/8)*2) */
rsb r10, r10, r4, asl #6
add r4, r4, r10, asl #3
sub r4, r9, r4, asl #1
/* t2 & t3 are 1/4 final value here */
add r1, r1, r2, asl #2 /* r1 = a0 = t0 + t2 */
sub r2, r1, r2, asl #3 /* r2 = a3 = t0 - t2 */
add r3, r3, r4, asl #2 /* r3 = a1 = t1 + t3 */
sub r4, r3, r4, asl #3 /* r4 = a2 = t1 - t3 */
add r9, r8, r5 /* r9 = tmp = 565*(d3 + d0) */
add r10, r9, r9, asl #4
add r10, r10, r10, asl #5
add r9, r10, r9, asl #2
add r10, r5, r5, asl #4 /* r5 = t0 = tmp + (((2276/4)*d0)*4) */
add r10, r10, r10, asl #5
add r5, r10, r5, asl #3
add r5, r9, r5, asl #2
add r10, r8, r8, asl #2 /* r8 = t1 = tmp - (((3406/2)*d3)*2) */
add r10, r10, r10, asl #4
add r10, r10, r8, asl #7
rsb r8, r8, r10, asl #3
sub r8, r9, r8, asl #1
add r9, r6, r7 /* r9 = tmp = (2408/8)*(d1 + d2) */
add r10, r9, r9, asl #3
add r10, r10, r10, asl #5
add r9, r10, r9, asl #2
add r10, r7, r7, asl #3 /* r7 = t2 = (tmp*8) - 799*d2 */
add r10, r10, r7, asl #4
rsb r7, r7, r10, asl #5
rsb r7, r7, r9, asl #3
sub r10, r6, r6, asl #4 /* r6 = t3 = (tmp*8) - 4017*d1 */
sub r10, r10, r6, asl #6
add r10, r10, r6, asl #12
add r6, r10, r6
rsb r6, r6, r9, asl #3
/* t0=r5, t1=r8, t2=r7, t3=r6*/
add r9, r5, r7 /* r9 = b0 = t0 + t2 */
add r10, r8, r6 /* r10 = b3 = t1 + t3 */
sub r5, r5, r7 /* t0 -= t2 */
sub r8, r8, r6 /* t1 -= t3 */
add r6, r5, r8 /* r6 = t0 + t1 */
sub r7, r5, r8 /* r7 = t0 - t1 */
add r11, r6, r6, asr #2 /* r6 = b1 = r5*(181/128) */
add r11, r11, r11, asr #5
add r6, r11, r6, asr #3
add r11, r7, r7, asr #2 /* r7 = b2 = r6*(181/128) */
add r11, r11, r11, asr #5
add r7, r11, r7, asr #3
/* r1 = a0, r3 = a1, r4 = a2, r2 = a3 */
/* r9 = b0, r6 = b1*2, r7 = b2*2, r10 = b3 */
add r5, r1, r9 /* block[0] = (a0 + b0) >> 17 */
mov r5, r5, asr #17
strh r5, [r0, #0*8]
add r8, r3, r6, asr #1 /* block[1] = (a1 + b1) >> 17 */
mov r8, r8, asr #17
strh r8, [r0, #2*8]
add r5, r4, r7, asr #1 /* block[2] = (a2 + b2) >> 17 */
mov r5, r5, asr #17
strh r5, [r0, #4*8]
add r8, r2, r10 /* block[3] = (a3 + b3) >> 17 */
mov r8, r8, asr #17
strh r8, [r0, #6*8]
sub r5, r2, r10 /* block[4] = (a3 - b3) >> 17 */
mov r5, r5, asr #17
strh r5, [r0, #8*8]
sub r8, r4, r7, asr #1 /* block[5] = (a2 - b2) >> 17 */
mov r8, r8, asr #17
strh r8, [r0, #10*8]
sub r5, r3, r6, asr #1 /* block[6] = (a1 - b1) >> 17 */
mov r5, r5, asr #17
strh r5, [r0, #12*8]
sub r8, r1, r9 /* block[7] = (a0 - b0) >> 17 */
mov r8, r8, asr #17
strh r8, [r0, #14*8]
add r0, r0, #2
cmp r0, r12
blo 4b
sub r0, r0, #16
bx lr
mpeg2_idct_copy:
stmfd sp!, { r1-r2, r4-r11, lr }
bl .idct
ldmfd sp!, { r1-r2 }
mov r11, #0
add r12, r0, #128
1:
ldrsh r3, [r0, #0]
ldrsh r4, [r0, #2]
ldrsh r5, [r0, #4]
ldrsh r6, [r0, #6]
ldrsh r7, [r0, #8]
ldrsh r8, [r0, #10]
ldrsh r9, [r0, #12]
ldrsh r10, [r0, #14]
cmp r3, #255
mvnhi r3, r3, asr #31
strb r3, [r1, #0]
str r11, [r0], #4
cmp r4, #255
mvnhi r4, r4, asr #31
strb r4, [r1, #1]
cmp r5, #255
mvnhi r5, r5, asr #31
strb r5, [r1, #2]
str r11, [r0], #4
cmp r6, #255
mvnhi r6, r6, asr #31
strb r6, [r1, #3]
cmp r7, #255
mvnhi r7, r7, asr #31
strb r7, [r1, #4]
str r11, [r0], #4
cmp r8, #255
mvnhi r8, r8, asr #31
strb r8, [r1, #5]
cmp r9, #255
mvnhi r9, r9, asr #31
strb r9, [r1, #6]
str r11, [r0], #4
cmp r10, #255
mvnhi r10, r10, asr #31
strb r10, [r1, #7]
add r1, r1, r2
cmp r0, r12
blo 1b
ldmpc regs=r4-r11
mpeg2_idct_add:
cmp r0, #129
mov r0, r1
ldrsheq r1, [r0, #0]
bne 1f
and r1, r1, #0x70
cmp r1, #0x40
bne 3f
1:
stmfd sp!, { r2-r11, lr }
bl .idct
ldmfd sp!, { r1-r2 }
mov r11, #0
add r12, r0, #128
2:
ldrb r3, [r1, #0]
ldrb r4, [r1, #1]
ldrb r5, [r1, #2]
ldrb r6, [r1, #3]
ldrsh r7, [r0, #0]
ldrsh r8, [r0, #2]
ldrsh r9, [r0, #4]
ldrsh r10, [r0, #6]
add r7, r7, r3
ldrb r3, [r1, #4]
cmp r7, #255
mvnhi r7, r7, asr #31
strb r7, [r1, #0]
ldrsh r7, [r0, #8]
add r8, r8, r4
ldrb r4, [r1, #5]
cmp r8, #255
mvnhi r8, r8, asr #31
strb r8, [r1, #1]
ldrsh r8, [r0, #10]
add r9, r9, r5
ldrb r5, [r1, #6]
cmp r9, #255
mvnhi r9, r9, asr #31
strb r9, [r1, #2]
ldrsh r9, [r0, #12]
add r10, r10, r6
ldrb r6, [r1, #7]
cmp r10, #255
mvnhi r10, r10, asr #31
strb r10, [r1, #3]
ldrsh r10, [r0, #14]
str r11, [r0], #4
add r7, r7, r3
cmp r7, #255
mvnhi r7, r7, asr #31
strb r7, [r1, #4]
str r11, [r0], #4
add r8, r8, r4
cmp r8, #255
mvnhi r8, r8, asr #31
strb r8, [r1, #5]
str r11, [r0], #4
add r9, r9, r5
cmp r9, #255
mvnhi r9, r9, asr #31
strb r9, [r1, #6]
add r10, r10, r6
cmp r10, #255
mvnhi r10, r10, asr #31
strb r10, [r1, #7]
str r11, [r0], #4
add r1, r1, r2
cmp r0, r12
blo 2b
ldmpc regs=r4-r11
3:
stmfd sp!, { r4-r5, lr }
ldrsh r1, [r0, #0] /* r1 = block[0] */
mov r4, #0
strh r4, [r0, #0] /* block[0] = 0 */
strh r4, [r0, #126] /* block[63] = 0 */
add r1, r1, #64 /* r1 = DC << 7 */
add r0, r2, r3, asl #3
4:
ldrb r4, [r2, #0]
ldrb r5, [r2, #1]
ldrb r12, [r2, #2]
ldrb lr, [r2, #3]
add r4, r4, r1, asr #7
cmp r4, #255
mvnhi r4, r4, asr #31
strb r4, [r2, #0]
add r5, r5, r1, asr #7
cmp r5, #255
mvnhi r5, r5, asr #31
strb r5, [r2, #1]
add r12, r12, r1, asr #7
cmp r12, #255
mvnhi r12, r12, asr #31
strb r12, [r2, #2]
add lr, lr, r1, asr #7
cmp lr, #255
mvnhi lr, lr, asr #31
strb lr, [r2, #3]
ldrb r4, [r2, #4]
ldrb r5, [r2, #5]
ldrb r12, [r2, #6]
ldrb lr, [r2, #7]
add r4, r4, r1, asr #7
cmp r4, #255
mvnhi r4, r4, asr #31
strb r4, [r2, #4]
add r5, r5, r1, asr #7
cmp r5, #255
mvnhi r5, r5, asr #31
strb r5, [r2, #5]
add r12, r12, r1, asr #7
cmp r12, #255
mvnhi r12, r12, asr #31
strb r12, [r2, #6]
add lr, lr, r1, asr #7
cmp lr, #255
mvnhi lr, lr, asr #31
strb lr, [r2, #7]
add r2, r2, r3
cmp r2, r0
blo 4b
ldmpc regs=r4-r5