1
0
Fork 0
forked from len0rd/rockbox

Branch optimisation in both C (giving hints to gcc - verified using -fprofile-arcs and gcov) and asm files. Biggest effect on coldfire (-c1000: +8%, -c2000: +5%), but ARM also profits a bit (less than 1% on ARM7TDMI, around 1% on ARM1136).

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19199 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Jens Arnold 2008-11-24 18:40:49 +00:00
parent 66c0cf2eb1
commit 3761c0108c
6 changed files with 143 additions and 121 deletions

View file

@ -70,6 +70,15 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
#define ICODE_ATTR
#define ICODE_ATTR_DEMAC
/* Use to give gcc hints on which branch is most likely taken */
#if defined(__GNUC__) && __GNUC__ >= 3
#define LIKELY(x) __builtin_expect(!!(x), 1)
#define UNLIKELY(x) __builtin_expect(!!(x), 0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#endif /* !ROCKBOX */
/* Defaults */

View file

@ -283,13 +283,13 @@ static inline void update_rice(struct rice_t* rice, int x)
{
rice->ksum += ((x + 1) / 2) - ((rice->ksum + 16) >> 5);
if (rice->k == 0) {
if (UNLIKELY(rice->k == 0)) {
rice->k = 1;
} else {
uint32_t lim = 1 << (rice->k + 4);
if (rice->ksum < lim) {
if (UNLIKELY(rice->ksum < lim)) {
rice->k--;
} else if (rice->ksum >= 2 * lim) {
} else if (UNLIKELY(rice->ksum >= 2 * lim)) {
rice->k++;
}
}
@ -300,11 +300,12 @@ static inline int entropy_decode3980(struct rice_t* rice)
int base, x, pivot, overflow;
pivot = rice->ksum >> 5;
if (pivot == 0) pivot=1;
if (UNLIKELY(pivot == 0))
pivot=1;
overflow = range_get_symbol_3980();
if (overflow == (MODEL_ELEMENTS-1)) {
if (UNLIKELY(overflow == (MODEL_ELEMENTS-1))) {
overflow = range_decode_short() << 16;
overflow |= range_decode_short();
}
@ -352,7 +353,7 @@ static inline int entropy_decode3970(struct rice_t* rice)
int overflow = range_get_symbol_3970();
if (overflow == (MODEL_ELEMENTS - 1)) {
if (UNLIKELY(overflow == (MODEL_ELEMENTS - 1))) {
tmpk = range_decode_bits(5);
overflow = 0;
} else {
@ -435,13 +436,13 @@ int ICODE_ATTR_DEMAC entropy_decode(struct ape_ctx_t* ape_ctx,
memset(decoded1, 0, blockstodecode * sizeof(int32_t));
} else {
if (ape_ctx->fileversion > 3970) {
while (blockstodecode--) {
while (LIKELY(blockstodecode--)) {
*(decoded0++) = entropy_decode3980(&riceY);
if (decoded1 != NULL)
*(decoded1++) = entropy_decode3980(&riceX);
}
} else {
while (blockstodecode--) {
while (LIKELY(blockstodecode--)) {
*(decoded0++) = entropy_decode3970(&riceY);
if (decoded1 != NULL)
*(decoded1++) = entropy_decode3970(&riceX);

View file

@ -100,7 +100,7 @@ struct filter_t {
#if defined(CPU_ARM) && (ARM_ARCH >= 6)
#define SATURATE(x) ({int __res; asm("ssat %0, #16, %1" : "=r"(__res) : "r"(x)); __res; })
#else
#define SATURATE(x) (((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF);
#define SATURATE(x) (LIKELY((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF);
#endif
/* Apply the filter with state f to count entries in data[] */
@ -109,20 +109,22 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
int32_t* data, int count)
{
int res;
int absres;
int absres;
#ifdef PREPARE_SCALARPRODUCT
PREPARE_SCALARPRODUCT
#endif
while(count--)
while(LIKELY(count--))
{
res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER));
if (*data < 0)
vector_add(f->coeffs, f->adaptcoeffs - ORDER);
else if (*data > 0)
vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
if (LIKELY(*data != 0)) {
if (*data < 0)
vector_add(f->coeffs, f->adaptcoeffs - ORDER);
else
vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
}
res += *data;
@ -136,11 +138,11 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
/* Update the adaption coefficients */
absres = (res < 0 ? -res : res);
if (absres > (f->avg * 3))
if (UNLIKELY(absres > (f->avg * 3)))
*f->adaptcoeffs = ((res >> 25) & 64) - 32;
else if (absres > (f->avg * 4) / 3)
*f->adaptcoeffs = ((res >> 26) & 32) - 16;
else if (absres > 0)
else if (LIKELY(absres > 0))
*f->adaptcoeffs = ((res >> 27) & 16) - 8;
else
*f->adaptcoeffs = 0;
@ -154,7 +156,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
f->adaptcoeffs++;
/* Have we filled the history buffer? */
if (f->delay == f->history_end) {
if (UNLIKELY(f->delay == f->history_end)) {
memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
(ORDER*2) * sizeof(filter_int));
f->adaptcoeffs = f->coeffs + ORDER*2;
@ -172,14 +174,16 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
PREPARE_SCALARPRODUCT
#endif
while(count--)
while(LIKELY(count--))
{
res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER));
if (*data < 0)
vector_add(f->coeffs, f->adaptcoeffs - ORDER);
else if (*data > 0)
vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
if (LIKELY(*data != 0)) {
if (*data < 0)
vector_add(f->coeffs, f->adaptcoeffs - ORDER);
else
vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
}
/* Convert res from (32-FRACBITS).FRACBITS fixed-point format to an
integer (rounding to nearest) and add the input value to
@ -199,7 +203,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
f->adaptcoeffs++;
/* Have we filled the history buffer? */
if (f->delay == f->history_end) {
if (UNLIKELY(f->delay == f->history_end)) {
memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
(ORDER*2) * sizeof(filter_int));
f->adaptcoeffs = f->coeffs + ORDER*2;

View file

@ -468,14 +468,24 @@ loop:
add r11, r12, #historybuffer @ r11 := &p->historybuffer[0]
sub r10, r14, #PREDICTOR_HISTORY_SIZE*4
sub r10, r14, #PREDICTOR_HISTORY_SIZE*4
@ r10 := p->buf - PREDICTOR_HISTORY_SIZE
cmp r10, r11
bne endofloop
beq move_hist @ The history buffer is full, we need to do a memmove
@ The history buffer is full, we need to do a memmove:
@ Check loop count
ldr r0, [sp, #8]
subs r0, r0, #1
strne r0, [sp, #8]
bne loop
done:
str r14, [r12] @ Save value of p->buf
add sp, sp, #12 @ Don't bother restoring r1-r3
ldmia sp!, {r4-r11, pc}
move_hist:
@ dest = r11 (p->historybuffer)
@ src = r14 (p->buf)
@ n = 200
@ -493,15 +503,10 @@ loop:
add r14, r12, #historybuffer @ p->buf = &p->historybuffer[0]
endofloop:
@ Check loop count
@ Check loop count
ldr r0, [sp, #8]
subs r0, r0, #1
strne r0, [sp, #8]
bne loop
done:
str r14, [r12] @ Save value of p->buf
add sp, sp, #12 @ Don't bother restoring r1-r3
ldmia sp!, {r4-r11, pc}
b done

View file

@ -486,10 +486,18 @@ predictor_decode_stereo:
| %a3 = &p->historybuffer[PREDICTOR_HISTORY_SIZE]
cmp.l %a3, %a5
bne.s .endofloop
beq.s .move_hist | The history buffer is full, we need to do a memmove
| The history buffer is full, we need to do a memmove:
subq.l #1, (8,%sp) | decrease loop count
bne.w .loop
.done:
move.l %a5, (%a6) | Save value of p->buf
movem.l (3*4,%sp), %d2-%d7/%a2-%a6
lea.l (14*4,%sp), %sp
rts
.move_hist:
lea.l (historybuffer,%a6), %a3
| dest = %a3 (p->historybuffer)
@ -497,33 +505,19 @@ predictor_decode_stereo:
| n = 200
movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes
lea.l (40,%a5), %a5
movem.l %d0-%d7/%a0-%a1, (%a3)
lea.l (40,%a3), %a3
movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes
lea.l (40,%a5), %a5
movem.l %d0-%d7/%a0-%a1, (%a3)
lea.l (40,%a3), %a3
movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes
lea.l (40,%a5), %a5
movem.l %d0-%d7/%a0-%a1, (%a3)
lea.l (40,%a3), %a3
movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes
lea.l (40,%a5), %a5
movem.l %d0-%d7/%a0-%a1, (%a3)
lea.l (40,%a3), %a3
movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes
lea.l (40,%a5), %a5
movem.l %d0-%d7/%a0-%a1, (%a3)
lea.l (40,%a3), %a3
movem.l (40,%a5), %d0-%d7/%a0-%a1 | 40 bytes
movem.l %d0-%d7/%a0-%a1, (40,%a3)
movem.l (80,%a5), %d0-%d7/%a0-%a1 | 40 bytes
movem.l %d0-%d7/%a0-%a1, (80,%a3)
movem.l (120,%a5), %d0-%d7/%a0-%a1 | 40 bytes
movem.l %d0-%d7/%a0-%a1, (120,%a3)
movem.l (160,%a5), %d0-%d7/%a0-%a1 | 40 bytes
movem.l %d0-%d7/%a0-%a1, (160,%a3)
lea.l (historybuffer,%a6), %a5 | p->buf = &p->historybuffer[0]
move.l %a3, %a5 | p->buf = &p->historybuffer[0]
.endofloop:
subq.l #1, (8,%sp) | decrease loop count
bne.w .loop
move.l %a5, (%a6) | Save value of p->buf
movem.l (3*4,%sp), %d2-%d7/%a2-%a6
lea.l (14*4,%sp), %sp
rts
bra.s .done

View file

@ -75,7 +75,7 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
{
int32_t predictionA, predictionB;
while (count--)
while (LIKELY(count--))
{
/* Predictor Y */
p->buf[YDELAYA] = p->YlastA;
@ -134,60 +134,66 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
p->XlastA = *decoded1 + ((predictionA + (predictionB >> 1)) >> 10);
p->XfilterA = p->XlastA + ((p->XfilterA * 31) >> 5);
if (*decoded0 > 0)
if (LIKELY(*decoded0 != 0))
{
p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA];
p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1];
p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2];
p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3];
if (*decoded0 > 0)
{
p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA];
p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1];
p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2];
p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3];
p->YcoeffsB[0] -= p->buf[YADAPTCOEFFSB];
p->YcoeffsB[1] -= p->buf[YADAPTCOEFFSB-1];
p->YcoeffsB[2] -= p->buf[YADAPTCOEFFSB-2];
p->YcoeffsB[3] -= p->buf[YADAPTCOEFFSB-3];
p->YcoeffsB[4] -= p->buf[YADAPTCOEFFSB-4];
}
else if (*decoded0 < 0)
{
p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA];
p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1];
p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2];
p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3];
p->YcoeffsB[0] -= p->buf[YADAPTCOEFFSB];
p->YcoeffsB[1] -= p->buf[YADAPTCOEFFSB-1];
p->YcoeffsB[2] -= p->buf[YADAPTCOEFFSB-2];
p->YcoeffsB[3] -= p->buf[YADAPTCOEFFSB-3];
p->YcoeffsB[4] -= p->buf[YADAPTCOEFFSB-4];
}
else
{
p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA];
p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1];
p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2];
p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3];
p->YcoeffsB[0] += p->buf[YADAPTCOEFFSB];
p->YcoeffsB[1] += p->buf[YADAPTCOEFFSB-1];
p->YcoeffsB[2] += p->buf[YADAPTCOEFFSB-2];
p->YcoeffsB[3] += p->buf[YADAPTCOEFFSB-3];
p->YcoeffsB[4] += p->buf[YADAPTCOEFFSB-4];
p->YcoeffsB[0] += p->buf[YADAPTCOEFFSB];
p->YcoeffsB[1] += p->buf[YADAPTCOEFFSB-1];
p->YcoeffsB[2] += p->buf[YADAPTCOEFFSB-2];
p->YcoeffsB[3] += p->buf[YADAPTCOEFFSB-3];
p->YcoeffsB[4] += p->buf[YADAPTCOEFFSB-4];
}
}
*(decoded0++) = p->YfilterA;
if (*decoded1 > 0)
if (LIKELY(*decoded1 != 0))
{
p->XcoeffsA[0] -= p->buf[XADAPTCOEFFSA];
p->XcoeffsA[1] -= p->buf[XADAPTCOEFFSA-1];
p->XcoeffsA[2] -= p->buf[XADAPTCOEFFSA-2];
p->XcoeffsA[3] -= p->buf[XADAPTCOEFFSA-3];
if (*decoded1 > 0)
{
p->XcoeffsA[0] -= p->buf[XADAPTCOEFFSA];
p->XcoeffsA[1] -= p->buf[XADAPTCOEFFSA-1];
p->XcoeffsA[2] -= p->buf[XADAPTCOEFFSA-2];
p->XcoeffsA[3] -= p->buf[XADAPTCOEFFSA-3];
p->XcoeffsB[0] -= p->buf[XADAPTCOEFFSB];
p->XcoeffsB[1] -= p->buf[XADAPTCOEFFSB-1];
p->XcoeffsB[2] -= p->buf[XADAPTCOEFFSB-2];
p->XcoeffsB[3] -= p->buf[XADAPTCOEFFSB-3];
p->XcoeffsB[4] -= p->buf[XADAPTCOEFFSB-4];
}
else if (*decoded1 < 0)
{
p->XcoeffsA[0] += p->buf[XADAPTCOEFFSA];
p->XcoeffsA[1] += p->buf[XADAPTCOEFFSA-1];
p->XcoeffsA[2] += p->buf[XADAPTCOEFFSA-2];
p->XcoeffsA[3] += p->buf[XADAPTCOEFFSA-3];
p->XcoeffsB[0] -= p->buf[XADAPTCOEFFSB];
p->XcoeffsB[1] -= p->buf[XADAPTCOEFFSB-1];
p->XcoeffsB[2] -= p->buf[XADAPTCOEFFSB-2];
p->XcoeffsB[3] -= p->buf[XADAPTCOEFFSB-3];
p->XcoeffsB[4] -= p->buf[XADAPTCOEFFSB-4];
}
else
{
p->XcoeffsA[0] += p->buf[XADAPTCOEFFSA];
p->XcoeffsA[1] += p->buf[XADAPTCOEFFSA-1];
p->XcoeffsA[2] += p->buf[XADAPTCOEFFSA-2];
p->XcoeffsA[3] += p->buf[XADAPTCOEFFSA-3];
p->XcoeffsB[0] += p->buf[XADAPTCOEFFSB];
p->XcoeffsB[1] += p->buf[XADAPTCOEFFSB-1];
p->XcoeffsB[2] += p->buf[XADAPTCOEFFSB-2];
p->XcoeffsB[3] += p->buf[XADAPTCOEFFSB-3];
p->XcoeffsB[4] += p->buf[XADAPTCOEFFSB-4];
p->XcoeffsB[0] += p->buf[XADAPTCOEFFSB];
p->XcoeffsB[1] += p->buf[XADAPTCOEFFSB-1];
p->XcoeffsB[2] += p->buf[XADAPTCOEFFSB-2];
p->XcoeffsB[3] += p->buf[XADAPTCOEFFSB-3];
p->XcoeffsB[4] += p->buf[XADAPTCOEFFSB-4];
}
}
*(decoded1++) = p->XfilterA;
@ -196,7 +202,7 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
p->buf++;
/* Have we filled the history buffer? */
if (p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE) {
if (UNLIKELY(p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE)) {
memmove(p->historybuffer, p->buf,
PREDICTOR_SIZE * sizeof(int32_t));
p->buf = p->historybuffer;
@ -215,7 +221,7 @@ int ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p,
currentA = p->YlastA;
while (count--)
while (LIKELY(count--))
{
A = *decoded0;
@ -232,25 +238,28 @@ int ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p,
p->buf[YADAPTCOEFFSA] = SIGN(p->buf[YDELAYA]);
p->buf[YADAPTCOEFFSA-1] = SIGN(p->buf[YDELAYA-1]);
if (A > 0)
if (LIKELY(A != 0))
{
p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA];
p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1];
p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2];
p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3];
}
else if (A < 0)
{
p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA];
p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1];
p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2];
p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3];
if (A > 0)
{
p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA];
p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1];
p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2];
p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3];
}
else
{
p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA];
p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1];
p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2];
p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3];
}
}
p->buf++;
/* Have we filled the history buffer? */
if (p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE) {
if (UNLIKELY(p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE)) {
memmove(p->historybuffer, p->buf,
PREDICTOR_SIZE * sizeof(int32_t));
p->buf = p->historybuffer;