forked from len0rd/rockbox
Branch optimisation in both C (giving hints to gcc - verified using -fprofile-arcs and gcov) and asm files. Biggest effect on coldfire (-c1000: +8%, -c2000: +5%), but ARM also profits a bit (less than 1% on ARM7TDMI, around 1% on ARM1136).
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19199 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
66c0cf2eb1
commit
3761c0108c
6 changed files with 143 additions and 121 deletions
|
@ -70,6 +70,15 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
|
|||
#define ICODE_ATTR
|
||||
#define ICODE_ATTR_DEMAC
|
||||
|
||||
/* Use to give gcc hints on which branch is most likely taken */
|
||||
#if defined(__GNUC__) && __GNUC__ >= 3
|
||||
#define LIKELY(x) __builtin_expect(!!(x), 1)
|
||||
#define UNLIKELY(x) __builtin_expect(!!(x), 0)
|
||||
#else
|
||||
#define LIKELY(x) (x)
|
||||
#define UNLIKELY(x) (x)
|
||||
#endif
|
||||
|
||||
#endif /* !ROCKBOX */
|
||||
|
||||
/* Defaults */
|
||||
|
|
|
@ -283,13 +283,13 @@ static inline void update_rice(struct rice_t* rice, int x)
|
|||
{
|
||||
rice->ksum += ((x + 1) / 2) - ((rice->ksum + 16) >> 5);
|
||||
|
||||
if (rice->k == 0) {
|
||||
if (UNLIKELY(rice->k == 0)) {
|
||||
rice->k = 1;
|
||||
} else {
|
||||
uint32_t lim = 1 << (rice->k + 4);
|
||||
if (rice->ksum < lim) {
|
||||
if (UNLIKELY(rice->ksum < lim)) {
|
||||
rice->k--;
|
||||
} else if (rice->ksum >= 2 * lim) {
|
||||
} else if (UNLIKELY(rice->ksum >= 2 * lim)) {
|
||||
rice->k++;
|
||||
}
|
||||
}
|
||||
|
@ -300,11 +300,12 @@ static inline int entropy_decode3980(struct rice_t* rice)
|
|||
int base, x, pivot, overflow;
|
||||
|
||||
pivot = rice->ksum >> 5;
|
||||
if (pivot == 0) pivot=1;
|
||||
if (UNLIKELY(pivot == 0))
|
||||
pivot=1;
|
||||
|
||||
overflow = range_get_symbol_3980();
|
||||
|
||||
if (overflow == (MODEL_ELEMENTS-1)) {
|
||||
if (UNLIKELY(overflow == (MODEL_ELEMENTS-1))) {
|
||||
overflow = range_decode_short() << 16;
|
||||
overflow |= range_decode_short();
|
||||
}
|
||||
|
@ -352,7 +353,7 @@ static inline int entropy_decode3970(struct rice_t* rice)
|
|||
|
||||
int overflow = range_get_symbol_3970();
|
||||
|
||||
if (overflow == (MODEL_ELEMENTS - 1)) {
|
||||
if (UNLIKELY(overflow == (MODEL_ELEMENTS - 1))) {
|
||||
tmpk = range_decode_bits(5);
|
||||
overflow = 0;
|
||||
} else {
|
||||
|
@ -435,13 +436,13 @@ int ICODE_ATTR_DEMAC entropy_decode(struct ape_ctx_t* ape_ctx,
|
|||
memset(decoded1, 0, blockstodecode * sizeof(int32_t));
|
||||
} else {
|
||||
if (ape_ctx->fileversion > 3970) {
|
||||
while (blockstodecode--) {
|
||||
while (LIKELY(blockstodecode--)) {
|
||||
*(decoded0++) = entropy_decode3980(&riceY);
|
||||
if (decoded1 != NULL)
|
||||
*(decoded1++) = entropy_decode3980(&riceX);
|
||||
}
|
||||
} else {
|
||||
while (blockstodecode--) {
|
||||
while (LIKELY(blockstodecode--)) {
|
||||
*(decoded0++) = entropy_decode3970(&riceY);
|
||||
if (decoded1 != NULL)
|
||||
*(decoded1++) = entropy_decode3970(&riceX);
|
||||
|
|
|
@ -100,7 +100,7 @@ struct filter_t {
|
|||
#if defined(CPU_ARM) && (ARM_ARCH >= 6)
|
||||
#define SATURATE(x) ({int __res; asm("ssat %0, #16, %1" : "=r"(__res) : "r"(x)); __res; })
|
||||
#else
|
||||
#define SATURATE(x) (((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF);
|
||||
#define SATURATE(x) (LIKELY((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF);
|
||||
#endif
|
||||
|
||||
/* Apply the filter with state f to count entries in data[] */
|
||||
|
@ -115,14 +115,16 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
|
|||
PREPARE_SCALARPRODUCT
|
||||
#endif
|
||||
|
||||
while(count--)
|
||||
while(LIKELY(count--))
|
||||
{
|
||||
res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER));
|
||||
|
||||
if (LIKELY(*data != 0)) {
|
||||
if (*data < 0)
|
||||
vector_add(f->coeffs, f->adaptcoeffs - ORDER);
|
||||
else if (*data > 0)
|
||||
else
|
||||
vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
|
||||
}
|
||||
|
||||
res += *data;
|
||||
|
||||
|
@ -136,11 +138,11 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
|
|||
/* Update the adaption coefficients */
|
||||
absres = (res < 0 ? -res : res);
|
||||
|
||||
if (absres > (f->avg * 3))
|
||||
if (UNLIKELY(absres > (f->avg * 3)))
|
||||
*f->adaptcoeffs = ((res >> 25) & 64) - 32;
|
||||
else if (absres > (f->avg * 4) / 3)
|
||||
*f->adaptcoeffs = ((res >> 26) & 32) - 16;
|
||||
else if (absres > 0)
|
||||
else if (LIKELY(absres > 0))
|
||||
*f->adaptcoeffs = ((res >> 27) & 16) - 8;
|
||||
else
|
||||
*f->adaptcoeffs = 0;
|
||||
|
@ -154,7 +156,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
|
|||
f->adaptcoeffs++;
|
||||
|
||||
/* Have we filled the history buffer? */
|
||||
if (f->delay == f->history_end) {
|
||||
if (UNLIKELY(f->delay == f->history_end)) {
|
||||
memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
|
||||
(ORDER*2) * sizeof(filter_int));
|
||||
f->adaptcoeffs = f->coeffs + ORDER*2;
|
||||
|
@ -172,14 +174,16 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
|
|||
PREPARE_SCALARPRODUCT
|
||||
#endif
|
||||
|
||||
while(count--)
|
||||
while(LIKELY(count--))
|
||||
{
|
||||
res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER));
|
||||
|
||||
if (LIKELY(*data != 0)) {
|
||||
if (*data < 0)
|
||||
vector_add(f->coeffs, f->adaptcoeffs - ORDER);
|
||||
else if (*data > 0)
|
||||
else
|
||||
vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
|
||||
}
|
||||
|
||||
/* Convert res from (32-FRACBITS).FRACBITS fixed-point format to an
|
||||
integer (rounding to nearest) and add the input value to
|
||||
|
@ -199,7 +203,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
|
|||
f->adaptcoeffs++;
|
||||
|
||||
/* Have we filled the history buffer? */
|
||||
if (f->delay == f->history_end) {
|
||||
if (UNLIKELY(f->delay == f->history_end)) {
|
||||
memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
|
||||
(ORDER*2) * sizeof(filter_int));
|
||||
f->adaptcoeffs = f->coeffs + ORDER*2;
|
||||
|
|
|
@ -472,10 +472,20 @@ loop:
|
|||
@ r10 := p->buf - PREDICTOR_HISTORY_SIZE
|
||||
|
||||
cmp r10, r11
|
||||
bne endofloop
|
||||
beq move_hist @ The history buffer is full, we need to do a memmove
|
||||
|
||||
@ The history buffer is full, we need to do a memmove:
|
||||
@ Check loop count
|
||||
ldr r0, [sp, #8]
|
||||
subs r0, r0, #1
|
||||
strne r0, [sp, #8]
|
||||
bne loop
|
||||
|
||||
done:
|
||||
str r14, [r12] @ Save value of p->buf
|
||||
add sp, sp, #12 @ Don't bother restoring r1-r3
|
||||
ldmia sp!, {r4-r11, pc}
|
||||
|
||||
move_hist:
|
||||
@ dest = r11 (p->historybuffer)
|
||||
@ src = r14 (p->buf)
|
||||
@ n = 200
|
||||
|
@ -493,15 +503,10 @@ loop:
|
|||
|
||||
add r14, r12, #historybuffer @ p->buf = &p->historybuffer[0]
|
||||
|
||||
|
||||
endofloop:
|
||||
@ Check loop count
|
||||
@ Check loop count
|
||||
ldr r0, [sp, #8]
|
||||
subs r0, r0, #1
|
||||
strne r0, [sp, #8]
|
||||
bne loop
|
||||
|
||||
done:
|
||||
str r14, [r12] @ Save value of p->buf
|
||||
add sp, sp, #12 @ Don't bother restoring r1-r3
|
||||
ldmia sp!, {r4-r11, pc}
|
||||
b done
|
||||
|
|
|
@ -486,10 +486,18 @@ predictor_decode_stereo:
|
|||
| %a3 = &p->historybuffer[PREDICTOR_HISTORY_SIZE]
|
||||
|
||||
cmp.l %a3, %a5
|
||||
bne.s .endofloop
|
||||
beq.s .move_hist | The history buffer is full, we need to do a memmove
|
||||
|
||||
| The history buffer is full, we need to do a memmove:
|
||||
subq.l #1, (8,%sp) | decrease loop count
|
||||
bne.w .loop
|
||||
|
||||
.done:
|
||||
move.l %a5, (%a6) | Save value of p->buf
|
||||
movem.l (3*4,%sp), %d2-%d7/%a2-%a6
|
||||
lea.l (14*4,%sp), %sp
|
||||
rts
|
||||
|
||||
.move_hist:
|
||||
lea.l (historybuffer,%a6), %a3
|
||||
|
||||
| dest = %a3 (p->historybuffer)
|
||||
|
@ -497,33 +505,19 @@ predictor_decode_stereo:
|
|||
| n = 200
|
||||
|
||||
movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes
|
||||
lea.l (40,%a5), %a5
|
||||
movem.l %d0-%d7/%a0-%a1, (%a3)
|
||||
lea.l (40,%a3), %a3
|
||||
movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes
|
||||
lea.l (40,%a5), %a5
|
||||
movem.l %d0-%d7/%a0-%a1, (%a3)
|
||||
lea.l (40,%a3), %a3
|
||||
movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes
|
||||
lea.l (40,%a5), %a5
|
||||
movem.l %d0-%d7/%a0-%a1, (%a3)
|
||||
lea.l (40,%a3), %a3
|
||||
movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes
|
||||
lea.l (40,%a5), %a5
|
||||
movem.l %d0-%d7/%a0-%a1, (%a3)
|
||||
lea.l (40,%a3), %a3
|
||||
movem.l (%a5), %d0-%d7/%a0-%a1 | 40 bytes
|
||||
lea.l (40,%a5), %a5
|
||||
movem.l %d0-%d7/%a0-%a1, (%a3)
|
||||
lea.l (40,%a3), %a3
|
||||
movem.l (40,%a5), %d0-%d7/%a0-%a1 | 40 bytes
|
||||
movem.l %d0-%d7/%a0-%a1, (40,%a3)
|
||||
movem.l (80,%a5), %d0-%d7/%a0-%a1 | 40 bytes
|
||||
movem.l %d0-%d7/%a0-%a1, (80,%a3)
|
||||
movem.l (120,%a5), %d0-%d7/%a0-%a1 | 40 bytes
|
||||
movem.l %d0-%d7/%a0-%a1, (120,%a3)
|
||||
movem.l (160,%a5), %d0-%d7/%a0-%a1 | 40 bytes
|
||||
movem.l %d0-%d7/%a0-%a1, (160,%a3)
|
||||
|
||||
lea.l (historybuffer,%a6), %a5 | p->buf = &p->historybuffer[0]
|
||||
move.l %a3, %a5 | p->buf = &p->historybuffer[0]
|
||||
|
||||
.endofloop:
|
||||
subq.l #1, (8,%sp) | decrease loop count
|
||||
bne.w .loop
|
||||
|
||||
move.l %a5, (%a6) | Save value of p->buf
|
||||
movem.l (3*4,%sp), %d2-%d7/%a2-%a6
|
||||
lea.l (14*4,%sp), %sp
|
||||
rts
|
||||
bra.s .done
|
||||
|
|
|
@ -75,7 +75,7 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
|
|||
{
|
||||
int32_t predictionA, predictionB;
|
||||
|
||||
while (count--)
|
||||
while (LIKELY(count--))
|
||||
{
|
||||
/* Predictor Y */
|
||||
p->buf[YDELAYA] = p->YlastA;
|
||||
|
@ -134,6 +134,8 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
|
|||
p->XlastA = *decoded1 + ((predictionA + (predictionB >> 1)) >> 10);
|
||||
p->XfilterA = p->XlastA + ((p->XfilterA * 31) >> 5);
|
||||
|
||||
if (LIKELY(*decoded0 != 0))
|
||||
{
|
||||
if (*decoded0 > 0)
|
||||
{
|
||||
p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA];
|
||||
|
@ -147,7 +149,7 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
|
|||
p->YcoeffsB[3] -= p->buf[YADAPTCOEFFSB-3];
|
||||
p->YcoeffsB[4] -= p->buf[YADAPTCOEFFSB-4];
|
||||
}
|
||||
else if (*decoded0 < 0)
|
||||
else
|
||||
{
|
||||
p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA];
|
||||
p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1];
|
||||
|
@ -160,9 +162,12 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
|
|||
p->YcoeffsB[3] += p->buf[YADAPTCOEFFSB-3];
|
||||
p->YcoeffsB[4] += p->buf[YADAPTCOEFFSB-4];
|
||||
}
|
||||
}
|
||||
|
||||
*(decoded0++) = p->YfilterA;
|
||||
|
||||
if (LIKELY(*decoded1 != 0))
|
||||
{
|
||||
if (*decoded1 > 0)
|
||||
{
|
||||
p->XcoeffsA[0] -= p->buf[XADAPTCOEFFSA];
|
||||
|
@ -176,7 +181,7 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
|
|||
p->XcoeffsB[3] -= p->buf[XADAPTCOEFFSB-3];
|
||||
p->XcoeffsB[4] -= p->buf[XADAPTCOEFFSB-4];
|
||||
}
|
||||
else if (*decoded1 < 0)
|
||||
else
|
||||
{
|
||||
p->XcoeffsA[0] += p->buf[XADAPTCOEFFSA];
|
||||
p->XcoeffsA[1] += p->buf[XADAPTCOEFFSA-1];
|
||||
|
@ -189,6 +194,7 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
|
|||
p->XcoeffsB[3] += p->buf[XADAPTCOEFFSB-3];
|
||||
p->XcoeffsB[4] += p->buf[XADAPTCOEFFSB-4];
|
||||
}
|
||||
}
|
||||
|
||||
*(decoded1++) = p->XfilterA;
|
||||
|
||||
|
@ -196,7 +202,7 @@ int ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
|
|||
p->buf++;
|
||||
|
||||
/* Have we filled the history buffer? */
|
||||
if (p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE) {
|
||||
if (UNLIKELY(p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE)) {
|
||||
memmove(p->historybuffer, p->buf,
|
||||
PREDICTOR_SIZE * sizeof(int32_t));
|
||||
p->buf = p->historybuffer;
|
||||
|
@ -215,7 +221,7 @@ int ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p,
|
|||
|
||||
currentA = p->YlastA;
|
||||
|
||||
while (count--)
|
||||
while (LIKELY(count--))
|
||||
{
|
||||
A = *decoded0;
|
||||
|
||||
|
@ -232,6 +238,8 @@ int ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p,
|
|||
p->buf[YADAPTCOEFFSA] = SIGN(p->buf[YDELAYA]);
|
||||
p->buf[YADAPTCOEFFSA-1] = SIGN(p->buf[YDELAYA-1]);
|
||||
|
||||
if (LIKELY(A != 0))
|
||||
{
|
||||
if (A > 0)
|
||||
{
|
||||
p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA];
|
||||
|
@ -239,18 +247,19 @@ int ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p,
|
|||
p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2];
|
||||
p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3];
|
||||
}
|
||||
else if (A < 0)
|
||||
else
|
||||
{
|
||||
p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA];
|
||||
p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1];
|
||||
p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2];
|
||||
p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3];
|
||||
}
|
||||
}
|
||||
|
||||
p->buf++;
|
||||
|
||||
/* Have we filled the history buffer? */
|
||||
if (p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE) {
|
||||
if (UNLIKELY(p->buf == p->historybuffer + PREDICTOR_HISTORY_SIZE)) {
|
||||
memmove(p->historybuffer, p->buf,
|
||||
PREDICTOR_SIZE * sizeof(int32_t));
|
||||
p->buf = p->historybuffer;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue