forked from len0rd/rockbox
Do some SPC codec optimizing for ARMv6 (as a training exercise), tweak realtime BRR for all CPU that use it, add Gaussian ASM optimization for all ARM that can use it. Add some LIKELY/UNLIKELY branch hints. On Gigabeat-S gives +22% speedup. For Gigabeat F, about +5% speedup. For less-powerful players, no real change aside possibly from branch hints.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25771 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
3adac47c61
commit
9f157ad584
2 changed files with 456 additions and 94 deletions
|
|
@ -57,6 +57,16 @@ void DSP_write( struct Spc_Dsp* this, int i, int data )
|
|||
}
|
||||
}
|
||||
|
||||
#if ARM_ARCH >= 6
|
||||
/* if ( n < -32768 ) out = -32768; */
|
||||
/* if ( n > 32767 ) out = 32767; */
|
||||
#define CLAMP16( n ) \
|
||||
({ \
|
||||
asm ("ssat %0, #16, %1" \
|
||||
: "=r" ( n ) : "r"( n ) ); \
|
||||
n; \
|
||||
})
|
||||
#else
|
||||
/* if ( n < -32768 ) out = -32768; */
|
||||
/* if ( n > 32767 ) out = 32767; */
|
||||
#define CLAMP16( n ) \
|
||||
|
|
@ -65,6 +75,7 @@ void DSP_write( struct Spc_Dsp* this, int i, int data )
|
|||
n = 0x7FFF ^ (n >> 31); \
|
||||
n; \
|
||||
})
|
||||
#endif
|
||||
|
||||
#if SPC_BRRCACHE
|
||||
static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
|
||||
|
|
@ -418,7 +429,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
/* Key on events are delayed */
|
||||
int key_on_delay = voice->key_on_delay;
|
||||
|
||||
if ( --key_on_delay >= 0 ) /* <1% of the time */
|
||||
if ( UNLIKELY ( --key_on_delay >= 0 ) ) /* <1% of the time */
|
||||
{
|
||||
key_on(this,voice,sd,raw_voice,key_on_delay,vbit);
|
||||
}
|
||||
|
|
@ -438,13 +449,13 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
int env_mode = voice->env_mode;
|
||||
int adsr0 = raw_voice->adsr [0];
|
||||
int env_timer;
|
||||
if ( env_mode != state_release ) /* 99% of the time */
|
||||
if ( LIKELY ( env_mode != state_release ) ) /* 99% of the time */
|
||||
{
|
||||
env_timer = voice->env_timer;
|
||||
if ( adsr0 & 0x80 ) /* 79% of the time */
|
||||
if ( LIKELY ( adsr0 & 0x80 ) ) /* 79% of the time */
|
||||
{
|
||||
int adsr1 = raw_voice->adsr [1];
|
||||
if ( env_mode == state_sustain ) /* 74% of the time */
|
||||
if ( LIKELY ( env_mode == state_sustain ) ) /* 74% of the time */
|
||||
{
|
||||
if ( (env_timer -= env_rates [adsr1 & 0x1F]) > 0 )
|
||||
goto write_env_timer;
|
||||
|
|
@ -607,25 +618,12 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
goto skip_decode;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* header */
|
||||
int const block_header = *addr;
|
||||
addr += 9;
|
||||
voice->addr = addr;
|
||||
voice->block_header = block_header;
|
||||
int const filter = (block_header & 0x0C) - 0x08;
|
||||
|
||||
/* scaling (invalid scaling gives -4096 for neg nybble,
|
||||
0 for pos) */
|
||||
static unsigned char const right_shifts [16] = {
|
||||
5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 29, 29, 29,
|
||||
};
|
||||
static unsigned char const left_shifts [16] = {
|
||||
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11
|
||||
};
|
||||
int const scale = block_header >> 4;
|
||||
int const right_shift = right_shifts [scale];
|
||||
int const left_shift = left_shifts [scale];
|
||||
|
||||
/* previous samples */
|
||||
int smp2 = voice->samples [BRR_BLOCK_SIZE + 1];
|
||||
|
|
@ -650,54 +648,117 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
/* force sample to end on next decode */
|
||||
voice->block_header = 1;
|
||||
}
|
||||
|
||||
do /* decode and filter 16 samples */
|
||||
|
||||
int const filter = block_header & 0x0c;
|
||||
int const scale = block_header >> 4;
|
||||
|
||||
if ( filter == 0x08 ) /* filter 2 (30-90% of the time) */
|
||||
{
|
||||
/* Get nybble, sign-extend, then scale
|
||||
get byte, select which nybble, sign-extend, then shift
|
||||
based on scaling. also handles invalid scaling values.*/
|
||||
int delta = (int) (int8_t) (addr [offset >> 3] <<
|
||||
(offset & 4)) >> right_shift << left_shift;
|
||||
|
||||
out [offset >> 2] = smp2;
|
||||
|
||||
if ( filter == 0 ) /* mode 0x08 (30-90% of the time) */
|
||||
/* y[n] = x[n] + 61/32 * y[n-1] - 15/16 * y[n-2] */
|
||||
do /* decode and filter 16 samples */
|
||||
{
|
||||
/* Get nybble, sign-extend, then scale
|
||||
get byte, select which nybble, sign-extend, then shift
|
||||
based on scaling. */
|
||||
int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
|
||||
delta = (delta << scale) >> 1;
|
||||
|
||||
if (scale > 0xc)
|
||||
delta = (delta >> 17) << 11;
|
||||
|
||||
out [offset >> 2] = smp2;
|
||||
|
||||
delta -= smp2 >> 1;
|
||||
delta += smp2 >> 5;
|
||||
smp2 = smp1;
|
||||
delta += smp1;
|
||||
delta += (-smp1 - (smp1 >> 1)) >> 5;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( filter == -4 ) /* mode 0x04 */
|
||||
{
|
||||
delta += smp1 >> 1;
|
||||
delta += (-smp1) >> 5;
|
||||
}
|
||||
else if ( filter > -4 ) /* mode 0x0C */
|
||||
{
|
||||
delta -= smp2 >> 1;
|
||||
delta += (smp2 + (smp2 >> 1)) >> 4;
|
||||
delta += smp1;
|
||||
delta += (-smp1 * 13) >> 7;
|
||||
}
|
||||
|
||||
delta = CLAMP16( delta );
|
||||
smp2 = smp1;
|
||||
smp1 = (int16_t) (delta * 2); /* sign-extend */
|
||||
}
|
||||
|
||||
delta = CLAMP16( delta );
|
||||
smp1 = (int16_t) (delta * 2); /* sign-extend */
|
||||
while ( (offset += 4) != 0 );
|
||||
}
|
||||
while ( (offset += 4) != 0 );
|
||||
|
||||
else if ( filter == 0x04 ) /* filter 1 */
|
||||
{
|
||||
/* y[n] = x[n] + 15/16 * y[n-1] */
|
||||
do /* decode and filter 16 samples */
|
||||
{
|
||||
/* Get nybble, sign-extend, then scale
|
||||
get byte, select which nybble, sign-extend, then shift
|
||||
based on scaling. */
|
||||
int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
|
||||
delta = (delta << scale) >> 1;
|
||||
|
||||
if (scale > 0xc)
|
||||
delta = (delta >> 17) << 11;
|
||||
|
||||
out [offset >> 2] = smp2;
|
||||
|
||||
delta += smp1 >> 1;
|
||||
delta += (-smp1) >> 5;
|
||||
|
||||
delta = CLAMP16( delta );
|
||||
smp2 = smp1;
|
||||
smp1 = (int16_t) (delta * 2); /* sign-extend */
|
||||
}
|
||||
while ( (offset += 4) != 0 );
|
||||
}
|
||||
else if ( filter == 0x0c ) /* filter 3 */
|
||||
{
|
||||
/* y[n] = x[n] + 115/64 * y[n-1] - 13/16 * y[n-2] */
|
||||
do /* decode and filter 16 samples */
|
||||
{
|
||||
/* Get nybble, sign-extend, then scale
|
||||
get byte, select which nybble, sign-extend, then shift
|
||||
based on scaling. */
|
||||
int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
|
||||
delta = (delta << scale) >> 1;
|
||||
|
||||
if (scale > 0xc)
|
||||
delta = (delta >> 17) << 11;
|
||||
|
||||
out [offset >> 2] = smp2;
|
||||
|
||||
delta -= smp2 >> 1;
|
||||
delta += (smp2 + (smp2 >> 1)) >> 4;
|
||||
delta += smp1;
|
||||
delta += (-smp1 * 13) >> 7;
|
||||
|
||||
delta = CLAMP16( delta );
|
||||
smp2 = smp1;
|
||||
smp1 = (int16_t) (delta * 2); /* sign-extend */
|
||||
}
|
||||
while ( (offset += 4) != 0 );
|
||||
}
|
||||
else /* filter 0 */
|
||||
{
|
||||
/* y[n] = x[n] */
|
||||
do /* decode and filter 16 samples */
|
||||
{
|
||||
/* Get nybble, sign-extend, then scale
|
||||
get byte, select which nybble, sign-extend, then shift
|
||||
based on scaling. */
|
||||
int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
|
||||
delta = (delta << scale) >> 1;
|
||||
|
||||
if (scale > 0xc)
|
||||
delta = (delta >> 17) << 11;
|
||||
|
||||
out [offset >> 2] = smp2;
|
||||
|
||||
smp2 = smp1;
|
||||
smp1 = delta * 2;
|
||||
}
|
||||
while ( (offset += 4) != 0 );
|
||||
}
|
||||
|
||||
out [0] = smp2;
|
||||
out [1] = smp1;
|
||||
|
||||
skip_decode:;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* !SPC_BRRCACHE */
|
||||
/* Get rate (with possible modulation) */
|
||||
int rate = VOICE_RATE(vr);
|
||||
if ( this->r.g.pitch_mods & vbit )
|
||||
|
|
@ -754,13 +815,87 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
|
||||
/* Use faster gaussian interpolation when exact result isn't needed
|
||||
by pitch modulator of next channel */
|
||||
int amp_0, amp_1;
|
||||
if ( !(slow_gaussian & vbit) ) /* 99% of the time */
|
||||
int amp_0, amp_1; /* Also serve as temps _0, and _1 */
|
||||
if ( LIKELY ( !(slow_gaussian & vbit) ) ) /* 99% of the time */
|
||||
{
|
||||
/* Main optimization is lack of clamping. Not a problem since
|
||||
output never goes more than +/- 16 outside 16-bit range and
|
||||
things are clamped later anyway. Other optimization is to
|
||||
preserve fractional accuracy, eliminating several masks. */
|
||||
#if defined (CPU_ARM)
|
||||
int output;
|
||||
int _2, _3; /* All-purpose temps */
|
||||
/* Multiple ASM blocks keep regs free and reduce result
|
||||
* latency issues. */
|
||||
#if ARM_ARCH >= 6
|
||||
/* Interpolate */
|
||||
asm volatile (
|
||||
"ldr %[_0], [%[interp]] \r\n" /* _0=i0i1 */
|
||||
"ldr %[_2], [%[fwd]] \r\n" /* _2=f0f1 */
|
||||
"ldr %[_1], [%[interp], #4] \r\n" /* _1=i2i3 */
|
||||
"ldr %[_3], [%[rev]] \r\n" /* _3=r0r1 */
|
||||
"smuad %[out], %[_0], %[_2] \r\n" /* out=f0*i0 + f1*i1 */
|
||||
"smladx %[out], %[_1], %[_3], %[out] \r\n" /* out+=r1*i2 + r0*i3 */
|
||||
: [out]"=&r"(output),
|
||||
[_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
|
||||
[_2]"=&r"(_2), [_3]"=&r"(_3)
|
||||
: [fwd]"r"(fwd), [rev]"r"(rev),
|
||||
[interp]"r"(interp));
|
||||
/* Apply voice envelope */
|
||||
asm volatile (
|
||||
"mov %[_2], %[out], asr #(11-5) \r\n" /* To do >> 16 later */
|
||||
"mul %[out], %[_2], %[envx] \r\n" /* and avoid exp. shift */
|
||||
: [out]"+r"(output), [_2]"=&r"(_2)
|
||||
: [envx]"r"((int)voice->envx));
|
||||
/* Apply left and right volume */
|
||||
asm volatile (
|
||||
"smulwb %[amp_0], %[out], %[vvol_0] \r\n" /* (32x16->48)[47:16]->[31:0] */
|
||||
"smulwb %[amp_1], %[out], %[vvol_1] \r\n"
|
||||
: [out]"+r"(output),
|
||||
[amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
|
||||
: [vvol_0]"r"(voice->volume[0]),
|
||||
[vvol_1]"r"(voice->volume[1]));
|
||||
|
||||
raw_voice->outx = output >> (8+5); /* 'output' still 5 bits too big */
|
||||
#else /* ARM_ARCH < 6 */
|
||||
/* Perform gaussian interpolation on four samples */
|
||||
asm volatile (
|
||||
"ldrsh %[_0], [%[interp]] \r\n"
|
||||
"ldrsh %[_2], [%[fwd]] \r\n"
|
||||
"ldrsh %[_1], [%[interp], #2] \r\n"
|
||||
"ldrsh %[_3], [%[fwd], #2] \r\n"
|
||||
"mul %[out], %[_0], %[_2] \r\n" /* out= fwd[0]*interp[0] */
|
||||
"ldrsh %[_0], [%[interp], #4] \r\n"
|
||||
"ldrsh %[_2], [%[rev], #2] \r\n"
|
||||
"mla %[out], %[_1], %[_3], %[out] \r\n" /* out+=fwd[1]*interp[1] */
|
||||
"ldrsh %[_1], [%[interp], #6] \r\n"
|
||||
"ldrsh %[_3], [%[rev]] \r\n"
|
||||
"mla %[out], %[_0], %[_2], %[out] \r\n" /* out+=rev[1]*interp[2] */
|
||||
"mla %[out], %[_1], %[_3], %[out] \r\n" /* out+=rev[0]*interp[3] */
|
||||
: [out]"=&r"(output),
|
||||
[_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
|
||||
[_2]"=&r"(_2), [_3]"=&r"(_3)
|
||||
: [fwd]"r"(fwd), [rev]"r"(rev),
|
||||
[interp]"r"(interp));
|
||||
/* Apply voice envelope */
|
||||
asm volatile (
|
||||
"mov %[_2], %[out], asr #11 \r\n"
|
||||
"mul %[out], %[_2], %[envx] \r\n"
|
||||
: [out]"+r"(output), [_2]"=&r"(_2)
|
||||
: [envx]"r"((int)voice->envx));
|
||||
/* Reduce and apply left and right volume */
|
||||
asm volatile (
|
||||
"mov %[out], %[out], asr #11 \r\n"
|
||||
"mul %[amp_0], %[out], %[vvol_0] \r\n"
|
||||
"mul %[amp_1], %[out], %[vvol_1] \r\n"
|
||||
: [out]"+r"(output),
|
||||
[amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
|
||||
: [vvol_0]"r"((int)voice->volume[0]),
|
||||
[vvol_1]"r"((int)voice->volume[1]));
|
||||
|
||||
raw_voice->outx = output >> 8;
|
||||
#endif /* ARM_ARCH */
|
||||
#else /* Unoptimized CPU */
|
||||
int output = (((fwd [0] * interp [0] +
|
||||
fwd [1] * interp [1] +
|
||||
rev [1] * interp [2] +
|
||||
|
|
@ -769,11 +904,121 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
/* duplicated here to give compiler more to run in parallel */
|
||||
amp_0 = voice->volume [0] * output;
|
||||
amp_1 = voice->volume [1] * output;
|
||||
|
||||
raw_voice->outx = output >> 8;
|
||||
#endif /* CPU_* */
|
||||
}
|
||||
else
|
||||
else /* slow gaussian */
|
||||
{
|
||||
#if defined(CPU_ARM)
|
||||
#if ARM_ARCH >= 6
|
||||
int output = *(int16_t*) &this->noise;
|
||||
|
||||
if ( !(this->r.g.noise_enables & vbit) )
|
||||
{
|
||||
/* Interpolate */
|
||||
int _2, _3;
|
||||
asm volatile (
|
||||
/* NOTE: often-unaligned accesses */
|
||||
"ldr %[_0], [%[interp]] \r\n" /* _0=i0i1 */
|
||||
"ldr %[_2], [%[fwd]] \r\n" /* _2=f0f1 */
|
||||
"ldr %[_1], [%[interp], #4] \r\n" /* _1=i2i3 */
|
||||
"ldr %[_3], [%[rev]] \r\n" /* _3=f2f3 */
|
||||
"smulbb %[out], %[_0], %[_2] \r\n" /* out=f0*i0 */
|
||||
"smultt %[_0], %[_0], %[_2] \r\n" /* _0=f1*i1 */
|
||||
"smulbt %[_2], %[_1], %[_3] \r\n" /* _2=r1*i2 */
|
||||
"smultb %[_3], %[_1], %[_3] \r\n" /* _3=r0*i3 */
|
||||
: [out]"=r"(output),
|
||||
[_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
|
||||
[_2]"=&r"(_2), [_3]"=&r"(_3)
|
||||
: [fwd]"r"(fwd), [rev]"r"(rev),
|
||||
[interp]"r"(interp));
|
||||
asm volatile (
|
||||
"mov %[out], %[out], asr#12 \r\n"
|
||||
"add %[_0], %[out], %[_0], asr #12 \r\n"
|
||||
"add %[_2], %[_0], %[_2], asr #12 \r\n"
|
||||
"pkhbt %[_0], %[_2], %[_3], asl #4 \r\n" /* _3[31:16], _2[15:0] */
|
||||
"sadd16 %[_0], %[_0], %[_0] \r\n" /* _3[31:16]*2, _2[15:0]*2 */
|
||||
"qsubaddx %[out], %[_0], %[_0] \r\n" /* out[15:0]=
|
||||
* sat16(_3[31:16]+_2[15:0]) */
|
||||
: [out]"+r"(output),
|
||||
[_0]"+r"(amp_0), [_2]"+r"(_2), [_3]"+r"(_3));
|
||||
}
|
||||
/* Apply voice envelope */
|
||||
asm volatile (
|
||||
"smulbb %[out], %[out], %[envx] \r\n"
|
||||
: [out]"+r"(output)
|
||||
: [envx]"r"(voice->envx));
|
||||
/* Reduce and apply left and right volume */
|
||||
asm volatile (
|
||||
"mov %[out], %[out], asr #11 \r\n"
|
||||
"bic %[out], %[out], #0x1 \r\n"
|
||||
"mul %[amp_0], %[out], %[vvol_0] \r\n"
|
||||
"mul %[amp_1], %[out], %[vvol_1] \r\n"
|
||||
: [out]"+r"(output),
|
||||
[amp_1]"=r"(amp_1), [amp_0]"=r"(amp_0)
|
||||
: [vvol_0]"r"((int)voice->volume[0]),
|
||||
[vvol_1]"r"((int)voice->volume[1]));
|
||||
|
||||
prev_outx = output;
|
||||
raw_voice->outx = output >> 8;
|
||||
#else /* ARM_ARCH < 6 */
|
||||
int output = *(int16_t*) &this->noise;
|
||||
|
||||
if ( !(this->r.g.noise_enables & vbit) )
|
||||
{
|
||||
/* Interpolate */
|
||||
int _2, _3;
|
||||
asm volatile (
|
||||
"ldrsh %[_0], [%[interp]] \r\n"
|
||||
"ldrsh %[_2], [%[fwd]] \r\n"
|
||||
"ldrsh %[_1], [%[interp], #2] \r\n"
|
||||
"ldrsh %[_3], [%[fwd], #2] \r\n"
|
||||
"mul %[out], %[_2], %[_0] \r\n" /* fwd[0]*interp[0] */
|
||||
"ldrsh %[_2], [%[rev], #2] \r\n"
|
||||
"mul %[_0], %[_3], %[_1] \r\n" /* fwd[1]*interp[1] */
|
||||
"ldrsh %[_1], [%[interp], #4] \r\n"
|
||||
"mov %[out], %[out], asr #12 \r\n"
|
||||
"ldrsh %[_3], [%[rev]] \r\n"
|
||||
"mul %[_2], %[_1], %[_2] \r\n" /* rev[1]*interp[2] */
|
||||
"ldrsh %[_1], [%[interp], #6] \r\n"
|
||||
"add %[_0], %[out], %[_0], asr #12 \r\n"
|
||||
"mul %[_3], %[_1], %[_3] \r\n" /* rev[0]*interp[3] */
|
||||
"add %[_2], %[_0], %[_2], asr #12 \r\n"
|
||||
"mov %[_2], %[_2], lsl #17 \r\n"
|
||||
"mov %[_3], %[_3], asr #12 \r\n"
|
||||
"mov %[_3], %[_3], asl #1 \r\n"
|
||||
"add %[out], %[_3], %[_2], asr #16 \r\n"
|
||||
: [out]"=r"(output),
|
||||
[_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
|
||||
[_2]"=&r"(_2), [_3]"=&r"(_3)
|
||||
: [fwd]"r"(fwd), [rev]"r"(rev),
|
||||
[interp]"r"(interp));
|
||||
|
||||
output = CLAMP16(output);
|
||||
}
|
||||
/* Apply voice envelope */
|
||||
asm volatile (
|
||||
"mul %[_0], %[out], %[envx] \r\n"
|
||||
: [_0]"=r"(amp_0)
|
||||
: [out]"r"(output), [envx]"r"((int)voice->envx));
|
||||
/* Reduce and apply left and right volume */
|
||||
asm volatile (
|
||||
"mov %[out], %[amp_0], asr #11 \r\n" /* amp_0 = _0 */
|
||||
"bic %[out], %[out], #0x1 \r\n"
|
||||
"mul %[amp_0], %[out], %[vvol_0] \r\n"
|
||||
"mul %[amp_1], %[out], %[vvol_1] \r\n"
|
||||
: [out]"+r"(output), [amp_0]"+r"(amp_0),
|
||||
[amp_1]"=r"(amp_1)
|
||||
: [vvol_0]"r"((int)voice->volume[0]),
|
||||
[vvol_1]"r"((int)voice->volume[1]));
|
||||
|
||||
prev_outx = output;
|
||||
raw_voice->outx = output >> 8;
|
||||
#endif /* ARM_ARCH >= 6 */
|
||||
#else /* Unoptimized CPU */
|
||||
int output = *(int16_t*) &this->noise;
|
||||
|
||||
if ( !(this->r.g.noise_enables & vbit) )
|
||||
{
|
||||
output = (fwd [0] * interp [0]) & ~0xFFF;
|
||||
|
|
@ -788,8 +1033,10 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
/* duplicated here to give compiler more to run in parallel */
|
||||
amp_0 = voice->volume [0] * output;
|
||||
amp_1 = voice->volume [1] * output;
|
||||
|
||||
prev_outx = output;
|
||||
raw_voice->outx = (int8_t) (output >> 8);
|
||||
raw_voice->outx = output >> 8;
|
||||
#endif /* CPU_* */
|
||||
}
|
||||
#else /* SPCNOINTERP */
|
||||
/* two-point linear interpolation */
|
||||
|
|
@ -826,16 +1073,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
"asr.l %[sh], %[y1] \r\n"
|
||||
"add.l %[y0], %[y1] \r\n"
|
||||
: [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)
|
||||
: [s]"a"(voice->samples), [sh]"d"(12)
|
||||
);
|
||||
: [s]"a"(voice->samples), [sh]"d"(12));
|
||||
}
|
||||
|
||||
/* apply voice envelope to output */
|
||||
asm volatile (
|
||||
"mac.w %[output]l, %[envx]l, %%acc0 \r\n"
|
||||
"mac.w %[out]l, %[envx]l, %%acc0 \r\n"
|
||||
:
|
||||
: [output]"r"(amp_0), [envx]"r"(voice->envx)
|
||||
);
|
||||
: [out]"r"(amp_0), [envx]"r"(voice->envx));
|
||||
|
||||
/* advance voice position */
|
||||
voice->position += rate;
|
||||
|
|
@ -843,15 +1088,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
/* fetch output, scale and apply left and right
|
||||
voice volume */
|
||||
asm volatile (
|
||||
"movclr.l %%acc0, %[output] \r\n"
|
||||
"asr.l %[sh], %[output] \r\n"
|
||||
"mac.l %[vvol_0], %[output], %%acc0 \r\n"
|
||||
"mac.l %[vvol_1], %[output], %%acc1 \r\n"
|
||||
: [output]"=&d"(amp_0)
|
||||
"movclr.l %%acc0, %[out] \r\n"
|
||||
"asr.l %[sh], %[out] \r\n"
|
||||
"mac.l %[vvol_0], %[out], %%acc0 \r\n"
|
||||
"mac.l %[vvol_1], %[out], %%acc1 \r\n"
|
||||
: [out]"=&d"(amp_0)
|
||||
: [vvol_0]"r"((int)voice->volume[0]),
|
||||
[vvol_1]"r"((int)voice->volume[1]),
|
||||
[sh]"d"(11)
|
||||
);
|
||||
[sh]"d"(11));
|
||||
|
||||
/* save this output into previous, scale and save in
|
||||
output register */
|
||||
|
|
@ -862,14 +1106,16 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
asm volatile (
|
||||
"movclr.l %%acc0, %[amp_0] \r\n"
|
||||
"movclr.l %%acc1, %[amp_1] \r\n"
|
||||
: [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
|
||||
);
|
||||
: [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1));
|
||||
#elif defined (CPU_ARM)
|
||||
int amp_0, amp_1;
|
||||
|
||||
if ( (this->r.g.noise_enables & vbit) != 0 ) {
|
||||
if ( (this->r.g.noise_enables & vbit) != 0 )
|
||||
{
|
||||
amp_0 = *(int16_t *)&this->noise;
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t f = voice->position;
|
||||
amp_0 = (uint32_t)voice->samples;
|
||||
|
||||
|
|
@ -882,8 +1128,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
"sub %[y1], %[y1], %[y0] \r\n"
|
||||
"mul %[f], %[y1], %[f] \r\n"
|
||||
"add %[y0], %[y0], %[f], asr #12 \r\n"
|
||||
: [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1)
|
||||
);
|
||||
: [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1));
|
||||
}
|
||||
|
||||
voice->position += rate;
|
||||
|
|
@ -893,8 +1138,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
"mov %[amp_0], %[amp_1], asr #11 \r\n"
|
||||
"mov %[amp_1], %[amp_0], asr #8 \r\n"
|
||||
: [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1)
|
||||
: [envx]"r"(voice->envx)
|
||||
);
|
||||
: [envx]"r"(voice->envx));
|
||||
|
||||
prev_outx = amp_0;
|
||||
raw_voice->outx = (int8_t)amp_1;
|
||||
|
|
@ -904,8 +1148,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
"mul %[amp_0], %[vol_0], %[amp_0] \r\n"
|
||||
: [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
|
||||
: [vol_0]"r"((int)voice->volume[0]),
|
||||
[vol_1]"r"((int)voice->volume[1])
|
||||
);
|
||||
[vol_1]"r"((int)voice->volume[1]));
|
||||
#else /* Unoptimized CPU */
|
||||
int output;
|
||||
|
||||
|
|
@ -1089,25 +1332,116 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
echo_pos = 0;
|
||||
this->echo_pos = echo_pos;
|
||||
|
||||
int fb_0 = GET_LE16SA( echo_ptr );
|
||||
int fb_1 = GET_LE16SA( echo_ptr + 2 );
|
||||
#if ARM_ARCH >= 6
|
||||
int32_t *fir_ptr, *fir_coeff;
|
||||
int fb_0, fb_1;
|
||||
|
||||
/* Apply FIR */
|
||||
fb_0 = *(uint32_t *)echo_ptr;
|
||||
|
||||
/* Keep last 8 samples */
|
||||
asm volatile (
|
||||
"add %[fir_p], %[t_fir_p], #4 \r\n"
|
||||
"bic %[t_fir_p], %[fir_p], %[mask] \r\n"
|
||||
"str %[fb_0], [%[fir_p], #-4] \r\n"
|
||||
/* duplicate at +8 eliminates wrap checking below */
|
||||
"str %[fb_0], [%[fir_p], #28] \r\n"
|
||||
: [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr)
|
||||
: [fb_0]"r"(fb_0), [mask]"i"(~FIR_BUF_MASK));
|
||||
|
||||
fir_coeff = (int32_t *)this->fir_coeff;
|
||||
|
||||
/* Fugly, but the best version found. */
|
||||
int _0;
|
||||
asm volatile ( /* L0R0 = acc0 */
|
||||
"ldmia %[fir_p]!, { r2-r5 } \r\n" /* L1R1-L4R4 = r2-r5 */
|
||||
"ldmia %[fir_c]!, { r0-r1 } \r\n" /* C0C1-C2C3 = r0-r1 */
|
||||
"pkhbt %[_0], %[acc0], r2, asl #16 \r\n" /* L0R0,L1R1->L0L1,R0R1 */
|
||||
"pkhtb r2, r2, %[acc0], asr #16 \r\n"
|
||||
"smuad %[acc0], %[_0], r0 \r\n" /* acc0=L0*C0+L1*C1 */
|
||||
"smuad %[acc1], r2, r0 \r\n" /* acc1=R0*C0+R1*C1 */
|
||||
"pkhbt %[_0], r3, r4, asl #16 \r\n" /* L2R2,L3R3->L2L3,R2R3 */
|
||||
"pkhtb r4, r4, r3, asr #16 \r\n"
|
||||
"smlad %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L2*C2+L3*C3 */
|
||||
"smlad %[acc1], r4, r1, %[acc1] \r\n" /* acc1+=R2*C2+R3*C3 */
|
||||
"ldmia %[fir_p], { r2-r4 } \r\n" /* L5R5-L7R7 = r2-r4 */
|
||||
"ldmia %[fir_c], { r0-r1 } \r\n" /* C4C5-C6C7 = r0-r1 */
|
||||
"pkhbt %[_0], r5, r2, asl #16 \r\n" /* L4R4,L5R5->L4L5,R4R5 */
|
||||
"pkhtb r2, r2, r5, asr #16 \r\n"
|
||||
"smlad %[acc0], %[_0], r0, %[acc0] \r\n" /* acc0+=L4*C4+L5*C5 */
|
||||
"smlad %[acc1], r2, r0, %[acc1] \r\n" /* acc1+=R4*C4+R5*C5 */
|
||||
"pkhbt %[_0], r3, r4, asl #16 \r\n" /* L6R6,L7R7->L6L7,R6R7 */
|
||||
"pkhtb r4, r4, r3, asr #16 \r\n"
|
||||
"smlad %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L6*C6+L7*C7 */
|
||||
"smlad %[acc1], r4, r1, %[acc1] \r\n" /* acc1+=R6*C6+R7*C7 */
|
||||
: [acc0]"+r"(fb_0), [acc1]"=&r"(fb_1), [_0]"=&r"(_0),
|
||||
[fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
|
||||
:
|
||||
: "r0", "r1", "r2", "r3", "r4", "r5");
|
||||
|
||||
/* Generate output */
|
||||
int amp_0, amp_1;
|
||||
|
||||
asm volatile (
|
||||
"mul %[amp_0], %[gvol_0], %[chans_0] \r\n"
|
||||
"mul %[amp_1], %[gvol_1], %[chans_1] \r\n"
|
||||
: [amp_0]"=&r"(amp_0), [amp_1]"=&r"(amp_1)
|
||||
: [gvol_0]"r"(global_vol_0), [gvol_1]"r"(global_vol_1),
|
||||
[chans_0]"r"(chans_0), [chans_1]"r"(chans_1));
|
||||
asm volatile (
|
||||
"mla %[amp_0], %[fb_0], %[ev_0], %[amp_0] \r\n"
|
||||
"mla %[amp_1], %[fb_1], %[ev_1], %[amp_1] \r\n"
|
||||
: [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
|
||||
: [fb_0]"r"(fb_0), [fb_1]"r"(fb_1),
|
||||
[ev_0]"r"((int)this->r.g.echo_volume_0),
|
||||
[ev_1]"r"((int)this->r.g.echo_volume_1));
|
||||
|
||||
out_buf [ 0] = amp_0 >> global_muting;
|
||||
out_buf [WAV_CHUNK_SIZE] = amp_1 >> global_muting;
|
||||
out_buf ++;
|
||||
|
||||
if ( !(this->r.g.flags & 0x20) )
|
||||
{
|
||||
/* Feedback into echo buffer */
|
||||
int e0, e1;
|
||||
|
||||
asm volatile (
|
||||
"mov %[e0], %[echo_0], asl #7 \r\n"
|
||||
"mov %[e1], %[echo_1], asl #7 \r\n"
|
||||
"mla %[e0], %[fb_0], %[efb], %[e0] \r\n"
|
||||
"mla %[e1], %[fb_1], %[efb], %[e1] \r\n"
|
||||
: [e0]"=&r"(e0), [e1]"=&r"(e1)
|
||||
: [echo_0]"r"(echo_0), [echo_1]"r"(echo_1),
|
||||
[fb_0]"r"(fb_0), [fb_1]"r"(fb_1),
|
||||
[efb]"r"((int)this->r.g.echo_feedback));
|
||||
asm volatile (
|
||||
"ssat %[e0], #16, %[e0], asr #14 \r\n"
|
||||
"ssat %[e1], #16, %[e1], asr #14 \r\n"
|
||||
"pkhbt %[e0], %[e0], %[e1], lsl #16 \r\n"
|
||||
"str %[e0], [%[echo_p]] \r\n"
|
||||
: [e0]"+r"(e0), [e1]"+r"(e1)
|
||||
: [echo_p]"r"(echo_ptr));
|
||||
}
|
||||
#else /* ARM_ARCH < 6 */
|
||||
int fb_0 = GET_LE16SA( echo_ptr );
|
||||
int fb_1 = GET_LE16SA( echo_ptr + 2 );
|
||||
int32_t *fir_ptr, *fir_coeff;
|
||||
|
||||
/* Keep last 8 samples */
|
||||
int32_t *fir_ptr = this->fir_ptr;
|
||||
|
||||
/* Apply FIR */
|
||||
asm volatile (
|
||||
"str %[fb_0], [%[fir_p]], #4 \r\n"
|
||||
"str %[fb_1], [%[fir_p]], #4 \r\n"
|
||||
"add %[fir_p], %[t_fir_p], #8 \r\n"
|
||||
"bic %[t_fir_p], %[fir_p], %[mask] \r\n"
|
||||
"str %[fb_0], [%[fir_p], #-8] \r\n"
|
||||
"str %[fb_1], [%[fir_p], #-4] \r\n"
|
||||
/* duplicate at +8 eliminates wrap checking below */
|
||||
"str %[fb_0], [%[fir_p], #56] \r\n"
|
||||
"str %[fb_1], [%[fir_p], #60] \r\n"
|
||||
: [fir_p]"+r"(fir_ptr)
|
||||
: [fb_0]"r"(fb_0), [fb_1]"r"(fb_1)
|
||||
);
|
||||
"str %[fb_0], [%[fir_p], #56] \r\n"
|
||||
"str %[fb_1], [%[fir_p], #60] \r\n"
|
||||
: [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr)
|
||||
: [fb_0]"r"(fb_0), [fb_1]"r"(fb_1), [mask]"i"(~FIR_BUF_MASK));
|
||||
|
||||
this->fir_ptr = (int32_t *)((intptr_t)fir_ptr & FIR_BUF_MASK);
|
||||
int32_t *fir_coeff = this->fir_coeff;
|
||||
fir_coeff = this->fir_coeff;
|
||||
|
||||
asm volatile (
|
||||
"ldmia %[fir_c]!, { r0-r1 } \r\n"
|
||||
|
|
@ -1137,8 +1471,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
: [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1),
|
||||
[fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
|
||||
:
|
||||
: "r0", "r1", "r2", "r3", "r4", "r5"
|
||||
);
|
||||
: "r0", "r1", "r2", "r3", "r4", "r5");
|
||||
|
||||
/* Generate output */
|
||||
int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
|
||||
|
|
@ -1160,6 +1493,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
|
|||
e1 = CLAMP16( e1 );
|
||||
SET_LE16A( echo_ptr + 2, e1 );
|
||||
}
|
||||
#endif /* ARM_ARCH */
|
||||
#else /* Unoptimized CPU */
|
||||
/* Read feedback from echo buffer */
|
||||
int echo_pos = this->echo_pos;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue