SWCODEC & Coldfire: Do some more DSP straigntening out. Do as much Coldfire optimizing as seems reasonably possible by jumping through some hoops to avoid stalls. Further boost reduction will just be fractional points if taken to extremes-- not worth it. Wrap up the ASM for awhile.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@12905 a1c6a512-1295-4272-9138-f99709370657
2007-03-25 04:03:44 +00:00 · 2007-03-25 04:03:44 +00:00 · 369c2a37b7
commit 369c2a37b7
parent cd630c9e0a
3 changed files with 469 additions and 370 deletions
--- a/apps/dsp.c
+++ b/apps/dsp.c
@ -38,9 +38,14 @@
 #define WORD_FRACBITS       27

 #define NATIVE_DEPTH        16
+/* If the buffer sizes change, check the assembly code! */
 #define SAMPLE_BUF_COUNT    256
 #define RESAMPLE_BUF_COUNT  (256 * 4)   /* Enough for 11,025 Hz -> 44,100 Hz*/
 #define DEFAULT_GAIN        0x01000000
+#define SAMPLE_BUF_LEFT_CHANNEL 0
+#define SAMPLE_BUF_RIGHT_CHANNEL (SAMPLE_BUF_COUNT/2)
+#define RESAMPLE_BUF_LEFT_CHANNEL 0
+#define RESAMPLE_BUF_RIGHT_CHANNEL (RESAMPLE_BUF_COUNT/2)

 /* enums to index conversion properly with stereo mode and other settings */
 enum
@ -66,11 +71,10 @@ enum
 * NOTE: Any assembly routines that use these structures must be updated
 * if current data members are moved or changed.
 */
-                                        /* 32-bit achitecture offset */
 struct resample_data
 {
-    long delta;                         /* 00h */
-    long phase;                         /* 04h */
+    uint32_t delta;                     /* 00h */
+    uint32_t phase;                     /* 04h */
    int32_t last_sample[2];             /* 08h */
                                        /* 10h */
 };
@ -93,9 +97,10 @@ struct dsp_data
    int output_scale;                   /* 00h */
    int num_channels;                   /* 04h */
    struct resample_data resample_data; /* 08h */
-    int clip_min;                       /* 18h */
-    int clip_max;                       /* 2ch */
-                                        /* 30h */
+    int32_t clip_min;                   /* 18h */
+    int32_t clip_max;                   /* 1ch */
+    int32_t gain;                       /* 20h - Note that this is in S8.23 format. */ 
+                                        /* 24h */
 };

 /* No asm...yet */
@ -132,13 +137,18 @@ struct eq_state
 #include <dsp_asm.h>

 /* Typedefs keep things much neater in this case */
-typedef int (*sample_input_fn_type)(int count, const char *src[],
-                                    int32_t *dst[]);    
+typedef void (*sample_input_fn_type)(int count, const char *src[],
+                                     int32_t *dst[]);    
 typedef int (*resample_fn_type)(int count, struct dsp_data *data,
                                int32_t *src[], int32_t *dst[]);
 typedef void (*sample_output_fn_type)(int count, struct dsp_data *data,
                                      int32_t *src[], int16_t *dst);
+/* Single-DSP channel processing in place */
 typedef void (*channels_process_fn_type)(int count, int32_t *buf[]);
+/* DSP local channel processing in place */
+typedef void (*channels_process_dsp_fn_type)(int count, struct dsp_data *data,
+                                             int32_t *buf[]);
+

 /*
 ***************************************************************************/
@ -152,16 +162,16 @@ struct dsp_config
    int  sample_bytes;
    int  stereo_mode;
    int  frac_bits;
-    long gain;          /* Note that this is in S8.23 format. */
    /* Functions that change depending upon settings - NULL if stage is
       disabled */
-    sample_input_fn_type        input_samples;
-    resample_fn_type            resample;
-    sample_output_fn_type       output_samples;
+    sample_input_fn_type         input_samples;
+    resample_fn_type             resample;
+    sample_output_fn_type        output_samples;
    /* These will be NULL for the voice codec and is more economical that
       way */
-    channels_process_fn_type    apply_crossfeed;
-    channels_process_fn_type    channels_process;
+    channels_process_dsp_fn_type apply_gain;
+    channels_process_fn_type     apply_crossfeed;
+    channels_process_fn_type     channels_process;
 };

 /* General DSP config */
@ -211,7 +221,7 @@ static struct dsp_config *dsp IDATA_ATTR = audio_dsp;
 * of copying needed is minimized for that case.
 */

-static int32_t sample_buf[SAMPLE_BUF_COUNT] IBSS_ATTR;
+int32_t sample_buf[SAMPLE_BUF_COUNT] IBSS_ATTR;
 static int32_t resample_buf[RESAMPLE_BUF_COUNT] IBSS_ATTR;

 /* set a new dsp and return old one */
@ -258,23 +268,20 @@ void sound_set_pitch(int permille)
    dsp_configure(DSP_SWITCH_FREQUENCY, dsp->codec_frequency);
 }

-/* Convert at most count samples to the internal format, if needed. Returns
- * number of samples ready for further processing. Updates src to point
- * past the samples "consumed" and dst is set to point to the samples to
- * consume. Note that for mono, dst[0] equals dst[1], as there is no point
- * in processing the same data twice.
+/* Convert count samples to the internal format, if needed.  Updates src
+ * to point past the samples "consumed" and dst is set to point to the
+ * samples to consume. Note that for mono, dst[0] equals dst[1], as there
+ * is no point in processing the same data twice.
 */

 /* convert count 16-bit mono to 32-bit mono */
-static int sample_input_lte_native_mono(
+static void sample_input_lte_native_mono(
    int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
-
    const int16_t *s = (int16_t *) src[0];
    const int16_t * const send = s + count;
-    int32_t *d = dst[0] = dst[1] = sample_buf;
-    const int scale = WORD_SHIFT;
+    int32_t *d = dst[0] = dst[1] = &sample_buf[SAMPLE_BUF_LEFT_CHANNEL];
+    int scale = WORD_SHIFT;

    do
    {
@ -283,21 +290,17 @@ static int sample_input_lte_native_mono(
    while (s < send);

    src[0] = (char *)s;
-
-    return count;
 }

 /* convert count 16-bit interleaved stereo to 32-bit noninterleaved */
-static int sample_input_lte_native_i_stereo(
+static void sample_input_lte_native_i_stereo(
    int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
-
    const int32_t *s = (int32_t *) src[0];
    const int32_t * const send = s + count;
-    int32_t *dl = dst[0] = sample_buf;
-    int32_t *dr = dst[1] = sample_buf + SAMPLE_BUF_COUNT/2;
-    const int scale = WORD_SHIFT;
+    int32_t *dl = dst[0] = &sample_buf[SAMPLE_BUF_LEFT_CHANNEL];
+    int32_t *dr = dst[1] = &sample_buf[SAMPLE_BUF_RIGHT_CHANNEL];
+    int scale = WORD_SHIFT;

    do
    {
@ -313,22 +316,18 @@ static int sample_input_lte_native_i_stereo(
    while (s < send);

    src[0] = (char *)s;
-
-    return count;
 }

 /* convert count 16-bit noninterleaved stereo to 32-bit noninterleaved */
-static int sample_input_lte_native_ni_stereo(
+static void sample_input_lte_native_ni_stereo(
    int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
-
    const int16_t *sl = (int16_t *) src[0];
    const int16_t *sr = (int16_t *) src[1];
    const int16_t * const slend = sl + count;
-    int32_t *dl = dst[0] = sample_buf;
-    int32_t *dr = dst[1] = sample_buf + SAMPLE_BUF_COUNT/2;
-    const int scale = WORD_SHIFT;
+    int32_t *dl = dst[0] = &sample_buf[SAMPLE_BUF_LEFT_CHANNEL];
+    int32_t *dr = dst[1] = &sample_buf[SAMPLE_BUF_RIGHT_CHANNEL];
+    int scale = WORD_SHIFT;

    do
    {
@ -339,35 +338,24 @@ static int sample_input_lte_native_ni_stereo(

    src[0] = (char *)sl;
    src[1] = (char *)sr;
-
-    return count;
 }

 /* convert count 32-bit mono to 32-bit mono */
-static int sample_input_gt_native_mono(
+static void sample_input_gt_native_mono(
    int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
-
    dst[0] = dst[1] = (int32_t *)src[0];
    src[0] = (char *)(dst[0] + count);
-
-    return count;
 }

 /* convert count 32-bit interleaved stereo to 32-bit noninterleaved stereo */
-static int sample_input_gt_native_i_stereo(
+static void sample_input_gt_native_i_stereo(
    int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
-
    const int32_t *s = (int32_t *)src[0];
    const int32_t * const send = s + 2*count;
-    int32_t *dl = sample_buf;
-    int32_t *dr = sample_buf + SAMPLE_BUF_COUNT/2;
-
-    dst[0] = dl;
-    dst[1] = dr;
+    int32_t *dl = dst[0] = &sample_buf[SAMPLE_BUF_LEFT_CHANNEL];
+    int32_t *dr = dst[1] = &sample_buf[SAMPLE_BUF_RIGHT_CHANNEL];

    do
    {
@ -377,22 +365,16 @@ static int sample_input_gt_native_i_stereo(
    while (s < send);

    src[0] = (char *)send;
-
-    return count;
 }

 /* convert 32 bit-noninterleaved stereo to 32-bit noninterleaved stereo */
-static int sample_input_gt_native_ni_stereo(
+static void sample_input_gt_native_ni_stereo(
    int count, const char *src[], int32_t *dst[])
 {
-    count = MIN(SAMPLE_BUF_COUNT/2, count);
-
    dst[0] = (int32_t *)src[0];
    dst[1] = (int32_t *)src[1];
    src[0] = (char *)(dst[0] + count);
    src[1] = (char *)(dst[1] + count);
-
-    return count;
 }

 /**
@ -573,12 +555,6 @@ static void sample_output_new_format(void)
    dsp->output_samples = sample_output_functions[out];
 }

-static void resampler_set_delta(int frequency)
-{
-    dsp->data.resample_data.delta = (unsigned long) 
-        frequency * 65536LL / NATIVE_FREQUENCY;
-}
-
 /**
 * Linear interpolation resampling that introduces a one sample delay because
 * of our inability to look into the future at the end of a frame.
@ -587,9 +563,9 @@ static void resampler_set_delta(int frequency)
 static int dsp_downsample(int count, struct dsp_data *data,
                          int32_t *src[], int32_t *dst[])
 {
-    int  ch = data->num_channels - 1;
-    long delta = data->resample_data.delta;
-    long phase, pos;
+    int ch = data->num_channels - 1;
+    uint32_t delta = data->resample_data.delta;
+    uint32_t phase, pos;
    int32_t *d;

    /* Rolled channel loop actually showed slightly faster. */
@ -610,7 +586,7 @@ static int dsp_downsample(int count, struct dsp_data *data,
        if (pos > 0)
            last = s[pos - 1];

-        while (pos < count)
+        while (pos < (uint32_t)count)
        {
            *d++ = last + FRACMUL((phase & 0xffff) << 15, s[pos] - last);
            phase += delta;
@ -625,12 +601,12 @@ static int dsp_downsample(int count, struct dsp_data *data,
    return d - dst[0];
 }

-static int dsp_upsample(int count,  struct dsp_data *data,
+static int dsp_upsample(int count, struct dsp_data *data,
                        int32_t *src[], int32_t *dst[])
 {
    int  ch = data->num_channels - 1;
-    long delta = data->resample_data.delta;
-    long phase, pos;
+    uint32_t delta = data->resample_data.delta;
+    uint32_t phase, pos;
    int32_t *d;

    /* Rolled channel loop actually showed slightly faster. */
@ -653,7 +629,7 @@ static int dsp_upsample(int count,  struct dsp_data *data,
            pos = phase >> 16;
        }

-        while (pos < count)
+        while (pos < (uint32_t)count)
        {
            last = s[pos - 1];
            *d++ = last + FRACMUL((phase & 0xffff) << 15, s[pos] - last);
@ -669,24 +645,43 @@ static int dsp_upsample(int count,  struct dsp_data *data,
 }
 #endif /* DSP_HAVE_ASM_RESAMPLING */

+static void resampler_new_delta(void)
+{
+    dsp->data.resample_data.delta = (unsigned long) 
+        dsp->frequency * 65536LL / NATIVE_FREQUENCY;
+
+    if (dsp->frequency == NATIVE_FREQUENCY)
+    {
+        /* NOTE: If fully glitch-free transistions from no resampling to
+           resampling are desired, last_sample history should be maintained
+           even when not resampling. */
+        dsp->resample = NULL;
+        dsp->data.resample_data.phase = 0;
+        dsp->data.resample_data.last_sample[0] = 0;
+        dsp->data.resample_data.last_sample[1] = 0;
+    }
+    else if (dsp->frequency < NATIVE_FREQUENCY)
+        dsp->resample = dsp_upsample;
+    else
+        dsp->resample = dsp_downsample;
+}
+
 /* Resample count stereo samples. Updates the src array, if resampling is
 * done, to refer to the resampled data. Returns number of stereo samples
 * for further processing.
 */
 static inline int resample(int count, int32_t *src[])
 {
-    if (dsp->resample)
+    int32_t *dst[2] =
    {
-        int32_t *dst[2] =
-        {
-            resample_buf,
-            resample_buf + RESAMPLE_BUF_COUNT/2,
-        };
+        &resample_buf[RESAMPLE_BUF_LEFT_CHANNEL],
+        &resample_buf[RESAMPLE_BUF_RIGHT_CHANNEL],
+    };

-        count = dsp->resample(count, &dsp->data, src, dst);
-        src[0] = dst[0];
-        src[1] = dst[dsp->data.num_channels - 1];
-    }
+    count = dsp->resample(count, &dsp->data, src, dst);
+
+    src[0] = dst[0];
+    src[1] = dst[dsp->data.num_channels - 1];

    return count;
 }
@ -810,30 +805,59 @@ void dsp_set_crossfeed_cross_params(long lf_gain, long hf_gain, long cutoff)
    c[2] <<= 4;
 }

+/* Apply a constant gain to the samples (e.g., for ReplayGain).
+ * Note that this must be called before the resampler.
+ */
+#ifndef DSP_HAVE_ASM_APPLY_GAIN
+static void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
+{
+    const int32_t gain = data->gain;
+    int ch = data->num_channels - 1;
+
+    do
+    {
+        int32_t *s = buf[ch];
+        int32_t *d = buf[ch];
+        int32_t  samp = *s++;
+        int i = 0;
+
+        do
+        {
+            FRACMUL_8_LOOP(samp, gain, s, d);
+        }
+        while (++i < count);
+    }
+    while (--ch >= 0);
+}
+#endif /* DSP_HAVE_ASM_APPLY_GAIN */
+
 /* Combine all gains to a global gain. */
 static void set_gain(struct dsp_config *dsp)
 {
-    dsp->gain = DEFAULT_GAIN;
+    dsp->data.gain = DEFAULT_GAIN;

    /* Replay gain not relevant to voice */
    if (dsp == audio_dsp && replaygain)
    {
-        dsp->gain = replaygain;
+        dsp->data.gain = replaygain;
    }
    
    if (eq_enabled && eq_precut)
    {
-        dsp->gain = (long) (((int64_t) dsp->gain * eq_precut) >> 24);
+        dsp->data.gain =
+            (long) (((int64_t) dsp->data.gain * eq_precut) >> 24);
    }
    
-    if (dsp->gain == DEFAULT_GAIN)
+    if (dsp->data.gain == DEFAULT_GAIN)
    {
-        dsp->gain = 0;
+        dsp->data.gain = 0;
    }
    else
    {
-        dsp->gain >>= 1;
+        dsp->data.gain >>= 1;
    }
+
+    dsp->apply_gain = dsp->data.gain != 0 ? dsp_apply_gain : NULL;
 }

 /**
@ -927,50 +951,6 @@ static void eq_process(int count, int32_t *buf[])
    }
 }

-/* Apply a constant gain to the samples (e.g., for ReplayGain). May update
- * the src array if gain was applied.
- * Note that this must be called before the resampler.
- */
-static void apply_gain(int count, int32_t *buf[])
-{
-    int32_t *sl, *sr;
-    int32_t s, *d;
-    long gain;
-    int i;
-
-    if (new_gain)
-    {
-        /* Gain has changed */
-        dsp_set_replaygain();
-        if (dsp->gain == 0)
-            return; /* No gain to apply now */
-    }
-
-    sl = buf[0], sr = buf[1];
-    gain = dsp->gain;
-
-    if (sl != sr)
-    {
-        d = &sample_buf[SAMPLE_BUF_COUNT / 2];
-        buf[1] = d;
-        s = *sr++;
-
-        for (i = 0; i < count; i++)
-            FRACMUL_8_LOOP(s, gain, sr, d);
-    }
-    else
-    {
-        buf[1] = &sample_buf[0];
-    }
-
-    d = &sample_buf[0];
-    buf[0] = d;
-    s = *sl++;
-
-    for (i = 0; i < count; i++)
-        FRACMUL_8_LOOP(s, gain, sl, d);
-}
-
 void dsp_set_stereo_width(int value)
 {
    long width, straight, cross;
@ -993,35 +973,6 @@ void dsp_set_stereo_width(int value)
    dsp_sw_cross = cross << 8;
 }

-/**
- * Implements the different channel configurations and stereo width.
- */
-
-/* SOUND_CHAN_STEREO mode is a noop so has no function - just outline one for
- * completeness. */
-#if 0
-static void channels_process_sound_chan_stereo(int count, int32_t *buf[])
-{
-    /* The channels are each just themselves */
-    (void)count; (void)buf;
-}
-#endif
-
-#ifndef DSP_HAVE_ASM_SOUND_CHAN_MONO
-static void channels_process_sound_chan_mono(int count, int32_t *buf[])
-{
-    int32_t *sl = buf[0], *sr = buf[1];
-
-    do
-    {
-        int32_t lr = *sl/2 + *sr/2;
-        *sl++ = lr;
-        *sr++ = lr;
-    }
-    while (--count > 0);
-}
-#endif /* DSP_HAVE_ASM_SOUND_CHAN_MONO */
-
 #if CONFIG_CODEC == SWCODEC

 #ifdef HAVE_SW_TONE_CONTROLS
@ -1063,6 +1014,35 @@ int dsp_callback(int msg, intptr_t param)
 }
 #endif

+/**
+ * Implements the different channel configurations and stereo width.
+ */
+
+/* SOUND_CHAN_STEREO mode is a noop so has no function - just outline one for
+ * completeness. */
+#if 0
+static void channels_process_sound_chan_stereo(int count, int32_t *buf[])
+{
+    /* The channels are each just themselves */
+    (void)count; (void)buf;
+}
+#endif
+
+#ifndef DSP_HAVE_ASM_SOUND_CHAN_MONO
+static void channels_process_sound_chan_mono(int count, int32_t *buf[])
+{
+    int32_t *sl = buf[0], *sr = buf[1];
+
+    do
+    {
+        int32_t lr = *sl/2 + *sr/2;
+        *sl++ = lr;
+        *sr++ = lr;
+    }
+    while (--count > 0);
+}
+#endif /* DSP_HAVE_ASM_SOUND_CHAN_MONO */
+
 #ifndef DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
 static void channels_process_sound_chan_custom(int count, int32_t *buf[])
 {
@ -1151,30 +1131,47 @@ int dsp_process(char *dst, const char *src[], int count)
    coldfire_set_macsr(EMAC_FRACTIONAL | EMAC_SATURATE);
 #endif

+    if (new_gain)
+        dsp_set_replaygain(); /* Gain has changed */
+
+    /* Testing function pointers for NULL is preferred since the pointer
+       will be preloaded to be used for the call if not. */
    while (count > 0)
    {
-        samples = dsp->input_samples(count, src, tmp);
+        samples = MIN(SAMPLE_BUF_COUNT/2, count);
        count -= samples;
-        if (dsp->gain != 0)
-            apply_gain(samples, tmp);
-        if ((samples = resample(samples, tmp)) <= 0)
+
+        dsp->input_samples(samples, src, tmp);
+
+        if (dsp->apply_gain)
+            dsp->apply_gain(samples, &dsp->data, tmp);
+
+        if (dsp->resample && (samples = resample(samples, tmp)) <= 0)
            break; /* I'm pretty sure we're downsampling here */
+
        if (dsp->apply_crossfeed)
            dsp->apply_crossfeed(samples, tmp);
+
        /* TODO: EQ and tone controls need separate structs for audio and voice
         * DSP processing thanks to filter history. isn't really audible now, but
-         * might be the day we start handling voice more delicately.
+         * might be the day we start handling voice more delicately. Planned
+         * changes may well run all relevent channels through the same EQ so
+         * perhaps not.
         */
        if (eq_enabled)
            eq_process(samples, tmp);
+
 #ifdef HAVE_SW_TONE_CONTROLS
        if ((bass | treble) != 0)
            eq_filter(tmp, &tone_filter, samples, dsp->data.num_channels,
                      FILTER_BISHELF_SHIFT);
 #endif
+
        if (dsp->channels_process)
            dsp->channels_process(samples, tmp);
+
        dsp->output_samples(samples, &dsp->data, tmp, (int16_t *)dst);
+
        written += samples;
        dst += samples * sizeof (int16_t) * 2;
        yield();
@ -1245,9 +1242,6 @@ bool dsp_configure(int setting, intptr_t value)
        if (dsp == audio_dsp)
        {
            *var = value;
-            /* In case current gain is zero, force at least one call
-               to apply_gain or apply_gain won't pick up on new_gain */
-            audio_dsp->gain = -1;
            new_gain = true;
        }
    }
@ -1282,15 +1276,7 @@ bool dsp_configure(int setting, intptr_t value)
        else
            dsp->frequency = dsp->codec_frequency;

-        resampler_set_delta(dsp->frequency);
-
-        if (dsp->frequency == NATIVE_FREQUENCY)
-            dsp->resample = NULL;
-        else if (dsp->frequency < NATIVE_FREQUENCY)
-            dsp->resample = dsp_upsample;
-        else
-            dsp->resample = dsp_downsample;
-
+        resampler_new_delta();
        break;

    case DSP_SET_SAMPLE_DEPTH:
@ -1348,7 +1334,7 @@ bool dsp_configure(int setting, intptr_t value)
    case DSP_FLUSH:
        memset(&dsp->data.resample_data, 0,
               sizeof (dsp->data.resample_data));
-        resampler_set_delta(dsp->frequency);
+        resampler_new_delta();
        dither_init();
        break;

--- a/apps/dsp_asm.h
+++ b/apps/dsp_asm.h
@ -22,32 +22,61 @@
 #ifndef _DSP_ASM_H
 #define _DSP_ASM_H

+/* Set the appropriate #defines based on CPU or whatever matters */
 #ifndef SIMULATOR

-#if defined(CPU_COLDFIRE) || defined(CPU_ARM)
-#define DSP_HAVE_ASM_CROSSFEED
-void apply_crossfeed(int count, int32_t *buf[]);
+#if defined(CPU_ARM)
 #define DSP_HAVE_ASM_RESAMPLING
-int dsp_downsample(int count, struct dsp_data *data, int32_t *src[], int32_t *dst[]);
-int dsp_upsample(int count, struct dsp_data *data, int32_t *src[], int32_t *dst[]);
-#endif /* defined(CPU_COLDFIRE) || defined(CPU_ARM) */
-
-#if defined (CPU_COLDFIRE)
+#define DSP_HAVE_ASM_CROSSFEED
+#elif defined (CPU_COLDFIRE)
+#define DSP_HAVE_ASM_APPLY_GAIN
+#define DSP_HAVE_ASM_RESAMPLING
+#define DSP_HAVE_ASM_CROSSFEED
 #define DSP_HAVE_ASM_SOUND_CHAN_MONO
-void channels_process_sound_chan_mono(int count, int32_t *buf[]);
 #define DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
-void channels_process_sound_chan_custom(int count, int32_t *buf[]);
 #define DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
-void channels_process_sound_chan_karaoke(int count, int32_t *buf[]);
-
 #define DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
-void sample_output_mono(int count, struct dsp_data *data,
-                        int32_t *src[], int16_t *dst);
 #define DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
-void sample_output_stereo(int count, struct dsp_data *data,
-                          int32_t *src[], int16_t *dst);
 #endif /* CPU_COLDFIRE */

 #endif /* SIMULATOR */

+/* Declare prototypes based upon what's #defined above */
+#ifdef DSP_HAVE_ASM_CROSSFEED
+void apply_crossfeed(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_APPLY_GAIN
+void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[]);
+#endif /* DSP_HAVE_ASM_APPLY_GAIN* */
+
+#ifdef DSP_HAVE_ASM_RESAMPLING
+int dsp_upsample(int count, struct dsp_data *data,
+                 int32_t *src[], int32_t *dst[]);
+int dsp_downsample(int count, struct dsp_data *data,
+                   int32_t *src[], int32_t *dst[]);
+#endif /* DSP_HAVE_ASM_RESAMPLING */
+
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_MONO
+void channels_process_sound_chan_mono(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
+void channels_process_sound_chan_custom(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
+void channels_process_sound_chan_karaoke(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
+void sample_output_stereo(int count, struct dsp_data *data,
+                          int32_t *src[], int16_t *dst);
+#endif
+
+#ifdef DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
+void sample_output_mono(int count, struct dsp_data *data,
+                        int32_t *src[], int16_t *dst);
+#endif
+
 #endif /* _DSP_ASM_H */
--- a/apps/dsp_cf.S
+++ b/apps/dsp_cf.S
@ -19,68 +19,117 @@
 ****************************************************************************/

 /****************************************************************************
- * void apply_crossfeed(int count, int32_t *src[])
+ * void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
 */
    .section    .text
+	.align      2
+    .global     dsp_apply_gain
+dsp_apply_gain:
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     28(%sp), %a0-%a1        | %a0 = data,
+                                        | %a1 = buf
+	move.l      4(%a0), %d1             | %d1 = data->num_channels
+    move.l      32(%a0), %a0            | %a0 = data->gain (in s8.23)
+10: | channel loop                      |
+	move.l      24(%sp), %d0            | %d0 = count
+    move.l      -4(%a1, %d1.l*4), %a2   | %a2 = s = buf[ch-1]
+    move.l      %a2, %a3                | %a3 = d = s
+    move.l      (%a2)+, %d2             | %d2 = *s++,
+    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
+    subq.l      #1, %d0                 | --count > 0 ? : effectively n++
+    ble.b       30f | loop done         | no? finish up
+20: | loop                              |
+    move.l      %accext01, %d4          | fetch S(n-1)[7:0]
+    movclr.l    %acc0, %d3              | fetch S(n-1)[40:8] in %d5[31:0]
+    asl.l       #8, %d3                 | *s++ = (S(n-1)[40:8] << 8) | S(n-1)[7:0]
+    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
+    move.b      %d4, %d3                |
+    move.l      %d3, (%a3)+             |
+    subq.l      #1, %d0                 | --count > 0 ? : effectively n++
+    bgt.b       20b | loop              | yes? do more samples
+30: | loop done                         |
+    move.l      %accext01, %d4          | fetch S(n-1)[7:0]
+    movclr.l    %acc0, %d3              | fetch S(n-1)[40:8] in %d5[31:0]
+    asl.l       #8, %d3                 | *s = (S(n-1)[40:8] << 8) | S(n-1)[7:0]
+    move.b      %d4, %d3                |
+    move.l      %d3, (%a3)              |
+	subq.l      #1, %d1                 | next channel
+	bgt.b       10b | channel loop      |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup stack
+    rts                                 |
+    .size       dsp_apply_gain,.-dsp_apply_gain
+
+/****************************************************************************
+ * void apply_crossfeed(int count, int32_t *buf[])
+ */
+    .section    .text
+        .align      2
    .global     apply_crossfeed 
 apply_crossfeed:
-    lea.l       -44(%sp), %sp
+    lea.l       -44(%sp), %sp           |
    movem.l     %d2-%d7/%a2-%a6, (%sp)  | save all regs
    movem.l     48(%sp), %d7/%a4        | %d7 = count, %a4 = src
    movem.l     (%a4), %a4-%a5          | %a4 = src[0], %a5 = src[1]
-    lea.l       crossfeed_data, %a1
-    move.l      (%a1)+, %a6             | a6 = direct gain
+    lea.l       crossfeed_data, %a1     | %a1 = &crossfeed_data
+    move.l      (%a1)+, %d6             | %d6 = direct gain
    movem.l     12(%a1), %d0-%d3        | fetch filter history samples
    move.l      132(%a1), %a0           | fetch delay line address
    movem.l     (%a1), %a1-%a3          | load filter coefs
+    lea.l       crossfeed_data+136, %a6 | %a6 = delay line wrap limit
+    bra.b       20f | loop start        | go to loop start point
    /* Register usage in loop:
     * %a0 = delay_p, %a1..%a3 = b0, b1, a1 (filter coefs),
-     * %a4 = src[0], %a5 = src[1], %a6 = direct gain,
+     * %a4 = buf[0], %a5 = buf[1],
+     * %a6 = delay line pointer wrap limit,
     * %d0..%d3 = history
-     * %d4..%d6 = temp.
+     * %d4..%d5 = temp.
+     * %d6 = direct gain,
     * %d7 = count
     */
-.cfloop:
-    mac.l       %a2, %d0, 4(%a0), %d0, %acc0 | acc  = b1*dr[n - 1] d0 = dr[n]
-    mac.l       %a1, %d0             , %acc0 | acc += b0*dr[n]
-    mac.l       %a3, %d1,  (%a4), %d4, %acc0 | acc += a1*y_l[n - 1], load L
-    move.l      %acc0, %d1              | get filtered delayed sample
-    mac.l       %a6, %d4, %acc0         | acc += gain*x_l[n]
-    movclr.l    %acc0, %d6              |
-    move.l      %d6, (%a4)+             | write result
-
-    mac.l       %a2, %d2, (%a0), %d2, %acc0 | acc  = b1*dl[n - 1], d2 = dl[n]
-    mac.l       %a1, %d2            , %acc0 | acc += b0*dl[n]
-    mac.l       %a3, %d3, (%a5), %d5, %acc0 | acc += a1*y_r[n - 1], load R
-    movem.l     %d4-%d5, (%a0)          | save left & right inputs to delay line
-    move.l      %acc0, %d3              | get filtered delayed sample
-    mac.l       %a6, %d5, %acc0         | acc += gain*x_r[n]
-    lea.l       8(%a0), %a0             | increment delay pointer
-    movclr.l    %acc0, %d6              |
-    move.l      %d6, (%a5)+             | write result
-
-    cmpa.l      #crossfeed_data+136, %a0| wrap a0 if passed end
-    bge.b       .cfwrap                 |
-    .word       0x51fb                  | tpf.l - trap the buffer wrap
-.cfwrap:
-    lea.l       -104(%a0), %a0          | wrap
-    subq.l      #1, %d7                 | --count < 0 ?
-    bgt.b       .cfloop                 |
+10: | loop                              |
+    movclr.l    %acc0, %d4              | write outputs
+    move.l      %d4, (%a4)+             | .
+    movclr.l    %acc1, %d5              | .
+    move.l      %d5, (%a5)+             | .
+20: | loop start                        |
+    mac.l       %a2, %d0, (%a0)+, %d0, %acc0 | %acc0  = b1*dl[n - 1], %d0 = dl[n]
+    mac.l       %a1, %d0             , %acc0 | %acc0 += b0*dl[n]
+    mac.l       %a3, %d1, (%a5),  %d5, %acc0 | %acc0 += a1*y_r[n - 1], load R
+    mac.l       %a2, %d2, (%a0)+, %d2, %acc1 | %acc1  = b1*dr[n - 1], %d2 = dr[n]
+    mac.l       %a1, %d2             , %acc1 | %acc1 += b0*dr[n]
+    mac.l       %a3, %d3, (%a4),  %d4, %acc1 | %acc1 += a1*y_l[n - 1], load L
+    movem.l     %d4-%d5, -8(%a0)        | save left & right inputs to delay line
+    move.l      %acc0, %d3              | get filtered delayed left sample (y_l[n])
+    move.l      %acc1, %d1              | get filtered delayed right sample (y_r[n])
+    mac.l       %d6, %d4, %acc0         | %acc0 += gain*x_l[n]
+    mac.l       %d6, %d5, %acc1         | %acc1 += gain*x_r[n]
+    cmp.l       %a6, %a0                | wrap %a0 if passed end
+    bhs.b       30f | wrap buffer       |
+    .word       0x51fb | tpf.l          | trap the buffer wrap
+30: | wrap buffer                       | ...fwd taken branches more costly
+    lea.l       -104(%a0), %a0          | wrap it up
+    subq.l      #1, %d7                 | --count > 0 ?
+    bgt.b       10b | loop              | yes? do more
+    movclr.l    %acc0, %d4              | write last outputs
+    move.l      %d4, (%a4)              | .
+    movclr.l    %acc1, %d5              | .
+    move.l      %d5, (%a5)              | .
    lea.l       crossfeed_data+16, %a1  | save data back to struct
    movem.l     %d0-%d3, (%a1)          | ...history
    move.l      %a0, 120(%a1)           | ...delay_p
    movem.l     (%sp), %d2-%d7/%a2-%a6  | restore all regs
-    lea.l       44(%sp), %sp
-    rts
-.cfend:
-    .size       apply_crossfeed,.cfend-apply_crossfeed
-
+    lea.l       44(%sp), %sp            |
+    rts                                 |
+    .size       apply_crossfeed,.-apply_crossfeed 

 /****************************************************************************
 * int dsp_downsample(int count, struct dsp_data *data,
 *                    in32_t *src[], int32_t *dst[])
 */
    .section    .text
+	.align      2
    .global     dsp_downsample
 dsp_downsample:
    lea.l       -40(%sp), %sp           | save non-clobberables
@ -92,7 +141,7 @@ dsp_downsample:
    movem.l     4(%a0), %d3-%d4         | %d3 = ch = data->num_channels
                                        | %d4 = delta = data->resample_data.delta
    moveq.l     #16, %d7                | %d7 = shift
-.dschannel_loop:
+10: | channel loop                      |
    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
@ -102,15 +151,15 @@ dsp_downsample:
    move.l      %d5, %d6                | %d6 = pos = phase >> 16
    lsr.l       %d7, %d6                |
    cmp.l       %d2, %d6                | past end of samples?
-    bge.b       .dsloop_skip            | yes? skip loop
+    bge.b       40f | skip resample loop| yes? skip loop
    tst.l       %d6                     | need last sample of prev. frame?
-    bne.b       .dsloop                 | no? start main loop
+    bne.b       20f | resample loop     | no? start main loop
    move.l      (%a3, %d6.l*4), %d1     | %d1 = s[pos]
-    bra.b       .dsuse_last_start       | start with last (last in %d0)
-.dsloop:
+    bra.b       30f | resample start last | start with last (last in %d0)
+20: | resample loop                     |
    lea.l       -4(%a3, %d6.l*4), %a5   | load s[pos-1] and s[pos]
    movem.l     (%a5), %d0-%d1          |
-.dsuse_last_start:
+30: | resample start last               |
    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
    move.l      %d0, %acc0              | %acc0 = previous sample
    move.l      %d5, %d0                | frac = (phase << 16) >> 1
@ -123,11 +172,11 @@ dsp_downsample:
    movclr.l    %acc0, %d0              |
    move.l      %d0, (%a4)+             | *d++ = %d0
    cmp.l       %d2, %d6                | pos < count?
-    blt.b       .dsloop                 | yes? continue resampling
-.dsloop_skip:
+    blt.b       20b | resample loop     | yes? continue resampling
+40: | skip resample loop                |
    subq.l      #1, %d3                 | ch > 0?
-    bgt.b       .dschannel_loop         | yes? process next channel
-    asl.l       %d7, %d2                | wrap phase to start of next frame
+    bgt.b       10b | channel loop      | yes? process next channel
+    lsl.l       %d7, %d2                | wrap phase to start of next frame
    sub.l       %d2, %d5                | data->resample_data.phase =
    move.l      %d5, 12(%a0)            | ... phase - (count << 16)
    move.l      %a4, %d0                | return d - d[0]
@ -136,14 +185,14 @@ dsp_downsample:
    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
    lea.l       40(%sp), %sp            | cleanup stack
    rts                                 | buh-bye
-.dsend:
-    .size       dsp_downsample,.dsend-dsp_downsample
+    .size       dsp_downsample,.-dsp_downsample

 /****************************************************************************
 * int dsp_upsample(int count, struct dsp_data *dsp,
- *                  in32_t *src[], int32_t *dst[])
+ *                  int32_t *src[], int32_t *dst[])
 */
    .section    .text
+	.align      2
    .global     dsp_upsample
 dsp_upsample:
    lea.l       -40(%sp), %sp           | save non-clobberables
@ -154,47 +203,55 @@ dsp_upsample:
                                        | %a2 = dst
    movem.l      4(%a0), %d3-%d4        | %d3 = ch = channels
                                        | %d4 = delta = data->resample_data.delta
-    swap        %d4                     | swap delta to high word to use
-                                        | carries to increment position
-.uschannel_loop:
+    swap        %d4                     | swap delta to high word to use...
+                                        | ...carries to increment position
+10: | channel loop                      |
    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
    lea.l       12(%a0, %d3.l*4), %a4   | %a4 = &data->resample_data.last_sample[ch-1]
-    lea.l       (%a3, %d2.l*4), %a5     | %a5 = src_end = &src[count]
+    lea.l       -4(%a3, %d2.l*4), %a5   | %a5 = src_end = &src[count-1]
    move.l      (%a4), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
-    move.l      -(%a5), (%a4)           | data->resample_data.last_sample[ch-1] = s[count-1]
+    move.l      (%a5), (%a4)            | data->resample_data.last_sample[ch-1] = s[count-1]
    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
+    move.l      (%a3)+, %d1             | fetch first sample - might throw this...
+                                        | ...away later but we'll be preincremented
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[0] - last
    swap        %d5                     | swap phase to high word to use
                                        | carries to increment position
-    move.l      %d5, %d6                | %d6 = pos = phase >> 16
+    move.l      %d5, %d7                | %d7 = pos = phase >> 16
    clr.w       %d5                     |
-    eor.l       %d5, %d6                | pos == 0?
-    beq.b       .usstart_0              | no? transistion from down
-    cmp.l       %d2, %d6                | past end of samples?
-    bge.b       .usloop_skip            | yes? skip loop
-    lea.l       -4(%a3, %d6.l*4), %a3   | %a3 = s = &s[pos-1] (previous)
-    move.l      (%a3)+, %d0             | %d0 = *s++
-    .word       0x51fa                  | tpf.w - trap next instruction
-.usloop_1:
+    eor.l       %d5, %d7                | pos == 0?
+    beq.b       40f | loop start        | yes? start loop
+    cmp.l       %d2, %d7                | past end of samples?
+    bge.b       50f | skip resample loop| yes? go to next channel and collect info
+    lea.l       (%a3, %d7.l*4), %a3     | %a3 = s = &s[pos+1]
+	movem.l     -8(%a3), %d0-%d1        | %d0 = s[pos-1], %d1 = s[pos]
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
+	bra.b       40f | loop start        |
+20: | next sample loop                  |
    move.l      %d6, %d0                | move previous sample to %d0
-.usstart_0:
    move.l      (%a3)+, %d1             | fetch next sample
    move.l      %d1, %d6                | save sample value
    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
-.usloop_0:
+30: | same sample loop                  |
+    movclr.l    %acc0, %d7              | %d7 = result
+    move.l      %d7, (%a4)+             | *d++ = %d7
+40: | loop start                        |
    lsr.l       #1, %d5                 | make phase into frac
+    move.l      %d0, %acc0              | %acc0 = s[pos-1]
    mac.l       %d1, %d5, %acc0         | %acc0 = diff * frac
    lsl.l       #1, %d5                 | restore frac to phase
-    movclr.l    %acc0, %d7              | %d7 = product
-    add.l       %d0, %d7                | %d7 = last + product
-    move.l      %d7, (%a4)+             | *d++ = %d7
    add.l       %d4, %d5                | phase += delta
-    bcc.b       .usloop_0               | load next values?
+    bcc.b       30b | same sample loop  | load next values?
    cmp.l       %a5, %a3                | src <= src_end?
-    ble.b       .usloop_1               | yes? continue resampling
-.usloop_skip:
+    bls.b       20b | next sample loop  | yes? continue resampling
+    movclr.l    %acc0, %d7              | %d7 = result
+    move.l      %d7, (%a4)+             | *d++ = %d7
+50: | skip resample loop                |
    subq.l      #1, %d3                 | ch > 0?
-    bgt.b       .uschannel_loop         | yes? process next channel
+    bgt.b       10b | channel loop      | yes? process next channel
    swap        %d5                     | wrap phase to start of next frame
    move.l      %d5, 12(%a0)            | ...and save in data->resample_data.phase
    move.l      %a4, %d0                | return d - d[0]
@ -203,12 +260,7 @@ dsp_upsample:
    asr.l       #2, %d0                 | convert bytes->samples
    lea.l       40(%sp), %sp            | cleanup stack
    rts                                 | buh-bye
-.usend:
-    .size       dsp_upsample,.usend-dsp_upsample
-
-/* These routines might benefit from burst transfers but we'll keep them
- * small for now since they're rather light weight
- */
+    .size       dsp_upsample,.-dsp_upsample

 /****************************************************************************
 * void channels_process_sound_chan_mono(int count, int32_t *buf[])
@ -216,31 +268,39 @@ dsp_upsample:
 * Mix left and right channels 50/50 into a center channel.
 */
    .section    .text
+	.align      2
    .global     channels_process_sound_chan_mono
 channels_process_sound_chan_mono:
    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
-    lea.l       -12(%sp), %sp           | save registers
-    move.l      %macsr, %d1             |
-    movem.l     %d1-%d3, (%sp)          |
-    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
    move.l      #0x40000000, %d3        | %d3 = 0.5
-1:
-    move.l     (%a0), %d1               | L = R = l/2 + r/2
-    mac.l      %d1, %d3, (%a1), %d2, %acc0 |
-    mac.l      %d2, %d3, %acc0          |
-    movclr.l   %acc0, %d1               |
-    move.l     %d1, (%a0)+              | output to original buffer
-    move.l     %d1, (%a1)+              |
-    subq.l     #1, %d0                  |
-    bgt.s      1b                       |
-    movem.l    (%sp), %d1-%d3           | restore registers
-    move.l     %d1, %macsr              |
-    lea.l      12(%sp), %sp             | cleanup
-    rts
-.cpmono_end:
-    .size       channels_process_sound_chan_mono, .cpmono_end-channels_process_sound_chan_mono
-
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc0 |
+    subq.l      #1, %d0                 |
+    ble.s       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d4              | L = R = l/2 + r/2
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc0 |
+    move.l      %d4, (%a2)+             | output to original buffer
+    move.l      %d4, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d4              | output last sample
+    move.l      %d4, (%a2)              |
+    move.l      %d4, (%a3)              |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_mono, \
+                .-channels_process_sound_chan_mono

 /****************************************************************************
 * void channels_process_sound_chan_custom(int count, int32_t *buf[])
@ -248,34 +308,47 @@ channels_process_sound_chan_mono:
 * Apply stereo width (narrowing/expanding) effect.
 */
    .section    .text
+	.align      2
    .global     channels_process_sound_chan_custom
 channels_process_sound_chan_custom:
    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
-    lea.l       -16(%sp), %sp           | save registers
-    move.l      %macsr, %d1             |
-    movem.l     %d1-%d4, (%sp)          |
-    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    lea.l       -28(%sp), %sp           | save registers
+    movem.l     %d2-%d6/%a2-%a3, (%sp)  |
    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
    move.l      dsp_sw_gain, %d3        | load straight (mid) gain
    move.l      dsp_sw_cross, %d4       | load cross (side) gain
-1:
-    move.l      (%a0), %d1              |
-    mac.l       %d1, %d3, (%a1), %d2, %acc0 |  L = l*gain + r*cross
-    mac.l       %d1, %d4            , %acc1 |  R = r*gain + l*cross
-    mac.l       %d2, %d4            , %acc0 |
-    mac.l       %d2, %d3            , %acc1 |
-    movclr.l    %acc0, %d1              |
-    movclr.l    %acc1, %d2              |
-    move.l      %d1, (%a0)+             |
-    move.l      %d2, (%a1)+             |
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4, (%a0)+, %d1, %acc1 |  R = r*gain + l*cross
+    mac.l       %d2, %d4             , %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc1 |
    subq.l      #1, %d0                 |
-    bgt.s       1b                      |
-    movem.l     (%sp), %d1-%d4          | restore registers
-    move.l      %d1, %macsr             |
-    lea.l       16(%sp), %sp            | cleanup
-    rts
-.cpcustom_end:
-    .size       channels_process_sound_chan_custom, .cpcustom_end-channels_process_sound_chan_custom
+    ble.b       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d5              |
+    movclr.l    %acc1, %d6              |
+15: | loop start                        |
+    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4, (%a0)+, %d1, %acc1 |  R = r*gain + l*cross
+    mac.l       %d2, %d4             , %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc1 |
+    move.l      %d5, (%a2)+             |
+    move.l      %d6, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d5              | output last sample
+    movclr.l    %acc1, %d6              |
+    move.l      %d5, (%a2)              |
+    move.l      %d6, (%a3)              |
+    movem.l     (%sp), %d2-%d6/%a2-%a3  | restore registers
+    lea.l       28(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_custom, \
+                .-channels_process_sound_chan_custom

 /****************************************************************************
 *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
@ -283,31 +356,42 @@ channels_process_sound_chan_custom:
 *  Separate channels into side channels.
 */
    .section    .text
+	.align      2
    .global     channels_process_sound_chan_karaoke
 channels_process_sound_chan_karaoke:
    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
-    lea.l       -16(%sp), %sp           | save registers
-    move.l      %macsr, %d1             |
-    movem.l     %d1-%d4, (%sp)          |
-    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
-    movem.l     (%a0), %a0-%a1          | get channel pointers
-    move.l      #0x40000000, %d4        | %d3 = 0.5
-1:
-    move.l     (%a0), %d1               |
-    msac.l     %d1, %d4, (%a1), %d2, %acc0 | R = r/2 - l/2
-    mac.l      %d2, %d4            , %acc0 |
-    movclr.l   %acc0, %d1               |
-    move.l     %d1, (%a1)+              |
-    neg.l      %d1                      | L = -R = -(r/2 - l/2) = l/2 - r/2
-    move.l     %d1, (%a0)+              |
-    subq.l     #1, %d0                  |
-    bgt.s      1b                       |
-    movem.l    (%sp), %d1-%d4           | restore registers
-    move.l     %d1, %macsr              |
-    lea.l      16(%sp), %sp             | cleanup
-    rts
-.cpkaraoke_end:
-    .size       channels_process_sound_chan_karaoke, .cpkaraoke_end-channels_process_sound_chan_karaoke
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     (%a0), %a0-%a1          | get channel src pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
+    move.l      #0x40000000, %d3        | %d3 = 0.5
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2
+    msac.l      %d2, %d3, (%a1)+, %d2, %acc0 |
+    subq.l      #1, %d0                 |
+    ble.b       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d4              |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2
+    msac.l      %d2, %d3, (%a1)+, %d2, %acc0 |
+    move.l      %d4, (%a2)+             |
+    neg.l       %d4                     | R = -L = -(l/2 - r/2) = r/2 - l/2
+    move.l      %d4, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d4              | output last sample
+    move.l      %d4, (%a2)              |
+    neg.l       %d4                     | R = -L = -(l/2 - r/2) = r/2 - l/2
+    move.l      %d4, (%a3)              |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_karaoke, \
+                .-channels_process_sound_chan_karaoke
+
 /****************************************************************************
 * void sample_output_stereo(int count, struct dsp_data *data,
 *                               int32_t *src[], int16_t *dst)
@ -329,6 +413,7 @@ channels_process_sound_chan_karaoke:
 *
 */
    .section   .text
+	.align      2
    .global    sample_output_stereo
 sample_output_stereo:
    lea.l       -44(%sp), %sp             | save registers
@ -348,11 +433,11 @@ sample_output_stereo:
    add.l       %a4, %d0                  |
    and.l       #0xfffffff0, %d0          |
    cmp.l       %a0, %d0                  | at least a full line?
-    bhi.w       .sos_longloop_1_start     | no? jump to trailing longword
+    bhi.w       40f | long loop 1 start   | no? do as trailing longwords
    sub.l       #16, %d0                  | %d1 = first line bound
    cmp.l       %a4, %d0                  | any leading longwords?
-    bls.b       .sos_lineloop_start       | no? jump to line loop
-.sos_longloop_0:
+    bls.b       20f | line loop start     | no? start line loop
+10: | long loop 0                         |
    move.l      (%a2)+, %d1               | read longword from L and R
    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | shift L to high word
    mac.l       %d2, %a1, %acc1           | shift R to high word
@ -362,10 +447,10 @@ sample_output_stereo:
    move.w      %d2, %d1                  | interleave MS 16 bits of each 
    move.l      %d1, (%a4)+               | ...and write both
    cmp.l       %a4, %d0                  |
-    bhi.b       .sos_longloop_0           |
-.sos_lineloop_start:
+    bhi.b       10b | long loop 0         |
+20: | line loop start                     |
    lea.l       -12(%a0), %a5             | %a5 = at or just before last line bound
-.sos_lineloop:
+30: | line loop                           |
    move.l      (%a3)+, %d4               | get next 4 R samples and scale
    mac.l       %d4, %a1, (%a3)+, %d5, %acc0 | with saturation
    mac.l       %d5, %a1, (%a3)+, %d6, %acc1 |
@ -394,11 +479,11 @@ sample_output_stereo:
    move.w      %d7, %d3                  |
    movem.l     %d0-%d3, -16(%a4)         | write four stereo samples
    cmp.l       %a4, %a5                  |
-    bhi.b       .sos_lineloop             |
-.sos_longloop_1_start:
+    bhi.b       30b | line loop           |
+40: | long loop 1 start                   |
    cmp.l       %a4, %a0                  | any longwords left?
-    bls.b       .sos_done                 | no? finished.
-.sos_longloop_1:
+    bls.b       60f | output end          | no? stop
+50: | long loop 1                         |
    move.l      (%a2)+, %d1               | handle trailing longwords
    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | the same way as leading ones
    mac.l       %d2, %a1, %acc1           |
@ -408,14 +493,13 @@ sample_output_stereo:
    move.w      %d2, %d1                  |
    move.l      %d1, (%a4)+               |
    cmp.l       %a4, %a0                  |
-    bhi.b       .sos_longloop_1           |
-.sos_done:
+    bhi.b       50b                       | long loop 1
+60: | output end                          |
    movem.l     (%sp), %d1-%d7/%a2-%a5    | restore registers
    move.l      %d1, %macsr               |
    lea.l       44(%sp), %sp              | cleanup
    rts                                   |
-.sos_end:
-    .size      sample_output_stereo, .sos_end-sample_output_stereo
+    .size      sample_output_stereo, .-sample_output_stereo

 /****************************************************************************
 * void sample_output_mono(int count, struct dsp_data *data,
@ -424,6 +508,7 @@ sample_output_stereo:
 * Same treatment as sample_output_stereo but for one channel.
 */
    .section   .text
+	.align      2
    .global    sample_output_mono
 sample_output_mono:
    lea.l       -28(%sp), %sp             | save registers
@ -442,11 +527,11 @@ sample_output_mono:
    add.l       %a3, %d0                  |
    and.l       #0xfffffff0, %d0          |
    cmp.l       %a0, %d0                  | at least a full line?
-    bhi.w       .som_longloop_1_start     | no? jump to trailing longword
+    bhi.w       40f | long loop 1 start   | no? do as trailing longwords
    sub.l       #16, %d0                  | %d1 = first line bound
    cmp.l       %a3, %d0                  | any leading longwords?
-    bls.b       .som_lineloop_start       | no? jump to line loop
-.som_longloop_0:
+    bls.b       20f | line loop start     | no? start line loop
+10: | long loop 0                         |
    move.l      (%a2)+, %d1               | read longword from L and R
    mac.l       %d1, %d5, %acc0           | shift L to high word
    movclr.l    %acc0, %d1                | get possibly saturated results
@ -455,10 +540,10 @@ sample_output_mono:
    move.w      %d2, %d1                  | duplicate single channel into
    move.l      %d1, (%a3)+               | L and R
    cmp.l       %a3, %d0                  |
-    bhi.b       .som_longloop_0           |
-.som_lineloop_start:
+    bhi.b       10b | long loop 0         |
+20: | line loop start                     |
    lea.l       -12(%a0), %a1             | %a1 = at or just before last line bound
-.som_lineloop:
+30: | line loop                           |
    move.l      (%a2)+, %d0               | get next 4 L samples and scale
    mac.l       %d0, %d5, (%a2)+, %d1, %acc0 | with saturation
    mac.l       %d1, %d5, (%a2)+, %d2, %acc1 |
@ -483,11 +568,11 @@ sample_output_mono:
    move.w      %d4, %d3                  |
    movem.l     %d0-%d3, -16(%a3)         | write four stereo samples
    cmp.l       %a3, %a1                  |
-    bhi.b       .som_lineloop             |
-.som_longloop_1_start:
+    bhi.b       30b | line loop           |
+40: | long loop 1 start                   |
    cmp.l       %a3, %a0                  | any longwords left?
-    bls.b       .som_done                 | no? finished.
-.som_longloop_1:
+    bls.b       60f | output end          | no? stop
+50: | loop loop 1                         |
    move.l      (%a2)+, %d1               | handle trailing longwords
    mac.l       %d1, %d5, %acc0           | the same way as leading ones
    movclr.l    %acc0, %d1                |
@ -496,11 +581,10 @@ sample_output_mono:
    move.w      %d2, %d1                  |
    move.l      %d1, (%a3)+               |
    cmp.l       %a3, %a0                  |
-    bhi.b       .som_longloop_1           |
-.som_done:
+    bhi.b       50b | long loop 1         |
+60: | output end                          |
    movem.l     (%sp), %d1-%d5/%a2-%a3    | restore registers
    move.l      %d1, %macsr               |
    lea.l       28(%sp), %sp              | cleanup
    rts                                   |
-.som_end:
-    .size      sample_output_mono, .som_end-sample_output_mono
+    .size      sample_output_mono, .-sample_output_mono