Build librbcodec with DSP and metadata.

All associated files are moved to /lib/rbcodec. Change-Id: I572ddd2b8a996aae1e98c081d06b1ed356dce222
2025-11-12 22:52:28 -05:00 · 2011-06-24 01:25:21 -04:00 · 2011-06-24 01:25:21 -04:00 · b5716df4cb
commit b5716df4cb
parent 24bd9d5393
80 changed files with 97 additions and 112 deletions
--- a/lib/rbcodec/dsp/compressor.c
+++ b/lib/rbcodec/dsp/compressor.c
@ -0,0 +1,363 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2009 Jeffrey Goode
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+#include "config.h"
+#include "fixedpoint.h"
+#include "fracmul.h"
+#include "settings.h"
+#include "dsp.h"
+#include "compressor.h"
+
+/* Define LOGF_ENABLE to enable logf output in this file */
+/*#define LOGF_ENABLE*/
+#include "logf.h"
+
+static int32_t comp_rel_slope IBSS_ATTR;   /* S7.24 format */
+static int32_t comp_makeup_gain IBSS_ATTR; /* S7.24 format */
+static int32_t comp_curve[66] IBSS_ATTR;   /* S7.24 format */
+static int32_t release_gain IBSS_ATTR;     /* S7.24 format */
+
+#define UNITY (1L << 24)                   /* unity gain in S7.24 format */
+
+/** COMPRESSOR UPDATE
+ *  Called via the menu system to configure the compressor process */
+bool compressor_update(void)
+{
+    static int curr_set[5];
+    int new_set[5] = {
+        global_settings.compressor_threshold,
+        global_settings.compressor_makeup_gain,
+        global_settings.compressor_ratio,
+        global_settings.compressor_knee,
+        global_settings.compressor_release_time};
+    
+    /* make menu values useful */
+    int  threshold  =  new_set[0];
+    bool auto_gain  = (new_set[1] == 1);
+    const int comp_ratios[] = {2, 4, 6, 10, 0};
+    int  ratio      =  comp_ratios[new_set[2]];
+    bool soft_knee  = (new_set[3] == 1);
+    int  release    =  new_set[4] * NATIVE_FREQUENCY / 1000;
+
+    bool changed = false;
+    bool active  = (threshold < 0);
+
+    for (int i = 0; i < 5; i++)
+    {
+        if (curr_set[i] != new_set[i])
+        {
+            changed = true;
+            curr_set[i] = new_set[i];
+            
+#if defined(ROCKBOX_HAS_LOGF) && defined(LOGF_ENABLE)
+            switch (i)
+            {
+            case 0:
+                logf("   Compressor Threshold: %d dB\tEnabled: %s",
+                    threshold, active ? "Yes" : "No");
+                break;
+            case 1:
+                logf("   Compressor Makeup Gain: %s",
+                    auto_gain ? "Auto" : "Off");
+                break;
+            case 2:
+                if (ratio)
+                    { logf("   Compressor Ratio: %d:1", ratio); }
+                else
+                    { logf("   Compressor Ratio: Limit"); }
+                break;
+            case 3:
+                logf("   Compressor Knee: %s", soft_knee?"Soft":"Hard");
+                break;
+            case 4:
+                logf("   Compressor Release: %d", release);
+                break;
+            }
+#endif
+        }
+    }
+
+    if (changed && active)
+    {
+        /* configure variables for compressor operation */
+        static const int32_t db[] = {
+            /* positive db equivalents in S15.16 format */
+            0x000000, 0x241FA4, 0x1E1A5E, 0x1A94C8,
+            0x181518, 0x1624EA, 0x148F82, 0x1338BD,
+            0x120FD2, 0x1109EB, 0x101FA4, 0x0F4BB6,
+            0x0E8A3C, 0x0DD840, 0x0D3377, 0x0C9A0E,
+            0x0C0A8C, 0x0B83BE, 0x0B04A5, 0x0A8C6C,
+            0x0A1A5E, 0x09ADE1, 0x094670, 0x08E398,
+            0x0884F6, 0x082A30, 0x07D2FA, 0x077F0F,
+            0x072E31, 0x06E02A, 0x0694C8, 0x064BDF,
+            0x060546, 0x05C0DA, 0x057E78, 0x053E03,
+            0x04FF5F, 0x04C273, 0x048726, 0x044D64,
+            0x041518, 0x03DE30, 0x03A89B, 0x037448,
+            0x03412A, 0x030F32, 0x02DE52, 0x02AE80,
+            0x027FB0, 0x0251D6, 0x0224EA, 0x01F8E2,
+            0x01CDB4, 0x01A359, 0x0179C9, 0x0150FC,
+            0x0128EB, 0x010190, 0x00DAE4, 0x00B4E1,
+            0x008F82, 0x006AC1, 0x004699, 0x002305};
+        
+        struct curve_point
+        {
+            int32_t db;     /* S15.16 format */
+            int32_t offset; /* S15.16 format */
+        } db_curve[5];
+        
+        /** Set up the shape of the compression curve first as decibel
+            values */
+        /* db_curve[0] = bottom of knee
+                   [1] = threshold
+                   [2] = top of knee
+                   [3] = 0 db input
+                   [4] = ~+12db input (2 bits clipping overhead) */
+        
+        db_curve[1].db = threshold << 16;
+        if (soft_knee)
+        {
+            /* bottom of knee is 3dB below the threshold for soft knee*/
+            db_curve[0].db = db_curve[1].db - (3 << 16);
+            /* top of knee is 3dB above the threshold for soft knee */
+            db_curve[2].db = db_curve[1].db + (3 << 16);
+            if (ratio)
+                /* offset = -3db * (ratio - 1) / ratio */
+                db_curve[2].offset = (int32_t)((long long)(-3 << 16)
+                    * (ratio - 1) / ratio);
+            else
+                /* offset = -3db for hard limit */
+                db_curve[2].offset = (-3 << 16);
+        }
+        else
+        {
+            /* bottom of knee is at the threshold for hard knee */
+            db_curve[0].db = threshold << 16;
+            /* top of knee is at the threshold for hard knee */
+            db_curve[2].db = threshold << 16;
+            db_curve[2].offset = 0;
+        }
+        
+        /* Calculate 0db and ~+12db offsets */
+        db_curve[4].db = 0xC0A8C; /* db of 2 bits clipping */
+        if (ratio)
+        {
+            /* offset = threshold * (ratio - 1) / ratio */
+            db_curve[3].offset = (int32_t)((long long)(threshold << 16)
+                * (ratio - 1) / ratio);
+            db_curve[4].offset = (int32_t)((long long)-db_curve[4].db
+                * (ratio - 1) / ratio) + db_curve[3].offset;
+        }
+        else
+        {
+            /* offset = threshold for hard limit */
+            db_curve[3].offset = (threshold << 16);
+            db_curve[4].offset = -db_curve[4].db + db_curve[3].offset;
+        }
+        
+        /** Now set up the comp_curve table with compression offsets in the
+            form of gain factors in S7.24 format */
+        /* comp_curve[0] is 0 (-infinity db) input */
+        comp_curve[0] = UNITY;
+        /* comp_curve[1 to 63] are intermediate compression values 
+           corresponding to the 6 MSB of the input values of a non-clipped
+           signal */
+        for (int i = 1; i < 64; i++)
+        {
+            /* db constants are stored as positive numbers;
+               make them negative here */
+            int32_t this_db = -db[i];
+            
+            /* no compression below the knee */
+            if (this_db <= db_curve[0].db)
+                comp_curve[i] = UNITY;
+            
+            /* if soft knee and below top of knee,
+               interpolate along soft knee slope */
+            else if (soft_knee && (this_db <= db_curve[2].db))
+                comp_curve[i] = fp_factor(fp_mul(
+                    ((this_db - db_curve[0].db) / 6),
+                    db_curve[2].offset, 16), 16) << 8;
+            
+            /* interpolate along ratio slope above the knee */
+            else
+                comp_curve[i] = fp_factor(fp_mul(
+                    fp_div((db_curve[1].db - this_db), db_curve[1].db, 16),
+                    db_curve[3].offset, 16), 16) << 8;
+        }
+        /* comp_curve[64] is the compression level of a maximum level,
+           non-clipped signal */
+        comp_curve[64] = fp_factor(db_curve[3].offset, 16) << 8;
+        
+        /* comp_curve[65] is the compression level of a maximum level,
+           clipped signal */
+        comp_curve[65] = fp_factor(db_curve[4].offset, 16) << 8;
+        
+#if defined(ROCKBOX_HAS_LOGF) && defined(LOGF_ENABLE)
+        logf("\n   *** Compression Offsets ***");
+        /* some settings for display only, not used in calculations */
+        db_curve[0].offset = 0;
+        db_curve[1].offset = 0;
+        db_curve[3].db = 0;
+        
+        for (int i = 0; i <= 4; i++)
+        {
+            logf("Curve[%d]: db: % 6.2f\toffset: % 6.2f", i,
+                (float)db_curve[i].db / (1 << 16),
+                (float)db_curve[i].offset / (1 << 16));
+        }
+        
+        logf("\nGain factors:");
+        for (int i = 1; i <= 65; i++)
+        {
+            debugf("%02d: %.6f  ", i, (float)comp_curve[i] / UNITY);
+            if (i % 4 == 0) debugf("\n");
+        }
+        debugf("\n");
+#endif
+        
+        /* if using auto peak, then makeup gain is max offset -
+           .1dB headroom */
+        comp_makeup_gain = auto_gain ?
+            fp_factor(-(db_curve[3].offset) - 0x199A, 16) << 8 : UNITY;
+        logf("Makeup gain:\t%.6f", (float)comp_makeup_gain / UNITY);
+
+        /* calculate per-sample gain change a rate of 10db over release time
+         */
+        comp_rel_slope = 0xAF0BB2 / release;
+        logf("Release slope:\t%.6f", (float)comp_rel_slope / UNITY);
+        
+        release_gain = UNITY;
+    }
+
+    return active;
+}
+
+/** GET COMPRESSION GAIN
+ *  Returns the required gain factor in S7.24 format in order to compress the
+ *  sample in accordance with the compression curve.  Always 1 or less.
+ */
+static inline int32_t get_compression_gain(struct dsp_data *data,
+                                           int32_t sample)
+{
+    const int frac_bits_offset = data->frac_bits - 15;
+    
+    /* sample must be positive */
+    if (sample < 0)
+        sample = -(sample + 1);
+        
+    /* shift sample into 15 frac bit range */
+    if (frac_bits_offset > 0)
+        sample >>= frac_bits_offset;
+    if (frac_bits_offset < 0)
+        sample <<= -frac_bits_offset;
+    
+    /* normal case: sample isn't clipped */
+    if (sample < (1 << 15))
+    {
+        /* index is 6 MSB, rem is 9 LSB */
+        int index = sample >> 9;
+        int32_t rem = (sample & 0x1FF) << 22;
+        
+        /* interpolate from the compression curve:
+            higher gain - ((rem / (1 << 31)) * (higher gain - lower gain)) */
+        return comp_curve[index] - (FRACMUL(rem,
+            (comp_curve[index] - comp_curve[index + 1])));
+    }
+    /* sample is somewhat clipped, up to 2 bits of overhead */
+    if (sample < (1 << 17))
+    {
+        /* straight interpolation:
+            higher gain - ((clipped portion of sample * 4/3
+            / (1 << 31)) * (higher gain - lower gain)) */
+        return comp_curve[64] - (FRACMUL(((sample - (1 << 15)) / 3) << 16,
+            (comp_curve[64] - comp_curve[65])));
+    }
+    
+    /* sample is too clipped, return invalid value */
+    return -1;
+}
+
+/** COMPRESSOR PROCESS
+ *  Changes the gain of the samples according to the compressor curve
+ */
+void compressor_process(int count, struct dsp_data *data, int32_t *buf[])
+{
+    const int num_chan = data->num_channels;
+    int32_t *in_buf[2] = {buf[0], buf[1]};
+    
+    while (count-- > 0)
+    {
+        int ch;
+        /* use lowest (most compressed) gain factor of the output buffer
+           sample pair for both samples (mono is also handled correctly here)
+         */
+        int32_t sample_gain = UNITY;
+        for (ch = 0; ch < num_chan; ch++)
+        {
+            int32_t this_gain = get_compression_gain(data, *in_buf[ch]);
+            if (this_gain < sample_gain)
+                sample_gain = this_gain;
+        }
+        
+        /* perform release slope; skip if no compression and no release slope
+         */
+        if ((sample_gain != UNITY) || (release_gain != UNITY))
+        {
+            /* if larger offset than previous slope, start new release slope
+             */
+            if ((sample_gain <= release_gain) && (sample_gain > 0))
+            {
+                release_gain = sample_gain;
+            }
+            else
+            /* keep sloping towards unity gain (and ignore invalid value) */
+            {
+                release_gain += comp_rel_slope;
+                if (release_gain > UNITY)
+                {
+                    release_gain = UNITY;
+                }
+            }
+        }
+        
+        /* total gain factor is the product of release gain and makeup gain,
+           but avoid computation if possible */
+        int32_t total_gain = ((release_gain == UNITY) ? comp_makeup_gain :
+            (comp_makeup_gain == UNITY) ? release_gain :
+                FRACMUL_SHL(release_gain, comp_makeup_gain, 7));
+        
+        /* Implement the compressor: apply total gain factor (if any) to the
+           output buffer sample pair/mono sample */
+        if (total_gain != UNITY)
+        {
+            for (ch = 0; ch < num_chan; ch++)
+            {
+                *in_buf[ch] = FRACMUL_SHL(total_gain, *in_buf[ch], 7);
+            }
+        }
+        in_buf[0]++;
+        in_buf[1]++;
+    }
+}
+
+void compressor_reset(void)
+{
+    release_gain = UNITY;
+}
--- a/lib/rbcodec/dsp/compressor.h
+++ b/lib/rbcodec/dsp/compressor.h
@ -0,0 +1,29 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2009 Jeffrey Goode
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#ifndef COMPRESSOR_H
+#define COMPRESSOR_H
+
+void compressor_process(int count, struct dsp_data *data, int32_t *buf[]);
+bool compressor_update(void);
+void compressor_reset(void);
+
+#endif /* COMPRESSOR_H */
--- a/lib/rbcodec/dsp/dsp.c
+++ b/lib/rbcodec/dsp/dsp.c
--- a/lib/rbcodec/dsp/dsp.h
+++ b/lib/rbcodec/dsp/dsp.h
@ -0,0 +1,125 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2005 Miika Pekkarinen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#ifndef _DSP_H
+#define _DSP_H
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+#define NATIVE_FREQUENCY       44100
+
+enum
+{
+    STEREO_INTERLEAVED = 0,
+    STEREO_NONINTERLEAVED,
+    STEREO_MONO,
+    STEREO_NUM_MODES,
+};
+
+enum
+{
+    CODEC_IDX_AUDIO = 0,
+    CODEC_IDX_VOICE,
+};
+
+enum
+{
+    DSP_MYDSP = 1,
+    DSP_SET_FREQUENCY,
+    DSP_SWITCH_FREQUENCY,
+    DSP_SET_SAMPLE_DEPTH,
+    DSP_SET_STEREO_MODE,
+    DSP_RESET,
+    DSP_FLUSH,
+    DSP_SET_TRACK_GAIN,
+    DSP_SET_ALBUM_GAIN,
+    DSP_SET_TRACK_PEAK,
+    DSP_SET_ALBUM_PEAK,
+    DSP_CROSSFEED
+};
+
+
+/****************************************************************************
+ * NOTE: Any assembly routines that use these structures must be updated
+ * if current data members are moved or changed.
+ */
+struct resample_data
+{
+    uint32_t delta;                     /* 00h */
+    uint32_t phase;                     /* 04h */
+    int32_t last_sample[2];             /* 08h */
+                                        /* 10h */
+};
+
+/* This is for passing needed data to external dsp routines. If another
+ * dsp parameter needs to be passed, add to the end of the structure
+ * and remove from dsp_config.
+ * If another function type becomes assembly/external and requires dsp
+ * config info, add a pointer paramter of type "struct dsp_data *".
+ * If removing something from other than the end, reserve the spot or
+ * else update every implementation for every target.
+ * Be sure to add the offset of the new member for easy viewing as well. :)
+ * It is the first member of dsp_config and all members can be accessesed
+ * through the main aggregate but this is intended to make a safe haven
+ * for these items whereas the c part can be rearranged at will. dsp_data
+ * could even moved within dsp_config without disurbing the order.
+ */
+struct dsp_data
+{
+    int output_scale;                   /* 00h */
+    int num_channels;                   /* 04h */
+    struct resample_data resample_data; /* 08h */
+    int32_t clip_min;                   /* 18h */
+    int32_t clip_max;                   /* 1ch */
+    int32_t gain;                       /* 20h - Note that this is in S8.23 format. */
+    int frac_bits;                      /* 24h */
+                                        /* 28h */
+};
+
+struct dsp_config;
+
+int dsp_process(struct dsp_config *dsp, char *dest,
+                const char *src[], int count);
+int dsp_input_count(struct dsp_config *dsp, int count);
+int dsp_output_count(struct dsp_config *dsp, int count);
+intptr_t dsp_configure(struct dsp_config *dsp, int setting,
+                       intptr_t value);
+int get_replaygain_mode(bool have_track_gain, bool have_album_gain);
+void dsp_set_replaygain(void);
+void dsp_set_crossfeed(bool enable);
+void dsp_set_crossfeed_direct_gain(int gain);
+void dsp_set_crossfeed_cross_params(long lf_gain, long hf_gain,
+                                    long cutoff);
+void dsp_set_eq(bool enable);
+void dsp_set_eq_precut(int precut);
+void dsp_set_eq_coefs(int band);
+void dsp_dither_enable(bool enable);
+void dsp_timestretch_enable(bool enable);
+bool dsp_timestretch_available(void);
+void sound_set_pitch(int32_t r);
+int32_t sound_get_pitch(void);
+void dsp_set_timestretch(int32_t percent);
+int32_t dsp_get_timestretch(void);
+int dsp_callback(int msg, intptr_t param);
+void dsp_set_compressor(void);
+
+#endif
--- a/lib/rbcodec/dsp/dsp_arm.S
+++ b/lib/rbcodec/dsp/dsp_arm.S
@ -0,0 +1,561 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006-2007 Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+ #include "config.h"
+
+/****************************************************************************
+ *  void channels_process_sound_chan_mono(int count, int32_t *buf[])
+ */
+
+#include "config.h"
+
+    .section .icode, "ax", %progbits
+    .align  2
+    .global channels_process_sound_chan_mono
+    .type   channels_process_sound_chan_mono, %function
+channels_process_sound_chan_mono:
+    @ input: r0 = count, r1 = buf
+    stmfd   sp!, { r4, lr }            @
+                                       @
+    ldmia   r1, { r1, r2 }             @ r1 = buf[0], r2 = buf[1]
+    subs    r0, r0, #1                 @ odd: end at 0; even: end at -1
+    beq     .mono_singlesample         @ Zero? Only one sample!
+                                       @
+.monoloop:                             @
+    ldmia   r1, { r3, r4 }             @ r3, r4 = Li0, Li1
+    ldmia   r2, { r12, r14 }           @ r12, r14 = Ri0, Ri1
+    mov     r3, r3, asr #1             @ Mo0 = Li0 / 2 + Ri0 / 2
+    mov     r4, r4, asr #1             @ Mo1 = Li1 / 2 + Ri1 / 2
+    add     r12, r3, r12, asr #1       @
+    add     r14, r4, r14, asr #1       @
+    subs    r0, r0, #2                 @
+    stmia   r1!, { r12, r14 }          @ store Mo0, Mo1
+    stmia   r2!, { r12, r14 }          @ store Mo0, Mo1
+    bgt     .monoloop                  @
+                                       @
+    ldmpc   cond=lt, regs=r4           @ if count was even, we're done
+                                       @
+.mono_singlesample:                    @
+    ldr     r3, [r1]                   @ r3 = Ls
+    ldr     r12, [r2]                  @ r12 = Rs
+    mov     r3, r3, asr #1             @ Mo = Ls / 2 + Rs / 2
+    add     r12, r3, r12, asr #1       @
+    str     r12, [r1]                  @ store Mo
+    str     r12, [r2]                  @ store Mo
+                                       @
+    ldmpc   regs=r4                    @
+    .size   channels_process_sound_chan_mono, \
+                .-channels_process_sound_chan_mono
+
+/****************************************************************************
+ * void channels_process_sound_chan_custom(int count, int32_t *buf[])
+ */
+    .section .icode, "ax", %progbits
+    .align  2
+    .global channels_process_sound_chan_custom
+    .type   channels_process_sound_chan_custom, %function
+channels_process_sound_chan_custom:
+    stmfd   sp!, { r4-r10, lr }
+    
+    ldr     r3, =dsp_sw_gain
+    ldr     r4, =dsp_sw_cross
+
+    ldmia   r1, { r1, r2 }             @ r1 = buf[0], r2 = buf[1]
+    ldr     r3, [r3]                   @ r3 = dsp_sw_gain
+    ldr     r4, [r4]                   @ r4 = dsp_sw_cross
+    
+    subs    r0, r0, #1
+    beq     .custom_single_sample      @ Zero? Only one sample!
+    
+.custom_loop:
+    ldmia   r1, { r5, r6 }             @ r5 = Li0, r6 = Li1
+    ldmia   r2, { r7, r8 }             @ r7 = Ri0, r8 = Ri1
+
+    subs    r0, r0, #2
+
+    smull   r9, r10, r5, r3            @ Lc0 = Li0*gain
+    smull   r12, r14, r7, r3           @ Rc0 = Ri0*gain
+    smlal   r9, r10, r7, r4            @ Lc0 += Ri0*cross
+    smlal   r12, r14, r5, r4           @ Rc0 += Li0*cross
+    
+    mov     r9, r9, lsr #31            @ Convert to s0.31
+    mov     r12, r12, lsr #31
+    orr     r5, r9, r10, asl #1
+    orr     r7, r12, r14, asl #1
+    
+    smull   r9, r10, r6, r3            @ Lc1 = Li1*gain
+    smull   r12, r14, r8, r3           @ Rc1 = Ri1*gain
+    smlal   r9, r10, r8, r4            @ Lc1 += Ri1*cross
+    smlal   r12, r14, r6, r4           @ Rc1 += Li1*cross
+    
+    mov     r9, r9, lsr #31            @ Convert to s0.31
+    mov     r12, r12, lsr #31
+    orr     r6, r9, r10, asl #1
+    orr     r8, r12, r14, asl #1
+    
+    stmia   r1!, { r5, r6 }            @ Store Lc0, Lc1
+    stmia   r2!, { r7, r8 }            @ Store Rc0, Rc1
+
+    bgt     .custom_loop
+    
+    ldmpc   cond=lt, regs=r4-r10       @ < 0? even count
+    
+.custom_single_sample:
+    ldr     r5, [r1]                   @ handle odd sample
+    ldr     r7, [r2]
+
+    smull   r9, r10, r5, r3            @ Lc0 = Li0*gain
+    smull   r12, r14, r7, r3           @ Rc0 = Ri0*gain
+    smlal   r9, r10, r7, r4            @ Lc0 += Ri0*cross
+    smlal   r12, r14, r5, r4           @ Rc0 += Li0*cross
+
+    mov     r9, r9, lsr #31            @ Convert to s0.31
+    mov     r12, r12, lsr #31
+    orr     r5, r9, r10, asl #1
+    orr     r7, r12, r14, asl #1
+
+    str     r5, [r1]                   @ Store Lc0
+    str     r7, [r2]                   @ Store Rc0
+
+    ldmpc   regs=r4-r10
+    .size   channels_process_sound_chan_custom, \
+                .-channels_process_sound_chan_custom
+
+/****************************************************************************
+ *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
+ */
+    .section .icode, "ax", %progbits
+    .align  2
+    .global channels_process_sound_chan_karaoke
+    .type   channels_process_sound_chan_karaoke, %function
+channels_process_sound_chan_karaoke:
+    @ input: r0 = count, r1 = buf
+    stmfd   sp!, { r4, lr }            @
+                                       @
+    ldmia   r1, { r1, r2 }             @ r1 = buf[0], r2 = buf[1]
+    subs    r0, r0, #1                 @ odd: end at 0; even: end at -1
+    beq     .karaoke_singlesample      @ Zero? Only one sample!
+                                       @
+.karaokeloop:                          @
+    ldmia   r1, { r3, r4 }             @ r3, r4  = Li0, Li1
+    ldmia   r2, { r12, r14 }           @ r12, r14 = Ri0, Ri1
+    mov     r3, r3, asr #1             @ Lo0 = Li0 / 2 - Ri0 / 2
+    mov     r4, r4, asr #1             @ Lo1 = Li1 / 2 - Ri1 / 2
+    sub     r3, r3, r12, asr #1        @
+    sub     r4, r4, r14, asr #1        @
+    rsb     r12, r3, #0                @ Ro0 = -Lk0 = Rs0 / 2 - Ls0 / 2
+    rsb     r14, r4, #0                @ Ro1 = -Lk1 = Ri1 / 2 - Li1 / 2
+    subs    r0, r0, #2                 @
+    stmia   r1!, { r3, r4 }            @ store Lo0, Lo1
+    stmia   r2!, { r12, r14 }          @ store Ro0, Ro1
+    bgt     .karaokeloop               @
+                                       @
+    ldmpc   cond=lt, regs=r4           @ if count was even, we're done
+                                       @
+.karaoke_singlesample:                 @
+    ldr     r3, [r1]                   @ r3 = Li
+    ldr     r12, [r2]                  @ r12 = Ri
+    mov     r3, r3, asr #1             @ Lk = Li / 2 - Ri /2
+    sub     r3, r3, r12, asr #1        @
+    rsb     r12, r3, #0                @ Rk = -Lo = Ri / 2 - Li / 2
+    str     r3, [r1]                   @ store Lo
+    str     r12, [r2]                  @ store Ro
+                                       @
+    ldmpc   regs=r4                    @
+    .size   channels_process_sound_chan_karaoke, \
+                .-channels_process_sound_chan_karaoke
+
+#if ARM_ARCH < 6
+/****************************************************************************
+ *  void sample_output_mono(int count, struct dsp_data *data,
+ *                          const int32_t *src[], int16_t *dst)
+ */
+    .section .icode, "ax", %progbits
+    .align  2
+    .global sample_output_mono
+    .type   sample_output_mono, %function
+sample_output_mono:
+    @ input: r0 = count, r1 = data, r2 = src, r3 = dst
+    stmfd   sp!, { r4-r6, lr }
+
+    ldr     r1, [r1]                   @ lr = data->output_scale
+    ldr     r2, [r2]                   @ r2 = src[0]
+
+    mov     r4, #1
+    mov     r4, r4, lsl r1             @ r4 = 1 << (scale-1)
+    mov     r4, r4, lsr #1
+    mvn     r14, #0x8000               @ r14 = 0xffff7fff, needed for
+                                       @ clipping and masking
+    subs    r0, r0, #1                 @
+    beq     .som_singlesample          @ Zero? Only one sample!
+
+.somloop:
+    ldmia   r2!, { r5, r6 }
+    add     r5, r5, r4                 @ r6 = (r6 + 1<<(scale-1)) >> scale
+    mov     r5, r5, asr r1
+    mov     r12, r5, asr #15
+    teq     r12, r12, asr #31
+    eorne   r5, r14, r5, asr #31       @ Clip (-32768...+32767)
+    add     r6, r6, r4
+    mov     r6, r6, asr r1             @ r7 = (r7 + 1<<(scale-1)) >> scale
+    mov     r12, r6, asr #15
+    teq     r12, r12, asr #31
+    eorne   r6, r14, r6, asr #31       @ Clip (-32768...+32767)
+    
+    and     r5, r5, r14, lsr #16
+    and     r6, r6, r14, lsr #16
+    orr     r5, r5, r5, lsl #16        @ pack first 2 halfwords into 1 word
+    orr     r6, r6, r6, lsl #16        @ pack last 2 halfwords into 1 word
+    stmia   r3!, { r5, r6 }
+    
+    subs    r0, r0, #2
+    bgt     .somloop     
+       
+    ldmpc   cond=lt, regs=r4-r6        @ even 'count'? return
+
+.som_singlesample:
+    ldr     r5, [r2]                   @ do odd sample
+    add     r5, r5, r4
+    mov     r5, r5, asr r1
+    mov     r12, r5, asr #15
+    teq     r12, r12, asr #31
+    eorne   r5, r14, r5, asr #31
+
+    and     r5, r5, r14, lsr #16       @ pack 2 halfwords into 1 word
+    orr     r5, r5, r5, lsl #16
+    str     r5, [r3]
+
+    ldmpc   regs=r4-r6
+    .size   sample_output_mono, .-sample_output_mono
+    
+/****************************************************************************
+ * void sample_output_stereo(int count, struct dsp_data *data,
+ *                           const int32_t *src[], int16_t *dst)
+ */
+    .section .icode, "ax", %progbits
+    .align  2
+    .global sample_output_stereo
+    .type   sample_output_stereo, %function
+sample_output_stereo:
+    @ input: r0 = count, r1 = data, r2 = src, r3 = dst
+    stmfd   sp!, { r4-r9, lr }
+
+    ldr     r1, [r1]                   @ r1 = data->output_scale
+    ldmia   r2, { r2, r5 }             @ r2 = src[0], r5 = src[1]
+
+    mov     r4, #1
+    mov     r4, r4, lsl r1             @ r4 = 1 << (scale-1)
+    mov     r4, r4, lsr #1             @
+    
+    mvn     r14, #0x8000               @ r14 = 0xffff7fff, needed for
+                                       @ clipping and masking
+    subs    r0, r0, #1                 @
+    beq     .sos_singlesample          @ Zero? Only one sample!
+
+.sosloop:
+    ldmia   r2!, { r6, r7 }            @ 2 left
+    ldmia   r5!, { r8, r9 }            @ 2 right
+
+    add     r6, r6, r4                 @ r6 = (r6 + 1<<(scale-1)) >> scale
+    mov     r6, r6, asr r1
+    mov     r12, r6, asr #15
+    teq     r12, r12, asr #31
+    eorne   r6, r14, r6, asr #31       @ Clip (-32768...+32767)
+    add     r7, r7, r4
+    mov     r7, r7, asr r1             @ r7 = (r7 + 1<<(scale-1)) >> scale
+    mov     r12, r7, asr #15
+    teq     r12, r12, asr #31
+    eorne   r7, r14, r7, asr #31       @ Clip (-32768...+32767)
+    
+    add     r8, r8, r4                 @ r8 = (r8 + 1<<(scale-1)) >> scale
+    mov     r8, r8, asr r1
+    mov     r12, r8, asr #15
+    teq     r12, r12, asr #31
+    eorne   r8, r14, r8, asr #31       @ Clip (-32768...+32767)
+    add     r9, r9, r4                 @ r9 = (r9 + 1<<(scale-1)) >> scale
+    mov     r9, r9, asr r1
+    mov     r12, r9, asr #15
+    teq     r12, r12, asr #31
+    eorne   r9, r14, r9, asr #31       @ Clip (-32768...+32767)
+    
+    and     r6, r6, r14, lsr #16       @ pack first 2 halfwords into 1 word
+    orr     r8, r6, r8, asl #16
+    and     r7, r7, r14, lsr #16       @ pack last 2 halfwords into 1 word
+    orr     r9, r7, r9, asl #16
+
+    stmia   r3!, { r8, r9 }
+
+    subs    r0, r0, #2
+    bgt     .sosloop
+
+    ldmpc   cond=lt, regs=r4-r9        @ even 'count'? return
+
+.sos_singlesample:    
+    ldr     r6, [r2]                   @ left odd sample
+    ldr     r8, [r5]                   @ right odd sample
+
+    add     r6, r6, r4                 @ r6 = (r7 + 1<<(scale-1)) >> scale
+    mov     r6, r6, asr r1
+    mov     r12, r6, asr #15
+    teq     r12, r12, asr #31
+    eorne   r6, r14, r6, asr #31       @ Clip (-32768...+32767)
+    add     r8, r8, r4                 @ r8 = (r8 + 1<<(scale-1)) >> scale
+    mov     r8, r8, asr r1
+    mov     r12, r8, asr #15
+    teq     r12, r12, asr #31
+    eorne   r8, r14, r8, asr #31       @ Clip (-32768...+32767)
+    
+    and     r6, r6, r14, lsr #16       @ pack 2 halfwords into 1 word
+    orr     r8, r6, r8, asl #16
+
+    str     r8, [r3]
+
+    ldmpc   regs=r4-r9
+    .size   sample_output_stereo, .-sample_output_stereo
+#endif /* ARM_ARCH < 6 */    
+
+/****************************************************************************
+ * void apply_crossfeed(int count, int32_t* src[])
+ */
+    .section .text
+    .global apply_crossfeed 
+apply_crossfeed:
+    @ unfortunately, we ended up in a bit of a register squeeze here, and need
+    @ to keep the count on the stack :/
+    stmdb   sp!, { r4-r11, lr }        @ stack modified regs
+    ldmia   r1, { r2-r3 }              @ r2 = src[0], r3 = src[1]
+    
+    ldr     r1, =crossfeed_data
+    ldmia   r1!, { r4-r11 }            @ load direct gain and filter data
+    mov     r12, r0                    @ better to ldm delay + count later
+    add     r0, r1, #13*4*2            @ calculate end of delay
+    stmdb   sp!, { r0, r12 }           @ stack end of delay adr and count
+    ldr     r0, [r1, #13*4*2]          @ fetch current delay line address
+
+    /* Register usage in loop:
+     * r0 = &delay[index][0], r1 = accumulator high, r2 = src[0], r3 = src[1],
+     * r4 = direct gain, r5-r7 = b0, b1, a1 (filter coefs),
+     * r8-r11 = filter history, r12 = temp, r14 = accumulator low
+     */
+.cfloop:
+    smull   r14, r1, r6, r8            @ acc = b1*dr[n - 1]
+    smlal   r14, r1, r7, r9            @ acc += a1*y_l[n - 1]
+    ldr     r8, [r0, #4]               @ r8 = dr[n]
+    smlal   r14, r1, r5, r8            @ acc += b0*dr[n]
+    mov     r9, r1, lsl #1             @ fix format for filter history
+    ldr     r12, [r2]                  @ load left input
+    smlal   r14, r1, r4, r12           @ acc += gain*x_l[n] 
+    mov     r1, r1, lsl #1             @ fix format
+    str     r1, [r2], #4               @ save result
+
+    smull   r14, r1, r6, r10           @ acc = b1*dl[n - 1]
+    smlal   r14, r1, r7, r11           @ acc += a1*y_r[n - 1]
+    ldr     r10, [r0]                  @ r10 = dl[n]
+    str     r12, [r0], #4              @ save left input to delay line
+    smlal   r14, r1, r5, r10           @ acc += b0*dl[n]
+    mov     r11, r1, lsl #1            @ fix format for filter history
+    ldr     r12, [r3]                  @ load right input
+    smlal   r14, r1, r4, r12           @ acc += gain*x_r[n]
+    str     r12, [r0], #4              @ save right input to delay line
+    mov     r1, r1, lsl #1             @ fix format
+    ldmia   sp, { r12, r14 }           @ fetch delay line end addr and count from stack
+    str     r1, [r3], #4               @ save result
+
+    cmp     r0, r12                    @ need to wrap to start of delay?
+    subeq   r0, r0, #13*4*2            @ wrap back delay line ptr to start
+ 
+    subs    r14, r14, #1               @ are we finished?
+    strne   r14, [sp, #4]              @ nope, save count back to stack
+    bne     .cfloop
+    
+    @ save data back to struct
+    ldr     r12, =crossfeed_data + 4*4
+    stmia   r12, { r8-r11 }            @ save filter history
+    str     r0, [r12, #30*4]           @ save delay line index
+    add     sp, sp, #8                 @ remove temp variables from stack
+    ldmpc   regs=r4-r11
+    .size   apply_crossfeed, .-apply_crossfeed
+
+/****************************************************************************
+ * int dsp_downsample(int count, struct dsp_data *data,
+ *                    in32_t *src[], int32_t *dst[])
+ */
+    .section    .text
+    .global     dsp_downsample
+dsp_downsample:
+    stmdb   sp!, { r4-r11, lr }     @ stack modified regs
+    ldmib   r1, { r5-r6 }           @ r5 = num_channels,r6 = resample_data.delta
+    sub     r5, r5, #1              @ pre-decrement num_channels for use
+    add     r4, r1, #12             @ r4 = &resample_data.phase
+    mov     r12, #0xff
+    orr     r12, r12, #0xff00       @ r12 = 0xffff
+.dschannel_loop:
+    ldr     r1, [r4]                @ r1 = resample_data.phase
+    ldr     r7, [r2, r5, lsl #2]    @ r7 = s = src[ch - 1]
+    ldr     r8, [r3, r5, lsl #2]    @ r8 = d = dst[ch - 1]
+    add     r9, r4, #4              @ r9 = &last_sample[0]
+    ldr     r10, [r9, r5, lsl #2]   @ r10 = last_sample[ch - 1]
+    sub     r11, r0, #1             
+    ldr     r14, [r7, r11, lsl #2]  @ load last sample in s[] ...
+    str     r14, [r9, r5, lsl #2]   @ and write as next frame's last_sample
+    movs    r9, r1, lsr #16         @ r9 = pos = phase >> 16
+    ldreq   r11, [r7]               @ if pos = 0, load src[0] and jump into loop
+    beq     .dsuse_last_start
+    cmp     r9, r0                  @ if pos >= count, we're already done
+    bge     .dsloop_skip
+
+    @ Register usage in loop:
+    @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel,
+    @ r6 = delta, r7 = s, r8 = d, r9 = pos, r10 = s[pos - 1], r11 = s[pos]
+.dsloop:
+    add     r9, r7, r9, lsl #2      @ r9 = &s[pos]
+    ldmda   r9, { r10, r11 }        @ r10 = s[pos - 1], r11 = s[pos]
+.dsuse_last_start:
+    sub     r11, r11, r10           @ r11 = diff = s[pos] - s[pos - 1]
+    @ keep frac in lower bits to take advantage of multiplier early termination
+    and     r9, r1, r12             @ frac = phase & 0xffff
+    smull   r9, r14, r11, r9
+    add     r1, r1, r6              @ phase += delta
+    add     r10, r10, r9, lsr #16   @ r10 = out = s[pos - 1] + frac*diff
+    add     r10, r10, r14, lsl #16
+    str     r10, [r8], #4           @ *d++ = out
+    mov     r9, r1, lsr #16         @ pos = phase >> 16
+    cmp     r9, r0                  @ pos < count?
+    blt     .dsloop                 @ yup, do more samples
+.dsloop_skip:
+    subs    r5, r5, #1
+    bpl     .dschannel_loop         @ if (--ch) >= 0, do another channel
+    sub     r1, r1, r0, lsl #16     @ wrap phase back to start
+    str     r1, [r4]                @ store back
+    ldr     r1, [r3]                @ r1 = &dst[0]
+    sub     r8, r8, r1              @ dst - &dst[0]
+    mov     r0, r8, lsr #2          @ convert bytes->samples
+    ldmpc   regs=r4-r11             @ ... and we're out
+    .size   dsp_downsample, .-dsp_downsample
+
+/****************************************************************************
+ * int dsp_upsample(int count, struct dsp_data *dsp,
+ *                  in32_t *src[], int32_t *dst[])
+ */
+    .section    .text
+    .global     dsp_upsample
+dsp_upsample:
+    stmfd   sp!, { r4-r11, lr }     @ stack modified regs
+    ldmib   r1, { r5-r6 }           @ r5 = num_channels,r6 = resample_data.delta
+    sub     r5, r5, #1              @ pre-decrement num_channels for use
+    add     r4, r1, #12             @ r4 = &resample_data.phase
+    mov     r6, r6, lsl #16         @ we'll use carry to detect pos increments
+    stmfd   sp!, { r0, r4 }         @ stack count and &resample_data.phase
+.uschannel_loop:
+    ldr     r12, [r4]               @ r12 = resample_data.phase
+    ldr     r7, [r2, r5, lsl #2]    @ r7 = s = src[ch - 1]
+    ldr     r8, [r3, r5, lsl #2]    @ r8 = d = dst[ch - 1]
+    add     r9, r4, #4              @ r9 = &last_sample[0]
+    mov     r1, r12, lsl #16        @ we'll use carry to detect pos increments
+    sub     r11, r0, #1             
+    ldr     r14, [r7, r11, lsl #2]  @ load last sample in s[] ...
+    ldr     r10, [r9, r5, lsl #2]   @ r10 = last_sample[ch - 1]
+    str     r14, [r9, r5, lsl #2]   @ and write as next frame's last_sample
+    movs    r14, r12, lsr #16       @ pos = resample_data.phase >> 16
+    beq     .usstart_0              @ pos = 0
+    cmp     r14, r0                 @ if pos >= count, we're already done
+    bge     .usloop_skip
+    add     r7, r7, r14, lsl #2     @ r7 = &s[pos]
+    ldr     r10, [r7, #-4]          @ r11 = s[pos - 1]
+    b       .usstart_0
+
+    @ Register usage in loop:
+    @ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel,
+    @ r6 = delta, r7 = s, r8 = d, r9 = diff, r10 = s[pos - 1], r11 = s[pos]
+.usloop_1:
+    mov     r10, r11                @ r10 = previous sample
+.usstart_0:
+    ldr     r11, [r7], #4           @ r11 = next sample
+    mov     r4, r1, lsr #16         @ r4 = frac = phase >> 16
+    sub     r9, r11, r10            @ r9 = diff = s[pos] - s[pos - 1]
+.usloop_0:
+    smull   r12, r14, r4, r9
+    adds    r1, r1, r6              @ phase += delta << 16
+    mov     r4, r1, lsr #16         @ r4 = frac = phase >> 16
+    add     r14, r10, r14, lsl #16
+    add     r14, r14, r12, lsr #16  @ r14 = out = s[pos - 1] + frac*diff
+    str     r14, [r8], #4           @ *d++ = out
+    bcc     .usloop_0               @ if carry is set, pos is incremented
+    subs    r0, r0, #1              @ if count > 0, do another sample
+    bgt     .usloop_1
+.usloop_skip:
+    subs    r5, r5, #1
+    ldmfd   sp, { r0, r4 }          @ reload count and &resample_data.phase
+    bpl     .uschannel_loop         @ if (--ch) >= 0, do another channel
+    mov     r1, r1, lsr #16         @ wrap phase back to start of next frame
+    ldr     r2, [r3]                @ r1 = &dst[0]
+    str     r1, [r4]                @ store phase
+    sub     r8, r8, r2              @ dst - &dst[0]
+    mov     r0, r8, lsr #2          @ convert bytes->samples
+    add     sp, sp, #8              @ adjust stack for temp variables
+    ldmpc   regs=r4-r11             @ ... and we're out
+    .size       dsp_upsample, .-dsp_upsample
+
+/****************************************************************************
+ *  void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
+ */
+    .section .icode, "ax", %progbits
+    .align  2
+    .global dsp_apply_gain
+    .type   dsp_apply_gain, %function
+dsp_apply_gain:
+    @ input: r0 = count, r1 = data, r2 = buf[]
+    stmfd   sp!, { r4-r8, lr }
+
+    ldr     r3, [r1,  #4]           @ r3 = data->num_channels
+    ldr     r4, [r1, #32]           @ r5 = data->gain
+
+.dag_outerloop:
+    ldr     r1, [r2], #4            @ r1 = buf[0] and increment index of buf[]
+    subs    r12, r0, #1             @ r12 = r0 = count - 1
+    beq     .dag_singlesample       @ Zero? Only one sample!
+
+.dag_innerloop:
+    ldmia   r1, { r5, r6 }          @ load r5, r6 from r1
+    smull   r7, r8, r5, r4          @ r7 = FRACMUL_SHL(r5, r4, 8)
+    smull   r14, r5, r6, r4         @ r14 = FRACMUL_SHL(r6, r4, 8)
+    subs    r12, r12, #2
+    mov     r7, r7, lsr #23
+    mov     r14, r14, lsr #23
+    orr     r7, r7, r8, asl #9
+    orr     r14, r14, r5, asl #9
+    stmia   r1!, { r7, r14 }        @ save r7, r14 to [r1] and increment r1
+    bgt     .dag_innerloop          @ end of inner loop
+
+    blt     .dag_evencount          @ < 0? even count
+
+.dag_singlesample:
+    ldr     r5, [r1]                @ handle odd sample
+    smull   r7, r8, r5, r4          @ r7 = FRACMUL_SHL(r5, r4, 8)
+    mov     r7, r7, lsr #23
+    orr     r7, r7, r8, asl #9
+    str     r7, [r1]
+
+.dag_evencount:
+    subs    r3, r3, #1
+    bgt     .dag_outerloop          @ end of outer loop
+               
+    ldmpc   regs=r4-r8
+    .size   dsp_apply_gain, .-dsp_apply_gain
--- a/lib/rbcodec/dsp/dsp_arm_v6.S
+++ b/lib/rbcodec/dsp/dsp_arm_v6.S
@ -0,0 +1,127 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2010 Michael Sevakis
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ *  void sample_output_mono(int count, struct dsp_data *data,
+ *                          const int32_t *src[], int16_t *dst)
+ */
+    .section .text, "ax", %progbits
+    .align  2
+    .global sample_output_mono
+    .type   sample_output_mono, %function
+sample_output_mono:
+    @ input: r0 = count, r1 = data, r2 = src, r3 = dst
+    stmfd   sp!, { r4, lr }            @
+                                       @
+    ldr     r1, [r1]                   @ r1 = data->output_scale
+    ldr     r2, [r2]                   @ r2 = src[0]
+                                       @
+    mov     r4, #1                     @ r4 = 1 << (scale - 1)
+    mov     r4, r4, lsl r1             @
+    subs    r0, r0, #1                 @ odd: end at 0; even: end at -1
+    mov     r4, r4, lsr #1             @
+    beq     2f                         @ Zero? Only one sample!
+                                       @
+1:                                     @
+    ldmia   r2!, { r12, r14 }          @ load Mi0, Mi1
+    qadd    r12, r12, r4               @ round, scale, saturate and
+    qadd    r14, r14, r4               @ pack Mi0 to So0, Mi1 to So1
+    mov     r12, r12, asr r1           @
+    mov     r14, r14, asr r1           @
+    ssat    r12, #16, r12              @
+    ssat    r14, #16, r14              @
+    pkhbt   r12, r12, r12, asl #16     @
+    pkhbt   r14, r14, r14, asl #16     @
+    subs    r0, r0, #2                 @
+    stmia   r3!, { r12, r14 }          @ store So0, So1
+    bgt     1b                         @
+                                       @
+    ldmltfd sp!, { r4, pc }            @ if count was even, we're done
+                                       @
+2:                                     @
+    ldr     r12, [r2]                  @ round, scale, saturate
+    qadd    r12, r12, r4               @ and pack Mi to So
+    mov     r12, r12, asr r1           @
+    ssat    r12, #16, r12              @
+    pkhbt   r12, r12, r12, asl #16     @
+    str     r12, [r3]                  @ store So
+                                       @       
+    ldmfd   sp!, { r4, pc }            @
+    .size   sample_output_mono, .-sample_output_mono
+
+/****************************************************************************
+ * void sample_output_stereo(int count, struct dsp_data *data,
+ *                           const int32_t *src[], int16_t *dst)
+ */
+    .section .text, "ax", %progbits
+    .align  2
+    .global sample_output_stereo
+    .type   sample_output_stereo, %function
+sample_output_stereo:
+    @ input: r0 = count, r1 = data, r2 = src, r3 = dst
+    stmfd   sp!, { r4-r7, lr }         @
+                                       @
+    ldr     r1, [r1]                   @ r1 = data->output_scale    
+    ldmia   r2, { r2, r4 }             @ r2 = src[0], r4 = src[1]
+                                       @
+    mov     r5, #1                     @ r5 = 1 << (scale - 1)
+    mov     r5, r5, lsl r1             @
+    subs    r0, r0, #1                 @ odd: end at 0; even: end at -1
+    mov     r5, r5, lsr #1             @
+    beq     2f                         @ Zero? Only one sample!
+                                       @
+1:                                     @
+    ldmia   r2!, { r6, r7 }            @ r6, r7 = Li0, Li1
+    ldmia   r4!, { r12, r14 }          @ r12, r14 = Ri0, Ri1
+    qadd    r6, r6, r5                 @ round, scale, saturate and pack
+    qadd    r7, r7, r5                 @ Li0+Ri0 to So0, Li1+Ri1 to So1
+    qadd    r12, r12, r5               @
+    qadd    r14, r14, r5               @
+    mov     r6, r6, asr r1             @ 
+    mov     r7, r7, asr r1             @
+    mov     r12, r12, asr r1           @
+    mov     r14, r14, asr r1           @
+    ssat    r6, #16, r6                @
+    ssat    r12, #16, r12              @
+    ssat    r7, #16, r7                @
+    ssat    r14, #16, r14              @
+    pkhbt   r6, r6, r12, asl #16       @
+    pkhbt   r7, r7, r14, asl #16       @
+    subs    r0, r0, #2                 @
+    stmia   r3!, { r6, r7 }            @ store So0, So1
+    bgt     1b                         @
+                                       @
+    ldmltfd sp!, { r4-r7, pc }         @ if count was even, we're done
+                                       @
+2:                                     @
+    ldr     r6, [r2]                   @ r6 = Li
+    ldr     r12, [r4]                  @ r12 = Ri
+    qadd    r6, r6, r5                 @ round, scale, saturate
+    qadd    r12, r12, r5               @ and pack Li+Ri to So
+    mov     r6, r6, asr r1             @
+    mov     r12, r12, asr r1           @
+    ssat    r6, #16, r6                @
+    ssat    r12, #16, r12              @
+    pkhbt   r6, r6, r12, asl #16       @
+    str     r6, [r3]                   @ store So
+                                       @
+    ldmfd   sp!, { r4-r7, pc }         @
+    .size   sample_output_stereo, .-sample_output_stereo
--- a/lib/rbcodec/dsp/dsp_asm.h
+++ b/lib/rbcodec/dsp/dsp_asm.h
@ -0,0 +1,86 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include <config.h>
+
+#ifndef _DSP_ASM_H
+#define _DSP_ASM_H
+
+/* Set the appropriate #defines based on CPU or whatever matters */
+#if defined(CPU_ARM)
+#define DSP_HAVE_ASM_APPLY_GAIN
+#define DSP_HAVE_ASM_RESAMPLING
+#define DSP_HAVE_ASM_CROSSFEED
+#define DSP_HAVE_ASM_SOUND_CHAN_MONO
+#define DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
+#define DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
+#define DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
+#define DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
+#elif defined (CPU_COLDFIRE)
+#define DSP_HAVE_ASM_APPLY_GAIN
+#define DSP_HAVE_ASM_RESAMPLING
+#define DSP_HAVE_ASM_CROSSFEED
+#define DSP_HAVE_ASM_SOUND_CHAN_MONO
+#define DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
+#define DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
+#define DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
+#define DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
+#endif /* CPU_COLDFIRE */
+
+/* Declare prototypes based upon what's #defined above */
+#ifdef DSP_HAVE_ASM_CROSSFEED
+void apply_crossfeed(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_APPLY_GAIN
+void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[]);
+#endif /* DSP_HAVE_ASM_APPLY_GAIN* */
+
+#ifdef DSP_HAVE_ASM_RESAMPLING
+int dsp_upsample(int count, struct dsp_data *data,
+                 const int32_t *src[], int32_t *dst[]);
+int dsp_downsample(int count, struct dsp_data *data,
+                   const int32_t *src[], int32_t *dst[]);
+#endif /* DSP_HAVE_ASM_RESAMPLING */
+
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_MONO
+void channels_process_sound_chan_mono(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
+void channels_process_sound_chan_custom(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_SOUND_CHAN_KARAOKE
+void channels_process_sound_chan_karaoke(int count, int32_t *buf[]);
+#endif
+
+#ifdef DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
+void sample_output_stereo(int count, struct dsp_data *data,
+                          const int32_t *src[], int16_t *dst);
+#endif
+
+#ifdef DSP_HAVE_ASM_SAMPLE_OUTPUT_MONO
+void sample_output_mono(int count, struct dsp_data *data,
+                        const int32_t *src[], int16_t *dst);
+#endif
+
+#endif /* _DSP_ASM_H */
--- a/lib/rbcodec/dsp/dsp_cf.S
+++ b/lib/rbcodec/dsp/dsp_cf.S
@ -0,0 +1,611 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 Thom Johansen
+ * Portions Copyright (C) 2007 Michael Sevakis
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
+ */
+    .section    .text
+    .align      2
+    .global     dsp_apply_gain
+dsp_apply_gain:
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     28(%sp), %a0-%a1        | %a0 = data,
+                                        | %a1 = buf
+    move.l      4(%a0), %d1             | %d1 = data->num_channels
+    move.l      32(%a0), %a0            | %a0 = data->gain (in s8.23)
+10: | channel loop                      |
+    move.l      24(%sp), %d0            | %d0 = count
+    move.l      -4(%a1, %d1.l*4), %a2   | %a2 = s = buf[ch-1]
+    move.l      %a2, %a3                | %a3 = d = s
+    move.l      (%a2)+, %d2             | %d2 = *s++,
+    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
+    subq.l      #1, %d0                 | --count > 0 ? : effectively n++
+    ble.b       30f | loop done         | no? finish up
+20: | loop                              |
+    move.l      %accext01, %d4          | fetch S(n-1)[7:0]
+    movclr.l    %acc0, %d3              | fetch S(n-1)[40:8] in %d5[31:0]
+    asl.l       #8, %d3                 | *s++ = (S(n-1)[40:8] << 8) | S(n-1)[7:0]
+    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
+    move.b      %d4, %d3                |
+    move.l      %d3, (%a3)+             |
+    subq.l      #1, %d0                 | --count > 0 ? : effectively n++
+    bgt.b       20b | loop              | yes? do more samples
+30: | loop done                         |
+    move.l      %accext01, %d4          | fetch S(n-1)[7:0]
+    movclr.l    %acc0, %d3              | fetch S(n-1)[40:8] in %d5[31:0]
+    asl.l       #8, %d3                 | *s = (S(n-1)[40:8] << 8) | S(n-1)[7:0]
+    move.b      %d4, %d3                |
+    move.l      %d3, (%a3)              |
+    subq.l      #1, %d1                 | next channel
+    bgt.b       10b | channel loop      |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup stack
+    rts                                 |
+    .size       dsp_apply_gain,.-dsp_apply_gain
+
+/****************************************************************************
+ * void apply_crossfeed(int count, int32_t *buf[])
+ */
+    .section    .text
+    .align      2
+    .global     apply_crossfeed 
+apply_crossfeed:
+    lea.l       -44(%sp), %sp           |
+    movem.l     %d2-%d7/%a2-%a6, (%sp)  | save all regs
+    movem.l     48(%sp), %d7/%a4        | %d7 = count, %a4 = src
+    movem.l     (%a4), %a4-%a5          | %a4 = src[0], %a5 = src[1]
+    lea.l       crossfeed_data, %a1     | %a1 = &crossfeed_data
+    move.l      (%a1)+, %d6             | %d6 = direct gain
+    movem.l     12(%a1), %d0-%d3        | fetch filter history samples
+    move.l      132(%a1), %a0           | fetch delay line address
+    movem.l     (%a1), %a1-%a3          | load filter coefs
+    lea.l       crossfeed_data+136, %a6 | %a6 = delay line wrap limit
+    bra.b       20f | loop start        | go to loop start point
+    /* Register usage in loop:
+     * %a0 = delay_p, %a1..%a3 = b0, b1, a1 (filter coefs),
+     * %a4 = buf[0], %a5 = buf[1],
+     * %a6 = delay line pointer wrap limit,
+     * %d0..%d3 = history
+     * %d4..%d5 = temp.
+     * %d6 = direct gain,
+     * %d7 = count
+     */
+10: | loop                              |
+    movclr.l    %acc0, %d4              | write outputs
+    move.l      %d4, (%a4)+             | .
+    movclr.l    %acc1, %d5              | .
+    move.l      %d5, (%a5)+             | .
+20: | loop start                        |
+    mac.l       %a2, %d0, (%a0)+, %d0, %acc0 | %acc0  = b1*dl[n - 1], %d0 = dl[n]
+    mac.l       %a1, %d0             , %acc0 | %acc0 += b0*dl[n]
+    mac.l       %a3, %d1, (%a5),  %d5, %acc0 | %acc0 += a1*y_r[n - 1], load R
+    mac.l       %a2, %d2, (%a0)+, %d2, %acc1 | %acc1  = b1*dr[n - 1], %d2 = dr[n]
+    mac.l       %a1, %d2             , %acc1 | %acc1 += b0*dr[n]
+    mac.l       %a3, %d3, (%a4),  %d4, %acc1 | %acc1 += a1*y_l[n - 1], load L
+    movem.l     %d4-%d5, -8(%a0)        | save left & right inputs to delay line
+    move.l      %acc0, %d3              | get filtered delayed left sample (y_l[n])
+    move.l      %acc1, %d1              | get filtered delayed right sample (y_r[n])
+    mac.l       %d6, %d4, %acc0         | %acc0 += gain*x_l[n]
+    mac.l       %d6, %d5, %acc1         | %acc1 += gain*x_r[n]
+    cmp.l       %a6, %a0                | wrap %a0 if passed end
+    bhs.b       30f | wrap buffer       |
+    .word       0x51fb | tpf.l          | trap the buffer wrap
+30: | wrap buffer                       | ...fwd taken branches more costly
+    lea.l       -104(%a0), %a0          | wrap it up
+    subq.l      #1, %d7                 | --count > 0 ?
+    bgt.b       10b | loop              | yes? do more
+    movclr.l    %acc0, %d4              | write last outputs
+    move.l      %d4, (%a4)              | .
+    movclr.l    %acc1, %d5              | .
+    move.l      %d5, (%a5)              | .
+    lea.l       crossfeed_data+16, %a1  | save data back to struct
+    movem.l     %d0-%d3, (%a1)          | ...history
+    move.l      %a0, 120(%a1)           | ...delay_p
+    movem.l     (%sp), %d2-%d7/%a2-%a6  | restore all regs
+    lea.l       44(%sp), %sp            |
+    rts                                 |
+    .size       apply_crossfeed,.-apply_crossfeed 
+
+/****************************************************************************
+ * int dsp_downsample(int count, struct dsp_data *data,
+ *                    in32_t *src[], int32_t *dst[])
+ */
+    .section    .text
+    .align      2
+    .global     dsp_downsample
+dsp_downsample:
+    lea.l       -40(%sp), %sp           | save non-clobberables
+    movem.l     %d2-%d7/%a2-%a5, (%sp)  |
+    movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
+                                        | %a0 = data
+                                        | %a1 = src
+                                        | %a2 = dst
+    movem.l     4(%a0), %d3-%d4         | %d3 = ch = data->num_channels
+                                        | %d4 = delta = data->resample_data.delta
+    moveq.l     #16, %d7                | %d7 = shift
+10: | channel loop                      |
+    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
+    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
+    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
+    lea.l       12(%a0, %d3.l*4), %a5   | %a5 = &data->resample_data.ast_sample[ch-1]
+    move.l      (%a5), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
+    move.l      -4(%a3, %d2.l*4), (%a5) | data->resample_data.last_sample[ch-1] = s[count-1]
+    move.l      %d5, %d6                | %d6 = pos = phase >> 16
+    lsr.l       %d7, %d6                |
+    cmp.l       %d2, %d6                | past end of samples?
+    bge.b       40f | skip resample loop| yes? skip loop
+    tst.l       %d6                     | need last sample of prev. frame?
+    bne.b       20f | resample loop     | no? start main loop
+    move.l      (%a3, %d6.l*4), %d1     | %d1 = s[pos]
+    bra.b       30f | resample start last | start with last (last in %d0)
+20: | resample loop                     |
+    lea.l       -4(%a3, %d6.l*4), %a5   | load s[pos-1] and s[pos]
+    movem.l     (%a5), %d0-%d1          |
+30: | resample start last               |
+    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
+    move.l      %d0, %acc0              | %acc0 = previous sample
+    move.l      %d5, %d0                | frac = (phase << 16) >> 1
+    lsl.l       %d7, %d0                |
+    lsr.l       #1, %d0                 |
+    mac.l       %d0, %d1, %acc0         | %acc0 += frac * diff
+    add.l       %d4, %d5                | phase += delta
+    move.l      %d5, %d6                | pos = phase >> 16
+    lsr.l       %d7, %d6                |
+    movclr.l    %acc0, %d0              |
+    move.l      %d0, (%a4)+             | *d++ = %d0
+    cmp.l       %d2, %d6                | pos < count?
+    blt.b       20b | resample loop     | yes? continue resampling
+40: | skip resample loop                |
+    subq.l      #1, %d3                 | ch > 0?
+    bgt.b       10b | channel loop      | yes? process next channel
+    lsl.l       %d7, %d2                | wrap phase to start of next frame
+    sub.l       %d2, %d5                | data->resample_data.phase =
+    move.l      %d5, 12(%a0)            | ... phase - (count << 16)
+    move.l      %a4, %d0                | return d - d[0]
+    sub.l       (%a2), %d0              |
+    asr.l       #2, %d0                 | convert bytes->samples
+    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
+    lea.l       40(%sp), %sp            | cleanup stack
+    rts                                 | buh-bye
+    .size       dsp_downsample,.-dsp_downsample
+
+/****************************************************************************
+ * int dsp_upsample(int count, struct dsp_data *dsp,
+ *                  const int32_t *src[], int32_t *dst[])
+ */
+    .section    .text
+    .align      2
+    .global     dsp_upsample
+dsp_upsample:
+    lea.l       -40(%sp), %sp           | save non-clobberables
+    movem.l     %d2-%d7/%a2-%a5, (%sp)  |
+    movem.l     44(%sp), %d2/%a0-%a2    | %d2 = count
+                                        | %a0 = data
+                                        | %a1 = src
+                                        | %a2 = dst
+    movem.l      4(%a0), %d3-%d4        | %d3 = ch = channels
+                                        | %d4 = delta = data->resample_data.delta
+    swap        %d4                     | swap delta to high word to use...
+                                        | ...carries to increment position
+10: | channel loop                      |
+    move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
+    move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
+    lea.l       12(%a0, %d3.l*4), %a4   | %a4 = &data->resample_data.last_sample[ch-1]
+    lea.l       -4(%a3, %d2.l*4), %a5   | %a5 = src_end = &src[count-1]
+    move.l      (%a4), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
+    move.l      (%a5), (%a4)            | data->resample_data.last_sample[ch-1] = s[count-1]
+    move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
+    move.l      (%a3)+, %d1             | fetch first sample - might throw this...
+                                        | ...away later but we'll be preincremented
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[0] - last
+    swap        %d5                     | swap phase to high word to use
+                                        | carries to increment position
+    move.l      %d5, %d7                | %d7 = pos = phase >> 16
+    clr.w       %d5                     |
+    eor.l       %d5, %d7                | pos == 0?
+    beq.b       40f | loop start        | yes? start loop
+    cmp.l       %d2, %d7                | past end of samples?
+    bge.b       50f | skip resample loop| yes? go to next channel and collect info
+    lea.l       (%a3, %d7.l*4), %a3     | %a3 = s = &s[pos+1]
+    movem.l     -8(%a3), %d0-%d1        | %d0 = s[pos-1], %d1 = s[pos]
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
+    bra.b       40f | loop start        |
+20: | next sample loop                  |
+    move.l      %d6, %d0                | move previous sample to %d0
+    move.l      (%a3)+, %d1             | fetch next sample
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
+30: | same sample loop                  |
+    movclr.l    %acc0, %d7              | %d7 = result
+    move.l      %d7, (%a4)+             | *d++ = %d7
+40: | loop start                        |
+    lsr.l       #1, %d5                 | make phase into frac
+    move.l      %d0, %acc0              | %acc0 = s[pos-1]
+    mac.l       %d1, %d5, %acc0         | %acc0 = diff * frac
+    lsl.l       #1, %d5                 | restore frac to phase
+    add.l       %d4, %d5                | phase += delta
+    bcc.b       30b | same sample loop  | load next values?
+    cmp.l       %a5, %a3                | src <= src_end?
+    bls.b       20b | next sample loop  | yes? continue resampling
+    movclr.l    %acc0, %d7              | %d7 = result
+    move.l      %d7, (%a4)+             | *d++ = %d7
+50: | skip resample loop                |
+    subq.l      #1, %d3                 | ch > 0?
+    bgt.b       10b | channel loop      | yes? process next channel
+    swap        %d5                     | wrap phase to start of next frame
+    move.l      %d5, 12(%a0)            | ...and save in data->resample_data.phase
+    move.l      %a4, %d0                | return d - d[0]
+    sub.l       (%a2), %d0              |
+    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
+    asr.l       #2, %d0                 | convert bytes->samples
+    lea.l       40(%sp), %sp            | cleanup stack
+    rts                                 | buh-bye
+    .size       dsp_upsample,.-dsp_upsample
+
+/****************************************************************************
+ * void channels_process_sound_chan_mono(int count, int32_t *buf[])
+ *
+ * Mix left and right channels 50/50 into a center channel.
+ */
+    .section    .text
+    .align      2
+    .global     channels_process_sound_chan_mono
+channels_process_sound_chan_mono:
+    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
+    move.l      #0x40000000, %d3        | %d3 = 0.5
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc0 |
+    subq.l      #1, %d0                 |
+    ble.s       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d4              | L = R = l/2 + r/2
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc0 |
+    move.l      %d4, (%a2)+             | output to original buffer
+    move.l      %d4, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d4              | output last sample
+    move.l      %d4, (%a2)              |
+    move.l      %d4, (%a3)              |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_mono, \
+                .-channels_process_sound_chan_mono
+
+/****************************************************************************
+ * void channels_process_sound_chan_custom(int count, int32_t *buf[])
+ *
+ * Apply stereo width (narrowing/expanding) effect.
+ */
+    .section    .text
+    .align      2
+    .global     channels_process_sound_chan_custom
+channels_process_sound_chan_custom:
+    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    lea.l       -28(%sp), %sp           | save registers
+    movem.l     %d2-%d6/%a2-%a3, (%sp)  |
+    movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
+    move.l      dsp_sw_gain, %d3        | load straight (mid) gain
+    move.l      dsp_sw_cross, %d4       | load cross (side) gain
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4, (%a0)+, %d1, %acc1 |  R = r*gain + l*cross
+    mac.l       %d2, %d4             , %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc1 |
+    subq.l      #1, %d0                 |
+    ble.b       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d5              |
+    movclr.l    %acc1, %d6              |
+    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4, (%a0)+, %d1, %acc1 |  R = r*gain + l*cross
+    mac.l       %d2, %d4             , %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc1 |
+    move.l      %d5, (%a2)+             |
+    move.l      %d6, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d5              | output last sample
+    movclr.l    %acc1, %d6              |
+    move.l      %d5, (%a2)              |
+    move.l      %d6, (%a3)              |
+    movem.l     (%sp), %d2-%d6/%a2-%a3  | restore registers
+    lea.l       28(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_custom, \
+                .-channels_process_sound_chan_custom
+
+/****************************************************************************
+ *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
+ *
+ *  Separate channels into side channels.
+ */
+    .section    .text
+    .align      2
+    .global     channels_process_sound_chan_karaoke
+channels_process_sound_chan_karaoke:
+    movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     (%a0), %a0-%a1          | get channel src pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
+    move.l      #0x40000000, %d3        | %d3 = 0.5
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2
+    msac.l      %d2, %d3, (%a1)+, %d2, %acc0 |
+    subq.l      #1, %d0                 |
+    ble.b       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d4              |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2
+    msac.l      %d2, %d3, (%a1)+, %d2, %acc0 |
+    move.l      %d4, (%a2)+             |
+    neg.l       %d4                     | R = -L = -(l/2 - r/2) = r/2 - l/2
+    move.l      %d4, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d4              | output last sample
+    move.l      %d4, (%a2)              |
+    neg.l       %d4                     | R = -L = -(l/2 - r/2) = r/2 - l/2
+    move.l      %d4, (%a3)              |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_karaoke, \
+                .-channels_process_sound_chan_karaoke
+
+/****************************************************************************
+ * void sample_output_stereo(int count, struct dsp_data *data,
+ *                           const int32_t *src[], int16_t *dst)
+ *
+ * Framework based on the ubiquitous Rockbox line transfer logic for
+ * Coldfire CPUs.
+ *
+ * Does emac clamping and scaling (which proved faster than the usual
+ * checks and branches - even single test clamping) and writes using
+ * line burst transfers. Also better than writing a single L-R pair per
+ * loop but a good deal more code.
+ *
+ * Attemping bursting during reads is rather futile since the source and
+ * destination alignments rarely agree and too much complication will
+ * slow us up. The parallel loads seem to do a bit better at least until
+ * a pcm buffer can always give line aligned chunk and then aligning the
+ * dest can then imply the source is aligned if the source buffers are.
+ * For now longword alignment is assumed of both the source and dest.
+ *
+ */
+    .section   .text
+    .align      2
+    .global    sample_output_stereo
+sample_output_stereo:
+    lea.l       -48(%sp), %sp             | save registers
+    move.l      %macsr, %d1               | do it now as at many lines will
+    movem.l     %d1-%d7/%a2-%a6, (%sp)    | be the far more common condition
+    move.l      #0x80, %macsr             | put emac unit in signed int mode
+    movem.l     52(%sp), %a0-%a2/%a4      |
+    lea.l       (%a4, %a0.l*4), %a0       | %a0 = end address     
+    move.l      (%a1), %d1                | %a1 = multiplier: (1 << (16 - scale))
+    sub.l       #16, %d1                  |
+    neg.l       %d1                       |
+    moveq.l     #1, %d0                   |
+    asl.l       %d1, %d0                  |
+    move.l      %d0, %a1                  |
+    move.l      #0x8000, %a6              | %a6 = rounding term
+    movem.l     (%a2), %a2-%a3            | get L/R channel pointers
+    moveq.l     #28, %d0                  | %d0 = second line bound
+    add.l       %a4, %d0                  |
+    and.l       #0xfffffff0, %d0          |
+    cmp.l       %a0, %d0                  | at least a full line?
+    bhi.w       40f | long loop 1 start   | no? do as trailing longwords
+    sub.l       #16, %d0                  | %d1 = first line bound
+    cmp.l       %a4, %d0                  | any leading longwords?
+    bls.b       20f | line loop start     | no? start line loop
+10: | long loop 0                         |
+    move.l      (%a2)+, %d1               | read longword from L and R
+    move.l      %a6, %acc0                |
+    move.l      %acc0, %acc1              |
+    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | shift L to high word
+    mac.l       %d2, %a1, %acc1           | shift R to high word
+    movclr.l    %acc0, %d1                | get possibly saturated results
+    movclr.l    %acc1, %d2                |
+    swap        %d2                       | move R to low word
+    move.w      %d2, %d1                  | interleave MS 16 bits of each 
+    move.l      %d1, (%a4)+               | ...and write both
+    cmp.l       %a4, %d0                  |
+    bhi.b       10b | long loop 0         |
+20: | line loop start                     |
+    lea.l       -12(%a0), %a5             | %a5 = at or just before last line bound
+30: | line loop                           |
+    move.l      (%a3)+, %d4               | get next 4 R samples and scale
+    move.l      %a6, %acc0                |
+    move.l      %acc0, %acc1              |
+    move.l      %acc1, %acc2              |
+    move.l      %acc2, %acc3              |
+    mac.l       %d4, %a1, (%a3)+, %d5, %acc0 | with saturation
+    mac.l       %d5, %a1, (%a3)+, %d6, %acc1 |
+    mac.l       %d6, %a1, (%a3)+, %d7, %acc2 |
+    mac.l       %d7, %a1, (%a2)+, %d0, %acc3 |
+    lea.l       16(%a4), %a4              | increment dest here, mitigate stalls
+    movclr.l    %acc0, %d4                | obtain R results
+    movclr.l    %acc1, %d5                |
+    movclr.l    %acc2, %d6                |
+    movclr.l    %acc3, %d7                |
+    move.l      %a6, %acc0                |
+    move.l      %acc0, %acc1              |
+    move.l      %acc1, %acc2              |
+    move.l      %acc2, %acc3              |
+    mac.l       %d0, %a1, (%a2)+, %d1, %acc0 | get next 4 L samples and scale
+    mac.l       %d1, %a1, (%a2)+, %d2, %acc1 | with saturation
+    mac.l       %d2, %a1, (%a2)+, %d3, %acc2 |
+    mac.l       %d3, %a1             , %acc3 |
+    swap        %d4                       | a) interleave most significant...
+    swap        %d5                       |
+    swap        %d6                       |
+    swap        %d7                       |
+    movclr.l    %acc0, %d0                | obtain L results
+    movclr.l    %acc1, %d1                |
+    movclr.l    %acc2, %d2                |
+    movclr.l    %acc3, %d3                |
+    move.w      %d4, %d0                  | a) ... 16 bits of L and R
+    move.w      %d5, %d1                  |
+    move.w      %d6, %d2                  |
+    move.w      %d7, %d3                  |
+    movem.l     %d0-%d3, -16(%a4)         | write four stereo samples
+    cmp.l       %a4, %a5                  |
+    bhi.b       30b | line loop           |
+40: | long loop 1 start                   |
+    cmp.l       %a4, %a0                  | any longwords left?
+    bls.b       60f | output end          | no? stop
+50: | long loop 1                         |
+    move.l      (%a2)+, %d1               | handle trailing longwords
+    move.l      %a6, %acc0                |
+    move.l      %acc0, %acc1              |
+    mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | the same way as leading ones
+    mac.l       %d2, %a1, %acc1           |
+    movclr.l    %acc0, %d1                |
+    movclr.l    %acc1, %d2                |
+    swap        %d2                       |
+    move.w      %d2, %d1                  |
+    move.l      %d1, (%a4)+               |
+    cmp.l       %a4, %a0                  |
+    bhi.b       50b                       | long loop 1
+60: | output end                          |
+    movem.l     (%sp), %d1-%d7/%a2-%a6    | restore registers
+    move.l      %d1, %macsr               |
+    lea.l       48(%sp), %sp              | cleanup
+    rts                                   |
+    .size      sample_output_stereo, .-sample_output_stereo
+
+/****************************************************************************
+ * void sample_output_mono(int count, struct dsp_data *data,
+ *                         const int32_t *src[], int16_t *dst)
+ *
+ * Same treatment as sample_output_stereo but for one channel.
+ */
+    .section   .text
+    .align      2
+    .global    sample_output_mono
+sample_output_mono:
+    lea.l       -32(%sp), %sp             | save registers
+    move.l      %macsr, %d1               | do it now as at many lines will
+    movem.l     %d1-%d5/%a2-%a4, (%sp)    | be the far more common condition
+    move.l      #0x80, %macsr             | put emac unit in signed int mode
+    movem.l     36(%sp), %a0-%a3          |
+    lea.l       (%a3, %a0.l*4), %a0       | %a0 = end address     
+    move.l      (%a1), %d1                | %d5 = multiplier: (1 << (16 - scale))
+    sub.l       #16, %d1                  |
+    neg.l       %d1                       |
+    moveq.l     #1, %d5                   |
+    asl.l       %d1, %d5                  |
+    move.l      #0x8000, %a4              | %a4 = rounding term
+    movem.l     (%a2), %a2                | get source channel pointer
+    moveq.l     #28, %d0                  | %d0 = second line bound
+    add.l       %a3, %d0                  |
+    and.l       #0xfffffff0, %d0          |
+    cmp.l       %a0, %d0                  | at least a full line?
+    bhi.w       40f | long loop 1 start   | no? do as trailing longwords
+    sub.l       #16, %d0                  | %d1 = first line bound
+    cmp.l       %a3, %d0                  | any leading longwords?
+    bls.b       20f | line loop start     | no? start line loop
+10: | long loop 0                         |
+    move.l      (%a2)+, %d1               | read longword from L and R
+    move.l      %a4, %acc0                |
+    mac.l       %d1, %d5, %acc0           | shift L to high word
+    movclr.l    %acc0, %d1                | get possibly saturated results
+    move.l      %d1, %d2                  |
+    swap        %d2                       | move R to low word
+    move.w      %d2, %d1                  | duplicate single channel into
+    move.l      %d1, (%a3)+               | L and R
+    cmp.l       %a3, %d0                  |
+    bhi.b       10b | long loop 0         |
+20: | line loop start                     |
+    lea.l       -12(%a0), %a1             | %a1 = at or just before last line bound
+30: | line loop                           |
+    move.l      (%a2)+, %d0               | get next 4 L samples and scale
+    move.l      %a4, %acc0                |
+    move.l      %acc0, %acc1              |
+    move.l      %acc1, %acc2              |
+    move.l      %acc2, %acc3              |
+    mac.l       %d0, %d5, (%a2)+, %d1, %acc0 | with saturation
+    mac.l       %d1, %d5, (%a2)+, %d2, %acc1 |
+    mac.l       %d2, %d5, (%a2)+, %d3, %acc2 |
+    mac.l       %d3, %d5             , %acc3 |
+    lea.l       16(%a3), %a3              | increment dest here, mitigate stalls
+    movclr.l    %acc0, %d0                | obtain results
+    movclr.l    %acc1, %d1                |
+    movclr.l    %acc2, %d2                |
+    movclr.l    %acc3, %d3                |
+    move.l      %d0, %d4                  | duplicate single channel
+    swap        %d4                       | into L and R
+    move.w      %d4, %d0                  |
+    move.l      %d1, %d4                  |
+    swap        %d4                       |
+    move.w      %d4, %d1                  |
+    move.l      %d2, %d4                  |
+    swap        %d4                       |
+    move.w      %d4, %d2                  |
+    move.l      %d3, %d4                  |
+    swap        %d4                       |
+    move.w      %d4, %d3                  |
+    movem.l     %d0-%d3, -16(%a3)         | write four stereo samples
+    cmp.l       %a3, %a1                  |
+    bhi.b       30b | line loop           |
+40: | long loop 1 start                   |
+    cmp.l       %a3, %a0                  | any longwords left?
+    bls.b       60f | output end          | no? stop
+50: | loop loop 1                         |
+    move.l      (%a2)+, %d1               | handle trailing longwords
+    move.l      %a4, %acc0                |
+    mac.l       %d1, %d5, %acc0           | the same way as leading ones
+    movclr.l    %acc0, %d1                |
+    move.l      %d1, %d2                  |
+    swap        %d2                       |
+    move.w      %d2, %d1                  |
+    move.l      %d1, (%a3)+               |
+    cmp.l       %a3, %a0                  |
+    bhi.b       50b | long loop 1         |
+60: | output end                          |
+    movem.l     (%sp), %d1-%d5/%a2-%a4    | restore registers
+    move.l      %d1, %macsr               |
+    lea.l       32(%sp), %sp              | cleanup
+    rts                                   |
+    .size      sample_output_mono, .-sample_output_mono
--- a/lib/rbcodec/dsp/eq.c
+++ b/lib/rbcodec/dsp/eq.c
@ -0,0 +1,268 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006-2007 Thom Johansen 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include <inttypes.h>
+#include "config.h"
+#include "fixedpoint.h"
+#include "fracmul.h"
+#include "eq.h"
+#include "replaygain.h"
+
+/** 
+ * Calculate first order shelving filter. Filter is not directly usable by the
+ * eq_filter() function.
+ * @param cutoff shelf midpoint frequency. See eq_pk_coefs for format.
+ * @param A decibel value multiplied by ten, describing gain/attenuation of
+ * shelf. Max value is 24 dB.
+ * @param low true for low-shelf filter, false for high-shelf filter.
+ * @param c pointer to coefficient storage. Coefficients are s4.27 format.
+ */
+void filter_shelf_coefs(unsigned long cutoff, long A, bool low, int32_t *c)
+{
+    long sin, cos;
+    int32_t b0, b1, a0, a1; /* s3.28 */
+    const long g = get_replaygain_int(A*5) << 4; /* 10^(db/40), s3.28 */
+
+    sin = fp_sincos(cutoff/2, &cos);
+    if (low) {
+        const int32_t sin_div_g = fp_div(sin, g, 25);
+        const int32_t sin_g = FRACMUL(sin, g);
+        cos >>= 3;
+        b0 = sin_g + cos;             /* 0.25 .. 4.10 */
+        b1 = sin_g - cos;             /* -1 .. 3.98 */
+        a0 = sin_div_g + cos;         /* 0.25 .. 4.10 */
+        a1 = sin_div_g - cos;         /* -1 .. 3.98 */
+    } else {
+        const int32_t cos_div_g = fp_div(cos, g, 25);
+        const int32_t cos_g = FRACMUL(cos, g);
+        sin >>= 3;
+        b0 = sin + cos_g;             /* 0.25 .. 4.10 */
+        b1 = sin - cos_g;             /* -3.98 .. 1 */
+        a0 = sin + cos_div_g;         /* 0.25 .. 4.10 */
+        a1 = sin - cos_div_g;         /* -3.98 .. 1 */
+    }
+
+    const int32_t rcp_a0 = fp_div(1, a0, 57); /* 0.24 .. 3.98, s2.29 */
+    *c++ = FRACMUL_SHL(b0, rcp_a0, 1);       /* 0.063 .. 15.85 */
+    *c++ = FRACMUL_SHL(b1, rcp_a0, 1);       /* -15.85 .. 15.85 */
+    *c++ = -FRACMUL_SHL(a1, rcp_a0, 1);      /* -1 .. 1 */
+}
+
+#ifdef HAVE_SW_TONE_CONTROLS
+/** 
+ * Calculate second order section filter consisting of one low-shelf and one
+ * high-shelf section.
+ * @param cutoff_low low-shelf midpoint frequency. See eq_pk_coefs for format.
+ * @param cutoff_high high-shelf midpoint frequency.
+ * @param A_low decibel value multiplied by ten, describing gain/attenuation of
+ * low-shelf part. Max value is 24 dB.
+ * @param A_high decibel value multiplied by ten, describing gain/attenuation of
+ * high-shelf part. Max value is 24 dB.
+ * @param A decibel value multiplied by ten, describing additional overall gain.
+ * @param c pointer to coefficient storage. Coefficients are s4.27 format.
+ */
+void filter_bishelf_coefs(unsigned long cutoff_low, unsigned long cutoff_high,
+                          long A_low, long A_high, long A, int32_t *c)
+{
+    const long g = get_replaygain_int(A*10) << 7; /* 10^(db/20), s0.31 */
+    int32_t c_ls[3], c_hs[3];
+
+    filter_shelf_coefs(cutoff_low, A_low, true, c_ls);
+    filter_shelf_coefs(cutoff_high, A_high, false, c_hs);
+    c_ls[0] = FRACMUL(g, c_ls[0]);
+    c_ls[1] = FRACMUL(g, c_ls[1]);
+
+    /* now we cascade the two first order filters to one second order filter
+     * which can be used by eq_filter(). these resulting coefficients have a
+     * really wide numerical range, so we use a fixed point format which will
+     * work for the selected cutoff frequencies (in dsp.c) only.
+     */
+    const int32_t b0 = c_ls[0], b1 = c_ls[1], b2 = c_hs[0], b3 = c_hs[1];
+    const int32_t a0 = c_ls[2], a1 = c_hs[2];
+    *c++ = FRACMUL_SHL(b0, b2, 4);
+    *c++ = FRACMUL_SHL(b0, b3, 4) + FRACMUL_SHL(b1, b2, 4);
+    *c++ = FRACMUL_SHL(b1, b3, 4);
+    *c++ = a0 + a1;
+    *c++ = -FRACMUL_SHL(a0, a1, 4);
+}
+#endif
+
+/* Coef calculation taken from Audio-EQ-Cookbook.txt by Robert Bristow-Johnson.
+ * Slightly faster calculation can be done by deriving forms which use tan()
+ * instead of cos() and sin(), but the latter are far easier to use when doing
+ * fixed point math, and performance is not a big point in the calculation part.
+ * All the 'a' filter coefficients are negated so we can use only additions
+ * in the filtering equation.
+ */
+
+/** 
+ * Calculate second order section peaking filter coefficients.
+ * @param cutoff a value from 0 to 0x80000000, where 0 represents 0 Hz and
+ * 0x80000000 represents the Nyquist frequency (samplerate/2).
+ * @param Q Q factor value multiplied by ten. Lower bound is artificially set
+ * at 0.5.
+ * @param db decibel value multiplied by ten, describing gain/attenuation at
+ * peak freq. Max value is 24 dB.
+ * @param c pointer to coefficient storage. Coefficients are s3.28 format.
+ */
+void eq_pk_coefs(unsigned long cutoff, unsigned long Q, long db, int32_t *c)
+{
+    long cs;
+    const long one = 1 << 28; /* s3.28 */
+    const long A = get_replaygain_int(db*5) << 5; /* 10^(db/40), s2.29 */
+    const long alpha = fp_sincos(cutoff, &cs)/(2*Q)*10 >> 1; /* s1.30 */
+    int32_t a0, a1, a2; /* these are all s3.28 format */
+    int32_t b0, b1, b2;
+    const long alphadivA = fp_div(alpha, A, 27);
+    const long alphaA = FRACMUL(alpha, A);
+
+    /* possible numerical ranges are in comments by each coef */
+    b0 = one + alphaA;                /* [1 .. 5] */
+    b1 = a1 = -2*(cs >> 3);           /* [-2 .. 2] */
+    b2 = one - alphaA;                /* [-3 .. 1] */
+    a0 = one + alphadivA;             /* [1 .. 5] */
+    a2 = one - alphadivA;             /* [-3 .. 1] */
+
+    /* range of this is roughly [0.2 .. 1], but we'll never hit 1 completely */
+    const long rcp_a0 = fp_div(1, a0, 59); /* s0.31 */
+    *c++ = FRACMUL(b0, rcp_a0);         /* [0.25 .. 4] */
+    *c++ = FRACMUL(b1, rcp_a0);         /* [-2 .. 2] */
+    *c++ = FRACMUL(b2, rcp_a0);         /* [-2.4 .. 1] */
+    *c++ = FRACMUL(-a1, rcp_a0);        /* [-2 .. 2] */
+    *c++ = FRACMUL(-a2, rcp_a0);        /* [-0.6 .. 1] */
+}
+
+/**
+ * Calculate coefficients for lowshelf filter. Parameters are as for
+ * eq_pk_coefs, but the coefficient format is s5.26 fixed point.
+ */
+void eq_ls_coefs(unsigned long cutoff, unsigned long Q, long db, int32_t *c)
+{
+    long cs;
+    const long one = 1 << 25; /* s6.25 */
+    const long sqrtA = get_replaygain_int(db*5/2) << 2; /* 10^(db/80), s5.26 */
+    const long A = FRACMUL_SHL(sqrtA, sqrtA, 8); /* s2.29 */
+    const long alpha = fp_sincos(cutoff, &cs)/(2*Q)*10 >> 1; /* s1.30 */
+    const long ap1 = (A >> 4) + one;
+    const long am1 = (A >> 4) - one;
+    const long ap1_cs = FRACMUL(ap1, cs);
+    const long am1_cs = FRACMUL(am1, cs);
+    const long twosqrtalpha = 2*FRACMUL(sqrtA, alpha);
+    int32_t a0, a1, a2; /* these are all s6.25 format */
+    int32_t b0, b1, b2;
+    
+    /* [0.1 .. 40] */
+    b0 = FRACMUL_SHL(A, ap1 - am1_cs + twosqrtalpha, 2);
+    /* [-16 .. 63.4] */
+    b1 = FRACMUL_SHL(A, am1 - ap1_cs, 3);
+    /* [0 .. 31.7] */
+    b2 = FRACMUL_SHL(A, ap1 - am1_cs - twosqrtalpha, 2);
+    /* [0.5 .. 10] */
+    a0 = ap1 + am1_cs + twosqrtalpha;
+    /* [-16 .. 4] */
+    a1 = -2*(am1 + ap1_cs);
+    /* [0 .. 8] */
+    a2 = ap1 + am1_cs - twosqrtalpha;
+
+    /* [0.1 .. 1.99] */
+    const long rcp_a0 = fp_div(1, a0, 55);    /* s1.30 */
+    *c++ = FRACMUL_SHL(b0, rcp_a0, 2);       /* [0.06 .. 15.9] */
+    *c++ = FRACMUL_SHL(b1, rcp_a0, 2);       /* [-2 .. 31.7] */
+    *c++ = FRACMUL_SHL(b2, rcp_a0, 2);       /* [0 .. 15.9] */
+    *c++ = FRACMUL_SHL(-a1, rcp_a0, 2);      /* [-2 .. 2] */
+    *c++ = FRACMUL_SHL(-a2, rcp_a0, 2);      /* [0 .. 1] */
+}
+
+/**
+ * Calculate coefficients for highshelf filter. Parameters are as for
+ * eq_pk_coefs, but the coefficient format is s5.26 fixed point.
+ */
+void eq_hs_coefs(unsigned long cutoff, unsigned long Q, long db, int32_t *c)
+{
+    long cs;
+    const long one = 1 << 25; /* s6.25 */
+    const long sqrtA = get_replaygain_int(db*5/2) << 2; /* 10^(db/80), s5.26 */
+    const long A = FRACMUL_SHL(sqrtA, sqrtA, 8); /* s2.29 */
+    const long alpha = fp_sincos(cutoff, &cs)/(2*Q)*10 >> 1; /* s1.30 */
+    const long ap1 = (A >> 4) + one;
+    const long am1 = (A >> 4) - one;
+    const long ap1_cs = FRACMUL(ap1, cs);
+    const long am1_cs = FRACMUL(am1, cs);
+    const long twosqrtalpha = 2*FRACMUL(sqrtA, alpha);
+    int32_t a0, a1, a2; /* these are all s6.25 format */
+    int32_t b0, b1, b2;
+
+    /* [0.1 .. 40] */
+    b0 = FRACMUL_SHL(A, ap1 + am1_cs + twosqrtalpha, 2);
+    /* [-63.5 .. 16] */
+    b1 = -FRACMUL_SHL(A, am1 + ap1_cs, 3);
+    /* [0 .. 32] */
+    b2 = FRACMUL_SHL(A, ap1 + am1_cs - twosqrtalpha, 2);
+    /* [0.5 .. 10] */
+    a0 = ap1 - am1_cs + twosqrtalpha;
+    /* [-4 .. 16] */
+    a1 = 2*(am1 - ap1_cs);
+    /* [0 .. 8] */
+    a2 = ap1 - am1_cs - twosqrtalpha;
+
+    /* [0.1 .. 1.99] */
+    const long rcp_a0 = fp_div(1, a0, 55);    /* s1.30 */
+    *c++ = FRACMUL_SHL(b0, rcp_a0, 2);       /* [0 .. 16] */
+    *c++ = FRACMUL_SHL(b1, rcp_a0, 2);       /* [-31.7 .. 2] */
+    *c++ = FRACMUL_SHL(b2, rcp_a0, 2);       /* [0 .. 16] */
+    *c++ = FRACMUL_SHL(-a1, rcp_a0, 2);      /* [-2 .. 2] */
+    *c++ = FRACMUL_SHL(-a2, rcp_a0, 2);      /* [0 .. 1] */
+}
+
+/* We realise the filters as a second order direct form 1 structure. Direct
+ * form 1 was chosen because of better numerical properties for fixed point
+ * implementations.
+ */
+
+#if (!defined(CPU_COLDFIRE) && !defined(CPU_ARM))
+void eq_filter(int32_t **x, struct eqfilter *f, unsigned num,
+               unsigned channels, unsigned shift)
+{
+    unsigned c, i;
+    long long acc;
+
+    /* Direct form 1 filtering code.
+       y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
+       where y[] is output and x[] is input.
+     */
+
+    for (c = 0; c < channels; c++) {
+        for (i = 0; i < num; i++) {
+            acc  = (long long) x[c][i] * f->coefs[0];
+            acc += (long long) f->history[c][0] * f->coefs[1];
+            acc += (long long) f->history[c][1] * f->coefs[2];
+            acc += (long long) f->history[c][2] * f->coefs[3];
+            acc += (long long) f->history[c][3] * f->coefs[4];
+            f->history[c][1] = f->history[c][0];
+            f->history[c][0] = x[c][i];
+            f->history[c][3] = f->history[c][2];
+            x[c][i] = (acc << shift) >> 32;
+            f->history[c][2] = x[c][i];
+        }
+    }
+}
+#endif
+
--- a/lib/rbcodec/dsp/eq.h
+++ b/lib/rbcodec/dsp/eq.h
@ -0,0 +1,50 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006-2007 Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#ifndef _EQ_H
+#define _EQ_H
+
+#include <inttypes.h>
+#include <stdbool.h>
+
+/* These depend on the fixed point formats used by the different filter types
+   and need to be changed when they change.
+ */
+#define FILTER_BISHELF_SHIFT 5
+#define EQ_PEAK_SHIFT 4
+#define EQ_SHELF_SHIFT 6
+
+struct eqfilter {
+    int32_t coefs[5];        /* Order is b0, b1, b2, a1, a2 */
+    int32_t history[2][4];
+};
+
+void filter_shelf_coefs(unsigned long cutoff, long A, bool low, int32_t *c);
+void filter_bishelf_coefs(unsigned long cutoff_low, unsigned long cutoff_high,
+                          long A_low, long A_high, long A, int32_t *c);
+void eq_pk_coefs(unsigned long cutoff, unsigned long Q, long db, int32_t *c);
+void eq_ls_coefs(unsigned long cutoff, unsigned long Q, long db, int32_t *c);
+void eq_hs_coefs(unsigned long cutoff, unsigned long Q, long db, int32_t *c);
+void eq_filter(int32_t **x, struct eqfilter *f, unsigned num,
+               unsigned channels, unsigned shift);
+
+#endif
+
--- a/lib/rbcodec/dsp/eq_arm.S
+++ b/lib/rbcodec/dsp/eq_arm.S
@ -0,0 +1,89 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006-2007 Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+
+/* uncomment this to make filtering calculate lower bits after shifting.
+ * without this, "shift" of the lower bits will be lost here.
+ */
+/* #define HIGH_PRECISION */
+
+/*
+ * void eq_filter(int32_t **x, struct eqfilter *f, unsigned num,
+ *                unsigned channels, unsigned shift)
+ */
+#if CONFIG_CPU == PP5002
+    .section    .icode,"ax",%progbits
+#else
+    .text
+#endif
+    .global eq_filter
+eq_filter:
+    ldr r12, [sp]             @ get shift parameter
+    stmdb sp!, { r0-r11, lr } @ save all params and clobbered regs 
+    ldmia r1!, { r4-r8 }      @ load coefs
+    mov r10, r1               @ loop prelude expects filter struct addr in r10
+
+.filterloop:
+    ldr r9, [sp]            @ get pointer to this channels data
+    add r0, r9, #4
+    str r0, [sp]            @ save back pointer to next channels data
+    ldr r9, [r9]            @ r9 = x[]
+    ldr r14, [sp, #8]       @ r14 = numsamples
+    ldmia r10, { r0-r3 }    @ load history, r10 should be filter struct addr
+    str r10, [sp, #4]       @ save it for loop end
+
+    /* r0-r3 = history, r4-r8 = coefs, r9 = x[], r10..r11 = accumulator,
+     * r12 = shift amount, r14 = number of samples.
+     */
+.loop:
+    /* Direct form 1 filtering code.
+     * y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
+     * where y[] is output and x[] is input. This is performed out of order to
+     * reuse registers, we're pretty short on regs.
+     */
+    smull r10, r11, r6, r1     @ acc = b2*x[i - 2]
+    mov r1, r0                 @ fix input history
+    smlal r10, r11, r5, r0     @ acc += b1*x[i - 1]
+    ldr r0, [r9]               @ load input and fix history in same operation
+    smlal r10, r11, r7, r2     @ acc += a1*y[i - 1]
+    smlal r10, r11, r8, r3     @ acc += a2*y[i - 2]
+    smlal r10, r11, r4, r0     @ acc += b0*x[i] /* avoid stall on arm9*/
+    mov r3, r2                 @ fix output history
+    mov r2, r11, asl r12       @ get upper part of result and shift left
+#ifdef HIGH_PRECISION
+    rsb r11, r12, #32          @ get shift amount for lower part
+    orr r2, r2, r10, lsr r11   @ then mix in correctly shifted lower part
+#endif
+    str r2, [r9], #4           @ save result
+    subs r14, r14, #1          @ are we done with this channel?
+    bne .loop
+
+    ldr r10, [sp, #4]          @ load filter struct pointer
+    stmia r10!, { r0-r3 }      @ save back history
+    ldr r11, [sp, #12]         @ load number of channels
+    subs r11, r11, #1          @ all channels processed?
+    strne r11, [sp, #12]
+    bne .filterloop
+
+    add sp, sp, #16            @ compensate for temp storage
+    ldmpc regs=r4-r11
+
--- a/lib/rbcodec/dsp/eq_cf.S
+++ b/lib/rbcodec/dsp/eq_cf.S
@ -0,0 +1,91 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006-2007 Thom Johansen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+/* uncomment this to make filtering calculate lower bits after shifting.
+ * without this, "shift" - 1 of the lower bits will be lost here.
+ */
+/* #define HIGH_PRECISION */
+
+/*
+ * void eq_filter(int32_t **x, struct eqfilter *f, unsigned num,
+ *                unsigned channels, unsigned shift)
+ */
+    .text
+    .global eq_filter
+eq_filter:
+    lea.l (-11*4, %sp), %sp 
+    movem.l %d2-%d7/%a2-%a6, (%sp)    | save clobbered regs
+    move.l (11*4+8, %sp), %a5         | fetch filter structure address
+    move.l (11*4+20, %sp), %d7        | load shift count
+    subq.l #1, %d7                    | EMAC gives us one free shift
+#ifdef HIGH_PRECISION
+    moveq.l #8, %d6
+    sub.l %d7, %d6                    | shift for lower part of accumulator
+#endif
+    movem.l (%a5), %a0-%a4            | load coefs
+    lea.l (5*4, %a5), %a5             | point to filter history
+
+.filterloop:
+    move.l (11*4+4, %sp), %a6         | load input channel pointer
+    addq.l #4, (11*4+4, %sp)          | point x to next channel
+    move.l (%a6), %a6
+    move.l (11*4+12, %sp), %d5        | number of samples
+    movem.l (%a5), %d0-%d3            | load filter history
+
+    /* d0-d3 = history, d4 = temp, d5 = sample count, d6 = lower shift amount,
+     * d7 = upper shift amount, a0-a4 = coefs, a5 = history pointer, a6 = x[]
+     */
+.loop:
+    /* Direct form 1 filtering code. We assume DSP has put EMAC in frac mode.
+     * y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2],
+     * where y[] is output and x[] is input. This is performed out of order
+     * to do parallel load of input value.
+     */
+    mac.l %a2, %d1, %acc0               | acc = b2*x[i - 2]
+    move.l %d0, %d1                     | fix input history
+    mac.l %a1, %d0, (%a6), %d0, %acc0   | acc += b1*x[i - 1], x[i] -> d0
+    mac.l %a0, %d0, %acc0               | acc += b0*x[i]
+    mac.l %a3, %d2, %acc0               | acc += a1*y[i - 1]
+    mac.l %a4, %d3, %acc0               | acc += a2*y[i - 2]
+    move.l %d2, %d3                     | fix output history
+#ifdef HIGH_PRECISION
+    move.l %accext01, %d2               | fetch lower part of accumulator
+    move.b %d2, %d4                     | clear upper three bytes
+    lsr.l %d6, %d4                      | shift lower bits
+#endif
+    movclr.l %acc0, %d2                 | fetch upper part of result
+    asl.l %d7, %d2                      | restore fixed point format
+#ifdef HIGH_PRECISION
+    or.l %d2, %d4                       | combine lower and upper parts
+#endif
+    move.l %d2, (%a6)+                  | save result
+    subq.l #1, %d5                      | are we done with this channel?
+    jne .loop
+    
+    movem.l %d0-%d3, (%a5)              | save history back to struct
+    lea.l (4*4, %a5), %a5               | point to next channel's history
+    subq.l #1, (11*4+16, %sp)           | have we processed both channels?
+    jne .filterloop
+
+    movem.l (%sp), %d2-%d7/%a2-%a6
+    lea.l (11*4, %sp), %sp
+    rts
+
--- a/lib/rbcodec/dsp/eqs/Acoustic.cfg
+++ b/lib/rbcodec/dsp/eqs/Acoustic.cfg
@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 45
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 45
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 10
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 15
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 30
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 20
--- a/lib/rbcodec/dsp/eqs/Bass.cfg
+++ b/lib/rbcodec/dsp/eqs/Bass.cfg
@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 50
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 50
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 35
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 15
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 5
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: -5
--- a/lib/rbcodec/dsp/eqs/Classical.cfg
+++ b/lib/rbcodec/dsp/eqs/Classical.cfg
@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 50
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 50
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 40
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: -20
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 10
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 20
--- a/lib/rbcodec/dsp/eqs/Default.cfg
+++ b/lib/rbcodec/dsp/eqs/Default.cfg
@ -0,0 +1,17 @@
+eq enabled: off
+eq precut: 0
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 0
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 0
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 0
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 0
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 0
--- a/lib/rbcodec/dsp/eqs/Disco.cfg
+++ b/lib/rbcodec/dsp/eqs/Disco.cfg
@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 45
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 30
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 10
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 45
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 25
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 10
--- a/lib/rbcodec/dsp/eqs/Electronic.cfg
+++ b/lib/rbcodec/dsp/eqs/Electronic.cfg
@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 55
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 45
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 5
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 25
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 15
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 55
--- a/lib/rbcodec/dsp/eqs/Hip-Hop.cfg
+++ b/lib/rbcodec/dsp/eqs/Hip-Hop.cfg
@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 65
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 65
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 25
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: -10
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 15
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 35
--- a/lib/rbcodec/dsp/eqs/Jazz.cfg
+++ b/lib/rbcodec/dsp/eqs/Jazz.cfg
@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 60
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 40
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 15
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: -25
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 5
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 60
--- a/lib/rbcodec/dsp/eqs/Lounge.cfg
+++ b/lib/rbcodec/dsp/eqs/Lounge.cfg
@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 20
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: -25
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 5
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 20
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: -15
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 15
--- a/lib/rbcodec/dsp/eqs/Pop.cfg
+++ b/lib/rbcodec/dsp/eqs/Pop.cfg
@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 50
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: -10
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 5
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 50
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 15
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: -10
--- a/lib/rbcodec/dsp/eqs/R&B.cfg
+++ b/lib/rbcodec/dsp/eqs/R&B.cfg
@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 45
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 35
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 45
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 5
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 25
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 30
--- a/lib/rbcodec/dsp/eqs/Rock.cfg
+++ b/lib/rbcodec/dsp/eqs/Rock.cfg
@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 45
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: 25
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 10
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 0
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 20
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 45
--- a/lib/rbcodec/dsp/eqs/Vocal.cfg
+++ b/lib/rbcodec/dsp/eqs/Vocal.cfg
@ -0,0 +1,17 @@
+eq enabled: on
+eq precut: 45
+eq band 0 cutoff: 60
+eq band 0 q: 7
+eq band 0 gain: -45
+eq band 1 cutoff: 200
+eq band 1 q: 10
+eq band 1 gain: 5
+eq band 2 cutoff: 800
+eq band 2 q: 10
+eq band 2 gain: 45
+eq band 3 cutoff: 4000
+eq band 3 q: 10
+eq band 3 gain: 20
+eq band 4 cutoff: 12000
+eq band 4 q: 7
+eq band 4 gain: 0
--- a/lib/rbcodec/dsp/tdspeed.c
+++ b/lib/rbcodec/dsp/tdspeed.c
@ -0,0 +1,450 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 by Nicolas Pitre <nico@cam.org>
+ * Copyright (C) 2006-2007 by Stéphane Doyon <s.doyon@videotron.ca>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include "sound.h"
+#include "core_alloc.h"
+#include "system.h"
+#include "tdspeed.h"
+#include "settings.h"
+
+#define assert(cond)
+
+#define MIN_RATE 8000
+#define MAX_RATE 48000 /* double buffer for double rate */
+#define MINFREQ 100
+
+#define FIXED_BUFSIZE 3072 /* 48KHz factor 3.0 */
+
+static int32_t** dsp_src;
+static int handles[4];
+static int32_t *overlap_buffer[2] = { NULL, NULL };
+static int32_t *outbuf[2] = { NULL, NULL };
+
+static int move_callback(int handle, void* current, void* new)
+{
+    /* TODO */
+    (void)handle;
+    if (dsp_src)
+    {
+        int ch = (current == outbuf[0]) ? 0 : 1;
+        dsp_src[ch] = outbuf[ch] = new;
+    }
+    return BUFLIB_CB_OK;
+}
+
+static struct buflib_callbacks ops = {
+    .move_callback = move_callback,
+    .shrink_callback = NULL,
+};
+
+static int ovl_move_callback(int handle, void* current, void* new)
+{
+    /* TODO */
+    (void)handle;
+    if (dsp_src)
+    {
+        int ch = (current == overlap_buffer[0]) ? 0 : 1;
+        overlap_buffer[ch] = new;
+    }
+    return BUFLIB_CB_OK;
+}
+
+static struct buflib_callbacks ovl_ops = {
+    .move_callback = ovl_move_callback,
+    .shrink_callback = NULL,
+};
+
+
+static struct tdspeed_state_s
+{
+    bool stereo;
+    int32_t shift_max;      /* maximum displacement on a frame */
+    int32_t src_step;       /* source window pace */
+    int32_t dst_step;       /* destination window pace */
+    int32_t dst_order;      /* power of two for dst_step */
+    int32_t ovl_shift;      /* overlap buffer frame shift */
+    int32_t ovl_size;       /* overlap buffer used size */
+    int32_t ovl_space;      /* overlap buffer size */
+    int32_t *ovl_buff[2];   /* overlap buffer */
+} tdspeed_state;
+
+void tdspeed_init(void)
+{
+    if (!global_settings.timestretch_enabled)
+        return;
+
+    /* Allocate buffers */
+    if (overlap_buffer[0] == NULL)
+    {
+        handles[0] = core_alloc_ex("tdspeed ovl left", FIXED_BUFSIZE * sizeof(int32_t), &ovl_ops);
+        overlap_buffer[0] = core_get_data(handles[0]);
+    }
+    if (overlap_buffer[1] == NULL)
+    {
+        handles[1] = core_alloc_ex("tdspeed ovl right", FIXED_BUFSIZE * sizeof(int32_t), &ovl_ops);
+        overlap_buffer[1] = core_get_data(handles[1]);
+    }
+    if (outbuf[0] == NULL)
+    {
+        handles[2] = core_alloc_ex("tdspeed left", TDSPEED_OUTBUFSIZE * sizeof(int32_t), &ops);
+        outbuf[0] = core_get_data(handles[2]);
+    }
+    if (outbuf[1] == NULL)
+    {
+        handles[3] = core_alloc_ex("tdspeed right", TDSPEED_OUTBUFSIZE * sizeof(int32_t), &ops);
+        outbuf[1] = core_get_data(handles[3]);
+    }
+}
+
+void tdspeed_finish(void)
+{
+    for(unsigned i = 0; i < ARRAYLEN(handles); i++)
+    {
+        if (handles[i] > 0)
+        {
+            core_free(handles[i]);
+            handles[i] = 0;
+        }
+    }
+    overlap_buffer[0] = overlap_buffer[1] = NULL;
+    outbuf[0]         = outbuf[1]         = NULL;
+}
+
+bool tdspeed_config(int samplerate, bool stereo, int32_t factor)
+{
+    struct tdspeed_state_s *st = &tdspeed_state;
+    int src_frame_sz;
+
+    /* Check buffers were allocated ok */
+    if (overlap_buffer[0] == NULL || overlap_buffer[1] == NULL)
+        return false;
+
+    if (outbuf[0] == NULL || outbuf[1] == NULL)
+        return false;
+
+    /* Check parameters */
+    if (factor == PITCH_SPEED_100)
+        return false;
+
+    if (samplerate < MIN_RATE || samplerate > MAX_RATE)
+        return false;
+
+    if (factor < STRETCH_MIN || factor > STRETCH_MAX)
+        return false;
+
+    st->stereo = stereo;
+    st->dst_step = samplerate / MINFREQ;
+
+    if (factor > PITCH_SPEED_100)
+        st->dst_step = st->dst_step * PITCH_SPEED_100 / factor;
+
+    st->dst_order = 1;
+
+    while (st->dst_step >>= 1)
+        st->dst_order++;
+
+    st->dst_step = (1 << st->dst_order);
+    st->src_step = st->dst_step * factor / PITCH_SPEED_100;
+    st->shift_max = (st->dst_step > st->src_step) ? st->dst_step : st->src_step;
+
+    src_frame_sz = st->shift_max + st->dst_step;
+
+    if (st->dst_step > st->src_step)
+        src_frame_sz += st->dst_step - st->src_step;
+
+    st->ovl_space = ((src_frame_sz - 2) / st->src_step) * st->src_step
+                        + src_frame_sz;
+
+    if (st->src_step > st->dst_step)
+        st->ovl_space += 2*st->src_step - st->dst_step;
+
+    if (st->ovl_space > FIXED_BUFSIZE)
+        st->ovl_space = FIXED_BUFSIZE;
+
+    st->ovl_size = 0;
+    st->ovl_shift = 0;
+
+    st->ovl_buff[0] = overlap_buffer[0];
+
+    if (stereo)
+        st->ovl_buff[1] = overlap_buffer[1];
+    else
+        st->ovl_buff[1] = st->ovl_buff[0];
+
+    return true;
+}
+
+static int tdspeed_apply(int32_t *buf_out[2], int32_t *buf_in[2],
+                         int data_len, int last, int out_size)
+/* data_len in samples */
+{
+    struct tdspeed_state_s *st = &tdspeed_state;
+    int32_t *dest[2];
+    int32_t next_frame, prev_frame, src_frame_sz;
+    bool stereo = buf_in[0] != buf_in[1];
+
+    assert(stereo == st->stereo);
+
+    src_frame_sz = st->shift_max + st->dst_step;
+
+    if (st->dst_step > st->src_step)
+        src_frame_sz += st->dst_step - st->src_step;
+
+    /* deal with overlap data first, if any */
+    if (st->ovl_size)
+    {
+        int32_t have, copy, steps;
+        have = st->ovl_size;
+
+        if (st->ovl_shift > 0)
+            have -= st->ovl_shift;
+
+        /* append just enough data to have all of the overlap buffer consumed */
+        steps = (have - 1) / st->src_step;
+        copy = steps * st->src_step + src_frame_sz - have;
+
+        if (copy < src_frame_sz - st->dst_step)
+            copy += st->src_step;  /* one more step to allow for pregap data */
+
+        if (copy > data_len)
+            copy = data_len;
+
+        assert(st->ovl_size + copy <= FIXED_BUFSIZE);
+        memcpy(st->ovl_buff[0] + st->ovl_size, buf_in[0],
+               copy * sizeof(int32_t));
+
+        if (stereo)
+            memcpy(st->ovl_buff[1] + st->ovl_size, buf_in[1],
+                   copy * sizeof(int32_t));
+
+        if (!last && have + copy < src_frame_sz)
+        {
+            /* still not enough to process at least one frame */
+            st->ovl_size += copy;
+            return 0;
+        }
+
+        /* recursively call ourselves to process the overlap buffer */
+        have = st->ovl_size;
+        st->ovl_size = 0;
+
+        if (copy == data_len)
+        {
+            assert(have + copy <= FIXED_BUFSIZE);
+            return tdspeed_apply(buf_out, st->ovl_buff, have+copy, last,
+                               out_size);
+        }
+
+        assert(have + copy <= FIXED_BUFSIZE);
+        int i = tdspeed_apply(buf_out, st->ovl_buff, have+copy, -1, out_size);
+
+        dest[0] = buf_out[0] + i;
+        dest[1] = buf_out[1] + i;
+
+        /* readjust pointers to account for data already consumed */
+        next_frame = copy - src_frame_sz + st->src_step;
+        prev_frame = next_frame - st->ovl_shift;
+    }
+    else
+    {
+        dest[0] = buf_out[0];
+        dest[1] = buf_out[1];
+
+        next_frame = prev_frame = 0;
+
+        if (st->ovl_shift > 0)
+            next_frame += st->ovl_shift;
+        else
+            prev_frame += -st->ovl_shift;
+    }
+
+    st->ovl_shift = 0;
+
+    /* process all complete frames */
+    while (data_len - next_frame >= src_frame_sz)
+    {
+        /* find frame overlap by autocorelation */
+        int const INC1 = 8;
+        int const INC2 = 32;
+
+        int64_t min_delta = INT64_MAX;  /* most positive */
+        int shift = 0;
+
+        /* Power of 2 of a 28bit number requires 56bits, can accumulate
+           256times in a 64bit variable. */
+        assert(st->dst_step / INC2 <= 256);
+        assert(next_frame + st->shift_max - 1 + st->dst_step - 1 < data_len);
+        assert(prev_frame + st->dst_step - 1 < data_len);
+
+        for (int i = 0; i < st->shift_max; i += INC1)
+        {
+            int64_t delta = 0;
+
+            int32_t *curr = buf_in[0] + next_frame + i;
+            int32_t *prev = buf_in[0] + prev_frame;
+
+            for (int j = 0; j < st->dst_step; j += INC2, curr += INC2, prev += INC2)
+            {
+                int32_t diff = *curr - *prev;
+                delta += abs(diff);
+
+                if (delta >= min_delta)
+                    goto skip;
+            }
+
+            if (stereo)
+            {
+                curr = buf_in[1] + next_frame + i;
+                prev = buf_in[1] + prev_frame;
+
+                for (int j = 0; j < st->dst_step; j += INC2, curr += INC2, prev += INC2)
+                {
+                    int32_t diff = *curr - *prev;
+                    delta += abs(diff);
+
+                    if (delta >= min_delta)
+                        goto skip;
+                }
+            }
+
+            min_delta = delta;
+            shift = i;
+skip:;
+        }
+
+        /* overlap fading-out previous frame with fading-in current frame */
+        int32_t *curr = buf_in[0] + next_frame + shift;
+        int32_t *prev = buf_in[0] + prev_frame;
+
+        int32_t *d = dest[0];
+
+        assert(next_frame + shift + st->dst_step - 1 < data_len);
+        assert(prev_frame + st->dst_step - 1 < data_len);
+        assert(dest[0] - buf_out[0] + st->dst_step - 1 < out_size);
+
+        for (int i = 0, j = st->dst_step; j; i++, j--)
+        {
+            *d++ = (*curr++ * (int64_t)i +
+                    *prev++ * (int64_t)j) >> st->dst_order;
+        }
+
+        dest[0] = d;
+
+        if (stereo)
+        {
+            curr = buf_in[1] + next_frame + shift;
+            prev = buf_in[1] + prev_frame;
+
+            d = dest[1];
+
+            for (int i = 0, j = st->dst_step; j; i++, j--)
+            {
+                assert(d < buf_out[1] + out_size);
+
+                *d++ = (*curr++ * (int64_t)i +
+                        *prev++ * (int64_t)j) >> st->dst_order;
+            }
+
+            dest[1] = d;
+        }
+
+        /* adjust pointers for next frame */
+        prev_frame = next_frame + shift + st->dst_step;
+        next_frame += st->src_step;
+
+        /* here next_frame - prev_frame = src_step - dst_step - shift */
+        assert(next_frame - prev_frame == st->src_step - st->dst_step - shift);
+    }
+
+    /* now deal with remaining partial frames */
+    if (last == -1)
+    {
+        /* special overlap buffer processing: remember frame shift only */
+        st->ovl_shift = next_frame - prev_frame;
+    }
+    else if (last != 0)
+    {
+        /* last call: purge all remaining data to output buffer */
+        int i = data_len - prev_frame;
+
+        assert(dest[0] + i <= buf_out[0] + out_size);
+        memcpy(dest[0], buf_in[0] + prev_frame, i * sizeof(int32_t));
+
+        dest[0] += i;
+
+        if (stereo)
+        {
+            assert(dest[1] + i <= buf_out[1] + out_size);
+            memcpy(dest[1], buf_in[1] + prev_frame, i * sizeof(int32_t));
+            dest[1] += i;
+        }
+    }
+    else
+    {
+        /* preserve remaining data + needed overlap data for next call */
+        st->ovl_shift = next_frame - prev_frame;
+        int i = (st->ovl_shift < 0) ? next_frame : prev_frame;
+        st->ovl_size = data_len - i;
+
+        assert(st->ovl_size <= FIXED_BUFSIZE);
+        memcpy(st->ovl_buff[0], buf_in[0] + i, st->ovl_size * sizeof(int32_t));
+
+        if (stereo)
+            memcpy(st->ovl_buff[1], buf_in[1] + i, st->ovl_size * sizeof(int32_t));
+    }
+
+    return dest[0] - buf_out[0];
+}
+
+long tdspeed_est_output_size()
+{
+    return TDSPEED_OUTBUFSIZE;
+}
+
+long tdspeed_est_input_size(long size)
+{
+    struct tdspeed_state_s *st = &tdspeed_state;
+
+    size = (size - st->ovl_size) * st->src_step / st->dst_step;
+
+    if (size < 0)
+        size = 0;
+
+    return size;
+}
+
+int tdspeed_doit(int32_t *src[], int count)
+{
+    dsp_src = src;
+    count = tdspeed_apply( (int32_t *[2]) { outbuf[0], outbuf[1] },
+                           src, count, 0, TDSPEED_OUTBUFSIZE);
+
+    src[0] = outbuf[0];
+    src[1] = outbuf[1];
+
+    return count;
+}
+
--- a/lib/rbcodec/dsp/tdspeed.h
+++ b/lib/rbcodec/dsp/tdspeed.h
@ -0,0 +1,49 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 by Nicolas Pitre <nico@cam.org>
+ * Copyright (C) 2006-2007 by Stéphane Doyon <s.doyon@videotron.ca>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#ifndef _TDSPEED_H
+#define _TDSPEED_H
+
+#include "dsp.h"
+
+#define TDSPEED_OUTBUFSIZE 4096
+
+/* some #define functions to get the pitch, stretch and speed values based on */
+/* two known values.  Remember that params are alphabetical.                  */
+#define GET_SPEED(pitch, stretch) \
+    ((pitch * stretch + PITCH_SPEED_100 / 2L) / PITCH_SPEED_100)
+#define GET_PITCH(speed, stretch) \
+    ((speed * PITCH_SPEED_100 + stretch / 2L) / stretch)
+#define GET_STRETCH(pitch, speed) \
+    ((speed * PITCH_SPEED_100 + pitch   / 2L) / pitch)
+
+void tdspeed_init(void);
+void tdspeed_finish(void);
+bool tdspeed_config(int samplerate, bool stereo, int32_t factor);
+long tdspeed_est_output_size(void);
+long tdspeed_est_input_size(long size);
+int tdspeed_doit(int32_t *src[], int count);
+
+#define STRETCH_MAX (250L * PITCH_SPEED_PRECISION) /* 250% */
+#define STRETCH_MIN (35L  * PITCH_SPEED_PRECISION) /* 35%  */
+
+#endif