Add codecs to librbcodec.

Change-Id: Id7f4717d51ed02d67cb9f9cb3c0ada4a81843f97 Reviewed-on: http://gerrit.rockbox.org/137 Reviewed-by: Nils Wallménius <nils@rockbox.org> Tested-by: Nils Wallménius <nils@rockbox.org>
2025-11-21 11:02:45 -05:00 · 2011-06-25 21:32:25 -04:00 · 2011-06-25 21:32:25 -04:00 · f40bfc9267
commit f40bfc9267
parent a0009907de
757 changed files with 122 additions and 122 deletions
--- a/lib/rbcodec/codecs/libwma/SOURCES
+++ b/lib/rbcodec/codecs/libwma/SOURCES
@ -0,0 +1,3 @@
+wmadeci.c
+wmafixed.c
+../lib/ffmpeg_bitstream.c
--- a/lib/rbcodec/codecs/libwma/libwma.make
+++ b/lib/rbcodec/codecs/libwma/libwma.make
@ -0,0 +1,18 @@
+#             __________               __   ___.
+#   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+#   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+#   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+#   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+#                     \/            \/     \/    \/            \/
+# $Id$
+#
+
+# libwma
+WMALIB := $(CODECDIR)/libwma.a
+WMALIB_SRC := $(call preprocess, $(RBCODECLIB_DIR)/codecs/libwma/SOURCES)
+WMALIB_OBJ := $(call c2obj, $(WMALIB_SRC))
+OTHER_SRC += $(WMALIB_SRC)
+
+$(WMALIB): $(WMALIB_OBJ)
+	$(SILENT)$(shell rm -f $@)
+	$(call PRINTS,AR $(@F))$(AR) rcs $@ $^ >/dev/null
--- a/lib/rbcodec/codecs/libwma/types.h
+++ b/lib/rbcodec/codecs/libwma/types.h
@ -0,0 +1,5 @@
+#include <codecs/lib/codeclib.h>
+
+#define fixed32         int32_t
+#define fixed64         int64_t
+
--- a/lib/rbcodec/codecs/libwma/wmadata.h
+++ b/lib/rbcodec/codecs/libwma/wmadata.h
--- a/lib/rbcodec/codecs/libwma/wmadec.h
+++ b/lib/rbcodec/codecs/libwma/wmadec.h
@ -0,0 +1,181 @@
+/*
+ * WMA compatible decoder
+ * Copyright (c) 2002 The FFmpeg Project.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _WMADEC_H
+#define _WMADEC_H
+
+#include <codecs/libasf/asf.h>
+#include "ffmpeg_get_bits.h"
+#include "types.h"
+
+//#define TRACE
+/* size of blocks */
+#define BLOCK_MIN_BITS 7
+#define BLOCK_MAX_BITS 11
+#define BLOCK_MAX_SIZE (1 << BLOCK_MAX_BITS)
+
+#define BLOCK_NB_SIZES (BLOCK_MAX_BITS - BLOCK_MIN_BITS + 1)
+
+/* XXX: find exact max size */
+#define HIGH_BAND_MAX_SIZE 16
+
+#define NB_LSP_COEFS 10
+
+/* XXX: is it a suitable value ? */
+#define MAX_CODED_SUPERFRAME_SIZE 16384
+
+#define M_PI    3.14159265358979323846
+
+#define M_PI_F  0x3243f // in fixed 32 format
+#define TWO_M_PI_F  0x6487f   //in fixed 32
+
+#define MAX_CHANNELS 2
+
+#define NOISE_TAB_SIZE 8192
+
+#define LSP_POW_BITS 7
+
+
+#if (CONFIG_CPU == PP5022) || (CONFIG_CPU == PP5024) || (CONFIG_CPU == MCF5250)
+/* PP5022/24 and MCF5250 have 128KB of IRAM. 80KB are allocated for codecs */
+#define IBSS_ATTR_WMA_LARGE_IRAM IBSS_ATTR
+#define IBSS_ATTR_WMA_XL_IRAM
+#define ICONST_ATTR_WMA_XL_IRAM
+
+#elif defined(CPU_S5L870X)
+/* S5L870x has even more IRAM. Use it. */
+#define IBSS_ATTR_WMA_LARGE_IRAM IBSS_ATTR
+#define IBSS_ATTR_WMA_XL_IRAM    IBSS_ATTR
+#define ICONST_ATTR_WMA_XL_IRAM  ICONST_ATTR
+
+#else
+/* other PP's and MCF5249 have 96KB of IRAM */
+#define IBSS_ATTR_WMA_LARGE_IRAM
+#define IBSS_ATTR_WMA_XL_IRAM
+#define ICONST_ATTR_WMA_XL_IRAM
+
+#endif
+
+
+#define VLCBITS 7       /*7 is the lowest without glitching*/
+#define VLCMAX ((22+VLCBITS-1)/VLCBITS)
+
+#define EXPVLCBITS 7
+#define EXPMAX ((19+EXPVLCBITS-1)/EXPVLCBITS)
+
+#define HGAINVLCBITS 9
+#define HGAINMAX ((13+HGAINVLCBITS-1)/HGAINVLCBITS)
+
+
+typedef struct CoefVLCTable
+{
+    int n;                               /* total number of codes */ 
+    const uint32_t *huffcodes;           /* VLC bit values */
+    const uint8_t *huffbits;             /* VLC bit size */
+    const uint16_t *levels;              /* table to build run/level tables */
+}
+CoefVLCTable;
+
+typedef struct WMADecodeContext
+{
+    GetBitContext gb;
+
+    int nb_block_sizes;  /* number of block sizes */
+
+    int sample_rate;
+    int nb_channels;
+    int bit_rate;
+    int version; /* 1 = 0x160 (WMAV1), 2 = 0x161 (WMAV2) */
+    int block_align;
+    int use_bit_reservoir;
+    int use_variable_block_len;
+    int use_exp_vlc;  /* exponent coding: 0 = lsp, 1 = vlc + delta */
+    int use_noise_coding; /* true if perceptual noise is added */
+    int byte_offset_bits;
+    VLC exp_vlc;
+    int exponent_sizes[BLOCK_NB_SIZES];
+    uint16_t exponent_bands[BLOCK_NB_SIZES][25];
+    int high_band_start[BLOCK_NB_SIZES]; /* index of first coef in high band */
+    int coefs_start;               /* first coded coef */
+    int coefs_end[BLOCK_NB_SIZES]; /* max number of coded coefficients */
+    int exponent_high_sizes[BLOCK_NB_SIZES];
+    int exponent_high_bands[BLOCK_NB_SIZES][HIGH_BAND_MAX_SIZE];
+    VLC hgain_vlc;
+
+    /* coded values in high bands */
+    int high_band_coded[MAX_CHANNELS][HIGH_BAND_MAX_SIZE];
+    int high_band_values[MAX_CHANNELS][HIGH_BAND_MAX_SIZE];
+
+    /* there are two possible tables for spectral coefficients */
+    VLC coef_vlc[2];
+    uint16_t *run_table[2];
+    uint16_t *level_table[2];
+    /* frame info */
+    int frame_len;       /* frame length in samples */
+    int frame_len_bits;  /* frame_len = 1 << frame_len_bits */
+
+    /* block info */
+    int reset_block_lengths;
+    int block_len_bits; /* log2 of current block length */
+    int next_block_len_bits; /* log2 of next block length */
+    int prev_block_len_bits; /* log2 of prev block length */
+    int block_len; /* block length in samples */
+    int block_num; /* block number in current frame */
+    int block_pos; /* current position in frame */
+    uint8_t ms_stereo; /* true if mid/side stereo mode */
+    uint8_t channel_coded[MAX_CHANNELS]; /* true if channel is coded */
+    int exponents_bsize[MAX_CHANNELS];      // log2 ratio frame/exp. length
+    fixed32 exponents[MAX_CHANNELS][BLOCK_MAX_SIZE] MEM_ALIGN_ATTR;
+    fixed32 max_exponent[MAX_CHANNELS];
+    int16_t coefs1[MAX_CHANNELS][BLOCK_MAX_SIZE];
+    fixed32 (*coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
+    fixed32 *windows[BLOCK_NB_SIZES];
+    /* output buffer for one frame and the last for IMDCT windowing */
+    fixed32 (*frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE*2];
+
+    /* last frame info */
+    uint8_t last_superframe[MAX_CODED_SUPERFRAME_SIZE + 4] MEM_ALIGN_ATTR; /* padding added */
+    int last_bitoffset;
+    int last_superframe_len;
+    fixed32 *noise_table;
+    int noise_index;
+    fixed32 noise_mult; /* XXX: suppress that and integrate it in the noise array */
+    /* lsp_to_curve tables */
+    fixed32 lsp_cos_table[BLOCK_MAX_SIZE] MEM_ALIGN_ATTR;
+    void *lsp_pow_m_table1;
+    void *lsp_pow_m_table2;
+
+    /* State of current superframe decoding */
+    int bit_offset;
+    int nb_frames;
+    int current_frame;
+
+#ifdef TRACE
+
+    int frame_count;
+#endif
+}
+WMADecodeContext;
+
+int wma_decode_init(WMADecodeContext* s, asf_waveformatex_t *wfx);
+int wma_decode_superframe_init(WMADecodeContext* s,
+                               const uint8_t *buf, int buf_size);
+int wma_decode_superframe_frame(WMADecodeContext* s,
+                                const uint8_t *buf, int buf_size);
+#endif
--- a/lib/rbcodec/codecs/libwma/wmadeci.c
+++ b/lib/rbcodec/codecs/libwma/wmadeci.c
--- a/lib/rbcodec/codecs/libwma/wmafixed.c
+++ b/lib/rbcodec/codecs/libwma/wmafixed.c
@ -0,0 +1,223 @@
+/****************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ *
+ * Copyright (C) 2007 Michael Giacomelli
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "wmadec.h"
+#include "wmafixed.h"
+#include <codecs.h>
+
+fixed64 IntTo64(int x){
+    fixed64 res = 0;
+    unsigned char *p = (unsigned char *)&res;
+
+#ifdef ROCKBOX_BIG_ENDIAN
+    p[5] = x & 0xff;
+    p[4] = (x & 0xff00)>>8;
+    p[3] = (x & 0xff0000)>>16;
+    p[2] = (x & 0xff000000)>>24;
+#else
+    p[2] = x & 0xff;
+    p[3] = (x & 0xff00)>>8;
+    p[4] = (x & 0xff0000)>>16;
+    p[5] = (x & 0xff000000)>>24;
+#endif
+    return res;
+}
+
+int IntFrom64(fixed64 x)
+{
+    int res = 0;
+    unsigned char *p = (unsigned char *)&x;
+
+#ifdef ROCKBOX_BIG_ENDIAN
+    res = p[5] | (p[4]<<8) | (p[3]<<16) | (p[2]<<24);
+#else
+    res = p[2] | (p[3]<<8) | (p[4]<<16) | (p[5]<<24);
+#endif
+    return res;
+}
+
+fixed32 Fixed32From64(fixed64 x)
+{
+  return x & 0xFFFFFFFF;
+}
+
+fixed64 Fixed32To64(fixed32 x)
+{
+  return (fixed64)x;
+}
+
+/*
+    Not performance senstitive code here
+
+*/
+
+fixed32 fixdiv32(fixed32 x, fixed32 y)
+{
+    fixed64 temp;
+
+    if(x == 0)
+        return 0;
+    if(y == 0)
+        return 0x7fffffff;
+    temp = x;
+    temp <<= PRECISION;
+    return (fixed32)(temp / y);
+}
+
+fixed64 fixdiv64(fixed64 x, fixed64 y)
+{
+    fixed64 temp;
+
+    if(x == 0)
+        return 0;
+    if(y == 0)
+        return 0x07ffffffffffffffLL;
+    temp = x;
+    temp <<= PRECISION64;
+    return (fixed64)(temp / y);
+}
+
+ fixed32 fixsqrt32(fixed32 x)
+{
+
+    unsigned long r = 0, s, v = (unsigned long)x;
+
+#define STEP(k) s = r + (1 << k * 2); r >>= 1; \
+    if (s <= v) { v -= s; r |= (1 << k * 2); }
+
+    STEP(15);
+    STEP(14);
+    STEP(13);
+    STEP(12);
+    STEP(11);
+    STEP(10);
+    STEP(9);
+    STEP(8);
+    STEP(7);
+    STEP(6);
+    STEP(5);
+    STEP(4);
+    STEP(3);
+    STEP(2);
+    STEP(1);
+    STEP(0);
+
+    return (fixed32)(r << (PRECISION / 2));
+}
+
+
+     static const long cordic_circular_gain = 0xb2458939; /* 0.607252929 */  
+     
+ /* Table of values of atan(2^-i) in 0.32 format fractions of pi where pi = 0xffffffff / 2 */    
+ static const unsigned long atan_table[] = {     
+     0x1fffffff, /* +0.785398163 (or pi/4) */    
+     0x12e4051d, /* +0.463647609 */  
+     0x09fb385b, /* +0.244978663 */  
+     0x051111d4, /* +0.124354995 */  
+     0x028b0d43, /* +0.062418810 */  
+     0x0145d7e1, /* +0.031239833 */  
+     0x00a2f61e, /* +0.015623729 */  
+     0x00517c55, /* +0.007812341 */  
+     0x0028be53, /* +0.003906230 */  
+     0x00145f2e, /* +0.001953123 */  
+     0x000a2f98, /* +0.000976562 */  
+     0x000517cc, /* +0.000488281 */  
+     0x00028be6, /* +0.000244141 */  
+     0x000145f3, /* +0.000122070 */  
+     0x0000a2f9, /* +0.000061035 */  
+     0x0000517c, /* +0.000030518 */  
+     0x000028be, /* +0.000015259 */  
+     0x0000145f, /* +0.000007629 */  
+     0x00000a2f, /* +0.000003815 */  
+     0x00000517, /* +0.000001907 */  
+     0x0000028b, /* +0.000000954 */  
+     0x00000145, /* +0.000000477 */  
+     0x000000a2, /* +0.000000238 */  
+     0x00000051, /* +0.000000119 */  
+     0x00000028, /* +0.000000060 */  
+     0x00000014, /* +0.000000030 */  
+     0x0000000a, /* +0.000000015 */  
+     0x00000005, /* +0.000000007 */  
+     0x00000002, /* +0.000000004 */  
+     0x00000001, /* +0.000000002 */  
+     0x00000000, /* +0.000000001 */  
+     0x00000000, /* +0.000000000 */  
+ };  
+     
+ /**     
+  * Implements sin and cos using CORDIC rotation.    
+  *  
+  * @param phase has range from 0 to 0xffffffff, representing 0 and  
+  *        2*pi respectively.    
+  * @param cos return address for cos    
+  * @return sin of phase, value is a signed value from LONG_MIN to LONG_MAX,     
+  *         representing -1 and 1 respectively.  
+  *  
+  *        Gives at least 24 bits precision (last 2-8 bits or so are probably off)   
+  */     
+     
+ long fsincos(unsigned long phase, fixed32 *cos)     
+ {   
+     int32_t x, x1, y, y1;   
+     unsigned long z, z1;    
+     int i;  
+     
+     /* Setup initial vector */  
+     x = cordic_circular_gain;   
+     y = 0;  
+     z = phase;  
+     
+     /* The phase has to be somewhere between 0..pi for this to work right */    
+     if (z < 0xffffffff / 4) {   
+         /* z in first quadrant, z += pi/2 to correct */     
+         x = -x;     
+         z += 0xffffffff / 4;    
+     } else if (z < 3 * (0xffffffff / 4)) {  
+         /* z in third quadrant, z -= pi/2 to correct */     
+         z -= 0xffffffff / 4;    
+     } else {    
+         /* z in fourth quadrant, z -= 3pi/2 to correct */   
+         x = -x;     
+         z -= 3 * (0xffffffff / 4);  
+     }   
+     
+     /* Each iteration adds roughly 1-bit of extra precision */  
+     for (i = 0; i < 31; i++) {  
+         x1 = x >> i;    
+         y1 = y >> i;    
+         z1 = atan_table[i];     
+     
+         /* Decided which direction to rotate vector. Pivot point is pi/2 */     
+         if (z >= 0xffffffff / 4) {  
+             x -= y1;    
+             y += x1;    
+             z -= z1;    
+         } else {    
+             x += y1;    
+             y -= x1;    
+             z += z1;    
+         }   
+     }   
+     
+     if (cos)    
+         *cos = x;   
+     
+     return y;   
+ }
--- a/lib/rbcodec/codecs/libwma/wmafixed.h
+++ b/lib/rbcodec/codecs/libwma/wmafixed.h
@ -0,0 +1,237 @@
+/****************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ *
+ * Copyright (C) 2007 Michael Giacomelli
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+/*  fixed precision code.  We use a combination of Sign 15.16 and Sign.31
+     precision here.
+
+    The WMA decoder does not always follow this convention, and occasionally
+    renormalizes values to other formats in order to maximize precision.
+    However, only the two precisions above are provided in this file.
+
+*/
+
+#include "types.h"
+
+#define PRECISION       16
+#define PRECISION64     16
+
+
+#define fixtof64(x)       (float)((float)(x) / (float)(1 << PRECISION64))        //does not work on int64_t!
+#define ftofix32(x)       ((fixed32)((x) * (float)(1 << PRECISION) + ((x) < 0 ? -0.5 : 0.5)))
+#define itofix64(x)       (IntTo64(x))
+#define itofix32(x)       ((x) << PRECISION)
+#define fixtoi32(x)       ((x) >> PRECISION)
+#define fixtoi64(x)       (IntFrom64(x))
+
+
+/*fixed functions*/
+
+fixed64 IntTo64(int x);
+int IntFrom64(fixed64 x);
+fixed32 Fixed32From64(fixed64 x);
+fixed64 Fixed32To64(fixed32 x);
+fixed32 fixdiv32(fixed32 x, fixed32 y);
+fixed64 fixdiv64(fixed64 x, fixed64 y);
+fixed32 fixsqrt32(fixed32 x);
+/* Inverse gain of circular cordic rotation in s0.31 format. */
+long fsincos(unsigned long phase, fixed32 *cos);
+
+
+#ifdef CPU_ARM
+
+/*Sign-15.16 format */
+#define fixmul32(x, y)  \
+    ({ int32_t __hi;  \
+       uint32_t __lo;  \
+       int32_t __result;  \
+       asm ("smull   %0, %1, %3, %4\n\t"  \
+            "movs    %0, %0, lsr %5\n\t"  \
+            "adc    %2, %0, %1, lsl %6"  \
+            : "=&r" (__lo), "=&r" (__hi), "=r" (__result)  \
+            : "%r" (x), "r" (y),  \
+              "M" (PRECISION), "M" (32 - PRECISION)  \
+            : "cc");  \
+       __result;  \
+    })
+
+#elif defined(CPU_COLDFIRE)
+
+static inline int32_t fixmul32(int32_t x, int32_t y)
+{
+#if PRECISION != 16
+#warning Coldfire fixmul32() only works for PRECISION == 16
+#endif
+    int32_t t1;
+    asm (
+        "mac.l   %[x], %[y], %%acc0  \n" // multiply
+        "mulu.l  %[y], %[x]      \n"     // get lower half, avoid emac stall
+        "movclr.l %%acc0, %[t1]  \n"     // get higher half
+        "lsr.l   #1, %[t1]       \n"
+        "move.w  %[t1], %[x]     \n"
+        "swap    %[x]            \n"
+        : [t1] "=&d" (t1), [x] "+d" (x)
+        : [y] "d"  (y)
+    );
+    return x;
+}
+
+#else
+
+static inline fixed32 fixmul32(fixed32 x, fixed32 y)
+{
+    fixed64 temp;
+    temp = x;
+    temp *= y;
+
+    temp >>= PRECISION;
+
+    return (fixed32)temp;
+}
+
+#endif
+
+
+/*
+ * Helper functions for wma_window.
+ *
+ *
+ */
+
+#ifdef CPU_ARM
+static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
+                         const fixed32 *window, int n)
+{
+    /* Block sizes are always power of two */
+    asm volatile (
+        "0:"
+        "ldmia %[d]!, {r0, r1};"
+        "ldmia %[w]!, {r4, r5};"
+        /* consume the first data and window value so we can use those
+         * registers again */
+        "smull r8, r9, r0, r4;"
+        "ldmia %[dst], {r0, r4};"
+        "add   r0, r0, r9, lsl #1;"  /* *dst=*dst+(r9<<1)*/
+        "smull r8, r9, r1, r5;"
+        "add   r1, r4, r9, lsl #1;"
+        "stmia %[dst]!, {r0, r1};"
+        "subs  %[n], %[n], #2;"
+        "bne   0b;"
+        : [d] "+r" (data), [w] "+r" (window), [dst] "+r" (dst), [n] "+r" (n)
+        : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
+}
+
+static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1,
+                         int len)
+{
+    /* Block sizes are always power of two */
+    asm volatile (
+        "add   %[s1], %[s1], %[n], lsl #2;"
+        "0:"
+        "ldmia %[s0]!, {r0, r1};"
+        "ldmdb %[s1]!, {r4, r5};"
+        "smull r8, r9, r0, r5;"
+        "mov   r0, r9, lsl #1;"
+        "smull r8, r9, r1, r4;"
+        "mov   r1, r9, lsl #1;"
+        "stmia %[dst]!, {r0, r1};"
+        "subs  %[n], %[n], #2;"
+        "bne   0b;"
+        : [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len)
+        : : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
+}
+
+#elif defined(CPU_COLDFIRE)
+
+static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *data,
+                         const fixed32 *window, int n)
+{
+    /* Block sizes are always power of two. Smallest block is always way bigger
+     * than four too.*/
+    asm volatile (
+        "0:"
+        "movem.l (%[d]), %%d0-%%d3;"
+        "movem.l (%[w]), %%d4-%%d5/%%a0-%%a1;"
+        "mac.l %%d0, %%d4, %%acc0;"
+        "mac.l %%d1, %%d5, %%acc1;"
+        "mac.l %%d2, %%a0, %%acc2;"
+        "mac.l %%d3, %%a1, %%acc3;"
+        "lea.l (16, %[d]), %[d];"
+        "lea.l (16, %[w]), %[w];"
+        "movclr.l %%acc0, %%d0;"
+        "movclr.l %%acc1, %%d1;"
+        "movclr.l %%acc2, %%d2;"
+        "movclr.l %%acc3, %%d3;"
+        "movem.l (%[dst]), %%d4-%%d5/%%a0-%%a1;"
+        "add.l %%d4, %%d0;"
+        "add.l %%d5, %%d1;"
+        "add.l %%a0, %%d2;"
+        "add.l %%a1, %%d3;"
+        "movem.l %%d0-%%d3, (%[dst]);"
+        "lea.l (16, %[dst]), %[dst];"
+        "subq.l #4, %[n];"
+        "jne 0b;"
+        : [d] "+a" (data), [w] "+a" (window), [dst] "+a" (dst), [n] "+d" (n)
+        : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
+}
+
+static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1,
+                         int len)
+{
+    /* Block sizes are always power of two. Smallest block is always way bigger
+     * than four too.*/
+    asm volatile (
+        "lea.l (-16, %[s1], %[n]*4), %[s1];"
+        "0:"
+        "movem.l (%[s0]), %%d0-%%d3;"
+        "movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;"
+        "mac.l %%d0, %%a1, %%acc0;"
+        "mac.l %%d1, %%a0, %%acc1;"
+        "mac.l %%d2, %%d5, %%acc2;"
+        "mac.l %%d3, %%d4, %%acc3;"
+        "lea.l (16, %[s0]), %[s0];"
+        "lea.l (-16, %[s1]), %[s1];"
+        "movclr.l %%acc0, %%d0;"
+        "movclr.l %%acc1, %%d1;"
+        "movclr.l %%acc2, %%d2;"
+        "movclr.l %%acc3, %%d3;"
+        "movem.l %%d0-%%d3, (%[dst]);"
+        "lea.l (16, %[dst]), %[dst];"
+        "subq.l #4, %[n];"
+        "jne 0b;"
+        : [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len)
+        : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
+}
+
+#else
+
+static inline void vector_fmul_add_add(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){
+    int i;
+    for(i=0; i<len; i++)
+        dst[i] = fixmul32b(src0[i], src1[i]) + dst[i];
+}
+
+static inline void vector_fmul_reverse(fixed32 *dst, const fixed32 *src0, const fixed32 *src1, int len){
+    int i;
+    src1 += len-1;
+    for(i=0; i<len; i++)
+        dst[i] = fixmul32b(src0[i], src1[-i]);
+}
+
+#endif