forked from len0rd/rockbox
		
	git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24715 a1c6a512-1295-4272-9138-f99709370657
		
			
				
	
	
		
			342 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			342 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /***************************************************************************
 | |
|  *             __________               __   ___.
 | |
|  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 | |
|  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 | |
|  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 | |
|  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 | |
|  *                     \/            \/     \/    \/            \/
 | |
|  *
 | |
|  * Copyright (C) 2005 by Pedro Vasconcelos
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU General Public License
 | |
|  * as published by the Free Software Foundation; either version 2
 | |
|  * of the License, or (at your option) any later version.
 | |
|  *
 | |
|  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 | |
|  * KIND, either express or implied.
 | |
|  *
 | |
|  ****************************************************************************/
 | |
| /* asm routines for wide math on the MCF5249 */
 | |
| 
 | |
| #if defined(CPU_COLDFIRE)
 | |
| 
 | |
| /* attribute for 16-byte alignment */
 | |
| #define LINE_ATTR   __attribute__ ((aligned (16)))
 | |
| 
 | |
| #ifndef _V_WIDE_MATH
 | |
| #define _V_WIDE_MATH
 | |
| 
 | |
| static inline int32_t MULT32(int32_t x, int32_t y) {
 | |
| 
 | |
|   asm volatile ("mac.l %[x], %[y], %%acc0;"    /* multiply & shift  */
 | |
|                 "movclr.l %%acc0, %[x];"       /* move & clear acc */
 | |
|                 "asr.l #1, %[x];"              /* no overflow test */
 | |
|                 : [x] "+&d" (x)
 | |
|                 : [y] "r" (y)
 | |
|                 : "cc");
 | |
|   return x;
 | |
| }
 | |
| 
 | |
| static inline int32_t MULT31(int32_t x, int32_t y) {
 | |
|   asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
 | |
|                 "movclr.l %%acc0, %[x];"    /* move and clear */
 | |
|                 : [x] "+&r" (x)
 | |
|                 : [y] "r" (y)
 | |
|                 : "cc");
 | |
|   return x;
 | |
| }
 | |
| 
 | |
| static inline int32_t MULT31_SHIFT15(int32_t x, int32_t y) {
 | |
|   int32_t r;
 | |
| 
 | |
|   asm volatile ("mac.l %[x], %[y], %%acc0;"  /* multiply */
 | |
|                 "mulu.l %[y], %[x];"         /* get lower half, avoid emac stall */
 | |
|                 "movclr.l %%acc0, %[r];"     /* get higher half */
 | |
|                 "asl.l #8, %[r];"            /* hi<<16, plus one free */
 | |
|                 "asl.l #8, %[r];"
 | |
|                 "lsr.l #8, %[x];"            /* (unsigned)lo >> 15 */
 | |
|                 "lsr.l #7, %[x];"
 | |
|                 "or.l %[x], %[r];"           /* logical-or results */
 | |
|                 : [r] "=&d" (r), [x] "+d" (x)
 | |
|                 : [y] "d" (y)
 | |
|                 : "cc");
 | |
|   return r;
 | |
| }
 | |
| 
 | |
| static inline
 | |
| void XPROD31(int32_t  a, int32_t  b,
 | |
|              int32_t  t, int32_t  v,
 | |
|              int32_t *x, int32_t *y)
 | |
| {
 | |
|   asm volatile ("mac.l %[a], %[t], %%acc0;"
 | |
|                 "mac.l %[b], %[v], %%acc0;"
 | |
|                 "mac.l %[b], %[t], %%acc1;"
 | |
|                 "msac.l %[a], %[v], %%acc1;"
 | |
|                 "movclr.l %%acc0, %[a];"
 | |
|                 "move.l %[a], (%[x]);"
 | |
|                 "movclr.l %%acc1, %[a];"
 | |
|                 "move.l %[a], (%[y]);"
 | |
|                 : [a] "+&r" (a)
 | |
|                 : [x] "a" (x), [y] "a" (y),
 | |
|                   [b] "r" (b), [t] "r" (t), [v] "r" (v)
 | |
|                 : "cc", "memory");
 | |
| }
 | |
| 
 | |
| static inline
 | |
| void XNPROD31(int32_t  a, int32_t  b,
 | |
|               int32_t  t, int32_t  v,
 | |
|               int32_t *x, int32_t *y)
 | |
| {
 | |
|   asm volatile ("mac.l %[a], %[t], %%acc0;"
 | |
|                 "msac.l %[b], %[v], %%acc0;"
 | |
|                 "mac.l %[b], %[t], %%acc1;"
 | |
|                 "mac.l %[a], %[v], %%acc1;"
 | |
|                 "movclr.l %%acc0, %[a];"
 | |
|                 "move.l %[a], (%[x]);"
 | |
|                 "movclr.l %%acc1, %[a];"
 | |
|                 "move.l %[a], (%[y]);"
 | |
|                 : [a] "+&r" (a)
 | |
|                 : [x] "a" (x), [y] "a" (y),
 | |
|                   [b] "r" (b), [t] "r" (t), [v] "r" (v)
 | |
|                 : "cc", "memory");
 | |
| }
 | |
| 
 | |
| #if 0    /* canonical Tremor definition */
 | |
| #define XPROD32(_a, _b, _t, _v, _x, _y)         \
 | |
|   { (_x)=MULT32(_a,_t)+MULT32(_b,_v);           \
 | |
|     (_y)=MULT32(_b,_t)-MULT32(_a,_v); }
 | |
| #endif
 | |
| 
 | |
| /* this could lose the LSB by overflow, but i don't think it'll ever happen.
 | |
|    if anyone think they can hear a bug caused by this, please try the above
 | |
|    version. */
 | |
| #define XPROD32(_a, _b, _t, _v, _x, _y)     \
 | |
|   asm volatile ("mac.l %[a], %[t], %%acc0;" \
 | |
|                 "mac.l %[b], %[v], %%acc0;" \
 | |
|                 "mac.l %[b], %[t], %%acc1;" \
 | |
|                 "msac.l %[a], %[v], %%acc1;" \
 | |
|                 "movclr.l %%acc0, %[x];" \
 | |
|                 "asr.l #1, %[x];" \
 | |
|                 "movclr.l %%acc1, %[y];" \
 | |
|                 "asr.l #1, %[y];" \
 | |
|                 : [x] "=&d" (_x), [y] "=&d" (_y) \
 | |
|                 : [a] "r" (_a), [b] "r" (_b), \
 | |
|                   [t] "r" (_t), [v] "r" (_v) \
 | |
|                 : "cc");
 | |
| 
 | |
| #define XPROD31_R(_a, _b, _t, _v, _x, _y)   \
 | |
|   asm volatile ("mac.l %[a], %[t], %%acc0;" \
 | |
|                 "mac.l %[b], %[v], %%acc0;" \
 | |
|                 "mac.l %[b], %[t], %%acc1;" \
 | |
|                 "msac.l %[a], %[v], %%acc1;" \
 | |
|                 "movclr.l %%acc0, %[x];" \
 | |
|                 "movclr.l %%acc1, %[y];" \
 | |
|                 : [x] "=&d" (_x), [y] "=&d" (_y) \
 | |
|                 : [a] "r" (_a), [b] "r" (_b), \
 | |
|                   [t] "r" (_t), [v] "r" (_v) \
 | |
|                 : "cc");
 | |
| 
 | |
| #define XNPROD31_R(_a, _b, _t, _v, _x, _y)  \
 | |
|   asm volatile ("mac.l %[a], %[t], %%acc0;" \
 | |
|                 "msac.l %[b], %[v], %%acc0;" \
 | |
|                 "mac.l %[b], %[t], %%acc1;" \
 | |
|                 "mac.l %[a], %[v], %%acc1;" \
 | |
|                 "movclr.l %%acc0, %[x];" \
 | |
|                 "movclr.l %%acc1, %[y];" \
 | |
|                 : [x] "=&d" (_x), [y] "=&d" (_y) \
 | |
|                 : [a] "r" (_a), [b] "r" (_b), \
 | |
|                   [t] "r" (_t), [v] "r" (_v) \
 | |
|                 : "cc");
 | |
| 
 | |
| #ifndef _V_VECT_OPS
 | |
| #define _V_VECT_OPS
 | |
| 
 | |
| /* asm versions of vector operations for block.c, window.c */
 | |
| /* assumes MAC is initialized & accumulators cleared */
 | |
| static inline
 | |
| void vect_add(int32_t *x, const int32_t *y, int n)
 | |
| {
 | |
|   /* align to 16 bytes */
 | |
|   while(n>0 && (int)x&15) {
 | |
|     *x++ += *y++;
 | |
|     n--;
 | |
|   }
 | |
|   asm volatile ("bra 1f;"
 | |
|                 "0:"                          /* loop start */
 | |
|                 "movem.l (%[x]), %%d0-%%d3;"  /* fetch values */
 | |
|                 "movem.l (%[y]), %%a0-%%a3;"
 | |
|                 /* add */
 | |
|                 "add.l %%a0, %%d0;"
 | |
|                 "add.l %%a1, %%d1;"
 | |
|                 "add.l %%a2, %%d2;"
 | |
|                 "add.l %%a3, %%d3;"
 | |
|                 /* store and advance */
 | |
|                 "movem.l %%d0-%%d3, (%[x]);"
 | |
|                 "lea.l (4*4, %[x]), %[x];"
 | |
|                 "lea.l (4*4, %[y]), %[y];"
 | |
|                 "subq.l #4, %[n];"     /* done 4 elements */
 | |
|                 "1: cmpi.l #4, %[n];"
 | |
|                 "bge 0b;"
 | |
|                 : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
 | |
|                 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
 | |
|                     "cc", "memory");
 | |
|   /* add final elements */
 | |
|   while (n>0) {
 | |
|     *x++ += *y++;
 | |
|     n--;
 | |
|   }
 | |
| }
 | |
| 
 | |
| static inline
 | |
| void vect_copy(int32_t *x, int32_t *y, int n)
 | |
| {
 | |
|   /* align to 16 bytes */
 | |
|   while(n>0 && (int)x&15) {
 | |
|     *x++ = *y++;
 | |
|     n--;
 | |
|   }
 | |
|   asm volatile ("bra 1f;"
 | |
|                 "0:"                                    /* loop start */
 | |
|                 "movem.l (%[y]), %%d0-%%d3;"            /* fetch values */
 | |
|                 "movem.l %%d0-%%d3, (%[x]);"            /* store */
 | |
|                 "lea.l (4*4, %[x]), %[x];"              /* advance */
 | |
|                 "lea.l (4*4, %[y]), %[y];"
 | |
|                 "subq.l #4, %[n];"                      /* done 4 elements */
 | |
|                 "1: cmpi.l #4, %[n];"
 | |
|                 "bge 0b;"
 | |
|                 : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
 | |
|                 : : "%d0", "%d1", "%d2", "%d3", "cc", "memory");
 | |
|   /* copy final elements */
 | |
|   while (n>0) {
 | |
|     *x++ = *y++;
 | |
|     n--;
 | |
|   }
 | |
| }
 | |
| 
 | |
| static inline
 | |
| void vect_mult_fw(int32_t *data, int32_t *window, int n)
 | |
| {
 | |
|   /* ensure data is aligned to 16-bytes */
 | |
|   while(n>0 && (int)data&15) {
 | |
|     *data = MULT31(*data, *window);
 | |
|     data++;
 | |
|     window++;
 | |
|     n--;
 | |
|   }
 | |
|   asm volatile ("movem.l (%[d]), %%d0-%%d3;"  /* loop start */
 | |
|                 "movem.l (%[w]), %%a0-%%a3;"  /* pre-fetch registers */
 | |
|                 "lea.l (4*4, %[w]), %[w];"
 | |
|                 "bra 1f;"               /* jump to loop condition */
 | |
|                 "0:" /* loop body */
 | |
|                 /* multiply and load next window values */
 | |
|                 "mac.l %%d0, %%a0, (%[w])+, %%a0, %%acc0;"
 | |
|                 "mac.l %%d1, %%a1, (%[w])+, %%a1, %%acc1;"
 | |
|                 "mac.l %%d2, %%a2, (%[w])+, %%a2, %%acc2;"
 | |
|                 "mac.l %%d3, %%a3, (%[w])+, %%a3, %%acc3;"
 | |
|                 "movclr.l %%acc0, %%d0;"  /* get the products */
 | |
|                 "movclr.l %%acc1, %%d1;"
 | |
|                 "movclr.l %%acc2, %%d2;"
 | |
|                 "movclr.l %%acc3, %%d3;"
 | |
|                 /* store and advance */
 | |
|                 "movem.l %%d0-%%d3, (%[d]);"
 | |
|                 "lea.l (4*4, %[d]), %[d];"
 | |
|                 "movem.l (%[d]), %%d0-%%d3;"
 | |
|                 "subq.l #4, %[n];"     /* done 4 elements */
 | |
|                 "1: cmpi.l #4, %[n];"
 | |
|                 "bge 0b;"
 | |
|                 /* multiply final elements */
 | |
|                 "tst.l %[n];"
 | |
|                 "beq 1f;"      /* n=0 */
 | |
|                 "mac.l %%d0, %%a0, %%acc0;"
 | |
|                 "movclr.l %%acc0, %%d0;"
 | |
|                 "move.l %%d0, (%[d])+;"
 | |
|                 "subq.l #1, %[n];"
 | |
|                 "beq 1f;"     /* n=1 */
 | |
|                 "mac.l %%d1, %%a1, %%acc0;"
 | |
|                 "movclr.l %%acc0, %%d1;"
 | |
|                 "move.l %%d1, (%[d])+;"
 | |
|                 "subq.l #1, %[n];"
 | |
|                 "beq 1f;"     /* n=2 */
 | |
|                 /* otherwise n = 3 */
 | |
|                 "mac.l %%d2, %%a2, %%acc0;"
 | |
|                 "movclr.l %%acc0, %%d2;"
 | |
|                 "move.l %%d2, (%[d])+;"
 | |
|                 "1:"
 | |
|                 : [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
 | |
|                 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
 | |
|                     "cc", "memory");
 | |
| }
 | |
| 
 | |
| static inline
 | |
| void vect_mult_bw(int32_t *data, int32_t *window, int n)
 | |
| {
 | |
|   /* ensure at least data is aligned to 16-bytes */
 | |
|   while(n>0 && (int)data&15) {
 | |
|     *data = MULT31(*data, *window);
 | |
|     data++;
 | |
|     window--;
 | |
|     n--;
 | |
|   }
 | |
|   asm volatile ("lea.l (-3*4, %[w]), %[w];"     /* loop start */
 | |
|                 "movem.l (%[d]), %%d0-%%d3;"    /* pre-fetch registers */
 | |
|                 "movem.l (%[w]), %%a0-%%a3;"
 | |
|                 "bra 1f;"               /* jump to loop condition */
 | |
|                 "0:" /* loop body */
 | |
|                 /* multiply and load next window value */
 | |
|                 "mac.l %%d0, %%a3, -(%[w]), %%a3, %%acc0;"
 | |
|                 "mac.l %%d1, %%a2, -(%[w]), %%a2, %%acc1;"
 | |
|                 "mac.l %%d2, %%a1, -(%[w]), %%a1, %%acc2;"
 | |
|                 "mac.l %%d3, %%a0, -(%[w]), %%a0, %%acc3;"
 | |
|                 "movclr.l %%acc0, %%d0;"  /* get the products */
 | |
|                 "movclr.l %%acc1, %%d1;"
 | |
|                 "movclr.l %%acc2, %%d2;"
 | |
|                 "movclr.l %%acc3, %%d3;"
 | |
|                 /* store and advance */
 | |
|                 "movem.l %%d0-%%d3, (%[d]);"
 | |
|                 "lea.l (4*4, %[d]), %[d];"
 | |
|                 "movem.l (%[d]), %%d0-%%d3;"
 | |
|                 "subq.l #4, %[n];"     /* done 4 elements */
 | |
|                 "1: cmpi.l #4, %[n];"
 | |
|                 "bge 0b;"
 | |
|                 /* multiply final elements */
 | |
|                 "tst.l %[n];"
 | |
|                 "beq 1f;"      /* n=0 */
 | |
|                 "mac.l %%d0, %%a3, %%acc0;"
 | |
|                 "movclr.l %%acc0, %%d0;"
 | |
|                 "move.l %%d0, (%[d])+;"
 | |
|                 "subq.l #1, %[n];"
 | |
|                 "beq 1f;"     /* n=1 */
 | |
|                 "mac.l %%d1, %%a2, %%acc0;"
 | |
|                 "movclr.l %%acc0, %%d1;"
 | |
|                 "move.l %%d1, (%[d])+;"
 | |
|                 "subq.l #1, %[n];"
 | |
|                 "beq 1f;"     /* n=2 */
 | |
|                 /* otherwise n = 3 */
 | |
|                 "mac.l %%d2, %%a1, %%acc0;"
 | |
|                 "movclr.l %%acc0, %%d2;"
 | |
|                 "move.l %%d2, (%[d])+;"
 | |
|                 "1:"
 | |
|                 : [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
 | |
|                 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
 | |
|                     "cc", "memory");
 | |
| }
 | |
| 
 | |
| #endif
 | |
| 
 | |
| #endif
 | |
| 
 | |
| #ifndef _V_CLIP_MATH
 | |
| #define _V_CLIP_MATH
 | |
| 
 | |
| /* this is portable C and simple; why not use this as default? */
 | |
| static inline int32_t CLIP_TO_15(register int32_t x) {
 | |
|   register int32_t hi=32767, lo=-32768;
 | |
|   return (x>=hi ? hi : (x<=lo ? lo : x));
 | |
| }
 | |
| 
 | |
| #endif
 | |
| #else
 | |
| #define LINE_ATTR
 | |
| #endif
 | |
| 
 |