Sync to upstream libopus

Sync to commit bb4b6885a139644cf3ac14e7deda9f633ec2d93c This brings in a bunch of optimizations to decode speed and memory usage. Allocations are switched from using the pseudostack to using the real stack. Enabled hacks to reduce stack usage. This should fix crashes on sansa clip, although some files will not play due to failing allocations in the codec buffer. Speeds up decoding of the following test files: H300 (cf) C200 (arm7tdmi) ipod classic (arm9e) 16 kbps (silk) 14.28 MHz 4.00 MHz 2.61 MHz 64 kbps (celt) 4.09 MHz 8.08 MHz 6.24 MHz 128 kbps (celt) 1.93 MHz 8.83 MHz 6.53 MHz Change-Id: I851733a8a5824b61feb363a173091bc7e6629b58
2014-01-19 16:31:59 +01:00 · 2014-01-19 16:31:59 +01:00 · 9b7ec42403
commit 9b7ec42403
parent e557951c94
46 changed files with 1608 additions and 1051 deletions
--- a/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h
+++ b/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h
@ -65,10 +65,6 @@
      do{ (m).r = ADD32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
          (m).i = SUB32(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0)

-#   define C_MUL4(m,a,b) \
-      do{ (m).r = SHR32(SUB32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)),2); \
-          (m).i = SHR32(ADD32(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)),2); }while(0)
-
 #   define C_MULBYSCALAR( c, s ) \
      do{ (c).r =  S_MUL( (c).r , s ) ;\
          (c).i =  S_MUL( (c).i , s ) ; }while(0)
--- a/lib/rbcodec/codecs/libopus/celt/arch.h
+++ b/lib/rbcodec/codecs/libopus/celt/arch.h
@ -69,11 +69,9 @@ static OPUS_INLINE void _celt_fatal(const char *str, const char *file, int line)

 #define IMUL32(a,b) ((a)*(b))

-#define ABS(x) ((x) < 0 ? (-(x)) : (x))      /**< Absolute integer value. */
-#define ABS16(x) ((x) < 0 ? (-(x)) : (x))    /**< Absolute 16-bit value.  */
+#define ABS(x) ((x) < 0 ? (-(x)) : (x))
 #define MIN16(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum 16-bit value.   */
 #define MAX16(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 16-bit value.   */
-#define ABS32(x) ((x) < 0 ? (-(x)) : (x))    /**< Absolute 32-bit value.  */
 #define MIN32(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum 32-bit value.   */
 #define MAX32(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 32-bit value.   */
 #define IMIN(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum int value.   */
@ -108,6 +106,13 @@ typedef opus_val32 celt_ener;
 #define SCALEIN(a)      (a)
 #define SCALEOUT(a)     (a)

+#define ABS16(x) ((x) < 0 ? (-(x)) : (x))
+#define ABS32(x) ((x) < 0 ? (-(x)) : (x))
+
+static OPUS_INLINE opus_int16 SAT16(opus_int32 x) {
+   return x > 32767 ? 32767 : x < -32768 ? -32768 : (opus_int16)x;
+}
+
 #ifdef FIXED_DEBUG
 #include "fixed_debug.h"
 #else
@ -139,6 +144,22 @@ typedef float celt_sig;
 typedef float celt_norm;
 typedef float celt_ener;

+#ifdef FLOAT_APPROX
+/* This code should reliably detect NaN/inf even when -ffast-math is used.
+   Assumes IEEE 754 format. */
+static OPUS_INLINE int celt_isnan(float x)
+{
+   union {float f; opus_uint32 i;} in;
+   in.f = x;
+   return ((in.i>>23)&0xFF)==0xFF && (in.i&0x007FFFFF)!=0;
+}
+#else
+#ifdef __FAST_MATH__
+#error Cannot build libopus with -ffast-math unless FLOAT_APPROX is defined. This could result in crashes on extreme (e.g. NaN) input
+#endif
+#define celt_isnan(x) ((x)!=(x))
+#endif
+
 #define Q15ONE 1.0f

 #define NORM_SCALING 1.f
@ -148,6 +169,10 @@ typedef float celt_ener;
 #define VERY_LARGE16 1e15f
 #define Q15_ONE ((opus_val16)1.f)

+/* This appears to be the same speed as C99's fabsf() but it's more portable. */
+#define ABS16(x) ((float)fabs(x))
+#define ABS32(x) ((float)fabs(x))
+
 #define QCONST16(x,bits) (x)
 #define QCONST32(x,bits) (x)

@ -186,6 +211,7 @@ typedef float celt_ener;
 #define MULT32_32_Q31(a,b)     ((a)*(b))

 #define MAC16_32_Q15(c,a,b)     ((c)+(a)*(b))
+#define MAC16_32_Q16(c,a,b)     ((c)+(a)*(b))

 #define MULT16_16_Q11_32(a,b)     ((a)*(b))
 #define MULT16_16_Q11(a,b)     ((a)*(b))
@ -203,6 +229,8 @@ typedef float celt_ener;
 #define SCALEIN(a)      ((a)*CELT_SIG_SCALE)
 #define SCALEOUT(a)     ((a)*(1/CELT_SIG_SCALE))

+#define SIG2WORD16(x) (x)
+
 #endif /* !FIXED_POINT */

 #ifndef GLOBAL_STACK_SIZE
--- a/lib/rbcodec/codecs/libopus/celt/arm/fixed_armv4.h
+++ b/lib/rbcodec/codecs/libopus/celt/arm/fixed_armv4.h
@ -68,6 +68,10 @@ static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
 #undef MAC16_32_Q15
 #define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b))

+/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
+    Result fits in 32 bits. */
+#undef MAC16_32_Q16
+#define MAC16_32_Q16(c, a, b) ADD32(c, MULT16_32_Q16(a, b))

 /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
 #undef MULT32_32_Q31
--- a/lib/rbcodec/codecs/libopus/celt/arm/fixed_armv5e.h
+++ b/lib/rbcodec/codecs/libopus/celt/arm/fixed_armv5e.h
@ -82,6 +82,23 @@ static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
 }
 #define MAC16_32_Q15(c, a, b) (MAC16_32_Q15_armv5e(c, a, b))

+/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
+    Result fits in 32 bits. */
+#undef MAC16_32_Q16
+static OPUS_INLINE opus_val32 MAC16_32_Q16_armv5e(opus_val32 c, opus_val16 a,
+ opus_val32 b)
+{
+  int res;
+  __asm__(
+      "#MAC16_32_Q16\n\t"
+      "smlawb %0, %1, %2, %3;\n"
+      : "=r"(res)
+      : "r"(b), "r"(a), "r"(c)
+  );
+  return res;
+}
+#define MAC16_32_Q16(c, a, b) (MAC16_32_Q16_armv5e(c, a, b))
+
 /** 16x16 multiply-add where the result fits in 32 bits */
 #undef MAC16_16
 static OPUS_INLINE opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a,
@ -113,4 +130,22 @@ static OPUS_INLINE opus_val32 MULT16_16_armv5e(opus_val16 a, opus_val16 b)
 }
 #define MULT16_16(a, b) (MULT16_16_armv5e(a, b))

+#ifdef OPUS_ARM_INLINE_MEDIA
+
+#undef SIG2WORD16
+static OPUS_INLINE opus_val16 SIG2WORD16_armv6(opus_val32 x)
+{
+   celt_sig res;
+   __asm__(
+       "#SIG2WORD16\n\t"
+       "ssat %0, #16, %1, ASR #12\n\t"
+       : "=r"(res)
+       : "r"(x+2048)
+   );
+   return EXTRACT16(res);
+}
+#define SIG2WORD16(x) (SIG2WORD16_armv6(x))
+
+#endif /* OPUS_ARM_INLINE_MEDIA */
+
 #endif
--- a/lib/rbcodec/codecs/libopus/celt/bands.c
+++ b/lib/rbcodec/codecs/libopus/celt/bands.c
@ -93,11 +93,11 @@ static int bitexact_log2tan(int isin,int icos)
 #if 0
 #ifdef FIXED_POINT
 /* Compute the amplitude (sqrt energy) in each of the bands */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int M)
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM)
 {
   int i, c, N;
   const opus_int16 *eBands = m->eBands;
-   N = M*m->shortMdctSize;
+   N = m->shortMdctSize<<LM;
   c=0; do {
      for (i=0;i<end;i++)
      {
@ -105,18 +105,23 @@ void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *band
         opus_val32 maxval=0;
         opus_val32 sum = 0;

-         j=M*eBands[i]; do {
-            maxval = MAX32(maxval, X[j+c*N]);
-            maxval = MAX32(maxval, -X[j+c*N]);
-         } while (++j<M*eBands[i+1]);
-
+         maxval = celt_maxabs32(&X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
         if (maxval > 0)
         {
-            int shift = celt_ilog2(maxval)-10;
-            j=M*eBands[i]; do {
-               sum = MAC16_16(sum, EXTRACT16(VSHR32(X[j+c*N],shift)),
-                                   EXTRACT16(VSHR32(X[j+c*N],shift)));
-            } while (++j<M*eBands[i+1]);
+            int shift = celt_ilog2(maxval) - 14 + (((m->logN[i]>>BITRES)+LM+1)>>1);
+            j=eBands[i]<<LM;
+            if (shift>0)
+            {
+               do {
+                  sum = MAC16_16(sum, EXTRACT16(SHR32(X[j+c*N],shift)),
+                        EXTRACT16(SHR32(X[j+c*N],shift)));
+               } while (++j<eBands[i+1]<<LM);
+            } else {
+               do {
+                  sum = MAC16_16(sum, EXTRACT16(SHL32(X[j+c*N],-shift)),
+                        EXTRACT16(SHL32(X[j+c*N],-shift)));
+               } while (++j<eBands[i+1]<<LM);
+            }
            /* We're adding one here to ensure the normalized band isn't larger than unity norm */
            bandE[i+c*m->nbEBands] = EPSILON+VSHR32(EXTEND32(celt_sqrt(sum)),-shift);
         } else {
@ -151,18 +156,16 @@ void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, cel

 #else /* FIXED_POINT */
 /* Compute the amplitude (sqrt energy) in each of the bands */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int M)
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM)
 {
   int i, c, N;
   const opus_int16 *eBands = m->eBands;
-   N = M*m->shortMdctSize;
+   N = m->shortMdctSize<<LM;
   c=0; do {
      for (i=0;i<end;i++)
      {
-         int j;
-         opus_val32 sum = 1e-27f;
-         for (j=M*eBands[i];j<M*eBands[i+1];j++)
-            sum += X[j+c*N]*X[j+c*N];
+         opus_val32 sum;
+         sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
         bandE[i+c*m->nbEBands] = celt_sqrt(sum);
         /*printf ("%f ", bandE[i+c*m->nbEBands]);*/
      }
@ -192,74 +195,80 @@ void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, cel

 /* De-normalise the energy to produce the synthesis from the unit-energy bands */
 void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
-      celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandLogE, int start, int end, int C, int M)
+      celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandLogE, int start,
+      int end, int M, int downsample, int silence)
 {
-   int i, c, N;
+   int i, N;
+   int bound;
+   celt_sig * OPUS_RESTRICT f;
+   const celt_norm * OPUS_RESTRICT x;
   const opus_int16 *eBands = m->eBands;
   N = M*m->shortMdctSize;
-   celt_assert2(C<=2, "denormalise_bands() not implemented for >2 channels");
-   c=0; do {
-      celt_sig * OPUS_RESTRICT f;
-      const celt_norm * OPUS_RESTRICT x;
-      f = freq+c*N;
-      x = X+c*N+M*eBands[start];
-      for (i=0;i<M*eBands[start];i++)
-         *f++ = 0;
-      for (i=start;i<end;i++)
-      {
-         int j, band_end;
-         opus_val16 g;
-         opus_val16 lg;
+   bound = M*eBands[end];
+   if (downsample!=1)
+      bound = IMIN(bound, N/downsample);
+   if (silence)
+   {
+      bound = 0;
+      start = end = 0;
+   }
+   f = freq;
+   x = X+M*eBands[start];
+   for (i=0;i<M*eBands[start];i++)
+      *f++ = 0;
+   for (i=start;i<end;i++)
+   {
+      int j, band_end;
+      opus_val16 g;
+      opus_val16 lg;
 #ifdef FIXED_POINT
-         int shift;
+      int shift;
 #endif
-         j=M*eBands[i];
-         band_end = M*eBands[i+1];
-         lg = ADD16(bandLogE[i+c*m->nbEBands], SHL16((opus_val16)eMeans[i],6));
+      j=M*eBands[i];
+      band_end = M*eBands[i+1];
+      lg = ADD16(bandLogE[i], SHL16((opus_val16)eMeans[i],6));
 #ifndef FIXED_POINT
-         g = celt_exp2(lg);
+      g = celt_exp2(lg);
 #else
-         /* Handle the integer part of the log energy */
-         shift = 16-(lg>>DB_SHIFT);
-         if (shift>31)
-         {
-            shift=0;
-            g=0;
-         } else {
-            /* Handle the fractional part. */
-            g = celt_exp2_frac(lg&((1<<DB_SHIFT)-1));
-         }
-         /* Handle extreme gains with negative shift. */
-         if (shift<0)
-         {
-            /* For shift < -2 we'd be likely to overflow, so we're capping
+      /* Handle the integer part of the log energy */
+      shift = 16-(lg>>DB_SHIFT);
+      if (shift>31)
+      {
+         shift=0;
+         g=0;
+      } else {
+         /* Handle the fractional part. */
+         g = celt_exp2_frac(lg&((1<<DB_SHIFT)-1));
+      }
+      /* Handle extreme gains with negative shift. */
+      if (shift<0)
+      {
+         /* For shift < -2 we'd be likely to overflow, so we're capping
               the gain here. This shouldn't happen unless the bitstream is
               already corrupted. */
-            if (shift < -2)
-            {
-               g = 32767;
-               shift = -2;
-            }
-            do {
-               *f++ = SHL32(MULT16_16(*x++, g), -shift);
-            } while (++j<band_end);
-         } else
+         if (shift < -2)
+         {
+            g = 32767;
+            shift = -2;
+         }
+         do {
+            *f++ = SHL32(MULT16_16(*x++, g), -shift);
+         } while (++j<band_end);
+      } else
 #endif
         /* Be careful of the fixed-point "else" just above when changing this code */
         do {
            *f++ = SHR32(MULT16_16(*x++, g), shift);
         } while (++j<band_end);
-      }
-      celt_assert(start <= end);
-      for (i=M*eBands[end];i<N;i++)
-         *f++ = 0;
-   } while (++c<C);
+   }
+   celt_assert(start <= end);
+   OPUS_CLEAR(&freq[bound], N-bound);
 }

 /* This prevents energy collapse for transients with multiple short MDCTs */
 void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
-      int start, int end, opus_val16 *logE, opus_val16 *prev1logE,
-      opus_val16 *prev2logE, int *pulses, opus_uint32 seed)
+      int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE,
+      const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed)
 {
   int c, i, j, k;
   for (i=start;i<end;i++)
@ -274,7 +283,8 @@ void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_mas

      N0 = m->eBands[i+1]-m->eBands[i];
      /* depth in 1/8 bits */
-      depth = (1+pulses[i])/((m->eBands[i+1]-m->eBands[i])<<LM);
+      celt_assert(pulses[i]>=0);
+      depth = celt_udiv(1+pulses[i], (m->eBands[i+1]-m->eBands[i]))>>LM;

 #ifdef FIXED_POINT
      thresh32 = SHR32(celt_exp2(-SHL16(depth, 10-BITRES)),1);
@ -352,7 +362,7 @@ void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_mas
   }
 }

-static void intensity_stereo(const CELTMode *m, celt_norm *X, celt_norm *Y, const celt_ener *bandE, int bandID, int N)
+static void intensity_stereo(const CELTMode *m, celt_norm * OPUS_RESTRICT X, const celt_norm * OPUS_RESTRICT Y, const celt_ener *bandE, int bandID, int N)
 {
   int i = bandID;
   int j;
@ -372,25 +382,25 @@ static void intensity_stereo(const CELTMode *m, celt_norm *X, celt_norm *Y, cons
      celt_norm r, l;
      l = X[j];
      r = Y[j];
-      X[j] = MULT16_16_Q14(a1,l) + MULT16_16_Q14(a2,r);
+      X[j] = EXTRACT16(SHR32(MAC16_16(MULT16_16(a1, l), a2, r), 14));
      /* Side is not encoded, no need to calculate */
   }
 }

-static void stereo_split(celt_norm *X, celt_norm *Y, int N)
+static void stereo_split(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, int N)
 {
   int j;
   for (j=0;j<N;j++)
   {
-      celt_norm r, l;
-      l = MULT16_16_Q15(QCONST16(.70710678f,15), X[j]);
-      r = MULT16_16_Q15(QCONST16(.70710678f,15), Y[j]);
-      X[j] = l+r;
-      Y[j] = r-l;
+      opus_val32 r, l;
+      l = MULT16_16(QCONST16(.70710678f, 15), X[j]);
+      r = MULT16_16(QCONST16(.70710678f, 15), Y[j]);
+      X[j] = EXTRACT16(SHR32(ADD32(l, r), 15));
+      Y[j] = EXTRACT16(SHR32(SUB32(r, l), 15));
   }
 }

-static void stereo_merge(celt_norm *X, celt_norm *Y, opus_val16 mid, int N)
+static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, opus_val16 mid, int N)
 {
   int j;
   opus_val32 xp=0, side=0;
@ -411,8 +421,7 @@ static void stereo_merge(celt_norm *X, celt_norm *Y, opus_val16 mid, int N)
   Er = MULT16_16(mid2, mid2) + side + 2*xp;
   if (Er < QCONST32(6e-4f, 28) || El < QCONST32(6e-4f, 28))
   {
-      for (j=0;j<N;j++)
-         Y[j] = X[j];
+      OPUS_COPY(Y, X, N);
      return;
   }

@ -436,7 +445,7 @@ static void stereo_merge(celt_norm *X, celt_norm *Y, opus_val16 mid, int N)
   {
      celt_norm r, l;
      /* Apply mid scaling (side is already scaled) */
-      l = MULT16_16_Q15(mid, X[j]);
+      l = MULT16_16_P15(mid, X[j]);
      r = Y[j];
      X[j] = EXTRACT16(PSHR32(MULT16_16(lgain, SUB16(l,r)), kl+1));
      Y[j] = EXTRACT16(PSHR32(MULT16_16(rgain, ADD16(l,r)), kr+1));
@ -445,7 +454,7 @@ static void stereo_merge(celt_norm *X, celt_norm *Y, opus_val16 mid, int N)

 #if 0
 /* Decide whether we should spread the pulses in the current frame */
-int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
+int spreading_decision(const CELTMode *m, const celt_norm *X, int *average,
      int last_decision, int *hf_average, int *tapset_decision, int update_hf,
      int end, int C, int M)
 {
@ -466,7 +475,7 @@ int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
      {
         int j, N, tmp=0;
         int tcount[3] = {0,0,0};
-         celt_norm * OPUS_RESTRICT x = X+M*eBands[i]+c*N0;
+         const celt_norm * OPUS_RESTRICT x = X+M*eBands[i]+c*N0;
         N = M*(eBands[i+1]-eBands[i]);
         if (N<=8)
            continue;
@ -486,7 +495,7 @@ int spreading_decision(const CELTMode *m, celt_norm *X, int *average,

         /* Only include four last bands (8 kHz and up) */
         if (i>m->nbEBands-4)
-            hf_sum += 32*(tcount[1]+tcount[0])/N;
+            hf_sum += celt_udiv(32*(tcount[1]+tcount[0]), N);
         tmp = (2*tcount[2] >= N) + (2*tcount[1] >= N) + (2*tcount[0] >= N);
         sum += tmp*256;
         nbBands++;
@ -496,7 +505,7 @@ int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
   if (update_hf)
   {
      if (hf_sum)
-         hf_sum /= C*(4-m->nbEBands+end);
+         hf_sum = celt_udiv(hf_sum, C*(4-m->nbEBands+end));
      *hf_average = (*hf_average+hf_sum)>>1;
      hf_sum = *hf_average;
      if (*tapset_decision==2)
@ -512,7 +521,8 @@ int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
   }
   /*printf("%d %d %d\n", hf_sum, *hf_average, *tapset_decision);*/
   celt_assert(nbBands>0); /* end has to be non-zero */
-   sum /= nbBands;
+   celt_assert(sum>=0);
+   sum = celt_udiv(sum, nbBands);
   /* Recursive averaging */
   sum = (sum+*average)>>1;
   *average = sum;
@ -571,8 +581,7 @@ static void deinterleave_hadamard(celt_norm *X, int N0, int stride, int hadamard
         for (j=0;j<N0;j++)
            tmp[i*N0+j] = X[j*stride+i];
   }
-   for (j=0;j<N;j++)
-      X[j] = tmp[j];
+   OPUS_COPY(X, tmp, N);
   RESTORE_STACK;
 }

@ -595,8 +604,7 @@ static void interleave_hadamard(celt_norm *X, int N0, int stride, int hadamard)
         for (j=0;j<N0;j++)
            tmp[j*stride+i] = X[i*N0+j];
   }
-   for (j=0;j<N;j++)
-      X[j] = tmp[j];
+   OPUS_COPY(X, tmp, N);
   RESTORE_STACK;
 }

@ -607,11 +615,11 @@ void haar1(celt_norm *X, int N0, int stride)
   for (i=0;i<stride;i++)
      for (j=0;j<N0;j++)
      {
-         celt_norm tmp1, tmp2;
-         tmp1 = MULT16_16_Q15(QCONST16(.70710678f,15), X[stride*2*j+i]);
-         tmp2 = MULT16_16_Q15(QCONST16(.70710678f,15), X[stride*(2*j+1)+i]);
-         X[stride*2*j+i] = tmp1 + tmp2;
-         X[stride*(2*j+1)+i] = tmp1 - tmp2;
+         opus_val32 tmp1, tmp2;
+         tmp1 = MULT16_16(QCONST16(.70710678f,15), X[stride*2*j+i]);
+         tmp2 = MULT16_16(QCONST16(.70710678f,15), X[stride*(2*j+1)+i]);
+         X[stride*2*j+i] = EXTRACT16(PSHR32(ADD32(tmp1, tmp2), 15));
+         X[stride*(2*j+1)+i] = EXTRACT16(PSHR32(SUB32(tmp1, tmp2), 15));
      }
 }

@ -626,7 +634,8 @@ static int compute_qn(int N, int b, int offset, int pulse_cap, int stereo)
   /* The upper limit ensures that in a stereo split with itheta==16384, we'll
       always have enough bits left over to code at least one pulse in the
       side; otherwise it would collapse, since it doesn't get folded. */
-   qb = IMIN(b-pulse_cap-(4<<BITRES), (b+N2*offset)/N2);
+   qb = celt_sudiv(b+N2*offset, N2);
+   qb = IMIN(b-pulse_cap-(4<<BITRES), qb);

   qb = IMIN(8<<BITRES, qb);

@ -773,7 +782,8 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx,
            ec_dec_update(ec, fl, fl+fs, ft);
         }
      }
-      itheta = (opus_int32)itheta*16384/qn;
+      celt_assert(itheta>=0);
+      itheta = celt_udiv((opus_int32)itheta*16384, qn);
      if (encode && stereo)
      {
         if (itheta==0)
@ -1025,8 +1035,7 @@ static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X,
            fill &= cm_mask;
            if (!fill)
            {
-               for (j=0;j<N;j++)
-                  X[j] = 0;
+               OPUS_CLEAR(X, N);
            } else {
               if (lowband == NULL)
               {
@ -1088,7 +1097,7 @@ static unsigned quant_band(struct band_ctx *ctx, celt_norm *X,

   longBlocks = B0==1;

-   N_B /= B;
+   N_B = celt_udiv(N_B, B);

   /* Special case for one sample */
   if (N==1)
@ -1102,9 +1111,7 @@ static unsigned quant_band(struct band_ctx *ctx, celt_norm *X,

   if (lowband_scratch && lowband && (recombine || ((N_B&1) == 0 && tf_change<0) || B0>1))
   {
-      int j;
-      for (j=0;j<N;j++)
-         lowband_scratch[j] = lowband[j];
+      OPUS_COPY(lowband_scratch, lowband, N);
      lowband = lowband_scratch;
   }

@ -1432,7 +1439,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
      ctx.remaining_bits = remaining_bits;
      if (i <= codedBands-1)
      {
-         curr_balance = balance / IMIN(3, codedBands-i);
+         curr_balance = celt_sudiv(balance, IMIN(3, codedBands-i));
         b = IMAX(0, IMIN(16383, IMIN(remaining_bits+1,pulses[i]+curr_balance)));
      } else {
         b = 0;
--- a/lib/rbcodec/codecs/libopus/celt/bands.h
+++ b/lib/rbcodec/codecs/libopus/celt/bands.h
@ -41,7 +41,7 @@
 * @param X Spectrum
 * @param bandE Square root of the energy for each band (returned)
 */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int M);
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM);

 /*void compute_noise_energies(const CELTMode *m, const celt_sig *X, const opus_val16 *tonality, celt_ener *bandE);*/

@ -59,14 +59,15 @@ void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, cel
 * @param bandE Square root of the energy for each band
 */
 void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
-      celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandE, int start, int end, int C, int M);
+      celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandE, int start,
+      int end, int M, int downsample, int silence);

 #define SPREAD_NONE       (0)
 #define SPREAD_LIGHT      (1)
 #define SPREAD_NORMAL     (2)
 #define SPREAD_AGGRESSIVE (3)

-int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
+int spreading_decision(const CELTMode *m, const celt_norm *X, int *average,
      int last_decision, int *hf_average, int *tapset_decision, int update_hf,
      int end, int C, int M);

@ -104,8 +105,8 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
      opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed);

 void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
-      int start, int end, opus_val16 *logE, opus_val16 *prev1logE,
-      opus_val16 *prev2logE, int *pulses, opus_uint32 seed);
+      int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE,
+      const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed);

 opus_uint32 celt_lcg_rand(opus_uint32 seed);

--- a/lib/rbcodec/codecs/libopus/celt/celt.c
+++ b/lib/rbcodec/codecs/libopus/celt/celt.c
@ -54,6 +54,10 @@
 #define PACKAGE_VERSION "unknown"
 #endif

+#if defined(MIPSr1_ASM)
+#include "mips/celt_mipsr1.h"
+#endif
+

 int resampling_factor(opus_int32 rate)
 {
@ -86,6 +90,63 @@ int resampling_factor(opus_int32 rate)
 }

 #ifndef OVERRIDE_COMB_FILTER_CONST
+/* This version should be faster on ARM */
+#ifdef OPUS_ARM_ASM
+static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+   opus_val32 x0, x1, x2, x3, x4;
+   int i;
+   x4 = SHL32(x[-T-2], 1);
+   x3 = SHL32(x[-T-1], 1);
+   x2 = SHL32(x[-T], 1);
+   x1 = SHL32(x[-T+1], 1);
+   for (i=0;i<N-4;i+=5)
+   {
+      opus_val32 t;
+      x0=SHL32(x[i-T+2],1);
+      t = MAC16_32_Q16(x[i], g10, x2);
+      t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
+      t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
+      y[i] = t;
+      x4=SHL32(x[i-T+3],1);
+      t = MAC16_32_Q16(x[i+1], g10, x1);
+      t = MAC16_32_Q16(t, g11, ADD32(x0,x2));
+      t = MAC16_32_Q16(t, g12, ADD32(x4,x3));
+      y[i+1] = t;
+      x3=SHL32(x[i-T+4],1);
+      t = MAC16_32_Q16(x[i+2], g10, x0);
+      t = MAC16_32_Q16(t, g11, ADD32(x4,x1));
+      t = MAC16_32_Q16(t, g12, ADD32(x3,x2));
+      y[i+2] = t;
+      x2=SHL32(x[i-T+5],1);
+      t = MAC16_32_Q16(x[i+3], g10, x4);
+      t = MAC16_32_Q16(t, g11, ADD32(x3,x0));
+      t = MAC16_32_Q16(t, g12, ADD32(x2,x1));
+      y[i+3] = t;
+      x1=SHL32(x[i-T+6],1);
+      t = MAC16_32_Q16(x[i+4], g10, x3);
+      t = MAC16_32_Q16(t, g11, ADD32(x2,x4));
+      t = MAC16_32_Q16(t, g12, ADD32(x1,x0));
+      y[i+4] = t;
+   }
+#ifdef CUSTOM_MODES
+   for (;i<N;i++)
+   {
+      opus_val32 t;
+      x0=SHL32(x[i-T+2],1);
+      t = MAC16_32_Q16(x[i], g10, x2);
+      t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
+      t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
+      y[i] = t;
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+   }
+#endif
+}
+#else
 static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
      opus_val16 g10, opus_val16 g11, opus_val16 g12)
 {
@ -110,7 +171,9 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,

 }
 #endif
+#endif

+#ifndef OVERRIDE_comb_filter
 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
      opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
      const opus_val16 *window, int overlap)
@ -131,16 +194,19 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
         OPUS_MOVE(y, x, N);
      return;
   }
-   g00 = MULT16_16_Q15(g0, gains[tapset0][0]);
-   g01 = MULT16_16_Q15(g0, gains[tapset0][1]);
-   g02 = MULT16_16_Q15(g0, gains[tapset0][2]);
-   g10 = MULT16_16_Q15(g1, gains[tapset1][0]);
-   g11 = MULT16_16_Q15(g1, gains[tapset1][1]);
-   g12 = MULT16_16_Q15(g1, gains[tapset1][2]);
+   g00 = MULT16_16_P15(g0, gains[tapset0][0]);
+   g01 = MULT16_16_P15(g0, gains[tapset0][1]);
+   g02 = MULT16_16_P15(g0, gains[tapset0][2]);
+   g10 = MULT16_16_P15(g1, gains[tapset1][0]);
+   g11 = MULT16_16_P15(g1, gains[tapset1][1]);
+   g12 = MULT16_16_P15(g1, gains[tapset1][2]);
   x1 = x[-T1+1];
   x2 = x[-T1  ];
   x3 = x[-T1-1];
   x4 = x[-T1-2];
+   /* If the filter didn't change, we don't need the overlap */
+   if (g0==g1 && T0==T1 && tapset0==tapset1)
+      overlap=0;
   for (i=0;i<overlap;i++)
   {
      opus_val16 f;
@ -170,6 +236,7 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
   /* Compute the part with the constant filter. */
   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);
 }
+#endif /* OVERRIDE_comb_filter */

 const signed char tf_select_table[4][8] = {
      {0, -1, 0, -1,    0,-1, 0,-1},
--- a/lib/rbcodec/codecs/libopus/celt/celt.h
+++ b/lib/rbcodec/codecs/libopus/celt/celt.h
@ -134,7 +134,8 @@ int celt_decoder_get_size(int channels);

 int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels);

-int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec);
+int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum);

 #define celt_encoder_ctl opus_custom_encoder_ctl
 #define celt_decoder_ctl opus_custom_decoder_ctl
@ -205,10 +206,10 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
 void init_caps(const CELTMode *m,int *cap,int LM,int C);

 #ifdef RESYNTH
-void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, celt_sig * OPUS_RESTRICT scratch);
-
-void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X,
-      celt_sig * OPUS_RESTRICT out_mem[], int C, int LM);
+void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem);
+void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[],
+      opus_val16 *oldBandE, int start, int effEnd, int C, int CC, int isTransient,
+      int LM, int downsample, int silence);
 #endif

 #ifdef __cplusplus
--- a/lib/rbcodec/codecs/libopus/celt/celt_decoder.c
+++ b/lib/rbcodec/codecs/libopus/celt/celt_decoder.c
@ -51,6 +51,9 @@
 #include "celt_lpc.h"
 #include "vq.h"

+#if defined(SMALL_FOOTPRINT) && defined(FIXED_POINT)
+#define NORM_ALIASING_HACK
+#endif
 /**********************************************************************/
 /*                                                                    */
 /*                             DECODER                                */
@ -175,28 +178,24 @@ void opus_custom_decoder_destroy(CELTDecoder *st)
 }
 #endif /* CUSTOM_MODES */

-static OPUS_INLINE opus_val16 SIG2WORD16(celt_sig x)
-{
-#ifdef FIXED_POINT
-   x = PSHR32(x, SIG_SHIFT);
-   x = MAX32(x, -32768);
-   x = MIN32(x, 32767);
-   return EXTRACT16(x);
-#else
-   return (opus_val16)x;
-#endif
-}

 #ifndef RESYNTH
 static
 #endif
-void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, celt_sig * OPUS_RESTRICT scratch)
+void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef,
+      celt_sig *mem, int accum)
 {
   int c;
   int Nd;
   int apply_downsampling=0;
   opus_val16 coef0;
-
+   VARDECL(celt_sig, scratch);
+   SAVE_STACK;
+#ifndef FIXED_POINT
+   (void)accum;
+   celt_assert(accum==0);
+#endif
+   ALLOC(scratch, N, celt_sig);
   coef0 = coef[0];
   Nd = N/downsample;
   c=0; do {
@ -234,11 +233,24 @@ void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, c
         apply_downsampling=1;
      } else {
         /* Shortcut for the standard (non-custom modes) case */
-         for (j=0;j<N;j++)
+#ifdef FIXED_POINT
+         if (accum)
         {
-            celt_sig tmp = x[j] + m + VERY_SMALL;
-            m = MULT16_32_Q15(coef0, tmp);
-            y[j*C] = SCALEOUT(SIG2WORD16(tmp));
+            for (j=0;j<N;j++)
+            {
+               celt_sig tmp = x[j] + m + VERY_SMALL;
+               m = MULT16_32_Q15(coef0, tmp);
+               y[j*C] = SAT16(ADD32(y[j*C], SCALEOUT(SIG2WORD16(tmp))));
+            }
+         } else
+#endif
+         {
+            for (j=0;j<N;j++)
+            {
+               celt_sig tmp = x[j] + m + VERY_SMALL;
+               m = MULT16_32_Q15(coef0, tmp);
+               y[j*C] = SCALEOUT(SIG2WORD16(tmp));
+            }
         }
      }
      mem[c] = m;
@ -246,41 +258,94 @@ void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, c
      if (apply_downsampling)
      {
         /* Perform down-sampling */
-         for (j=0;j<Nd;j++)
-            y[j*C] = SCALEOUT(SIG2WORD16(scratch[j*downsample]));
+#ifdef FIXED_POINT
+         if (accum)
+         {
+            for (j=0;j<Nd;j++)
+               y[j*C] = SAT16(ADD32(y[j*C], SCALEOUT(SIG2WORD16(scratch[j*downsample]))));
+         } else
+#endif
+         {
+            for (j=0;j<Nd;j++)
+               y[j*C] = SCALEOUT(SIG2WORD16(scratch[j*downsample]));
+         }
      }
   } while (++c<C);
+   RESTORE_STACK;
 }

-/** Compute the IMDCT and apply window for all sub-frames and
-    all channels in a frame */
 #ifndef RESYNTH
 static
 #endif
-void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X,
-      celt_sig * OPUS_RESTRICT out_mem[], int C, int LM)
+void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[],
+      opus_val16 *oldBandE, int start, int effEnd, int C, int CC, int isTransient,
+      int LM, int downsample, int silence)
 {
-   int b, c;
+   int c, i;
+   int M;
+   int b;
   int B;
-   int N;
+   int N, NB;
   int shift;
-   const int overlap = OVERLAP(mode);
+   int nbEBands;
+   int overlap;
+   VARDECL(celt_sig, freq);
+   SAVE_STACK;

-   if (shortBlocks)
+   overlap = mode->overlap;
+   nbEBands = mode->nbEBands;
+   N = mode->shortMdctSize<<LM;
+   ALLOC(freq, N, celt_sig); /**< Interleaved signal MDCTs */
+   M = 1<<LM;
+
+   if (isTransient)
   {
-      B = shortBlocks;
-      N = mode->shortMdctSize;
+      B = M;
+      NB = mode->shortMdctSize;
      shift = mode->maxLM;
   } else {
      B = 1;
-      N = mode->shortMdctSize<<LM;
+      NB = mode->shortMdctSize<<LM;
      shift = mode->maxLM-LM;
   }
-   c=0; do {
-      /* IMDCT on the interleaved the sub-frames, overlap-add is performed by the IMDCT */
+
+   if (CC==2&&C==1)
+   {
+      /* Copying a mono streams to two channels */
+      celt_sig *freq2;
+      denormalise_bands(mode, X, freq, oldBandE, start, effEnd, M,
+            downsample, silence);
+      /* Store a temporary copy in the output buffer because the IMDCT destroys its input. */
+      freq2 = out_syn[1]+overlap/2;
+      OPUS_COPY(freq2, freq, N);
      for (b=0;b<B;b++)
-         clt_mdct_backward(&mode->mdct, &X[b+c*N*B], out_mem[c]+N*b, mode->window, overlap, shift, B);
-   } while (++c<C);
+         clt_mdct_backward(&mode->mdct, &freq2[b], out_syn[0]+NB*b, mode->window, overlap, shift, B);
+      for (b=0;b<B;b++)
+         clt_mdct_backward(&mode->mdct, &freq[b], out_syn[1]+NB*b, mode->window, overlap, shift, B);
+   } else if (CC==1&&C==2)
+   {
+      /* Downmixing a stereo stream to mono */
+      celt_sig *freq2;
+      freq2 = out_syn[0]+overlap/2;
+      denormalise_bands(mode, X, freq, oldBandE, start, effEnd, M,
+            downsample, silence);
+      /* Use the output buffer as temp array before downmixing. */
+      denormalise_bands(mode, X+N, freq2, oldBandE+nbEBands, start, effEnd, M,
+            downsample, silence);
+      for (i=0;i<N;i++)
+         freq[i] = HALF32(ADD32(freq[i],freq2[i]));
+      for (b=0;b<B;b++)
+         clt_mdct_backward(&mode->mdct, &freq[b], out_syn[0]+NB*b, mode->window, overlap, shift, B);
+   } else {
+      /* Normal case (mono or stereo) */
+      c=0; do {
+         denormalise_bands(mode, X+c*N, freq, oldBandE+c*nbEBands, start, effEnd, M,
+               downsample, silence);
+         for (b=0;b<B;b++)
+            clt_mdct_backward(&mode->mdct, &freq[b], out_syn[c]+NB*b, mode->window, overlap, shift, B);
+      } while (++c<CC);
+   }
+   RESTORE_STACK;
 }

 static void tf_decode(int start, int end, int isTransient, int *tf_res, int LM, ec_dec *dec)
@ -330,7 +395,23 @@ static void tf_decode(int start, int end, int isTransient, int *tf_res, int LM,
   pitch of 480 Hz. */
 #define PLC_PITCH_LAG_MIN (100)

-static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_RESTRICT pcm, int N, int LM)
+static int celt_plc_pitch_search(celt_sig *decode_mem[2], int C, int arch)
+{
+   int pitch_index;
+   VARDECL( opus_val16, lp_pitch_buf );
+   SAVE_STACK;
+   ALLOC( lp_pitch_buf, DECODE_BUFFER_SIZE>>1, opus_val16 );
+   pitch_downsample(decode_mem, lp_pitch_buf,
+         DECODE_BUFFER_SIZE, C, arch);
+   pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf,
+         DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX,
+         PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index, arch);
+   pitch_index = PLC_PITCH_LAG_MAX-pitch_index;
+   RESTORE_STACK;
+   return pitch_index;
+}
+
+static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
 {
   int c;
   int i;
@ -343,11 +424,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R
   int nbEBands;
   int overlap;
   int start;
-   int downsample;
   int loss_count;
   int noise_based;
   const opus_int16 *eBands;
-   VARDECL(celt_sig, scratch);
   SAVE_STACK;

   mode = st->mode;
@ -367,14 +446,15 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R

   loss_count = st->loss_count;
   start = st->start;
-   downsample = st->downsample;
   noise_based = loss_count >= 5 || start != 0;
-   ALLOC(scratch, noise_based?N*C:N, celt_sig);
   if (noise_based)
   {
      /* Noise-based PLC/CNG */
-      celt_sig *freq;
+#ifdef NORM_ALIASING_HACK
+      celt_norm *X;
+#else
      VARDECL(celt_norm, X);
+#endif
      opus_uint32 seed;
      opus_val16 *plcLogE;
      int end;
@ -383,10 +463,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R
      end = st->end;
      effEnd = IMAX(start, IMIN(end, mode->effEBands));

-      /* Share the interleaved signal MDCT coefficient buffer with the
-         deemphasis scratch buffer. */
-      freq = scratch;
+#ifdef NORM_ALIASING_HACK
+      /* This is an ugly hack that breaks aliasing rules and would be easily broken,
+         but it saves almost 4kB of stack. */
+      X = (celt_norm*)(out_syn[C-1]+overlap/2);
+#else
      ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
+#endif

      if (loss_count >= 5)
         plcLogE = backgroundLogE;
@ -421,20 +504,12 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R
      }
      st->rng = seed;

-      denormalise_bands(mode, X, freq, plcLogE, start, effEnd, C, 1<<LM);
-
-      c=0; do {
-         int bound = eBands[effEnd]<<LM;
-         if (downsample!=1)
-            bound = IMIN(bound, N/downsample);
-         for (i=bound;i<N;i++)
-            freq[c*N+i] = 0;
-      } while (++c<C);
      c=0; do {
         OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
               DECODE_BUFFER_SIZE-N+(overlap>>1));
      } while (++c<C);
-      compute_inv_mdcts(mode, 0, freq, out_syn, C, LM);
+
+      celt_synthesis(mode, X, out_syn, plcLogE, start, effEnd, C, C, 0, LM, st->downsample, 0);
   } else {
      /* Pitch-based PLC */
      const opus_val16 *window;
@ -445,15 +520,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R

      if (loss_count == 0)
      {
-         VARDECL( opus_val16, lp_pitch_buf );
-         ALLOC( lp_pitch_buf, DECODE_BUFFER_SIZE>>1, opus_val16 );
-         pitch_downsample(decode_mem, lp_pitch_buf,
-               DECODE_BUFFER_SIZE, C, st->arch);
-         pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf,
-               DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX,
-               PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index, st->arch);
-         pitch_index = PLC_PITCH_LAG_MAX-pitch_index;
-         st->last_pitch_index = pitch_index;
+         st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch);
      } else {
         pitch_index = st->last_pitch_index;
         fade = QCONST16(.8f,15);
@ -644,25 +711,23 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R
      } while (++c<C);
   }

-   deemphasis(out_syn, pcm, N, C, downsample,
-         mode->preemph, st->preemph_memD, scratch);
-
   st->loss_count = loss_count+1;

   RESTORE_STACK;
 }

-#define FREQ_X_BUF_SIZE (2*8*120) /* stereo * nbShortMdcts * shortMdctSize */
-static celt_sig s_freq[FREQ_X_BUF_SIZE] IBSS_ATTR MEM_ALIGN_ATTR; /* 7680 byte */
-static celt_norm s_X[FREQ_X_BUF_SIZE] IBSS_ATTR MEM_ALIGN_ATTR; /* 3840 byte */
-int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec)
+int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum)
 {
   int c, i, N;
   int spread_decision;
   opus_int32 bits;
   ec_dec _dec;
-   VARDECL(celt_sig, freq);
+#ifdef NORM_ALIASING_HACK
+   celt_norm *X;
+#else
   VARDECL(celt_norm, X);
+#endif
   VARDECL(int, fine_quant);
   VARDECL(int, pulses);
   VARDECL(int, cap);
@ -680,6 +745,8 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
   int intra_ener;
   const int CC = st->channels;
   int LM, M;
+   int start;
+   int end;
   int effEnd;
   int codedBands;
   int alloc_trim;
@ -706,11 +773,10 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
   nbEBands = mode->nbEBands;
   overlap = mode->overlap;
   eBands = mode->eBands;
+   start = st->start;
+   end = st->end;
   frame_size *= st->downsample;

-   c=0; do {
-      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
-   } while (++c<CC);
   lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC);
   oldBandE = lpc+CC*LPC_ORDER;
   oldLogE = oldBandE + 2*nbEBands;
@ -728,7 +794,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
         if (data0<0)
            return OPUS_INVALID_PACKET;
      }
-      st->end = IMAX(1, mode->effEBands-2*(data0>>5));
+      st->end = end = IMAX(1, mode->effEBands-2*(data0>>5));
      LM = (data0>>3)&0x3;
      C = 1 + ((data0>>2)&0x1);
      data++;
@ -755,14 +821,19 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
      return OPUS_BAD_ARG;

   N = M*mode->shortMdctSize;
+   c=0; do {
+      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
+      out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
+   } while (++c<CC);

-   effEnd = st->end;
+   effEnd = end;
   if (effEnd > mode->effEBands)
      effEnd = mode->effEBands;

   if (data == NULL || len<=1)
   {
-      celt_decode_lost(st, pcm, N, LM);
+      celt_decode_lost(st, N, LM);
+      deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
      RESTORE_STACK;
      return frame_size/st->downsample;
   }
@ -798,7 +869,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
   postfilter_gain = 0;
   postfilter_pitch = 0;
   postfilter_tapset = 0;
-   if (st->start==0 && tell+16 <= total_bits)
+   if (start==0 && tell+16 <= total_bits)
   {
      if(ec_dec_bit_logp(dec, 1))
      {
@ -829,11 +900,11 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
   /* Decode the global flags (first symbols in the stream) */
   intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0;
   /* Get band energies */
-   unquant_coarse_energy(mode, st->start, st->end, oldBandE,
+   unquant_coarse_energy(mode, start, end, oldBandE,
         intra_ener, dec, C, LM);

   ALLOC(tf_res, nbEBands, int);
-   tf_decode(st->start, st->end, isTransient, tf_res, LM, dec);
+   tf_decode(start, end, isTransient, tf_res, LM, dec);

   tell = ec_tell(dec);
   spread_decision = SPREAD_NORMAL;
@ -849,7 +920,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
   dynalloc_logp = 6;
   total_bits<<=BITRES;
   tell = ec_tell_frac(dec);
-   for (i=st->start;i<st->end;i++)
+   for (i=start;i<end;i++)
   {
      int width, quanta;
      int dynalloc_loop_logp;
@ -888,21 +959,28 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
   ALLOC(pulses, nbEBands, int);
   ALLOC(fine_priority, nbEBands, int);

-   codedBands = compute_allocation(mode, st->start, st->end, offsets, cap,
+   codedBands = compute_allocation(mode, start, end, offsets, cap,
         alloc_trim, &intensity, &dual_stereo, bits, &balance, pulses,
         fine_quant, fine_priority, C, LM, dec, 0, 0, 0);

-   unquant_fine_energy(mode, st->start, st->end, oldBandE, fine_quant, dec, C);
+   unquant_fine_energy(mode, start, end, oldBandE, fine_quant, dec, C);
+
+   c=0; do {
+      OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap/2);
+   } while (++c<CC);

   /* Decode fixed codebook */
   ALLOC(collapse_masks, C*nbEBands, unsigned char);
-   /**< Interleaved normalised MDCTs */
-   if (FREQ_X_BUF_SIZE >= C*N)
-      X = s_X;
-   else
-      ALLOC(X, C*N, celt_norm);

-   quant_all_bands(0, mode, st->start, st->end, X, C==2 ? X+N : NULL, collapse_masks,
+#ifdef NORM_ALIASING_HACK
+   /* This is an ugly hack that breaks aliasing rules and would be easily broken,
+      but it saves almost 4kB of stack. */
+   X = (celt_norm*)(out_syn[CC-1]+overlap/2);
+#else
+   ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
+#endif
+
+   quant_all_bands(0, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
         NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
         len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng);

@ -911,58 +989,20 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
      anti_collapse_on = ec_dec_bits(dec, 1);
   }

-   unquant_energy_finalise(mode, st->start, st->end, oldBandE,
+   unquant_energy_finalise(mode, start, end, oldBandE,
         fine_quant, fine_priority, len*8-ec_tell(dec), dec, C);

   if (anti_collapse_on)
      anti_collapse(mode, X, collapse_masks, LM, C, N,
-            st->start, st->end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
-
-   /**< Interleaved signal MDCTs */
-   if (FREQ_X_BUF_SIZE >= IMAX(CC,C)*N)
-      freq = s_freq;
-   else
-      ALLOC(freq, IMAX(CC,C)*N, celt_sig);
+            start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);

   if (silence)
   {
      for (i=0;i<C*nbEBands;i++)
         oldBandE[i] = -QCONST16(28.f,DB_SHIFT);
-      for (i=0;i<C*N;i++)
-         freq[i] = 0;
-   } else {
-      /* Synthesis */
-      denormalise_bands(mode, X, freq, oldBandE, st->start, effEnd, C, M);
-   }
-   c=0; do {
-      OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap/2);
-   } while (++c<CC);
-
-   c=0; do {
-      int bound = M*eBands[effEnd];
-      if (st->downsample!=1)
-         bound = IMIN(bound, N/st->downsample);
-      for (i=bound;i<N;i++)
-         freq[c*N+i] = 0;
-   } while (++c<C);
-
-   c=0; do {
-      out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
-   } while (++c<CC);
-
-   if (CC==2&&C==1)
-   {
-      for (i=0;i<N;i++)
-         freq[N+i] = freq[i];
-   }
-   if (CC==1&&C==2)
-   {
-      for (i=0;i<N;i++)
-         freq[i] = HALF32(ADD32(freq[i],freq[N+i]));
   }

-   /* Compute inverse MDCTs */
-   compute_inv_mdcts(mode, shortBlocks, freq, out_syn, CC, LM);
+   celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, CC, isTransient, LM, st->downsample, silence);

   c=0; do {
      st->postfilter_period=IMAX(st->postfilter_period, COMBFILTER_MINPERIOD);
@ -989,18 +1029,14 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
      st->postfilter_tapset_old = st->postfilter_tapset;
   }

-   if (C==1) {
-      for (i=0;i<nbEBands;i++)
-         oldBandE[nbEBands+i]=oldBandE[i];
-   }
+   if (C==1)
+      OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands);

   /* In case start or end were to change */
   if (!isTransient)
   {
-      for (i=0;i<2*nbEBands;i++)
-         oldLogE2[i] = oldLogE[i];
-      for (i=0;i<2*nbEBands;i++)
-         oldLogE[i] = oldBandE[i];
+      OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands);
+      OPUS_COPY(oldLogE, oldBandE, 2*nbEBands);
      for (i=0;i<2*nbEBands;i++)
         backgroundLogE[i] = MIN16(backgroundLogE[i] + M*QCONST16(0.001f,DB_SHIFT), oldBandE[i]);
   } else {
@ -1009,12 +1045,12 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
   }
   c=0; do
   {
-      for (i=0;i<st->start;i++)
+      for (i=0;i<start;i++)
      {
         oldBandE[c*nbEBands+i]=0;
         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
      }
-      for (i=st->end;i<nbEBands;i++)
+      for (i=end;i<nbEBands;i++)
      {
         oldBandE[c*nbEBands+i]=0;
         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
@ -1022,8 +1058,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
   } while (++c<2);
   st->rng = dec->rng;

-   /* We reuse freq[] as scratch space for the de-emphasis */
-   deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, freq);
+   deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
   st->loss_count = 0;
   RESTORE_STACK;
   if (ec_tell(dec) > 8*len)
@ -1039,7 +1074,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
 #ifdef FIXED_POINT
 int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size)
 {
-   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL);
+   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL, 0);
 }

 #ifndef DISABLE_FLOAT_API
@ -1056,7 +1091,7 @@ int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char
   N = frame_size;

   ALLOC(out, C*N, opus_int16);
-   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL);
+   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL, 0);
   if (ret>0)
      for (j=0;j<C*ret;j++)
         pcm[j]=out[j]*(1.f/32768.f);
@ -1070,7 +1105,7 @@ int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char

 int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, float * OPUS_RESTRICT pcm, int frame_size)
 {
-   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL);
+   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL, 0);
 }

 int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size)
@ -1086,7 +1121,7 @@ int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data
   N = frame_size;
   ALLOC(out, C*N, celt_sig);

-   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL);
+   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL, 0);

   if (ret>0)
      for (j=0;j<C*ret;j++)
--- a/lib/rbcodec/codecs/libopus/celt/cwrs.c
+++ b/lib/rbcodec/codecs/libopus/celt/cwrs.c
@ -460,10 +460,12 @@ void encode_pulses(const int *_y,int _n,int _k,ec_enc *_enc){
  ec_enc_uint(_enc,icwrs(_n,_y),CELT_PVQ_V(_n,_k));
 }

-static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
+static opus_val32 cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
  opus_uint32 p;
  int         s;
  int         k0;
+  opus_int16  val;
+  opus_val32  yy=0;
  celt_assert(_k>0);
  celt_assert(_n>1);
  while(_n>2){
@ -487,7 +489,9 @@ static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
      }
      else for(p=row[_k];p>_i;p=row[_k])_k--;
      _i-=p;
-      *_y++=(k0-_k+s)^s;
+      val=(k0-_k+s)^s;
+      *_y++=val;
+      yy=MAC16_16(yy,val,val);
    }
    /*Lots of dimensions case:*/
    else{
@ -507,7 +511,9 @@ static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
        do p=CELT_PVQ_U_ROW[--_k][_n];
        while(p>_i);
        _i-=p;
-        *_y++=(k0-_k+s)^s;
+        val=(k0-_k+s)^s;
+        *_y++=val;
+        yy=MAC16_16(yy,val,val);
      }
    }
    _n--;
@ -519,14 +525,19 @@ static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
  k0=_k;
  _k=(_i+1)>>1;
  if(_k)_i-=2*_k-1;
-  *_y++=(k0-_k+s)^s;
+  val=(k0-_k+s)^s;
+  *_y++=val;
+  yy=MAC16_16(yy,val,val);
  /*_n==1*/
  s=-(int)_i;
-  *_y=(_k+s)^s;
+  val=(_k+s)^s;
+  *_y=val;
+  yy=MAC16_16(yy,val,val);
+  return yy;
 }

-void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
-  cwrsi(_n,_k,ec_dec_uint(_dec,CELT_PVQ_V(_n,_k)),_y);
+opus_val32 decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
+  return cwrsi(_n,_k,ec_dec_uint(_dec,CELT_PVQ_V(_n,_k)),_y);
 }

 #else /* SMALL_FOOTPRINT */
@ -591,8 +602,10 @@ static opus_uint32 ncwrs_urow(unsigned _n,unsigned _k,opus_uint32 *_u){
  _y: Returns the vector of pulses.
  _u: Must contain entries [0..._k+1] of row _n of U() on input.
      Its contents will be destructively modified.*/
-static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
+static opus_val32 cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
  int j;
+  opus_int16 val;
+  opus_val32 yy=0;
  celt_assert(_n>0);
  j=0;
  do{
@ -607,10 +620,13 @@ static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
    while(p>_i)p=_u[--_k];
    _i-=p;
    yj-=_k;
-    _y[j]=(yj+s)^s;
+    val=(yj+s)^s;
+    _y[j]=val;
+    yy=MAC16_16(yy,val,val);
    uprev(_u,_k+2,0);
  }
  while(++j<_n);
+  return yy;
 }

 /*Returns the index of the given combination of K elements chosen from a set
@ -685,13 +701,15 @@ void encode_pulses(const int *_y,int _n,int _k,ec_enc *_enc){
  RESTORE_STACK;
 }

-void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
+opus_val32 decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
  VARDECL(opus_uint32,u);
+  int ret;
  SAVE_STACK;
  celt_assert(_k>0);
  ALLOC(u,_k+2U,opus_uint32);
-  cwrsi(_n,_k,ec_dec_uint(_dec,ncwrs_urow(_n,_k,u)),_y,u);
+  ret = cwrsi(_n,_k,ec_dec_uint(_dec,ncwrs_urow(_n,_k,u)),_y,u);
  RESTORE_STACK;
+  return ret;
 }

 #endif /* SMALL_FOOTPRINT */
--- a/lib/rbcodec/codecs/libopus/celt/cwrs.h
+++ b/lib/rbcodec/codecs/libopus/celt/cwrs.h
@ -43,6 +43,6 @@ void get_required_bits(opus_int16 *bits, int N, int K, int frac);

 void encode_pulses(const int *_y, int N, int K, ec_enc *enc);

-void decode_pulses(int *_y, int N, int K, ec_dec *dec);
+opus_val32 decode_pulses(int *_y, int N, int K, ec_dec *dec);

 #endif /* CWRS_H */
--- a/lib/rbcodec/codecs/libopus/celt/entcode.c
+++ b/lib/rbcodec/codecs/libopus/celt/entcode.c
@ -62,6 +62,27 @@ int ec_ilog(opus_uint32 _v){
 }
 #endif

+#if 1
+/* This is a faster version of ec_tell_frac() that takes advantage
+   of the low (1/8 bit) resolution to use just a linear function
+   followed by a lookup to determine the exact transition thresholds. */
+opus_uint32 ec_tell_frac(ec_ctx *_this){
+  static const unsigned correction[8] =
+    {35733, 38967, 42495, 46340,
+     50535, 55109, 60097, 65535};
+  opus_uint32 nbits;
+  opus_uint32 r;
+  int         l;
+  unsigned    b;
+  nbits=_this->nbits_total<<BITRES;
+  l=EC_ILOG(_this->rng);
+  r=_this->rng>>(l-16);
+  b = (r>>12)-8;
+  b += r>correction[b];
+  l = (l<<3)+b;
+  return nbits-l;
+}
+#else
 opus_uint32 ec_tell_frac(ec_ctx *_this){
  opus_uint32 nbits;
  opus_uint32 r;
@ -91,3 +112,42 @@ opus_uint32 ec_tell_frac(ec_ctx *_this){
  }
  return nbits-l;
 }
+#endif
+
+#ifdef USE_SMALL_DIV_TABLE
+/* Result of 2^32/(2*i+1), except for i=0. */
+const opus_uint32 SMALL_DIV_TABLE[129] ICONST_ATTR = {
+   0xFFFFFFFF, 0x55555555, 0x33333333, 0x24924924,
+   0x1C71C71C, 0x1745D174, 0x13B13B13, 0x11111111,
+   0x0F0F0F0F, 0x0D79435E, 0x0C30C30C, 0x0B21642C,
+   0x0A3D70A3, 0x097B425E, 0x08D3DCB0, 0x08421084,
+   0x07C1F07C, 0x07507507, 0x06EB3E45, 0x06906906,
+   0x063E7063, 0x05F417D0, 0x05B05B05, 0x0572620A,
+   0x05397829, 0x05050505, 0x04D4873E, 0x04A7904A,
+   0x047DC11F, 0x0456C797, 0x04325C53, 0x04104104,
+   0x03F03F03, 0x03D22635, 0x03B5CC0E, 0x039B0AD1,
+   0x0381C0E0, 0x0369D036, 0x03531DEC, 0x033D91D2,
+   0x0329161F, 0x03159721, 0x03030303, 0x02F14990,
+   0x02E05C0B, 0x02D02D02, 0x02C0B02C, 0x02B1DA46,
+   0x02A3A0FD, 0x0295FAD4, 0x0288DF0C, 0x027C4597,
+   0x02702702, 0x02647C69, 0x02593F69, 0x024E6A17,
+   0x0243F6F0, 0x0239E0D5, 0x02302302, 0x0226B902,
+   0x021D9EAD, 0x0214D021, 0x020C49BA, 0x02040810,
+   0x01FC07F0, 0x01F44659, 0x01ECC07B, 0x01E573AC,
+   0x01DE5D6E, 0x01D77B65, 0x01D0CB58, 0x01CA4B30,
+   0x01C3F8F0, 0x01BDD2B8, 0x01B7D6C3, 0x01B20364,
+   0x01AC5701, 0x01A6D01A, 0x01A16D3F, 0x019C2D14,
+   0x01970E4F, 0x01920FB4, 0x018D3018, 0x01886E5F,
+   0x0183C977, 0x017F405F, 0x017AD220, 0x01767DCE,
+   0x01724287, 0x016E1F76, 0x016A13CD, 0x01661EC6,
+   0x01623FA7, 0x015E75BB, 0x015AC056, 0x01571ED3,
+   0x01539094, 0x01501501, 0x014CAB88, 0x0149539E,
+   0x01460CBC, 0x0142D662, 0x013FB013, 0x013C995A,
+   0x013991C2, 0x013698DF, 0x0133AE45, 0x0130D190,
+   0x012E025C, 0x012B404A, 0x01288B01, 0x0125E227,
+   0x01234567, 0x0120B470, 0x011E2EF3, 0x011BB4A4,
+   0x01194538, 0x0116E068, 0x011485F0, 0x0112358E,
+   0x010FEF01, 0x010DB20A, 0x010B7E6E, 0x010953F3,
+   0x01073260, 0x0105197F, 0x0103091B, 0x01010101
+};
+#endif
--- a/lib/rbcodec/codecs/libopus/celt/entcode.h
+++ b/lib/rbcodec/codecs/libopus/celt/entcode.h
@ -34,6 +34,12 @@
 # include <stddef.h>
 # include "ecintrin.h"

+extern const opus_uint32 SMALL_DIV_TABLE[129];
+
+#ifdef OPUS_ARM_ASM
+#define USE_SMALL_DIV_TABLE
+#endif
+
 /*OPT: ec_window must be at least 32 bits, but if you have fast arithmetic on a
   larger type, you can speed up the decoder by using it here.*/
 typedef opus_uint32           ec_window;
@ -114,4 +120,33 @@ static OPUS_INLINE int ec_tell(ec_ctx *_this){
           rounding error is in the positive direction).*/
 opus_uint32 ec_tell_frac(ec_ctx *_this);

+/* Tested exhaustively for all n and for 1<=d<=256 */
+static OPUS_INLINE opus_uint32 celt_udiv(opus_uint32 n, opus_uint32 d) {
+   celt_assert(d>0);
+#ifdef USE_SMALL_DIV_TABLE
+   if (d>256)
+      return n/d;
+   else {
+      opus_uint32 t, q;
+      t = EC_ILOG(d&-d);
+      q = (opus_uint64)SMALL_DIV_TABLE[d>>t]*(n>>(t-1))>>32;
+      return q+(n-q*d >= d);
+   }
+#else
+   return n/d;
+#endif
+}
+
+static OPUS_INLINE opus_int32 celt_sudiv(opus_int32 n, opus_int32 d) {
+   celt_assert(d>0);
+#ifdef USE_SMALL_DIV_TABLE
+   if (n<0)
+      return -(opus_int32)celt_udiv(-n, d);
+   else
+      return celt_udiv(n, d);
+#else
+   return n/d;
+#endif
+}
+
 #endif
--- a/lib/rbcodec/codecs/libopus/celt/entdec.c
+++ b/lib/rbcodec/codecs/libopus/celt/entdec.c
@ -138,7 +138,7 @@ void ec_dec_init(ec_dec *_this,unsigned char *_buf,opus_uint32 _storage){

 unsigned ec_decode(ec_dec *_this,unsigned _ft){
  unsigned s;
-  _this->ext=_this->rng/_ft;
+  _this->ext=celt_udiv(_this->rng,_ft);
  s=(unsigned)(_this->val/_this->ext);
  return _ft-EC_MINI(s+1,_ft);
 }
--- a/lib/rbcodec/codecs/libopus/celt/entenc.c
+++ b/lib/rbcodec/codecs/libopus/celt/entenc.c
@ -127,7 +127,7 @@ void ec_enc_init(ec_enc *_this,unsigned char *_buf,opus_uint32 _size){

 void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft){
  opus_uint32 r;
-  r=_this->rng/_ft;
+  r=celt_udiv(_this->rng,_ft);
  if(_fl>0){
    _this->val+=_this->rng-IMUL32(r,(_ft-_fl));
    _this->rng=IMUL32(r,(_fh-_fl));
--- a/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
+++ b/lib/rbcodec/codecs/libopus/celt/fixed_generic.h
@ -113,7 +113,11 @@
 /** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
    b must fit in 31 bits.
    Result fits in 32 bits. */
-#define MAC16_32_Q15(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
+#define MAC16_32_Q15(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
+
+/** 16x32 multiplication, followed by a 16-bit shift right and 32-bit add.
+    Results fits in 32 bits */
+#define MAC16_32_Q16(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16)))

 #define MULT16_16_Q11_32(a,b) (SHR(MULT16_16((a),(b)),11))
 #define MULT16_16_Q11(a,b) (SHR(MULT16_16((a),(b)),11))
@ -131,4 +135,17 @@
 /** Divide a 32-bit value by a 32-bit value. Result fits in 32 bits */
 #define DIV32(a,b) (((opus_val32)(a))/((opus_val32)(b)))

+#if defined(MIPSr1_ASM)
+#include "mips/fixed_generic_mipsr1.h"
+#endif
+
+static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x)
+{
+   x = PSHR32(x, SIG_SHIFT);
+   x = MAX32(x, -32768);
+   x = MIN32(x, 32767);
+   return EXTRACT16(x);
+}
+#define SIG2WORD16(x) (SIG2WORD16_generic(x))
+
 #endif
--- a/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
+++ b/lib/rbcodec/codecs/libopus/celt/kiss_fft.c
@ -45,73 +45,62 @@
   complex numbers.  It also delares the kf_ internal functions.
 */

-#if 0
 static void kf_bfly2(
                     kiss_fft_cpx * Fout,
-                     const size_t fstride,
-                     const kiss_fft_state *st,
                     int m,
-                     int N,
-                     int mm
+                     int N
                    )
 {
   kiss_fft_cpx * Fout2;
-   const kiss_twiddle_cpx * tw1;
-   int i,j;
-   kiss_fft_cpx * Fout_beg = Fout;
-   for (i=0;i<N;i++)
+   int i;
+   (void)m;
+#ifdef CUSTOM_MODES
+   if (m==1)
   {
-      Fout = Fout_beg + i*mm;
-      Fout2 = Fout + m;
-      tw1 = st->twiddles;
-      for(j=0;j<m;j++)
+      celt_assert(m==1);
+      for (i=0;i<N;i++)
      {
         kiss_fft_cpx t;
-         Fout->r = SHR32(Fout->r, 1);Fout->i = SHR32(Fout->i, 1);
-         Fout2->r = SHR32(Fout2->r, 1);Fout2->i = SHR32(Fout2->i, 1);
-         C_MUL (t,  *Fout2 , *tw1);
-         tw1 += fstride;
+         Fout2 = Fout + 1;
+         t = *Fout2;
         C_SUB( *Fout2 ,  *Fout , t );
         C_ADDTO( *Fout ,  t );
-         ++Fout2;
-         ++Fout;
+         Fout += 2;
      }
-   }
-}
+   } else
 #endif
-
-static void ki_bfly2(
-                     kiss_fft_cpx * Fout,
-                     const size_t fstride,
-                     const kiss_fft_state *st,
-                     int m,
-                     int N,
-                     int mm
-                    )
-{
-   kiss_fft_cpx * Fout2;
-   const kiss_twiddle_cpx * tw1;
-   kiss_fft_cpx t;
-   int i,j;
-   kiss_fft_cpx * Fout_beg = Fout;
-   for (i=0;i<N;i++)
   {
-      Fout = Fout_beg + i*mm;
-      Fout2 = Fout + m;
-      tw1 = st->twiddles;
-      for(j=0;j<m;j++)
+      opus_val16 tw;
+      tw = QCONST16(0.7071067812f, 15);
+      /* We know that m==4 here because the radix-2 is just after a radix-4 */
+      celt_assert(m==4);
+      for (i=0;i<N;i++)
      {
-         C_MULC (t,  *Fout2 , *tw1);
-         tw1 += fstride;
-         C_SUB( *Fout2 ,  *Fout , t );
-         C_ADDTO( *Fout ,  t );
-         ++Fout2;
-         ++Fout;
+         kiss_fft_cpx t;
+         Fout2 = Fout + 4;
+         t = Fout2[0];
+         C_SUB( Fout2[0] ,  Fout[0] , t );
+         C_ADDTO( Fout[0] ,  t );
+
+         t.r = S_MUL(Fout2[1].r+Fout2[1].i, tw);
+         t.i = S_MUL(Fout2[1].i-Fout2[1].r, tw);
+         C_SUB( Fout2[1] ,  Fout[1] , t );
+         C_ADDTO( Fout[1] ,  t );
+
+         t.r = Fout2[2].i;
+         t.i = -Fout2[2].r;
+         C_SUB( Fout2[2] ,  Fout[2] , t );
+         C_ADDTO( Fout[2] ,  t );
+
+         t.r = S_MUL(Fout2[3].i-Fout2[3].r, tw);
+         t.i = S_MUL(-Fout2[3].i-Fout2[3].r, tw);
+         C_SUB( Fout2[3] ,  Fout[3] , t );
+         C_ADDTO( Fout[3] ,  t );
+         Fout += 8;
      }
   }
 }

-#if 0
 static void kf_bfly4(
                     kiss_fft_cpx * Fout,
                     const size_t fstride,
@ -121,93 +110,69 @@ static void kf_bfly4(
                     int mm
                    )
 {
-   const kiss_twiddle_cpx *tw1,*tw2,*tw3;
-   kiss_fft_cpx scratch[6];
-   const size_t m2=2*m;
-   const size_t m3=3*m;
-   int i, j;
+   int i;

-   kiss_fft_cpx * Fout_beg = Fout;
-   for (i=0;i<N;i++)
+   if (m==1)
   {
-      Fout = Fout_beg + i*mm;
-      tw3 = tw2 = tw1 = st->twiddles;
-      for (j=0;j<m;j++)
+      /* Degenerate case where all the twiddles are 1. */
+      for (i=0;i<N;i++)
      {
-         C_MUL4(scratch[0],Fout[m] , *tw1 );
-         C_MUL4(scratch[1],Fout[m2] , *tw2 );
-         C_MUL4(scratch[2],Fout[m3] , *tw3 );
+         kiss_fft_cpx scratch0, scratch1;

-         Fout->r = PSHR32(Fout->r, 2);
-         Fout->i = PSHR32(Fout->i, 2);
-         C_SUB( scratch[5] , *Fout, scratch[1] );
-         C_ADDTO(*Fout, scratch[1]);
-         C_ADD( scratch[3] , scratch[0] , scratch[2] );
-         C_SUB( scratch[4] , scratch[0] , scratch[2] );
-         C_SUB( Fout[m2], *Fout, scratch[3] );
-         tw1 += fstride;
-         tw2 += fstride*2;
-         tw3 += fstride*3;
-         C_ADDTO( *Fout , scratch[3] );
+         C_SUB( scratch0 , *Fout, Fout[2] );
+         C_ADDTO(*Fout, Fout[2]);
+         C_ADD( scratch1 , Fout[1] , Fout[3] );
+         C_SUB( Fout[2], *Fout, scratch1 );
+         C_ADDTO( *Fout , scratch1 );
+         C_SUB( scratch1 , Fout[1] , Fout[3] );

-         Fout[m].r = scratch[5].r + scratch[4].i;
-         Fout[m].i = scratch[5].i - scratch[4].r;
-         Fout[m3].r = scratch[5].r - scratch[4].i;
-         Fout[m3].i = scratch[5].i + scratch[4].r;
-         ++Fout;
+         Fout[1].r = scratch0.r + scratch1.i;
+         Fout[1].i = scratch0.i - scratch1.r;
+         Fout[3].r = scratch0.r - scratch1.i;
+         Fout[3].i = scratch0.i + scratch1.r;
+         Fout+=4;
+      }
+   } else {
+      int j;
+      kiss_fft_cpx scratch[6];
+      const kiss_twiddle_cpx *tw1,*tw2,*tw3;
+      const int m2=2*m;
+      const int m3=3*m;
+      kiss_fft_cpx * Fout_beg = Fout;
+      for (i=0;i<N;i++)
+      {
+         Fout = Fout_beg + i*mm;
+         tw3 = tw2 = tw1 = st->twiddles;
+         /* m is guaranteed to be a multiple of 4. */
+         for (j=0;j<m;j++)
+         {
+            C_MUL(scratch[0],Fout[m] , *tw1 );
+            C_MUL(scratch[1],Fout[m2] , *tw2 );
+            C_MUL(scratch[2],Fout[m3] , *tw3 );
+
+            C_SUB( scratch[5] , *Fout, scratch[1] );
+            C_ADDTO(*Fout, scratch[1]);
+            C_ADD( scratch[3] , scratch[0] , scratch[2] );
+            C_SUB( scratch[4] , scratch[0] , scratch[2] );
+            C_SUB( Fout[m2], *Fout, scratch[3] );
+            tw1 += fstride;
+            tw2 += fstride*2;
+            tw3 += fstride*3;
+            C_ADDTO( *Fout , scratch[3] );
+
+            Fout[m].r = scratch[5].r + scratch[4].i;
+            Fout[m].i = scratch[5].i - scratch[4].r;
+            Fout[m3].r = scratch[5].r - scratch[4].i;
+            Fout[m3].i = scratch[5].i + scratch[4].r;
+            ++Fout;
+         }
      }
   }
 }
-#endif

-static void ki_bfly4(
-                     kiss_fft_cpx * Fout,
-                     const size_t fstride,
-                     const kiss_fft_state *st,
-                     int m,
-                     int N,
-                     int mm
-                    )
-{
-   const kiss_twiddle_cpx *tw1,*tw2,*tw3;
-   kiss_fft_cpx scratch[6];
-   const size_t m2=2*m;
-   const size_t m3=3*m;
-   int i, j;
-
-   kiss_fft_cpx * Fout_beg = Fout;
-   for (i=0;i<N;i++)
-   {
-      Fout = Fout_beg + i*mm;
-      tw3 = tw2 = tw1 = st->twiddles;
-      for (j=0;j<m;j++)
-      {
-         C_MULC(scratch[0],Fout[m] , *tw1 );
-         C_MULC(scratch[1],Fout[m2] , *tw2 );
-         C_MULC(scratch[2],Fout[m3] , *tw3 );
-
-         C_SUB( scratch[5] , *Fout, scratch[1] );
-         C_ADDTO(*Fout, scratch[1]);
-         C_ADD( scratch[3] , scratch[0] , scratch[2] );
-         C_SUB( scratch[4] , scratch[0] , scratch[2] );
-         C_SUB( Fout[m2], *Fout, scratch[3] );
-         tw1 += fstride;
-         tw2 += fstride*2;
-         tw3 += fstride*3;
-         C_ADDTO( *Fout , scratch[3] );
-
-         Fout[m].r = scratch[5].r - scratch[4].i;
-         Fout[m].i = scratch[5].i + scratch[4].r;
-         Fout[m3].r = scratch[5].r + scratch[4].i;
-         Fout[m3].i = scratch[5].i - scratch[4].r;
-         ++Fout;
-      }
-   }
-}

 #ifndef RADIX_TWO_ONLY

-#if 0
 static void kf_bfly3(
                     kiss_fft_cpx * Fout,
                     const size_t fstride,
@ -225,14 +190,19 @@ static void kf_bfly3(
   kiss_twiddle_cpx epi3;

   kiss_fft_cpx * Fout_beg = Fout;
+#ifdef FIXED_POINT
+   epi3.r = -16384;
+   epi3.i = -28378;
+#else
   epi3 = st->twiddles[fstride*m];
+#endif
   for (i=0;i<N;i++)
   {
      Fout = Fout_beg + i*mm;
      tw1=tw2=st->twiddles;
+      /* For non-custom modes, m is guaranteed to be a multiple of 4. */
      k=m;
      do {
-         C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);

         C_MUL(scratch[1],Fout[m] , *tw1);
         C_MUL(scratch[2],Fout[m2] , *tw2);
@ -259,59 +229,9 @@ static void kf_bfly3(
      } while(--k);
   }
 }
-#endif

-static void ki_bfly3(
-                     kiss_fft_cpx * Fout,
-                     const size_t fstride,
-                     const kiss_fft_state *st,
-                     int m,
-                     int N,
-                     int mm
-                    )
-{
-   int i, k;
-   const size_t m2 = 2*m;
-   const kiss_twiddle_cpx *tw1,*tw2;
-   kiss_fft_cpx scratch[5];
-   kiss_twiddle_cpx epi3;

-   kiss_fft_cpx * Fout_beg = Fout;
-   epi3 = st->twiddles[fstride*m];
-   for (i=0;i<N;i++)
-   {
-      Fout = Fout_beg + i*mm;
-      tw1=tw2=st->twiddles;
-      k=m;
-      do{
-
-         C_MULC(scratch[1],Fout[m] , *tw1);
-         C_MULC(scratch[2],Fout[m2] , *tw2);
-
-         C_ADD(scratch[3],scratch[1],scratch[2]);
-         C_SUB(scratch[0],scratch[1],scratch[2]);
-         tw1 += fstride;
-         tw2 += fstride*2;
-
-         Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
-         Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
-
-         C_MULBYSCALAR( scratch[0] , -epi3.i );
-
-         C_ADDTO(*Fout,scratch[3]);
-
-         Fout[m2].r = Fout[m].r + scratch[0].i;
-         Fout[m2].i = Fout[m].i - scratch[0].r;
-
-         Fout[m].r -= scratch[0].i;
-         Fout[m].i += scratch[0].r;
-
-         ++Fout;
-      }while(--k);
-   }
-}
-
-#if 0
+#ifndef OVERRIDE_kf_bfly5
 static void kf_bfly5(
                     kiss_fft_cpx * Fout,
                     const size_t fstride,
@ -324,13 +244,19 @@ static void kf_bfly5(
   kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
   int i, u;
   kiss_fft_cpx scratch[13];
-   const kiss_twiddle_cpx * twiddles = st->twiddles;
   const kiss_twiddle_cpx *tw;
   kiss_twiddle_cpx ya,yb;
   kiss_fft_cpx * Fout_beg = Fout;

-   ya = twiddles[fstride*m];
-   yb = twiddles[fstride*2*m];
+#ifdef FIXED_POINT
+   ya.r = 10126;
+   ya.i = -31164;
+   yb.r = -26510;
+   yb.i = -19261;
+#else
+   ya = st->twiddles[fstride*m];
+   yb = st->twiddles[fstride*2*m];
+#endif
   tw=st->twiddles;

   for (i=0;i<N;i++)
@ -342,8 +268,8 @@ static void kf_bfly5(
      Fout3=Fout0+3*m;
      Fout4=Fout0+4*m;

+      /* For non-custom modes, m is guaranteed to be a multiple of 4. */
      for ( u=0; u<m; ++u ) {
-         C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
         scratch[0] = *Fout0;

         C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
@ -380,75 +306,8 @@ static void kf_bfly5(
      }
   }
 }
-#endif
+#endif /* OVERRIDE_kf_bfly5 */

-static void ki_bfly5(
-                     kiss_fft_cpx * Fout,
-                     const size_t fstride,
-                     const kiss_fft_state *st,
-                     int m,
-                     int N,
-                     int mm
-                    )
-{
-   kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
-   int i, u;
-   kiss_fft_cpx scratch[13];
-   const kiss_twiddle_cpx * twiddles = st->twiddles;
-   const kiss_twiddle_cpx *tw;
-   kiss_twiddle_cpx ya,yb;
-   kiss_fft_cpx * Fout_beg = Fout;
-
-   ya = twiddles[fstride*m];
-   yb = twiddles[fstride*2*m];
-   tw=st->twiddles;
-
-   for (i=0;i<N;i++)
-   {
-      Fout = Fout_beg + i*mm;
-      Fout0=Fout;
-      Fout1=Fout0+m;
-      Fout2=Fout0+2*m;
-      Fout3=Fout0+3*m;
-      Fout4=Fout0+4*m;
-
-      for ( u=0; u<m; ++u ) {
-         scratch[0] = *Fout0;
-
-         C_MULC(scratch[1] ,*Fout1, tw[u*fstride]);
-         C_MULC(scratch[2] ,*Fout2, tw[2*u*fstride]);
-         C_MULC(scratch[3] ,*Fout3, tw[3*u*fstride]);
-         C_MULC(scratch[4] ,*Fout4, tw[4*u*fstride]);
-
-         C_ADD( scratch[7],scratch[1],scratch[4]);
-         C_SUB( scratch[10],scratch[1],scratch[4]);
-         C_ADD( scratch[8],scratch[2],scratch[3]);
-         C_SUB( scratch[9],scratch[2],scratch[3]);
-
-         Fout0->r += scratch[7].r + scratch[8].r;
-         Fout0->i += scratch[7].i + scratch[8].i;
-
-         scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
-         scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
-
-         scratch[6].r = -S_MUL(scratch[10].i,ya.i) - S_MUL(scratch[9].i,yb.i);
-         scratch[6].i =  S_MUL(scratch[10].r,ya.i) + S_MUL(scratch[9].r,yb.i);
-
-         C_SUB(*Fout1,scratch[5],scratch[6]);
-         C_ADD(*Fout4,scratch[5],scratch[6]);
-
-         scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
-         scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
-         scratch[12].r =  S_MUL(scratch[10].i,yb.i) - S_MUL(scratch[9].i,ya.i);
-         scratch[12].i = -S_MUL(scratch[10].r,yb.i) + S_MUL(scratch[9].r,ya.i);
-
-         C_ADD(*Fout2,scratch[11],scratch[12]);
-         C_SUB(*Fout3,scratch[11],scratch[12]);
-
-         ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
-      }
-   }
-}

 #endif

@ -496,6 +355,9 @@ static
 int kf_factor(int n,opus_int16 * facbuf)
 {
    int p=4;
+    int i;
+    int stages=0;
+    int nbak = n;

    /*factor out powers of 4, powers of 2, then any remaining primes */
    do {
@ -517,9 +379,30 @@ int kf_factor(int n,opus_int16 * facbuf)
        {
           return 0;
        }
-        *facbuf++ = p;
-        *facbuf++ = n;
+        facbuf[2*stages] = p;
+        if (p==2 && stages > 1)
+        {
+           facbuf[2*stages] = 4;
+           facbuf[2] = 2;
+        }
+        stages++;
    } while (n > 1);
+    n = nbak;
+    /* Reverse the order to get the radix 4 at the end, so we can use the
+       fast degenerate case. It turns out that reversing the order also
+       improves the noise behaviour. */
+    for (i=0;i<stages/2;i++)
+    {
+       int tmp;
+       tmp = facbuf[2*i];
+       facbuf[2*i] = facbuf[2*(stages-i-1)];
+       facbuf[2*(stages-i-1)] = tmp;
+    }
+    for (i=0;i<stages;i++)
+    {
+        n /= facbuf[2*i];
+        facbuf[2*i+1] = n;
+    }
    return 1;
 }

@ -563,14 +446,20 @@ kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,  co
        kiss_twiddle_cpx *twiddles;

        st->nfft=nfft;
-#ifndef FIXED_POINT
+#ifdef FIXED_POINT
+        st->scale_shift = celt_ilog2(st->nfft);
+        if (st->nfft == 1<<st->scale_shift)
+           st->scale = Q15ONE;
+        else
+           st->scale = (1073741824+st->nfft/2)/st->nfft>>(15-st->scale_shift);
+#else
        st->scale = 1.f/nfft;
 #endif
        if (base != NULL)
        {
           st->twiddles = base->twiddles;
           st->shift = 0;
-           while (nfft<<st->shift != base->nfft && st->shift < 32)
+           while (st->shift < 32 && nfft<<st->shift != base->nfft)
              st->shift++;
           if (st->shift>=32)
              goto fail;
@ -614,8 +503,7 @@ void opus_fft_free(const kiss_fft_state *cfg)

 #endif /* CUSTOM_MODES */

-#if 0
-void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout)
 {
    int m2, m;
    int p;
@ -627,17 +515,6 @@ void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fou
    /* st->shift can be -1 */
    shift = st->shift>0 ? st->shift : 0;

-    celt_assert2 (fin != fout, "In-place FFT not supported");
-    /* Bit-reverse the input */
-    for (i=0;i<st->nfft;i++)
-    {
-       fout[st->bitrev[i]] = fin[i];
-#ifndef FIXED_POINT
-       fout[st->bitrev[i]].r *= st->scale;
-       fout[st->bitrev[i]].i *= st->scale;
-#endif
-    }
-
    fstride[0] = 1;
    L=0;
    do {
@ -656,7 +533,7 @@ void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fou
       switch (st->factors[2*i])
       {
       case 2:
-          kf_bfly2(fout,fstride[i]<<shift,st,m, fstride[i], m2);
+          kf_bfly2(fout, m, fstride[i]);
          break;
       case 4:
          kf_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2);
@ -673,57 +550,44 @@ void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fou
       m = m2;
    }
 }
+
+#if 0
+void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+{
+   int i;
+   opus_val16 scale;
+#ifdef FIXED_POINT
+   /* Allows us to scale with MULT16_32_Q16(), which is faster than
+      MULT16_32_Q15() on ARM. */
+   int scale_shift = st->scale_shift-1;
+#endif
+   scale = st->scale;
+
+   celt_assert2 (fin != fout, "In-place FFT not supported");
+   /* Bit-reverse the input */
+   for (i=0;i<st->nfft;i++)
+   {
+      kiss_fft_cpx x = fin[i];
+      fout[st->bitrev[i]].r = SHR32(MULT16_32_Q16(scale, x.r), scale_shift);
+      fout[st->bitrev[i]].i = SHR32(MULT16_32_Q16(scale, x.i), scale_shift);
+   }
+   opus_fft_impl(st, fout);
+}
 #endif

+
+#ifdef TEST_UNIT_DFT_C
 void opus_ifft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
 {
-   int m2, m;
-   int p;
-   int L;
-   int fstride[MAXFACTORS];
   int i;
-   int shift;
-
-   /* st->shift can be -1 */
-   shift = st->shift>0 ? st->shift : 0;
   celt_assert2 (fin != fout, "In-place FFT not supported");
   /* Bit-reverse the input */
   for (i=0;i<st->nfft;i++)
      fout[st->bitrev[i]] = fin[i];
-
-   fstride[0] = 1;
-   L=0;
-   do {
-      p = st->factors[2*L];
-      m = st->factors[2*L+1];
-      fstride[L+1] = fstride[L]*p;
-      L++;
-   } while(m!=1);
-   m = st->factors[2*L-1];
-   for (i=L-1;i>=0;i--)
-   {
-      if (i!=0)
-         m2 = st->factors[2*i-1];
-      else
-         m2 = 1;
-      switch (st->factors[2*i])
-      {
-      case 2:
-         ki_bfly2(fout,fstride[i]<<shift,st,m, fstride[i], m2);
-         break;
-      case 4:
-         ki_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2);
-         break;
-#ifndef RADIX_TWO_ONLY
-      case 3:
-         ki_bfly3(fout,fstride[i]<<shift,st,m, fstride[i], m2);
-         break;
-      case 5:
-         ki_bfly5(fout,fstride[i]<<shift,st,m, fstride[i], m2);
-         break;
-#endif
-      }
-      m = m2;
-   }
+   for (i=0;i<st->nfft;i++)
+      fout[i].i = -fout[i].i;
+   opus_fft_impl(st, fout);
+   for (i=0;i<st->nfft;i++)
+      fout[i].i = -fout[i].i;
 }
-
+#endif
--- a/lib/rbcodec/codecs/libopus/celt/kiss_fft.h
+++ b/lib/rbcodec/codecs/libopus/celt/kiss_fft.h
@ -79,8 +79,9 @@ typedef struct {

 typedef struct kiss_fft_state{
    int nfft;
-#ifndef FIXED_POINT
-    kiss_fft_scalar scale;
+    opus_val16 scale;
+#ifdef FIXED_POINT
+    int scale_shift;
 #endif
    int shift;
    opus_int16 factors[2*MAXFACTORS];
@ -128,14 +129,10 @@ kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem);
    f[k].r and f[k].i
 * */
 void opus_fft(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
+void opus_ifft(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);

-#if defined(CPU_COLDFIRE)
-#define IFFT_ICODE ICODE_ATTR
-#else
-#define IFFT_ICODE
-#endif
-
-void opus_ifft(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout) IFFT_ICODE;
+void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout);
+void opus_ifft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout);

 void opus_fft_free(const kiss_fft_state *cfg);

--- a/lib/rbcodec/codecs/libopus/celt/mdct.c
+++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
@ -53,18 +53,20 @@
 #include "mathops.h"
 #include "stack_alloc.h"

+#if defined(MIPSr1_ASM)
+#include "mips/mdct_mipsr1.h"
+#endif
+
+
 #ifdef CUSTOM_MODES

 int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
 {
   int i;
-   int N4;
   kiss_twiddle_scalar *trig;
-#if defined(FIXED_POINT)
+   int shift;
   int N2=N>>1;
-#endif
   l->n = N;
-   N4 = N>>2;
   l->maxshift = maxshift;
   for (i=0;i<=maxshift;i++)
   {
@ -77,17 +79,28 @@ int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
         return 0;
 #endif
   }
-   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N4+1)*sizeof(kiss_twiddle_scalar));
+   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar));
   if (l->trig==NULL)
     return 0;
-   /* We have enough points that sine isn't necessary */
+   for (shift=0;shift<=maxshift;shift++)
+   {
+      /* We have enough points that sine isn't necessary */
 #if defined(FIXED_POINT)
-   for (i=0;i<=N4;i++)
-      trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N));
+#if 1
+      for (i=0;i<N2;i++)
+         trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N));
 #else
-   for (i=0;i<=N4;i++)
-      trig[i] = (kiss_twiddle_scalar)cos(2*PI*i/N);
+      for (i=0;i<N2;i++)
+         trig[i] = (kiss_twiddle_scalar)MAX32(-32767,MIN32(32767,floor(.5+32768*cos(2*M_PI*(i+.125)/N))));
 #endif
+#else
+      for (i=0;i<N2;i++)
+         trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N);
+#endif
+      trig += N2;
+      N2 >>= 1;
+      N >>= 1;
+   }
   return 1;
 }

@ -103,27 +116,37 @@ void clt_mdct_clear(mdct_lookup *l)

 #if 0
 /* Forward MDCT trashes the input array */
+#ifndef OVERRIDE_clt_mdct_forward
 void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
      const opus_val16 *window, int overlap, int shift, int stride)
 {
   int i;
   int N, N2, N4;
-   kiss_twiddle_scalar sine;
   VARDECL(kiss_fft_scalar, f);
-   VARDECL(kiss_fft_scalar, f2);
+   VARDECL(kiss_fft_cpx, f2);
+   const kiss_fft_state *st = l->kfft[shift];
+   const kiss_twiddle_scalar *trig;
+   opus_val16 scale;
+#ifdef FIXED_POINT
+   /* Allows us to scale with MULT16_32_Q16(), which is faster than
+      MULT16_32_Q15() on ARM. */
+   int scale_shift = st->scale_shift-1;
+#endif
   SAVE_STACK;
+   scale = st->scale;
+
   N = l->n;
-   N >>= shift;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
   N2 = N>>1;
   N4 = N>>2;
+
   ALLOC(f, N2, kiss_fft_scalar);
-   ALLOC(f2, N2, kiss_fft_scalar);
-   /* sin(x) ~= x here */
-#ifdef FIXED_POINT
-   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
-#else
-   sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
-#endif
+   ALLOC(f2, N4, kiss_fft_cpx);

   /* Consider the input to be composed of four blocks: [a, b, c, d] */
   /* Window, shuffle, fold */
@ -168,125 +191,131 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
   /* Pre-rotation */
   {
      kiss_fft_scalar * OPUS_RESTRICT yp = f;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_twiddle_scalar *t = &trig[0];
      for(i=0;i<N4;i++)
      {
+         kiss_fft_cpx yc;
+         kiss_twiddle_scalar t0, t1;
         kiss_fft_scalar re, im, yr, yi;
-         re = yp[0];
-         im = yp[1];
-         yr = -S_MUL(re,t[i<<shift])  -  S_MUL(im,t[(N4-i)<<shift]);
-         yi = -S_MUL(im,t[i<<shift])  +  S_MUL(re,t[(N4-i)<<shift]);
-         /* works because the cos is nearly one */
-         *yp++ = yr + S_MUL(yi,sine);
-         *yp++ = yi - S_MUL(yr,sine);
+         t0 = t[i];
+         t1 = t[N4+i];
+         re = *yp++;
+         im = *yp++;
+         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
+         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
+         yc.r = yr;
+         yc.i = yi;
+         yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
+         yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift);
+         f2[st->bitrev[i]] = yc;
      }
   }

-   /* N/4 complex FFT, down-scales by 4/N */
-   opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2);
+   /* N/4 complex FFT, does not downscale anymore */
+   opus_fft_impl(st, f2);

   /* Post-rotate */
   {
      /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_scalar * OPUS_RESTRICT fp = f2;
+      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_twiddle_scalar *t = &trig[0];
      /* Temp pointers to make it really clear to the compiler what we're doing */
      for(i=0;i<N4;i++)
      {
         kiss_fft_scalar yr, yi;
-         yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]);
-         yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]);
-         /* works because the cos is nearly one */
-         *yp1 = yr - S_MUL(yi,sine);
-         *yp2 = yi + S_MUL(yr,sine);;
-         fp += 2;
+         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
+         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
+         *yp1 = yr;
+         *yp2 = yi;
+         fp++;
         yp1 += 2*stride;
         yp2 -= 2*stride;
      }
   }
   RESTORE_STACK;
 }
+#endif /* OVERRIDE_clt_mdct_forward */
 #endif

+#ifndef OVERRIDE_clt_mdct_backward
 void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
 {
   int i;
   int N, N2, N4;
-   kiss_twiddle_scalar sine;
-/*   VARDECL(kiss_fft_scalar, f2);
-   SAVE_STACK; */
+   const kiss_twiddle_scalar *trig;
+
   N = l->n;
-   N >>= shift;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
   N2 = N>>1;
   N4 = N>>2;
-/*   ALLOC(f2, N2, kiss_fft_scalar); */
-   kiss_fft_scalar f2[N2]; /* worst case 3840b */
-   /* sin(x) ~= x here */
-#ifdef FIXED_POINT
-   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
-#else
-   sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
-#endif

   /* Pre-rotate */
   {
      /* Temp pointers to make it really clear to the compiler what we're doing */
      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
-      kiss_fft_scalar * OPUS_RESTRICT yp = f2;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
+      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
+      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
      for(i=0;i<N4;i++)
      {
+         int rev;
         kiss_fft_scalar yr, yi;
-         yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
-         yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
-         /* works because the cos is nearly one */
-         *yp++ = yr - S_MUL(yi,sine);
-         *yp++ = yi + S_MUL(yr,sine);
+         rev = *bitrev++;
+         yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
+         yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
+         /* We swap real and imag because we use an FFT instead of an IFFT. */
+         yp[2*rev+1] = yr;
+         yp[2*rev] = yi;
+         /* Storing the pre-rotation directly in the bitrev order. */
         xp1+=2*stride;
         xp2-=2*stride;
      }
   }

-   /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
-   opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
+   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)));

   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
      it in-place. */
   {
-      kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
-      kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      kiss_fft_scalar * yp0 = out+(overlap>>1);
+      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
+      const kiss_twiddle_scalar *t = &trig[0];
      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
         middle pair will be computed twice. */
      for(i=0;i<(N4+1)>>1;i++)
      {
         kiss_fft_scalar re, im, yr, yi;
         kiss_twiddle_scalar t0, t1;
-         re = yp0[0];
-         im = yp0[1];
-         t0 = t[i<<shift];
-         t1 = t[(N4-i)<<shift];
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
+         re = yp0[1];
+         im = yp0[0];
+         t0 = t[i];
+         t1 = t[N4+i];
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t0) - S_MUL(im,t1);
-         yi = S_MUL(im,t0) + S_MUL(re,t1);
-         re = yp1[0];
-         im = yp1[1];
-         /* works because the cos is nearly one */
-         yp0[0] = -(yr - S_MUL(yi,sine));
-         yp1[1] = yi + S_MUL(yr,sine);
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
+         re = yp1[1];
+         im = yp1[0];
+         yp0[0] = yr;
+         yp1[1] = yi;

-         t0 = t[(N4-i-1)<<shift];
-         t1 = t[(i+1)<<shift];
+         t0 = t[(N4-i-1)];
+         t1 = t[(N2-i-1)];
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t0) - S_MUL(im,t1);
-         yi = S_MUL(im,t0) + S_MUL(re,t1);
-         /* works because the cos is nearly one */
-         yp1[0] = -(yr - S_MUL(yi,sine));
-         yp0[1] = yi + S_MUL(yr,sine);
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
+         yp1[0] = yr;
+         yp0[1] = yi;
         yp0 += 2;
         yp1 -= 2;
      }
@ -310,5 +339,5 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
         wp2--;
      }
   }
-/*   RESTORE_STACK; */
 }
+#endif /* OVERRIDE_clt_mdct_backward */
--- a/lib/rbcodec/codecs/libopus/celt/modes.h
+++ b/lib/rbcodec/codecs/libopus/celt/modes.h
@ -39,14 +39,6 @@

 #define MAX_PERIOD 1024

-#ifndef OVERLAP
-#define OVERLAP(mode) ((mode)->overlap)
-#endif
-
-#ifndef FRAMESIZE
-#define FRAMESIZE(mode) ((mode)->mdctSize)
-#endif
-
 typedef struct {
   int size;
   const opus_int16 *index;
--- a/lib/rbcodec/codecs/libopus/celt/pitch.c
+++ b/lib/rbcodec/codecs/libopus/celt/pitch.c
@ -252,15 +252,15 @@ void
 #endif
 celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
 {
-   int i,j;
+   int i;
   /*The EDSP version requires that max_pitch is at least 1, and that _x is
      32-bit aligned.
     Since it's hard to put asserts in assembly, put them here.*/
-   celt_assert(max_pitch>0);
-   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
 #ifdef FIXED_POINT
   opus_val32 maxcorr=1;
 #endif
+   celt_assert(max_pitch>0);
+   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
   for (i=0;i<max_pitch-3;i+=4)
   {
      opus_val32 sum[4]={0,0,0,0};
@ -279,9 +279,8 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr
   /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
   for (;i<max_pitch;i++)
   {
-      opus_val32 sum = 0;
-      for (j=0;j<len;j++)
-         sum = MAC16_16(sum, _x[j],_y[i+j]);
+      opus_val32 sum;
+      sum = celt_inner_prod(_x, _y+i, len);
      xcorr[i] = sum;
 #ifdef FIXED_POINT
      maxcorr = MAX32(maxcorr, sum);
@ -361,12 +360,17 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
 #endif
   for (i=0;i<max_pitch>>1;i++)
   {
-      opus_val32 sum=0;
+      opus_val32 sum;
      xcorr[i] = 0;
      if (abs(i-2*best_pitch[0])>2 && abs(i-2*best_pitch[1])>2)
         continue;
+#ifdef FIXED_POINT
+      sum = 0;
      for (j=0;j<len>>1;j++)
         sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
+#else
+      sum = celt_inner_prod(x_lp, y+i, len>>1);
+#endif
      xcorr[i] = MAX32(-1, sum);
 #ifdef FIXED_POINT
      maxcorr = MAX32(maxcorr, sum);
@ -457,7 +461,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
      opus_val16 g1;
      opus_val16 cont=0;
      opus_val16 thresh;
-      T1 = (2*T0+k)/(2*k);
+      T1 = celt_udiv(2*T0+k, 2*k);
      if (T1 < minperiod)
         break;
      /* Look for another strong correlation at T1b */
@ -469,7 +473,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
            T1b = T0+T1;
      } else
      {
-         T1b = (2*second_check[k]*T0+k)/(2*k);
+         T1b = celt_udiv(2*second_check[k]*T0+k, 2*k);
      }
      dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
      xy += xy2;
@ -514,13 +518,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
      pg = SHR32(frac_div32(best_xy,best_yy+1),16);

   for (k=0;k<3;k++)
-   {
-      int T1 = T+k-1;
-      xy = 0;
-      for (i=0;i<N;i++)
-         xy = MAC16_16(xy, x[i], x[i-T1]);
-      xcorr[k] = xy;
-   }
+      xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);
   if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0]))
      offset = 1;
   else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2]))
--- a/lib/rbcodec/codecs/libopus/celt/pitch.h
+++ b/lib/rbcodec/codecs/libopus/celt/pitch.h
@ -41,8 +41,12 @@
 #include "x86/pitch_sse.h"
 #endif

+#if defined(MIPSr1_ASM)
+#include "mips/pitch_mipsr1.h"
+#endif
+
 #if defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
-# include "arm/pitch_arm.h"
+//# include "arm/pitch_arm.h"
 #endif

 void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
@ -141,6 +145,18 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
 }
 #endif

+#ifndef OVERRIDE_CELT_INNER_PROD
+static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y,
+      int N)
+{
+   int i;
+   opus_val32 xy=0;
+   for (i=0;i<N;i++)
+      xy = MAC16_16(xy, x[i], y[i]);
+   return xy;
+}
+#endif
+
 #ifdef FIXED_POINT
 opus_val32
 #else
--- a/lib/rbcodec/codecs/libopus/celt/rate.c
+++ b/lib/rbcodec/codecs/libopus/celt/rate.c
@ -333,7 +333,7 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,
      /*Figure out how many left-over bits we would be adding to this band.
        This can include bits we've stolen back from higher, skipped bands.*/
      left = total-psum;
-      percoeff = left/(m->eBands[codedBands]-m->eBands[start]);
+      percoeff = celt_udiv(left, m->eBands[codedBands]-m->eBands[start]);
      left -= (m->eBands[codedBands]-m->eBands[start])*percoeff;
      rem = IMAX(left-(m->eBands[j]-m->eBands[start]),0);
      band_width = m->eBands[codedBands]-m->eBands[j];
@ -414,7 +414,7 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,

   /* Allocate the remaining bits */
   left = total-psum;
-   percoeff = left/(m->eBands[codedBands]-m->eBands[start]);
+   percoeff = celt_udiv(left, m->eBands[codedBands]-m->eBands[start]);
   left -= (m->eBands[codedBands]-m->eBands[start])*percoeff;
   for (j=start;j<codedBands;j++)
      bits[j] += ((int)percoeff*(m->eBands[j+1]-m->eBands[j]));
@ -465,7 +465,8 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,
            offset += NClogN>>3;

         /* Divide with rounding */
-         ebits[j] = IMAX(0, (bits[j] + offset + (den<<(BITRES-1))) / (den<<BITRES));
+         ebits[j] = IMAX(0, (bits[j] + offset + (den<<(BITRES-1))));
+         ebits[j] = celt_udiv(ebits[j], den)>>BITRES;

         /* Make sure not to bust */
         if (C*ebits[j] > (bits[j]>>BITRES))
--- a/lib/rbcodec/codecs/libopus/celt/stack_alloc.h
+++ b/lib/rbcodec/codecs/libopus/celt/stack_alloc.h
@ -116,9 +116,11 @@
 #else

 #ifdef CELT_C
+char *scratch_ptr=0;
 char *global_stack=0;
 #else
 extern char *global_stack;
+extern char *scratch_ptr;
 #endif /* CELT_C */

 #ifdef ENABLE_VALGRIND
@ -140,8 +142,12 @@ extern char *global_stack_top;

 #define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1))
 #define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/sizeof(char)),(stack)+=(size)*(sizeof(type)/sizeof(char)),(type*)((stack)-(size)*(sizeof(type)/sizeof(char))))
+#if 0 /* Set this to 1 to instrument pseudostack usage */
+#define RESTORE_STACK (printf("%ld %s:%d\n", global_stack-scratch_ptr, __FILE__, __LINE__),global_stack = _saved_stack)
+#else
 #define RESTORE_STACK (global_stack = _saved_stack)
-#define ALLOC_STACK char *_saved_stack; (global_stack = (global_stack==0) ? opus_alloc_scratch(GLOBAL_STACK_SIZE) : global_stack); _saved_stack = global_stack;
+#endif
+#define ALLOC_STACK char *_saved_stack; (global_stack = (global_stack==0) ? (scratch_ptr=opus_alloc_scratch(GLOBAL_STACK_SIZE)) : global_stack); _saved_stack = global_stack;

 #endif /* ENABLE_VALGRIND */

--- a/lib/rbcodec/codecs/libopus/celt/static_modes_fixed.h
+++ b/lib/rbcodec/codecs/libopus/celt/static_modes_fixed.h
@ -341,84 +341,84 @@ static const kiss_twiddle_cpx fft_twiddles48000_960[480] ICONST_ATTR = {
 #ifndef FFT_BITREV480
 #define FFT_BITREV480
 static const opus_int16 fft_bitrev480[480] = {
-0, 120, 240, 360, 30, 150, 270, 390, 60, 180, 300, 420, 90, 210, 330,
-450, 15, 135, 255, 375, 45, 165, 285, 405, 75, 195, 315, 435, 105, 225,
-345, 465, 5, 125, 245, 365, 35, 155, 275, 395, 65, 185, 305, 425, 95,
-215, 335, 455, 20, 140, 260, 380, 50, 170, 290, 410, 80, 200, 320, 440,
-110, 230, 350, 470, 10, 130, 250, 370, 40, 160, 280, 400, 70, 190, 310,
-430, 100, 220, 340, 460, 25, 145, 265, 385, 55, 175, 295, 415, 85, 205,
-325, 445, 115, 235, 355, 475, 1, 121, 241, 361, 31, 151, 271, 391, 61,
-181, 301, 421, 91, 211, 331, 451, 16, 136, 256, 376, 46, 166, 286, 406,
-76, 196, 316, 436, 106, 226, 346, 466, 6, 126, 246, 366, 36, 156, 276,
-396, 66, 186, 306, 426, 96, 216, 336, 456, 21, 141, 261, 381, 51, 171,
-291, 411, 81, 201, 321, 441, 111, 231, 351, 471, 11, 131, 251, 371, 41,
-161, 281, 401, 71, 191, 311, 431, 101, 221, 341, 461, 26, 146, 266, 386,
-56, 176, 296, 416, 86, 206, 326, 446, 116, 236, 356, 476, 2, 122, 242,
-362, 32, 152, 272, 392, 62, 182, 302, 422, 92, 212, 332, 452, 17, 137,
-257, 377, 47, 167, 287, 407, 77, 197, 317, 437, 107, 227, 347, 467, 7,
-127, 247, 367, 37, 157, 277, 397, 67, 187, 307, 427, 97, 217, 337, 457,
-22, 142, 262, 382, 52, 172, 292, 412, 82, 202, 322, 442, 112, 232, 352,
-472, 12, 132, 252, 372, 42, 162, 282, 402, 72, 192, 312, 432, 102, 222,
-342, 462, 27, 147, 267, 387, 57, 177, 297, 417, 87, 207, 327, 447, 117,
-237, 357, 477, 3, 123, 243, 363, 33, 153, 273, 393, 63, 183, 303, 423,
-93, 213, 333, 453, 18, 138, 258, 378, 48, 168, 288, 408, 78, 198, 318,
-438, 108, 228, 348, 468, 8, 128, 248, 368, 38, 158, 278, 398, 68, 188,
-308, 428, 98, 218, 338, 458, 23, 143, 263, 383, 53, 173, 293, 413, 83,
-203, 323, 443, 113, 233, 353, 473, 13, 133, 253, 373, 43, 163, 283, 403,
-73, 193, 313, 433, 103, 223, 343, 463, 28, 148, 268, 388, 58, 178, 298,
-418, 88, 208, 328, 448, 118, 238, 358, 478, 4, 124, 244, 364, 34, 154,
-274, 394, 64, 184, 304, 424, 94, 214, 334, 454, 19, 139, 259, 379, 49,
-169, 289, 409, 79, 199, 319, 439, 109, 229, 349, 469, 9, 129, 249, 369,
-39, 159, 279, 399, 69, 189, 309, 429, 99, 219, 339, 459, 24, 144, 264,
-384, 54, 174, 294, 414, 84, 204, 324, 444, 114, 234, 354, 474, 14, 134,
-254, 374, 44, 164, 284, 404, 74, 194, 314, 434, 104, 224, 344, 464, 29,
-149, 269, 389, 59, 179, 299, 419, 89, 209, 329, 449, 119, 239, 359, 479,
+0, 96, 192, 288, 384, 32, 128, 224, 320, 416, 64, 160, 256, 352, 448,
+8, 104, 200, 296, 392, 40, 136, 232, 328, 424, 72, 168, 264, 360, 456,
+16, 112, 208, 304, 400, 48, 144, 240, 336, 432, 80, 176, 272, 368, 464,
+24, 120, 216, 312, 408, 56, 152, 248, 344, 440, 88, 184, 280, 376, 472,
+4, 100, 196, 292, 388, 36, 132, 228, 324, 420, 68, 164, 260, 356, 452,
+12, 108, 204, 300, 396, 44, 140, 236, 332, 428, 76, 172, 268, 364, 460,
+20, 116, 212, 308, 404, 52, 148, 244, 340, 436, 84, 180, 276, 372, 468,
+28, 124, 220, 316, 412, 60, 156, 252, 348, 444, 92, 188, 284, 380, 476,
+1, 97, 193, 289, 385, 33, 129, 225, 321, 417, 65, 161, 257, 353, 449,
+9, 105, 201, 297, 393, 41, 137, 233, 329, 425, 73, 169, 265, 361, 457,
+17, 113, 209, 305, 401, 49, 145, 241, 337, 433, 81, 177, 273, 369, 465,
+25, 121, 217, 313, 409, 57, 153, 249, 345, 441, 89, 185, 281, 377, 473,
+5, 101, 197, 293, 389, 37, 133, 229, 325, 421, 69, 165, 261, 357, 453,
+13, 109, 205, 301, 397, 45, 141, 237, 333, 429, 77, 173, 269, 365, 461,
+21, 117, 213, 309, 405, 53, 149, 245, 341, 437, 85, 181, 277, 373, 469,
+29, 125, 221, 317, 413, 61, 157, 253, 349, 445, 93, 189, 285, 381, 477,
+2, 98, 194, 290, 386, 34, 130, 226, 322, 418, 66, 162, 258, 354, 450,
+10, 106, 202, 298, 394, 42, 138, 234, 330, 426, 74, 170, 266, 362, 458,
+18, 114, 210, 306, 402, 50, 146, 242, 338, 434, 82, 178, 274, 370, 466,
+26, 122, 218, 314, 410, 58, 154, 250, 346, 442, 90, 186, 282, 378, 474,
+6, 102, 198, 294, 390, 38, 134, 230, 326, 422, 70, 166, 262, 358, 454,
+14, 110, 206, 302, 398, 46, 142, 238, 334, 430, 78, 174, 270, 366, 462,
+22, 118, 214, 310, 406, 54, 150, 246, 342, 438, 86, 182, 278, 374, 470,
+30, 126, 222, 318, 414, 62, 158, 254, 350, 446, 94, 190, 286, 382, 478,
+3, 99, 195, 291, 387, 35, 131, 227, 323, 419, 67, 163, 259, 355, 451,
+11, 107, 203, 299, 395, 43, 139, 235, 331, 427, 75, 171, 267, 363, 459,
+19, 115, 211, 307, 403, 51, 147, 243, 339, 435, 83, 179, 275, 371, 467,
+27, 123, 219, 315, 411, 59, 155, 251, 347, 443, 91, 187, 283, 379, 475,
+7, 103, 199, 295, 391, 39, 135, 231, 327, 423, 71, 167, 263, 359, 455,
+15, 111, 207, 303, 399, 47, 143, 239, 335, 431, 79, 175, 271, 367, 463,
+23, 119, 215, 311, 407, 55, 151, 247, 343, 439, 87, 183, 279, 375, 471,
+31, 127, 223, 319, 415, 63, 159, 255, 351, 447, 95, 191, 287, 383, 479,
 };
 #endif

 #ifndef FFT_BITREV240
 #define FFT_BITREV240
 static const opus_int16 fft_bitrev240[240] = {
-0, 60, 120, 180, 15, 75, 135, 195, 30, 90, 150, 210, 45, 105, 165,
-225, 5, 65, 125, 185, 20, 80, 140, 200, 35, 95, 155, 215, 50, 110,
-170, 230, 10, 70, 130, 190, 25, 85, 145, 205, 40, 100, 160, 220, 55,
-115, 175, 235, 1, 61, 121, 181, 16, 76, 136, 196, 31, 91, 151, 211,
-46, 106, 166, 226, 6, 66, 126, 186, 21, 81, 141, 201, 36, 96, 156,
-216, 51, 111, 171, 231, 11, 71, 131, 191, 26, 86, 146, 206, 41, 101,
-161, 221, 56, 116, 176, 236, 2, 62, 122, 182, 17, 77, 137, 197, 32,
-92, 152, 212, 47, 107, 167, 227, 7, 67, 127, 187, 22, 82, 142, 202,
-37, 97, 157, 217, 52, 112, 172, 232, 12, 72, 132, 192, 27, 87, 147,
-207, 42, 102, 162, 222, 57, 117, 177, 237, 3, 63, 123, 183, 18, 78,
-138, 198, 33, 93, 153, 213, 48, 108, 168, 228, 8, 68, 128, 188, 23,
-83, 143, 203, 38, 98, 158, 218, 53, 113, 173, 233, 13, 73, 133, 193,
-28, 88, 148, 208, 43, 103, 163, 223, 58, 118, 178, 238, 4, 64, 124,
-184, 19, 79, 139, 199, 34, 94, 154, 214, 49, 109, 169, 229, 9, 69,
-129, 189, 24, 84, 144, 204, 39, 99, 159, 219, 54, 114, 174, 234, 14,
-74, 134, 194, 29, 89, 149, 209, 44, 104, 164, 224, 59, 119, 179, 239,
+0, 48, 96, 144, 192, 16, 64, 112, 160, 208, 32, 80, 128, 176, 224,
+4, 52, 100, 148, 196, 20, 68, 116, 164, 212, 36, 84, 132, 180, 228,
+8, 56, 104, 152, 200, 24, 72, 120, 168, 216, 40, 88, 136, 184, 232,
+12, 60, 108, 156, 204, 28, 76, 124, 172, 220, 44, 92, 140, 188, 236,
+1, 49, 97, 145, 193, 17, 65, 113, 161, 209, 33, 81, 129, 177, 225,
+5, 53, 101, 149, 197, 21, 69, 117, 165, 213, 37, 85, 133, 181, 229,
+9, 57, 105, 153, 201, 25, 73, 121, 169, 217, 41, 89, 137, 185, 233,
+13, 61, 109, 157, 205, 29, 77, 125, 173, 221, 45, 93, 141, 189, 237,
+2, 50, 98, 146, 194, 18, 66, 114, 162, 210, 34, 82, 130, 178, 226,
+6, 54, 102, 150, 198, 22, 70, 118, 166, 214, 38, 86, 134, 182, 230,
+10, 58, 106, 154, 202, 26, 74, 122, 170, 218, 42, 90, 138, 186, 234,
+14, 62, 110, 158, 206, 30, 78, 126, 174, 222, 46, 94, 142, 190, 238,
+3, 51, 99, 147, 195, 19, 67, 115, 163, 211, 35, 83, 131, 179, 227,
+7, 55, 103, 151, 199, 23, 71, 119, 167, 215, 39, 87, 135, 183, 231,
+11, 59, 107, 155, 203, 27, 75, 123, 171, 219, 43, 91, 139, 187, 235,
+15, 63, 111, 159, 207, 31, 79, 127, 175, 223, 47, 95, 143, 191, 239,
 };
 #endif

 #ifndef FFT_BITREV120
 #define FFT_BITREV120
 static const opus_int16 fft_bitrev120[120] = {
-0, 30, 60, 90, 15, 45, 75, 105, 5, 35, 65, 95, 20, 50, 80,
-110, 10, 40, 70, 100, 25, 55, 85, 115, 1, 31, 61, 91, 16, 46,
-76, 106, 6, 36, 66, 96, 21, 51, 81, 111, 11, 41, 71, 101, 26,
-56, 86, 116, 2, 32, 62, 92, 17, 47, 77, 107, 7, 37, 67, 97,
-22, 52, 82, 112, 12, 42, 72, 102, 27, 57, 87, 117, 3, 33, 63,
-93, 18, 48, 78, 108, 8, 38, 68, 98, 23, 53, 83, 113, 13, 43,
-73, 103, 28, 58, 88, 118, 4, 34, 64, 94, 19, 49, 79, 109, 9,
-39, 69, 99, 24, 54, 84, 114, 14, 44, 74, 104, 29, 59, 89, 119,
+0, 24, 48, 72, 96, 8, 32, 56, 80, 104, 16, 40, 64, 88, 112,
+4, 28, 52, 76, 100, 12, 36, 60, 84, 108, 20, 44, 68, 92, 116,
+1, 25, 49, 73, 97, 9, 33, 57, 81, 105, 17, 41, 65, 89, 113,
+5, 29, 53, 77, 101, 13, 37, 61, 85, 109, 21, 45, 69, 93, 117,
+2, 26, 50, 74, 98, 10, 34, 58, 82, 106, 18, 42, 66, 90, 114,
+6, 30, 54, 78, 102, 14, 38, 62, 86, 110, 22, 46, 70, 94, 118,
+3, 27, 51, 75, 99, 11, 35, 59, 83, 107, 19, 43, 67, 91, 115,
+7, 31, 55, 79, 103, 15, 39, 63, 87, 111, 23, 47, 71, 95, 119,
 };
 #endif

 #ifndef FFT_BITREV60
 #define FFT_BITREV60
 static const opus_int16 fft_bitrev60[60] = {
-0, 15, 30, 45, 5, 20, 35, 50, 10, 25, 40, 55, 1, 16, 31,
-46, 6, 21, 36, 51, 11, 26, 41, 56, 2, 17, 32, 47, 7, 22,
-37, 52, 12, 27, 42, 57, 3, 18, 33, 48, 8, 23, 38, 53, 13,
-28, 43, 58, 4, 19, 34, 49, 9, 24, 39, 54, 14, 29, 44, 59,
+0, 12, 24, 36, 48, 4, 16, 28, 40, 52, 8, 20, 32, 44, 56,
+1, 13, 25, 37, 49, 5, 17, 29, 41, 53, 9, 21, 33, 45, 57,
+2, 14, 26, 38, 50, 6, 18, 30, 42, 54, 10, 22, 34, 46, 58,
+3, 15, 27, 39, 51, 7, 19, 31, 43, 55, 11, 23, 35, 47, 59,
 };
 #endif

@ -426,8 +426,10 @@ static const opus_int16 fft_bitrev60[60] = {
 #define FFT_STATE48000_960_0
 static const kiss_fft_state fft_state48000_960_0 ICONST_ATTR = {
 480,    /* nfft */
+17476,	/* scale */
+8,      /* scale_shift */
 -1,     /* shift */
-{4, 120, 4, 30, 2, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, }, /* factors */
+{5, 96, 3, 32, 4, 8, 2, 4, 4, 1, 0, 0, 0, 0, 0, 0, },	/* factors */
 fft_bitrev480,  /* bitrev */
 fft_twiddles48000_960,  /* bitrev */
 };
@ -437,8 +439,10 @@ fft_twiddles48000_960,  /* bitrev */
 #define FFT_STATE48000_960_1
 static const kiss_fft_state fft_state48000_960_1 ICONST_ATTR = {
 240,    /* nfft */
+17476,	/* scale */
+7,      /* scale_shift */
 1,      /* shift */
-{4, 60, 4, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, },   /* factors */
+{5, 48, 3, 16, 4, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, },	/* factors */
 fft_bitrev240,  /* bitrev */
 fft_twiddles48000_960,  /* bitrev */
 };
@ -448,8 +452,10 @@ fft_twiddles48000_960,  /* bitrev */
 #define FFT_STATE48000_960_2
 static const kiss_fft_state fft_state48000_960_2 ICONST_ATTR = {
 120,    /* nfft */
+17476,	/* scale */
+6,      /* scale_shift */
 2,      /* shift */
-{4, 30, 2, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, },   /* factors */
+{5, 24, 3, 8, 2, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, },	/* factors */
 fft_bitrev120,  /* bitrev */
 fft_twiddles48000_960,  /* bitrev */
 };
@ -459,8 +465,10 @@ fft_twiddles48000_960,  /* bitrev */
 #define FFT_STATE48000_960_3
 static const kiss_fft_state fft_state48000_960_3 ICONST_ATTR = {
 60,     /* nfft */
+17476,	/* scale */
+5,      /* scale_shift */
 3,      /* shift */
-{4, 15, 3, 5, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
+{5, 12, 3, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },	/* factors */
 fft_bitrev60,   /* bitrev */
 fft_twiddles48000_960,  /* bitrev */
 };
@ -470,104 +478,368 @@ fft_twiddles48000_960,  /* bitrev */

 #ifndef MDCT_TWIDDLES960
 #define MDCT_TWIDDLES960
-static const opus_val16 mdct_twiddles960[481] ICONST_ATTR = {
-32767, 32767, 32767, 32767, 32766,
-32763, 32762, 32759, 32757, 32753,
-32751, 32747, 32743, 32738, 32733,
-32729, 32724, 32717, 32711, 32705,
-32698, 32690, 32683, 32676, 32667,
-32658, 32650, 32640, 32631, 32620,
-32610, 32599, 32588, 32577, 32566,
-32554, 32541, 32528, 32515, 32502,
-32487, 32474, 32459, 32444, 32429,
-32413, 32397, 32381, 32364, 32348,
-32331, 32313, 32294, 32277, 32257,
-32239, 32219, 32200, 32180, 32159,
-32138, 32118, 32096, 32074, 32051,
-32029, 32006, 31984, 31960, 31936,
-31912, 31888, 31863, 31837, 31812,
-31786, 31760, 31734, 31707, 31679,
-31652, 31624, 31596, 31567, 31539,
-31508, 31479, 31450, 31419, 31388,
-31357, 31326, 31294, 31262, 31230,
-31198, 31164, 31131, 31097, 31063,
-31030, 30994, 30959, 30924, 30889,
-30853, 30816, 30779, 30743, 30705,
-30668, 30629, 30592, 30553, 30515,
-30475, 30435, 30396, 30356, 30315,
-30274, 30233, 30191, 30149, 30107,
-30065, 30022, 29979, 29936, 29891,
-29847, 29803, 29758, 29713, 29668,
-29622, 29577, 29529, 29483, 29436,
-29390, 29341, 29293, 29246, 29197,
-29148, 29098, 29050, 29000, 28949,
-28899, 28848, 28797, 28746, 28694,
-28642, 28590, 28537, 28485, 28432,
-28378, 28324, 28271, 28217, 28162,
-28106, 28051, 27995, 27940, 27884,
-27827, 27770, 27713, 27657, 27598,
-27540, 27481, 27423, 27365, 27305,
-27246, 27187, 27126, 27066, 27006,
-26945, 26883, 26822, 26760, 26698,
-26636, 26574, 26510, 26448, 26383,
-26320, 26257, 26191, 26127, 26062,
-25997, 25931, 25866, 25800, 25734,
-25667, 25601, 25533, 25466, 25398,
-25330, 25262, 25194, 25125, 25056,
-24987, 24917, 24848, 24778, 24707,
-24636, 24566, 24495, 24424, 24352,
-24280, 24208, 24135, 24063, 23990,
-23917, 23842, 23769, 23695, 23622,
-23546, 23472, 23398, 23322, 23246,
-23171, 23095, 23018, 22942, 22866,
-22788, 22711, 22634, 22557, 22478,
-22400, 22322, 22244, 22165, 22085,
-22006, 21927, 21846, 21766, 21687,
-21606, 21524, 21443, 21363, 21282,
-21199, 21118, 21035, 20954, 20870,
-20788, 20705, 20621, 20538, 20455,
-20371, 20286, 20202, 20118, 20034,
-19947, 19863, 19777, 19692, 19606,
-19520, 19434, 19347, 19260, 19174,
-19088, 18999, 18911, 18825, 18737,
-18648, 18560, 18472, 18384, 18294,
-18205, 18116, 18025, 17936, 17846,
-17757, 17666, 17576, 17485, 17395,
-17303, 17212, 17122, 17030, 16937,
-16846, 16755, 16662, 16569, 16477,
-16385, 16291, 16198, 16105, 16012,
-15917, 15824, 15730, 15636, 15541,
-15447, 15352, 15257, 15162, 15067,
-14973, 14875, 14781, 14685, 14589,
-14493, 14396, 14300, 14204, 14107,
-14010, 13914, 13815, 13718, 13621,
-13524, 13425, 13328, 13230, 13133,
-13033, 12935, 12836, 12738, 12638,
-12540, 12441, 12341, 12241, 12142,
-12044, 11943, 11843, 11744, 11643,
-11542, 11442, 11342, 11241, 11139,
-11039, 10939, 10836, 10736, 10635,
-10534, 10431, 10330, 10228, 10127,
-10024, 9921, 9820, 9718, 9614,
-9512, 9410, 9306, 9204, 9101,
-8998, 8895, 8791, 8689, 8585,
-8481, 8377, 8274, 8171, 8067,
-7962, 7858, 7753, 7650, 7545,
-7441, 7336, 7231, 7129, 7023,
-6917, 6813, 6709, 6604, 6498,
-6393, 6288, 6182, 6077, 5973,
-5867, 5760, 5656, 5549, 5445,
-5339, 5232, 5127, 5022, 4914,
-4809, 4703, 4596, 4490, 4384,
-4278, 4171, 4065, 3958, 3852,
-3745, 3640, 3532, 3426, 3318,
-3212, 3106, 2998, 2891, 2786,
-2679, 2570, 2465, 2358, 2251,
-2143, 2037, 1929, 1823, 1715,
-1609, 1501, 1393, 1287, 1180,
-1073, 964, 858, 751, 644,
-535, 429, 322, 214, 107,
-0, };
+static const opus_val16 mdct_twiddles960[1800] ICONST_ATTR = {
+32767, 32767, 32767, 32766, 32765,
+32763, 32761, 32759, 32756, 32753,
+32750, 32746, 32742, 32738, 32733,
+32728, 32722, 32717, 32710, 32704,
+32697, 32690, 32682, 32674, 32666,
+32657, 32648, 32639, 32629, 32619,
+32609, 32598, 32587, 32576, 32564,
+32552, 32539, 32526, 32513, 32500,
+32486, 32472, 32457, 32442, 32427,
+32411, 32395, 32379, 32362, 32345,
+32328, 32310, 32292, 32274, 32255,
+32236, 32217, 32197, 32177, 32157,
+32136, 32115, 32093, 32071, 32049,
+32027, 32004, 31981, 31957, 31933,
+31909, 31884, 31859, 31834, 31809,
+31783, 31756, 31730, 31703, 31676,
+31648, 31620, 31592, 31563, 31534,
+31505, 31475, 31445, 31415, 31384,
+31353, 31322, 31290, 31258, 31226,
+31193, 31160, 31127, 31093, 31059,
+31025, 30990, 30955, 30920, 30884,
+30848, 30812, 30775, 30738, 30701,
+30663, 30625, 30587, 30548, 30509,
+30470, 30430, 30390, 30350, 30309,
+30269, 30227, 30186, 30144, 30102,
+30059, 30016, 29973, 29930, 29886,
+29842, 29797, 29752, 29707, 29662,
+29616, 29570, 29524, 29477, 29430,
+29383, 29335, 29287, 29239, 29190,
+29142, 29092, 29043, 28993, 28943,
+28892, 28842, 28791, 28739, 28688,
+28636, 28583, 28531, 28478, 28425,
+28371, 28317, 28263, 28209, 28154,
+28099, 28044, 27988, 27932, 27876,
+27820, 27763, 27706, 27648, 27591,
+27533, 27474, 27416, 27357, 27298,
+27238, 27178, 27118, 27058, 26997,
+26936, 26875, 26814, 26752, 26690,
+26628, 26565, 26502, 26439, 26375,
+26312, 26247, 26183, 26119, 26054,
+25988, 25923, 25857, 25791, 25725,
+25658, 25592, 25524, 25457, 25389,
+25322, 25253, 25185, 25116, 25047,
+24978, 24908, 24838, 24768, 24698,
+24627, 24557, 24485, 24414, 24342,
+24270, 24198, 24126, 24053, 23980,
+23907, 23834, 23760, 23686, 23612,
+23537, 23462, 23387, 23312, 23237,
+23161, 23085, 23009, 22932, 22856,
+22779, 22701, 22624, 22546, 22468,
+22390, 22312, 22233, 22154, 22075,
+21996, 21916, 21836, 21756, 21676,
+21595, 21515, 21434, 21352, 21271,
+21189, 21107, 21025, 20943, 20860,
+20777, 20694, 20611, 20528, 20444,
+20360, 20276, 20192, 20107, 20022,
+19937, 19852, 19767, 19681, 19595,
+19509, 19423, 19336, 19250, 19163,
+19076, 18988, 18901, 18813, 18725,
+18637, 18549, 18460, 18372, 18283,
+18194, 18104, 18015, 17925, 17835,
+17745, 17655, 17565, 17474, 17383,
+17292, 17201, 17110, 17018, 16927,
+16835, 16743, 16650, 16558, 16465,
+16372, 16279, 16186, 16093, 15999,
+15906, 15812, 15718, 15624, 15529,
+15435, 15340, 15245, 15150, 15055,
+14960, 14864, 14769, 14673, 14577,
+14481, 14385, 14288, 14192, 14095,
+13998, 13901, 13804, 13706, 13609,
+13511, 13414, 13316, 13218, 13119,
+13021, 12923, 12824, 12725, 12626,
+12527, 12428, 12329, 12230, 12130,
+12030, 11930, 11831, 11730, 11630,
+11530, 11430, 11329, 11228, 11128,
+11027, 10926, 10824, 10723, 10622,
+10520, 10419, 10317, 10215, 10113,
+10011, 9909, 9807, 9704, 9602,
+9499, 9397, 9294, 9191, 9088,
+8985, 8882, 8778, 8675, 8572,
+8468, 8364, 8261, 8157, 8053,
+7949, 7845, 7741, 7637, 7532,
+7428, 7323, 7219, 7114, 7009,
+6905, 6800, 6695, 6590, 6485,
+6380, 6274, 6169, 6064, 5958,
+5853, 5747, 5642, 5536, 5430,
+5325, 5219, 5113, 5007, 4901,
+4795, 4689, 4583, 4476, 4370,
+4264, 4157, 4051, 3945, 3838,
+3732, 3625, 3518, 3412, 3305,
+3198, 3092, 2985, 2878, 2771,
+2664, 2558, 2451, 2344, 2237,
+2130, 2023, 1916, 1809, 1702,
+1594, 1487, 1380, 1273, 1166,
+1059, 952, 844, 737, 630,
+523, 416, 308, 201, 94,
+-13, -121, -228, -335, -442,
+-550, -657, -764, -871, -978,
+-1086, -1193, -1300, -1407, -1514,
+-1621, -1728, -1835, -1942, -2049,
+-2157, -2263, -2370, -2477, -2584,
+-2691, -2798, -2905, -3012, -3118,
+-3225, -3332, -3439, -3545, -3652,
+-3758, -3865, -3971, -4078, -4184,
+-4290, -4397, -4503, -4609, -4715,
+-4821, -4927, -5033, -5139, -5245,
+-5351, -5457, -5562, -5668, -5774,
+-5879, -5985, -6090, -6195, -6301,
+-6406, -6511, -6616, -6721, -6826,
+-6931, -7036, -7140, -7245, -7349,
+-7454, -7558, -7663, -7767, -7871,
+-7975, -8079, -8183, -8287, -8390,
+-8494, -8597, -8701, -8804, -8907,
+-9011, -9114, -9217, -9319, -9422,
+-9525, -9627, -9730, -9832, -9934,
+-10037, -10139, -10241, -10342, -10444,
+-10546, -10647, -10748, -10850, -10951,
+-11052, -11153, -11253, -11354, -11455,
+-11555, -11655, -11756, -11856, -11955,
+-12055, -12155, -12254, -12354, -12453,
+-12552, -12651, -12750, -12849, -12947,
+-13046, -13144, -13242, -13340, -13438,
+-13536, -13633, -13731, -13828, -13925,
+-14022, -14119, -14216, -14312, -14409,
+-14505, -14601, -14697, -14793, -14888,
+-14984, -15079, -15174, -15269, -15364,
+-15459, -15553, -15647, -15741, -15835,
+-15929, -16023, -16116, -16210, -16303,
+-16396, -16488, -16581, -16673, -16766,
+-16858, -16949, -17041, -17133, -17224,
+-17315, -17406, -17497, -17587, -17678,
+-17768, -17858, -17948, -18037, -18127,
+-18216, -18305, -18394, -18483, -18571,
+-18659, -18747, -18835, -18923, -19010,
+-19098, -19185, -19271, -19358, -19444,
+-19531, -19617, -19702, -19788, -19873,
+-19959, -20043, -20128, -20213, -20297,
+-20381, -20465, -20549, -20632, -20715,
+-20798, -20881, -20963, -21046, -21128,
+-21210, -21291, -21373, -21454, -21535,
+-21616, -21696, -21776, -21856, -21936,
+-22016, -22095, -22174, -22253, -22331,
+-22410, -22488, -22566, -22643, -22721,
+-22798, -22875, -22951, -23028, -23104,
+-23180, -23256, -23331, -23406, -23481,
+-23556, -23630, -23704, -23778, -23852,
+-23925, -23998, -24071, -24144, -24216,
+-24288, -24360, -24432, -24503, -24574,
+-24645, -24716, -24786, -24856, -24926,
+-24995, -25064, -25133, -25202, -25270,
+-25339, -25406, -25474, -25541, -25608,
+-25675, -25742, -25808, -25874, -25939,
+-26005, -26070, -26135, -26199, -26264,
+-26327, -26391, -26455, -26518, -26581,
+-26643, -26705, -26767, -26829, -26891,
+-26952, -27013, -27073, -27133, -27193,
+-27253, -27312, -27372, -27430, -27489,
+-27547, -27605, -27663, -27720, -27777,
+-27834, -27890, -27946, -28002, -28058,
+-28113, -28168, -28223, -28277, -28331,
+-28385, -28438, -28491, -28544, -28596,
+-28649, -28701, -28752, -28803, -28854,
+-28905, -28955, -29006, -29055, -29105,
+-29154, -29203, -29251, -29299, -29347,
+-29395, -29442, -29489, -29535, -29582,
+-29628, -29673, -29719, -29764, -29808,
+-29853, -29897, -29941, -29984, -30027,
+-30070, -30112, -30154, -30196, -30238,
+-30279, -30320, -30360, -30400, -30440,
+-30480, -30519, -30558, -30596, -30635,
+-30672, -30710, -30747, -30784, -30821,
+-30857, -30893, -30929, -30964, -30999,
+-31033, -31068, -31102, -31135, -31168,
+-31201, -31234, -31266, -31298, -31330,
+-31361, -31392, -31422, -31453, -31483,
+-31512, -31541, -31570, -31599, -31627,
+-31655, -31682, -31710, -31737, -31763,
+-31789, -31815, -31841, -31866, -31891,
+-31915, -31939, -31963, -31986, -32010,
+-32032, -32055, -32077, -32099, -32120,
+-32141, -32162, -32182, -32202, -32222,
+-32241, -32260, -32279, -32297, -32315,
+-32333, -32350, -32367, -32383, -32399,
+-32415, -32431, -32446, -32461, -32475,
+-32489, -32503, -32517, -32530, -32542,
+-32555, -32567, -32579, -32590, -32601,
+-32612, -32622, -32632, -32641, -32651,
+-32659, -32668, -32676, -32684, -32692,
+-32699, -32706, -32712, -32718, -32724,
+-32729, -32734, -32739, -32743, -32747,
+-32751, -32754, -32757, -32760, -32762,
+-32764, -32765, -32767, -32767, -32767,
+32767, 32767, 32765, 32761, 32756,
+32750, 32742, 32732, 32722, 32710,
+32696, 32681, 32665, 32647, 32628,
+32608, 32586, 32562, 32538, 32512,
+32484, 32455, 32425, 32393, 32360,
+32326, 32290, 32253, 32214, 32174,
+32133, 32090, 32046, 32001, 31954,
+31906, 31856, 31805, 31753, 31700,
+31645, 31588, 31530, 31471, 31411,
+31349, 31286, 31222, 31156, 31089,
+31020, 30951, 30880, 30807, 30733,
+30658, 30582, 30504, 30425, 30345,
+30263, 30181, 30096, 30011, 29924,
+29836, 29747, 29656, 29564, 29471,
+29377, 29281, 29184, 29086, 28987,
+28886, 28784, 28681, 28577, 28471,
+28365, 28257, 28147, 28037, 27925,
+27812, 27698, 27583, 27467, 27349,
+27231, 27111, 26990, 26868, 26744,
+26620, 26494, 26367, 26239, 26110,
+25980, 25849, 25717, 25583, 25449,
+25313, 25176, 25038, 24900, 24760,
+24619, 24477, 24333, 24189, 24044,
+23898, 23751, 23602, 23453, 23303,
+23152, 22999, 22846, 22692, 22537,
+22380, 22223, 22065, 21906, 21746,
+21585, 21423, 21261, 21097, 20933,
+20767, 20601, 20434, 20265, 20096,
+19927, 19756, 19584, 19412, 19239,
+19065, 18890, 18714, 18538, 18361,
+18183, 18004, 17824, 17644, 17463,
+17281, 17098, 16915, 16731, 16546,
+16361, 16175, 15988, 15800, 15612,
+15423, 15234, 15043, 14852, 14661,
+14469, 14276, 14083, 13889, 13694,
+13499, 13303, 13107, 12910, 12713,
+12515, 12317, 12118, 11918, 11718,
+11517, 11316, 11115, 10913, 10710,
+10508, 10304, 10100, 9896, 9691,
+9486, 9281, 9075, 8869, 8662,
+8455, 8248, 8040, 7832, 7623,
+7415, 7206, 6996, 6787, 6577,
+6366, 6156, 5945, 5734, 5523,
+5311, 5100, 4888, 4675, 4463,
+4251, 4038, 3825, 3612, 3399,
+3185, 2972, 2758, 2544, 2330,
+2116, 1902, 1688, 1474, 1260,
+1045, 831, 617, 402, 188,
+-27, -241, -456, -670, -885,
+-1099, -1313, -1528, -1742, -1956,
+-2170, -2384, -2598, -2811, -3025,
+-3239, -3452, -3665, -3878, -4091,
+-4304, -4516, -4728, -4941, -5153,
+-5364, -5576, -5787, -5998, -6209,
+-6419, -6629, -6839, -7049, -7258,
+-7467, -7676, -7884, -8092, -8300,
+-8507, -8714, -8920, -9127, -9332,
+-9538, -9743, -9947, -10151, -10355,
+-10558, -10761, -10963, -11165, -11367,
+-11568, -11768, -11968, -12167, -12366,
+-12565, -12762, -12960, -13156, -13352,
+-13548, -13743, -13937, -14131, -14324,
+-14517, -14709, -14900, -15091, -15281,
+-15470, -15659, -15847, -16035, -16221,
+-16407, -16593, -16777, -16961, -17144,
+-17326, -17508, -17689, -17869, -18049,
+-18227, -18405, -18582, -18758, -18934,
+-19108, -19282, -19455, -19627, -19799,
+-19969, -20139, -20308, -20475, -20642,
+-20809, -20974, -21138, -21301, -21464,
+-21626, -21786, -21946, -22105, -22263,
+-22420, -22575, -22730, -22884, -23037,
+-23189, -23340, -23490, -23640, -23788,
+-23935, -24080, -24225, -24369, -24512,
+-24654, -24795, -24934, -25073, -25211,
+-25347, -25482, -25617, -25750, -25882,
+-26013, -26143, -26272, -26399, -26526,
+-26651, -26775, -26898, -27020, -27141,
+-27260, -27379, -27496, -27612, -27727,
+-27841, -27953, -28065, -28175, -28284,
+-28391, -28498, -28603, -28707, -28810,
+-28911, -29012, -29111, -29209, -29305,
+-29401, -29495, -29587, -29679, -29769,
+-29858, -29946, -30032, -30118, -30201,
+-30284, -30365, -30445, -30524, -30601,
+-30677, -30752, -30825, -30897, -30968,
+-31038, -31106, -31172, -31238, -31302,
+-31365, -31426, -31486, -31545, -31602,
+-31658, -31713, -31766, -31818, -31869,
+-31918, -31966, -32012, -32058, -32101,
+-32144, -32185, -32224, -32262, -32299,
+-32335, -32369, -32401, -32433, -32463,
+-32491, -32518, -32544, -32568, -32591,
+-32613, -32633, -32652, -32669, -32685,
+-32700, -32713, -32724, -32735, -32744,
+-32751, -32757, -32762, -32766, -32767,
+32767, 32764, 32755, 32741, 32720,
+32694, 32663, 32626, 32583, 32535,
+32481, 32421, 32356, 32286, 32209,
+32128, 32041, 31948, 31850, 31747,
+31638, 31523, 31403, 31278, 31148,
+31012, 30871, 30724, 30572, 30415,
+30253, 30086, 29913, 29736, 29553,
+29365, 29172, 28974, 28771, 28564,
+28351, 28134, 27911, 27684, 27452,
+27216, 26975, 26729, 26478, 26223,
+25964, 25700, 25432, 25159, 24882,
+24601, 24315, 24026, 23732, 23434,
+23133, 22827, 22517, 22204, 21886,
+21565, 21240, 20912, 20580, 20244,
+19905, 19563, 19217, 18868, 18516,
+18160, 17802, 17440, 17075, 16708,
+16338, 15964, 15588, 15210, 14829,
+14445, 14059, 13670, 13279, 12886,
+12490, 12093, 11693, 11291, 10888,
+10482, 10075, 9666, 9255, 8843,
+8429, 8014, 7597, 7180, 6760,
+6340, 5919, 5496, 5073, 4649,
+4224, 3798, 3372, 2945, 2517,
+2090, 1661, 1233, 804, 375,
+-54, -483, -911, -1340, -1768,
+-2197, -2624, -3052, -3479, -3905,
+-4330, -4755, -5179, -5602, -6024,
+-6445, -6865, -7284, -7702, -8118,
+-8533, -8946, -9358, -9768, -10177,
+-10584, -10989, -11392, -11793, -12192,
+-12589, -12984, -13377, -13767, -14155,
+-14541, -14924, -15305, -15683, -16058,
+-16430, -16800, -17167, -17531, -17892,
+-18249, -18604, -18956, -19304, -19649,
+-19990, -20329, -20663, -20994, -21322,
+-21646, -21966, -22282, -22595, -22904,
+-23208, -23509, -23806, -24099, -24387,
+-24672, -24952, -25228, -25499, -25766,
+-26029, -26288, -26541, -26791, -27035,
+-27275, -27511, -27741, -27967, -28188,
+-28405, -28616, -28823, -29024, -29221,
+-29412, -29599, -29780, -29957, -30128,
+-30294, -30455, -30611, -30761, -30906,
+-31046, -31181, -31310, -31434, -31552,
+-31665, -31773, -31875, -31972, -32063,
+-32149, -32229, -32304, -32373, -32437,
+-32495, -32547, -32594, -32635, -32671,
+-32701, -32726, -32745, -32758, -32766,
+32767, 32754, 32717, 32658, 32577,
+32473, 32348, 32200, 32029, 31837,
+31624, 31388, 31131, 30853, 30553,
+30232, 29891, 29530, 29148, 28746,
+28324, 27883, 27423, 26944, 26447,
+25931, 25398, 24847, 24279, 23695,
+23095, 22478, 21846, 21199, 20538,
+19863, 19174, 18472, 17757, 17030,
+16291, 15541, 14781, 14010, 13230,
+12441, 11643, 10837, 10024, 9204,
+8377, 7545, 6708, 5866, 5020,
+4171, 3319, 2464, 1608, 751,
+-107, -965, -1822, -2678, -3532,
+-4383, -5232, -6077, -6918, -7754,
+-8585, -9409, -10228, -11039, -11843,
+-12639, -13426, -14204, -14972, -15730,
+-16477, -17213, -17937, -18648, -19347,
+-20033, -20705, -21363, -22006, -22634,
+-23246, -23843, -24423, -24986, -25533,
+-26062, -26573, -27066, -27540, -27995,
+-28431, -28848, -29245, -29622, -29979,
+-30315, -30630, -30924, -31197, -31449,
+-31679, -31887, -32074, -32239, -32381,
+-32501, -32600, -32675, -32729, -32759,
+};
 #endif

 static const CELTMode mode48000_960_120 ICONST_ATTR = {
--- a/lib/rbcodec/codecs/libopus/celt/vq.c
+++ b/lib/rbcodec/codecs/libopus/celt/vq.c
@ -37,19 +37,27 @@
 #include "os_support.h"
 #include "bands.h"
 #include "rate.h"
+#include "pitch.h"

+#if defined(MIPSr1_ASM)
+#include "mips/vq_mipsr1.h"
+#endif
+
+#ifndef OVERRIDE_vq_exp_rotation1
 static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_val16 s)
 {
   int i;
+   opus_val16 ms;
   celt_norm *Xptr;
   Xptr = X;
+   ms = NEG16(s);
   for (i=0;i<len-stride;i++)
   {
      celt_norm x1, x2;
      x1 = Xptr[0];
      x2 = Xptr[stride];
-      Xptr[stride] = EXTRACT16(SHR32(MULT16_16(c,x2) + MULT16_16(s,x1), 15));
-      *Xptr++      = EXTRACT16(SHR32(MULT16_16(c,x1) - MULT16_16(s,x2), 15));
+      Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2),  s, x1), 15));
+      *Xptr++      = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15));
   }
   Xptr = &X[len-2*stride-1];
   for (i=len-2*stride-1;i>=0;i--)
@ -57,10 +65,11 @@ static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_
      celt_norm x1, x2;
      x1 = Xptr[0];
      x2 = Xptr[stride];
-      Xptr[stride] = EXTRACT16(SHR32(MULT16_16(c,x2) + MULT16_16(s,x1), 15));
-      *Xptr--      = EXTRACT16(SHR32(MULT16_16(c,x1) - MULT16_16(s,x2), 15));
+      Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2),  s, x1), 15));
+      *Xptr--      = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15));
   }
 }
+#endif /* OVERRIDE_vq_exp_rotation1 */

 static void exp_rotation(celt_norm *X, int len, int dir, int stride, int K, int spread)
 {
@ -91,7 +100,7 @@ static void exp_rotation(celt_norm *X, int len, int dir, int stride, int K, int
   }
   /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for
      extract_collapse_mask().*/
-   len /= stride;
+   len = celt_udiv(len, stride);
   for (i=0;i<stride;i++)
   {
      if (dir < 0)
@ -140,13 +149,15 @@ static unsigned extract_collapse_mask(int *iy, int N, int B)
      return 1;
   /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for
      exp_rotation().*/
-   N0 = N/B;
+   N0 = celt_udiv(N, B);
   collapse_mask = 0;
   i=0; do {
      int j;
+      unsigned tmp=0;
      j=0; do {
-         collapse_mask |= (iy[i*N0+j]!=0)<<i;
+         tmp |= iy[i*N0+j];
      } while (++j<N0);
+      collapse_mask |= (tmp!=0)<<i;
   } while (++i<B);
   return collapse_mask;
 }
@ -322,47 +333,34 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc
 unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
      ec_dec *dec, opus_val16 gain)
 {
-   int i;
   opus_val32 Ryy;
   unsigned collapse_mask;
-/*   VARDECL(int, iy);
-   SAVE_STACK; */
-
-   /* the difference between the last two values of eband5ms shifted by maxLM
-      which is 22 << 3 with the static mode */
-   int iy[176];
+   VARDECL(int, iy);
+   SAVE_STACK;

   celt_assert2(K>0, "alg_unquant() needs at least one pulse");
   celt_assert2(N>1, "alg_unquant() needs at least two dimensions");
-/*   ALLOC(iy, N, int); */
-   decode_pulses(iy, N, K, dec);
-   Ryy = 0;
-   i=0;
-   do {
-      Ryy = MAC16_16(Ryy, iy[i], iy[i]);
-   } while (++i < N);
+   ALLOC(iy, N, int);
+   Ryy = decode_pulses(iy, N, K, dec);
   normalise_residual(iy, X, N, Ryy, gain);
   exp_rotation(X, N, -1, B, K, spread);
   collapse_mask = extract_collapse_mask(iy, N, B);
-/*   RESTORE_STACK; */
+   RESTORE_STACK;
   return collapse_mask;
 }

+#ifndef OVERRIDE_renormalise_vector
 void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
 {
   int i;
 #ifdef FIXED_POINT
   int k;
 #endif
-   opus_val32 E = EPSILON;
+   opus_val32 E;
   opus_val16 g;
   opus_val32 t;
-   celt_norm *xptr = X;
-   for (i=0;i<N;i++)
-   {
-      E = MAC16_16(E, *xptr, *xptr);
-      xptr++;
-   }
+   celt_norm *xptr;
+   E = EPSILON + celt_inner_prod(X, X, N);
 #ifdef FIXED_POINT
   k = celt_ilog2(E)>>1;
 #endif
@ -377,8 +375,9 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
   }
   /*return celt_sqrt(E);*/
 }
+#endif /* OVERRIDE_renormalise_vector */

-int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N)
+int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N)
 {
   int i;
   int itheta;
@ -397,14 +396,8 @@ int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N)
         Eside = MAC16_16(Eside, s, s);
      }
   } else {
-      for (i=0;i<N;i++)
-      {
-         celt_norm m, s;
-         m = X[i];
-         s = Y[i];
-         Emid = MAC16_16(Emid, m, m);
-         Eside = MAC16_16(Eside, s, s);
-      }
+      Emid += celt_inner_prod(X, X, N);
+      Eside += celt_inner_prod(Y, Y, N);
   }
   mid = celt_sqrt(Emid);
   side = celt_sqrt(Eside);
--- a/lib/rbcodec/codecs/libopus/celt/vq.h
+++ b/lib/rbcodec/codecs/libopus/celt/vq.h
@ -65,6 +65,6 @@ unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,

 void renormalise_vector(celt_norm *X, int N, opus_val16 gain);

-int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N);
+int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N);

 #endif /* VQ_H */
--- a/lib/rbcodec/codecs/libopus/config.h
+++ b/lib/rbcodec/codecs/libopus/config.h
@ -9,7 +9,8 @@
 #define OPUS_BUILD

 /* alloc stuff */
-#define NONTHREADSAFE_PSEUDOSTACK
+#define VAR_ARRAYS
+#define NORM_ALIASING_HACK

 #define OVERRIDE_OPUS_ALLOC
 #define OVERRIDE_OPUS_FREE
@ -40,6 +41,7 @@
 #endif

 #if defined(CPU_ARM)
+#define OPUS_ARM_ASM
 #if ARM_ARCH == 4
 #define OPUS_ARM_INLINE_ASM
 #elif ARM_ARCH > 4
--- a/lib/rbcodec/codecs/libopus/opus.c
+++ b/lib/rbcodec/codecs/libopus/opus.c
@ -168,6 +168,27 @@ static int parse_size(const unsigned char *data, opus_int32 len, opus_int16 *siz
   }
 }

+int opus_packet_get_samples_per_frame(const unsigned char *data,
+      opus_int32 Fs)
+{
+   int audiosize;
+   if (data[0]&0x80)
+   {
+      audiosize = ((data[0]>>3)&0x3);
+      audiosize = (Fs<<audiosize)/400;
+   } else if ((data[0]&0x60) == 0x60)
+   {
+      audiosize = (data[0]&0x08) ? Fs/50 : Fs/100;
+   } else {
+      audiosize = ((data[0]>>3)&0x3);
+      if (audiosize == 3)
+         audiosize = Fs*60/1000;
+      else
+         audiosize = (Fs<<audiosize)/100;
+   }
+   return audiosize;
+}
+
 int opus_packet_parse_impl(const unsigned char *data, opus_int32 len,
      int self_delimited, unsigned char *out_toc,
      const unsigned char *frames[48], opus_int16 size[48],
--- a/lib/rbcodec/codecs/libopus/opus_decoder.c
+++ b/lib/rbcodec/codecs/libopus/opus_decoder.c
@ -77,12 +77,6 @@ struct OpusDecoder {
   opus_uint32  rangeFinal;
 };

-#ifdef FIXED_POINT
-static OPUS_INLINE opus_int16 SAT16(opus_int32 x) {
-   return x > 32767 ? 32767 : x < -32768 ? -32768 : (opus_int16)x;
-}
-#endif
-

 int opus_decoder_get_size(int channels)
 {
@ -222,7 +216,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
   VARDECL(opus_val16, pcm_transition_silk);
   int pcm_transition_celt_size;
   VARDECL(opus_val16, pcm_transition_celt);
-   opus_val16 *pcm_transition = NULL; /* Silence false positive "may be used uninitialized" warning */
+   opus_val16 *pcm_transition=NULL;
   int redundant_audio_size;
   VARDECL(opus_val16, redundant_audio);

@ -237,6 +231,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
   int F2_5, F5, F10, F20;
   const opus_val16 *window;
   opus_uint32 redundant_rng = 0;
+   int celt_accum;
   ALLOC_STACK;

   silk_dec = (char*)st+st->silk_dec_offset;
@ -302,6 +297,14 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
      }
   }

+   /* In fixed-point, we can tell CELT to do the accumulation on top of the
+      SILK PCM buffer. This saves some stack space. */
+#ifdef FIXED_POINT
+   celt_accum = (mode != MODE_CELT_ONLY) && (frame_size >= F10);
+#else
+   celt_accum = 0;
+#endif
+
   pcm_transition_silk_size = ALLOC_NONE;
   pcm_transition_celt_size = ALLOC_NONE;
   if (data!=NULL && st->prev_mode > 0 && (
@ -332,14 +335,20 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
   }

   /* Don't allocate any memory when in CELT-only mode */
-   pcm_silk_size = (mode != MODE_CELT_ONLY) ? IMAX(F10, frame_size)*st->channels : ALLOC_NONE;
+   pcm_silk_size = (mode != MODE_CELT_ONLY && !celt_accum) ? IMAX(F10, frame_size)*st->channels : ALLOC_NONE;
   ALLOC(pcm_silk, pcm_silk_size, opus_int16);

   /* SILK processing */
   if (mode != MODE_CELT_ONLY)
   {
      int lost_flag, decoded_samples;
-      opus_int16 *pcm_ptr = pcm_silk;
+      opus_int16 *pcm_ptr;
+#ifdef FIXED_POINT
+      if (celt_accum)
+         pcm_ptr = pcm;
+      else
+#endif
+         pcm_ptr = pcm_silk;

      if (st->prev_mode==MODE_CELT_ONLY)
         silk_InitDecoder( silk_dec );
@ -469,7 +478,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
   {
      celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0));
      celt_decode_with_ec(celt_dec, data+len, redundancy_bytes,
-                          redundant_audio, F5, NULL);
+                          redundant_audio, F5, NULL, 0);
      celt_decoder_ctl(celt_dec, OPUS_GET_FINAL_RANGE(&redundant_rng));
   }

@ -484,25 +493,28 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
         celt_decoder_ctl(celt_dec, OPUS_RESET_STATE);
      /* Decode CELT */
      celt_ret = celt_decode_with_ec(celt_dec, decode_fec ? NULL : data,
-                                     len, pcm, celt_frame_size, &dec);
+                                     len, pcm, celt_frame_size, &dec, celt_accum);
   } else {
      unsigned char silence[2] = {0xFF, 0xFF};
-      for (i=0;i<frame_size*st->channels;i++)
-         pcm[i] = 0;
+      if (!celt_accum)
+      {
+         for (i=0;i<frame_size*st->channels;i++)
+            pcm[i] = 0;
+      }
      /* For hybrid -> SILK transitions, we let the CELT MDCT
         do a fade-out by decoding a silence frame */
      if (st->prev_mode == MODE_HYBRID && !(redundancy && celt_to_silk && st->prev_redundancy) )
      {
         celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0));
-         celt_decode_with_ec(celt_dec, silence, 2, pcm, F2_5, NULL);
+         celt_decode_with_ec(celt_dec, silence, 2, pcm, F2_5, NULL, celt_accum);
      }
   }

-   if (mode != MODE_CELT_ONLY)
+   if (mode != MODE_CELT_ONLY && !celt_accum)
   {
 #ifdef FIXED_POINT
      for (i=0;i<frame_size*st->channels;i++)
-         pcm[i] = SAT16(pcm[i] + pcm_silk[i]);
+         pcm[i] = SAT16(ADD32(pcm[i], pcm_silk[i]));
 #else
      for (i=0;i<frame_size*st->channels;i++)
         pcm[i] = pcm[i] + (opus_val16)((1.f/32768.f)*pcm_silk[i]);
@ -521,7 +533,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
      celt_decoder_ctl(celt_dec, OPUS_RESET_STATE);
      celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0));

-      celt_decode_with_ec(celt_dec, data+len, redundancy_bytes, redundant_audio, F5, NULL);
+      celt_decode_with_ec(celt_dec, data+len, redundancy_bytes, redundant_audio, F5, NULL, 0);
      celt_decoder_ctl(celt_dec, OPUS_GET_FINAL_RANGE(&redundant_rng));
      smooth_fade(pcm+st->channels*(frame_size-F2_5), redundant_audio+st->channels*F2_5,
                  pcm+st->channels*(frame_size-F2_5), F2_5, st->channels, window, st->Fs);
@ -717,6 +729,7 @@ int opus_decode_float(OpusDecoder *st, const unsigned char *data,
 {
   VARDECL(opus_int16, out);
   int ret, i;
+   int nb_samples;
   ALLOC_STACK;

   if(frame_size<=0)
@ -724,6 +737,14 @@ int opus_decode_float(OpusDecoder *st, const unsigned char *data,
      RESTORE_STACK;
      return OPUS_BAD_ARG;
   }
+   if (data != NULL && len > 0 && !decode_fec)
+   {
+      nb_samples = opus_decoder_get_nb_samples(st, data, len);
+      if (nb_samples>0)
+         frame_size = IMIN(frame_size, nb_samples);
+      else
+         return OPUS_INVALID_PACKET;
+   }
   ALLOC(out, frame_size*st->channels, opus_int16);

   ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 0);
@ -744,6 +765,7 @@ int opus_decode(OpusDecoder *st, const unsigned char *data,
 {
   VARDECL(float, out);
   int ret, i;
+   int nb_samples;
   ALLOC_STACK;

   if(frame_size<=0)
@ -752,6 +774,14 @@ int opus_decode(OpusDecoder *st, const unsigned char *data,
      return OPUS_BAD_ARG;
   }

+   if (data != NULL && len > 0 && !decode_fec)
+   {
+      nb_samples = opus_decoder_get_nb_samples(st, data, len);
+      if (nb_samples>0)
+         frame_size = IMIN(frame_size, nb_samples);
+      else
+         return OPUS_INVALID_PACKET;
+   }
   ALLOC(out, frame_size*st->channels, float);

   ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 1);
@ -911,27 +941,6 @@ int opus_packet_get_bandwidth(const unsigned char *data)
   return bandwidth;
 }

-int opus_packet_get_samples_per_frame(const unsigned char *data,
-      opus_int32 Fs)
-{
-   int audiosize;
-   if (data[0]&0x80)
-   {
-      audiosize = ((data[0]>>3)&0x3);
-      audiosize = (Fs<<audiosize)/400;
-   } else if ((data[0]&0x60) == 0x60)
-   {
-      audiosize = (data[0]&0x08) ? Fs/50 : Fs/100;
-   } else {
-      audiosize = ((data[0]>>3)&0x3);
-      if (audiosize == 3)
-         audiosize = Fs*60/1000;
-      else
-         audiosize = (Fs<<audiosize)/100;
-   }
-   return audiosize;
-}
-
 int opus_packet_get_nb_channels(const unsigned char *data)
 {
   return (data[0]&0x4) ? 2 : 1;
--- a/lib/rbcodec/codecs/libopus/opus_defines.h
+++ b/lib/rbcodec/codecs/libopus/opus_defines.h
@ -454,14 +454,6 @@ extern "C" {
  * @hideinitializer */
 #define OPUS_GET_APPLICATION(x) OPUS_GET_APPLICATION_REQUEST, __opus_check_int_ptr(x)

-/** Gets the sampling rate the encoder or decoder was initialized with.
-  * This simply returns the <code>Fs</code> value passed to opus_encoder_init()
-  * or opus_decoder_init().
-  * @param[out] x <tt>opus_int32 *</tt>: Sampling rate of encoder or decoder.
-  * @hideinitializer
-  */
-#define OPUS_GET_SAMPLE_RATE(x) OPUS_GET_SAMPLE_RATE_REQUEST, __opus_check_int_ptr(x)
-
 /** Gets the total samples of delay added by the entire codec.
  * This can be queried by the encoder and then the provided number of samples can be
  * skipped on from the start of the decoder's output to provide time aligned input
@ -545,11 +537,6 @@ extern "C" {
  * @hideinitializer */
 #define OPUS_GET_LSB_DEPTH(x) OPUS_GET_LSB_DEPTH_REQUEST, __opus_check_int_ptr(x)

-/** Gets the duration (in samples) of the last packet successfully decoded or concealed.
-  * @param[out] x <tt>opus_int32 *</tt>: Number of samples (at current sampling rate).
-  * @hideinitializer */
-#define OPUS_GET_LAST_PACKET_DURATION(x) OPUS_GET_LAST_PACKET_DURATION_REQUEST, __opus_check_int_ptr(x)
-
 /** Configures the encoder's use of variable duration frames.
  * When variable duration is enabled, the encoder is free to use a shorter frame
  * size than the one requested in the opus_encode*() call.
@ -649,18 +636,6 @@ extern "C" {
  * @hideinitializer */
 #define OPUS_GET_FINAL_RANGE(x) OPUS_GET_FINAL_RANGE_REQUEST, __opus_check_uint_ptr(x)

-/** Gets the pitch of the last decoded frame, if available.
-  * This can be used for any post-processing algorithm requiring the use of pitch,
-  * e.g. time stretching/shortening. If the last frame was not voiced, or if the
-  * pitch was not coded in the frame, then zero is returned.
-  *
-  * This CTL is only implemented for decoder instances.
-  *
-  * @param[out] x <tt>opus_int32 *</tt>: pitch period at 48 kHz (or 0 if not available)
-  *
-  * @hideinitializer */
-#define OPUS_GET_PITCH(x) OPUS_GET_PITCH_REQUEST, __opus_check_int_ptr(x)
-
 /** Gets the encoder's configured bandpass or the decoder's last bandpass.
  * @see OPUS_SET_BANDWIDTH
  * @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values:
@ -675,6 +650,14 @@ extern "C" {
  * @hideinitializer */
 #define OPUS_GET_BANDWIDTH(x) OPUS_GET_BANDWIDTH_REQUEST, __opus_check_int_ptr(x)

+/** Gets the sampling rate the encoder or decoder was initialized with.
+  * This simply returns the <code>Fs</code> value passed to opus_encoder_init()
+  * or opus_decoder_init().
+  * @param[out] x <tt>opus_int32 *</tt>: Sampling rate of encoder or decoder.
+  * @hideinitializer
+  */
+#define OPUS_GET_SAMPLE_RATE(x) OPUS_GET_SAMPLE_RATE_REQUEST, __opus_check_int_ptr(x)
+
 /**@}*/

 /** @defgroup opus_decoderctls Decoder related CTLs
@ -699,6 +682,23 @@ extern "C" {
  * @hideinitializer */
 #define OPUS_GET_GAIN(x) OPUS_GET_GAIN_REQUEST, __opus_check_int_ptr(x)

+/** Gets the duration (in samples) of the last packet successfully decoded or concealed.
+  * @param[out] x <tt>opus_int32 *</tt>: Number of samples (at current sampling rate).
+  * @hideinitializer */
+#define OPUS_GET_LAST_PACKET_DURATION(x) OPUS_GET_LAST_PACKET_DURATION_REQUEST, __opus_check_int_ptr(x)
+
+/** Gets the pitch of the last decoded frame, if available.
+  * This can be used for any post-processing algorithm requiring the use of pitch,
+  * e.g. time stretching/shortening. If the last frame was not voiced, or if the
+  * pitch was not coded in the frame, then zero is returned.
+  *
+  * This CTL is only implemented for decoder instances.
+  *
+  * @param[out] x <tt>opus_int32 *</tt>: pitch period at 48 kHz (or 0 if not available)
+  *
+  * @hideinitializer */
+#define OPUS_GET_PITCH(x) OPUS_GET_PITCH_REQUEST, __opus_check_int_ptr(x)
+
 /**@}*/

 /** @defgroup opus_libinfo Opus library information functions
--- a/lib/rbcodec/codecs/libopus/opus_private.h
+++ b/lib/rbcodec/codecs/libopus/opus_private.h
@ -86,10 +86,6 @@ typedef void (*downmix_func)(const void *, opus_val32 *, int, int, int, int, int
 void downmix_float(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C);
 void downmix_int(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C);

-int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
-                int bitrate, opus_val16 tonality, float *mem, int buffering,
-                downmix_func downmix);
-
 int encode_size(int size, unsigned char *data);

 opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs);
@ -104,7 +100,8 @@ opus_int32 compute_frame_size(const void *analysis_pcm, int frame_size,

 opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size,
      unsigned char *data, opus_int32 out_data_bytes, int lsb_depth,
-      const void *analysis_pcm, opus_int32 analysis_size, int c1, int c2, int analysis_channels, downmix_func downmix);
+      const void *analysis_pcm, opus_int32 analysis_size, int c1, int c2,
+      int analysis_channels, downmix_func downmix, int float_api);

 int opus_decode_native(OpusDecoder *st, const unsigned char *data, opus_int32 len,
      opus_val16 *pcm, int frame_size, int decode_fec, int self_delimited,
--- a/lib/rbcodec/codecs/libopus/silk/CNG.c
+++ b/lib/rbcodec/codecs/libopus/silk/CNG.c
@ -34,7 +34,7 @@ POSSIBILITY OF SUCH DAMAGE.

 /* Generates excitation for CNG LPC synthesis */
 static OPUS_INLINE void silk_CNG_exc(
-    opus_int32                       residual_Q10[],     /* O    CNG residual signal Q10                     */
+    opus_int32                       exc_Q10[],          /* O    CNG excitation signal Q10                   */
    opus_int32                       exc_buf_Q14[],      /* I    Random samples buffer Q10                   */
    opus_int32                       Gain_Q16,           /* I    Gain to apply                               */
    opus_int                         length,             /* I    Length                                      */
@ -55,7 +55,7 @@ static OPUS_INLINE void silk_CNG_exc(
        idx = (opus_int)( silk_RSHIFT( seed, 24 ) & exc_mask );
        silk_assert( idx >= 0 );
        silk_assert( idx <= CNG_BUF_MASK_MAX );
-        residual_Q10[ i ] = (opus_int16)silk_SAT16( silk_SMULWW( exc_buf_Q14[ idx ], Gain_Q16 >> 4 ) );
+        exc_Q10[ i ] = (opus_int16)silk_SAT16( silk_SMULWW( exc_buf_Q14[ idx ], Gain_Q16 >> 4 ) );
    }
    *rand_seed = seed;
 }
@ -85,7 +85,7 @@ void silk_CNG(
 )
 {
    opus_int   i, subfr;
-    opus_int32 sum_Q6, max_Gain_Q16;
+    opus_int32 sum_Q6, max_Gain_Q16, gain_Q16;
    opus_int16 A_Q12[ MAX_LPC_ORDER ];
    silk_CNG_struct *psCNG = &psDec->sCNG;
    SAVE_STACK;
@ -125,11 +125,20 @@ void silk_CNG(
    /* Add CNG when packet is lost or during DTX */
    if( psDec->lossCnt ) {
        VARDECL( opus_int32, CNG_sig_Q10 );
-
        ALLOC( CNG_sig_Q10, length + MAX_LPC_ORDER, opus_int32 );

        /* Generate CNG excitation */
-        silk_CNG_exc( CNG_sig_Q10 + MAX_LPC_ORDER, psCNG->CNG_exc_buf_Q14, psCNG->CNG_smth_Gain_Q16, length, &psCNG->rand_seed );
+		gain_Q16 = silk_SMULWW( psDec->sPLC.randScale_Q14, psDec->sPLC.prevGain_Q16[1] );
+		if( gain_Q16 >= (1 << 21) || psCNG->CNG_smth_Gain_Q16 > (1 << 23) ) {
+			gain_Q16 = silk_SMULTT( gain_Q16, gain_Q16 );
+			gain_Q16 = silk_SUB_LSHIFT32(silk_SMULTT( psCNG->CNG_smth_Gain_Q16, psCNG->CNG_smth_Gain_Q16 ), gain_Q16, 5 );
+			gain_Q16 = silk_LSHIFT32( silk_SQRT_APPROX( gain_Q16 ), 16 );
+		} else {
+			gain_Q16 = silk_SMULWW( gain_Q16, gain_Q16 );
+			gain_Q16 = silk_SUB_LSHIFT32(silk_SMULWW( psCNG->CNG_smth_Gain_Q16, psCNG->CNG_smth_Gain_Q16 ), gain_Q16, 5 );
+			gain_Q16 = silk_LSHIFT32( silk_SQRT_APPROX( gain_Q16 ), 8 );
+		}
+        silk_CNG_exc( CNG_sig_Q10 + MAX_LPC_ORDER, psCNG->CNG_exc_buf_Q14, gain_Q16, length, &psCNG->rand_seed );

        /* Convert CNG NLSF to filter representation */
        silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order );
@ -162,7 +171,7 @@ void silk_CNG(
            /* Update states */
            CNG_sig_Q10[ MAX_LPC_ORDER + i ] = silk_ADD_LSHIFT( CNG_sig_Q10[ MAX_LPC_ORDER + i ], sum_Q6, 4 );

-            frame[ i ] = silk_ADD_SAT16( frame[ i ], silk_RSHIFT_ROUND( sum_Q6, 6 ) );
+            frame[ i ] = silk_ADD_SAT16( frame[ i ], silk_RSHIFT_ROUND( CNG_sig_Q10[ MAX_LPC_ORDER + i ], 10 ) );
        }
        silk_memcpy( psCNG->CNG_synth_state, &CNG_sig_Q10[ length ], MAX_LPC_ORDER * sizeof( opus_int32 ) );
    } else {
--- a/lib/rbcodec/codecs/libopus/silk/PLC.c
+++ b/lib/rbcodec/codecs/libopus/silk/PLC.c
@ -165,6 +165,30 @@ static OPUS_INLINE void silk_PLC_update(
    psPLC->nb_subfr = psDec->nb_subfr;
 }

+static OPUS_INLINE void silk_PLC_energy(opus_int32 *energy1, opus_int *shift1, opus_int32 *energy2, opus_int *shift2,
+      const opus_int32 *exc_Q14, const opus_int32 *prevGain_Q10, int subfr_length, int nb_subfr)
+{
+    int i, k;
+    VARDECL( opus_int16, exc_buf );
+    opus_int16 *exc_buf_ptr;
+    SAVE_STACK;
+    ALLOC( exc_buf, 2*subfr_length, opus_int16 );
+    /* Find random noise component */
+    /* Scale previous excitation signal */
+    exc_buf_ptr = exc_buf;
+    for( k = 0; k < 2; k++ ) {
+        for( i = 0; i < subfr_length; i++ ) {
+            exc_buf_ptr[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT(
+                silk_SMULWW( exc_Q14[ i + ( k + nb_subfr - 2 ) * subfr_length ], prevGain_Q10[ k ] ), 8 ) );
+        }
+        exc_buf_ptr += subfr_length;
+    }
+    /* Find the subframe with lowest energy of the last two and use that as random noise generator */
+    silk_sum_sqr_shift( energy1, shift1, exc_buf,                  subfr_length );
+    silk_sum_sqr_shift( energy2, shift2, &exc_buf[ subfr_length ], subfr_length );
+    RESTORE_STACK;
+}
+
 static OPUS_INLINE void silk_PLC_conceal(
    silk_decoder_state                  *psDec,             /* I/O Decoder state        */
    silk_decoder_control                *psDecCtrl,         /* I/O Decoder control      */
@ -177,19 +201,26 @@ static OPUS_INLINE void silk_PLC_conceal(
    opus_int32 energy1, energy2, *rand_ptr, *pred_lag_ptr;
    opus_int32 LPC_pred_Q10, LTP_pred_Q12;
    opus_int16 rand_scale_Q14;
-    opus_int16 *B_Q14, *exc_buf_ptr;
+    opus_int16 *B_Q14;
    opus_int32 *sLPC_Q14_ptr;
-    VARDECL( opus_int16, exc_buf );
    opus_int16 A_Q12[ MAX_LPC_ORDER ];
+#ifdef SMALL_FOOTPRINT
+    opus_int16 *sLTP;
+#else
    VARDECL( opus_int16, sLTP );
+#endif
    VARDECL( opus_int32, sLTP_Q14 );
    silk_PLC_struct *psPLC = &psDec->sPLC;
    opus_int32 prevGain_Q10[2];
    SAVE_STACK;

-    ALLOC( exc_buf, 2*psPLC->subfr_length, opus_int16 );
-    ALLOC( sLTP, psDec->ltp_mem_length, opus_int16 );
    ALLOC( sLTP_Q14, psDec->ltp_mem_length + psDec->frame_length, opus_int32 );
+#ifdef SMALL_FOOTPRINT
+    /* Ugly hack that breaks aliasing rules to save stack: put sLTP at the very end of sLTP_Q14. */
+    sLTP = ((opus_int16*)&sLTP_Q14[psDec->ltp_mem_length + psDec->frame_length])-psDec->ltp_mem_length;
+#else
+    ALLOC( sLTP, psDec->ltp_mem_length, opus_int16 );
+#endif

    prevGain_Q10[0] = silk_RSHIFT( psPLC->prevGain_Q16[ 0 ], 6);
    prevGain_Q10[1] = silk_RSHIFT( psPLC->prevGain_Q16[ 1 ], 6);
@ -198,19 +229,7 @@ static OPUS_INLINE void silk_PLC_conceal(
       silk_memset( psPLC->prevLPC_Q12, 0, sizeof( psPLC->prevLPC_Q12 ) );
    }

-    /* Find random noise component */
-    /* Scale previous excitation signal */
-    exc_buf_ptr = exc_buf;
-    for( k = 0; k < 2; k++ ) {
-        for( i = 0; i < psPLC->subfr_length; i++ ) {
-            exc_buf_ptr[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT(
-                silk_SMULWW( psDec->exc_Q14[ i + ( k + psPLC->nb_subfr - 2 ) * psPLC->subfr_length ], prevGain_Q10[ k ] ), 8 ) );
-        }
-        exc_buf_ptr += psPLC->subfr_length;
-    }
-    /* Find the subframe with lowest energy of the last two and use that as random noise generator */
-    silk_sum_sqr_shift( &energy1, &shift1, exc_buf,                         psPLC->subfr_length );
-    silk_sum_sqr_shift( &energy2, &shift2, &exc_buf[ psPLC->subfr_length ], psPLC->subfr_length );
+    silk_PLC_energy(&energy1, &shift1, &energy2, &shift2, psDec->exc_Q14, prevGain_Q10, psDec->subfr_length, psDec->nb_subfr);

    if( silk_RSHIFT( energy1, shift2 ) < silk_RSHIFT( energy2, shift1 ) ) {
        /* First sub-frame has lowest energy */
--- a/lib/rbcodec/codecs/libopus/silk/SigProc_FIX.h
+++ b/lib/rbcodec/codecs/libopus/silk/SigProc_FIX.h
@ -587,6 +587,11 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
 #include "arm/SigProc_FIX_armv5e.h"
 #endif

+#if defined(MIPSr1_ASM)
+#include "mips/sigproc_fix_mipsr1.h"
+#endif
+
+
 #ifdef  __cplusplus
 }
 #endif
--- a/lib/rbcodec/codecs/libopus/silk/code_signs.c
+++ b/lib/rbcodec/codecs/libopus/silk/code_signs.c
@ -76,7 +76,7 @@ void silk_encode_signs(
 /* Decodes signs of excitation */
 void silk_decode_signs(
    ec_dec                      *psRangeDec,                        /* I/O  Compressor data structure                   */
-    opus_int                    pulses[],                           /* I/O  pulse signal                                */
+    opus_int16                  pulses[],                           /* I/O  pulse signal                                */
    opus_int                    length,                             /* I    length of input                             */
    const opus_int              signalType,                         /* I    Signal type                                 */
    const opus_int              quantOffsetType,                    /* I    Quantization offset type                    */
@ -85,7 +85,7 @@ void silk_decode_signs(
 {
    opus_int         i, j, p;
    opus_uint8       icdf[ 2 ];
-    opus_int         *q_ptr;
+    opus_int16       *q_ptr;
    const opus_uint8 *icdf_ptr;

    icdf[ 1 ] = 0;
--- a/lib/rbcodec/codecs/libopus/silk/dec_API.c
+++ b/lib/rbcodec/codecs/libopus/silk/dec_API.c
@ -31,6 +31,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "API.h"
 #include "main.h"
 #include "stack_alloc.h"
+#include "os_support.h"

 /************************/
 /* Decoder Super Struct */
@ -90,7 +91,8 @@ opus_int silk_Decode(                                   /* O    Returns error co
    opus_int   i, n, decode_only_middle = 0, ret = SILK_NO_ERROR;
    opus_int32 nSamplesOutDec, LBRR_symbol;
    opus_int16 *samplesOut1_tmp[ 2 ];
-    VARDECL( opus_int16, samplesOut1_tmp_storage );
+    VARDECL( opus_int16, samplesOut1_tmp_storage1 );
+    VARDECL( opus_int16, samplesOut1_tmp_storage2 );
    VARDECL( opus_int16, samplesOut2_tmp );
    opus_int32 MS_pred_Q13[ 2 ] = { 0 };
    opus_int16 *resample_out_ptr;
@ -98,6 +100,7 @@ opus_int silk_Decode(                                   /* O    Returns error co
    silk_decoder_state *channel_state = psDec->channel_state;
    opus_int has_side;
    opus_int stereo_to_mono;
+    int delay_stack_alloc;
    SAVE_STACK;

    silk_assert( decControl->nChannelsInternal == 1 || decControl->nChannelsInternal == 2 );
@ -196,7 +199,7 @@ opus_int silk_Decode(                                   /* O    Returns error co
            for( i = 0; i < channel_state[ 0 ].nFramesPerPacket; i++ ) {
                for( n = 0; n < decControl->nChannelsInternal; n++ ) {
                    if( channel_state[ n ].LBRR_flags[ i ] ) {
-                        opus_int pulses[ MAX_FRAME_LENGTH ];
+                        opus_int16 pulses[ MAX_FRAME_LENGTH ];
                        opus_int condCoding;

                        if( decControl->nChannelsInternal == 2 && n == 0 ) {
@ -251,13 +254,22 @@ opus_int silk_Decode(                                   /* O    Returns error co
        psDec->channel_state[ 1 ].first_frame_after_reset = 1;
    }

-    ALLOC( samplesOut1_tmp_storage,
-           decControl->nChannelsInternal*(
-               channel_state[ 0 ].frame_length + 2 ),
+    /* Check if the temp buffer fits into the output PCM buffer. If it fits,
+       we can delay allocating the temp buffer until after the SILK peak stack
+       usage. We need to use a < and not a <= because of the two extra samples. */
+    delay_stack_alloc = decControl->internalSampleRate*decControl->nChannelsInternal
+          < decControl->API_sampleRate*decControl->nChannelsAPI;
+    ALLOC( samplesOut1_tmp_storage1, delay_stack_alloc ? ALLOC_NONE
+           : decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2 ),
           opus_int16 );
-    samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage;
-    samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage
-                           + channel_state[ 0 ].frame_length + 2;
+    if ( delay_stack_alloc )
+    {
+       samplesOut1_tmp[ 0 ] = samplesOut;
+       samplesOut1_tmp[ 1 ] = samplesOut + channel_state[ 0 ].frame_length + 2;
+    } else {
+       samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage1;
+       samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage1 + channel_state[ 0 ].frame_length + 2;
+    }

    if( lostFlag == FLAG_DECODE_NORMAL ) {
        has_side = !decode_only_middle;
@ -312,6 +324,15 @@ opus_int silk_Decode(                                   /* O    Returns error co
        resample_out_ptr = samplesOut;
    }

+    ALLOC( samplesOut1_tmp_storage2, delay_stack_alloc
+           ? decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2 )
+           : ALLOC_NONE,
+           opus_int16 );
+    if ( delay_stack_alloc ) {
+       OPUS_COPY(samplesOut1_tmp_storage2, samplesOut, decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2));
+       samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage2;
+       samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage2 + channel_state[ 0 ].frame_length + 2;
+    }
    for( n = 0; n < silk_min( decControl->nChannelsAPI, decControl->nChannelsInternal ); n++ ) {

        /* Resample decoded signal to API_sampleRate */
--- a/lib/rbcodec/codecs/libopus/silk/decode_core.c
+++ b/lib/rbcodec/codecs/libopus/silk/decode_core.c
@ -39,7 +39,7 @@ void silk_decode_core(
    silk_decoder_state          *psDec,                         /* I/O  Decoder state                               */
    silk_decoder_control        *psDecCtrl,                     /* I    Decoder control                             */
    opus_int16                  xq[],                           /* O    Decoded speech                              */
-    const opus_int              pulses[ MAX_FRAME_LENGTH ]      /* I    Pulse signal                                */
+    const opus_int16            pulses[ MAX_FRAME_LENGTH ]      /* I    Pulse signal                                */
 )
 {
    opus_int   i, k, lag = 0, start_idx, sLTP_buf_idx, NLSF_interpolation_flag, signalType;
@ -49,7 +49,7 @@ void silk_decode_core(
    opus_int32 LTP_pred_Q13, LPC_pred_Q10, Gain_Q10, inv_gain_Q31, gain_adj_Q16, rand_seed, offset_Q10;
    opus_int32 *pred_lag_ptr, *pexc_Q14, *pres_Q14;
    VARDECL( opus_int32, res_Q14 );
-/*    VARDECL( opus_int32, sLPC_Q14 ); */
+    VARDECL( opus_int32, sLPC_Q14 );
    SAVE_STACK;

    silk_assert( psDec->prev_gain_Q16 != 0 );
@ -57,8 +57,7 @@ void silk_decode_core(
    ALLOC( sLTP, psDec->ltp_mem_length, opus_int16 );
    ALLOC( sLTP_Q15, psDec->ltp_mem_length + psDec->frame_length, opus_int32 );
    ALLOC( res_Q14, psDec->subfr_length, opus_int32 );
-/*    ALLOC( sLPC_Q14, psDec->subfr_length + MAX_LPC_ORDER, opus_int32 ); */
-    opus_int32 sLPC_Q14[psDec->subfr_length + MAX_LPC_ORDER]; /* worst case is 80 + 16 */
+    ALLOC( sLPC_Q14, psDec->subfr_length + MAX_LPC_ORDER, opus_int32 );

    offset_Q10 = silk_Quantization_Offsets_Q10[ psDec->indices.signalType >> 1 ][ psDec->indices.quantOffsetType ];

--- a/lib/rbcodec/codecs/libopus/silk/decode_frame.c
+++ b/lib/rbcodec/codecs/libopus/silk/decode_frame.c
@ -47,13 +47,10 @@ opus_int silk_decode_frame(
 {
    VARDECL( silk_decoder_control, psDecCtrl );
    opus_int         L, mv_len, ret = 0;
-    VARDECL( opus_int, pulses );
    SAVE_STACK;

    L = psDec->frame_length;
    ALLOC( psDecCtrl, 1, silk_decoder_control );
-    ALLOC( pulses, (L + SHELL_CODEC_FRAME_LENGTH - 1) &
-                   ~(SHELL_CODEC_FRAME_LENGTH - 1), opus_int );
    psDecCtrl->LTP_scale_Q14 = 0;

    /* Safety checks */
@ -62,6 +59,9 @@ opus_int silk_decode_frame(
    if(   lostFlag == FLAG_DECODE_NORMAL ||
        ( lostFlag == FLAG_DECODE_LBRR && psDec->LBRR_flags[ psDec->nFramesDecoded ] == 1 ) )
    {
+        VARDECL( opus_int16, pulses );
+        ALLOC( pulses, (L + SHELL_CODEC_FRAME_LENGTH - 1) &
+                       ~(SHELL_CODEC_FRAME_LENGTH - 1), opus_int16 );
        /*********************************************/
        /* Decode quantization indices of side info  */
        /*********************************************/
@ -107,16 +107,16 @@ opus_int silk_decode_frame(
    silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) );
    silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) );

-    /****************************************************************/
-    /* Ensure smooth connection of extrapolated and good frames     */
-    /****************************************************************/
-    silk_PLC_glue_frames( psDec, pOut, L );
-
    /************************************************/
    /* Comfort noise generation / estimation        */
    /************************************************/
    silk_CNG( psDec, psDecCtrl, pOut, L );

+    /****************************************************************/
+    /* Ensure smooth connection of extrapolated and good frames     */
+    /****************************************************************/
+    silk_PLC_glue_frames( psDec, pOut, L );
+
    /* Update some decoder state variables */
    psDec->lagPrev = psDecCtrl->pitchL[ psDec->nb_subfr - 1 ];

--- a/lib/rbcodec/codecs/libopus/silk/decode_pulses.c
+++ b/lib/rbcodec/codecs/libopus/silk/decode_pulses.c
@ -36,7 +36,7 @@ POSSIBILITY OF SUCH DAMAGE.
 /*********************************************/
 void silk_decode_pulses(
    ec_dec                      *psRangeDec,                    /* I/O  Compressor data structure                   */
-    opus_int                    pulses[],                       /* O    Excitation signal                           */
+    opus_int16                  pulses[],                       /* O    Excitation signal                           */
    const opus_int              signalType,                     /* I    Sigtype                                     */
    const opus_int              quantOffsetType,                /* I    quantOffsetType                             */
    const opus_int              frame_length                    /* I    Frame length                                */
@ -44,7 +44,7 @@ void silk_decode_pulses(
 {
    opus_int   i, j, k, iter, abs_q, nLS, RateLevelIndex;
    opus_int   sum_pulses[ MAX_NB_SHELL_BLOCKS ], nLshifts[ MAX_NB_SHELL_BLOCKS ];
-    opus_int   *pulses_ptr;
+    opus_int16 *pulses_ptr;
    const opus_uint8 *cdf_ptr;

    /*********************/
@ -84,7 +84,7 @@ void silk_decode_pulses(
        if( sum_pulses[ i ] > 0 ) {
            silk_shell_decoder( &pulses[ silk_SMULBB( i, SHELL_CODEC_FRAME_LENGTH ) ], psRangeDec, sum_pulses[ i ] );
        } else {
-            silk_memset( &pulses[ silk_SMULBB( i, SHELL_CODEC_FRAME_LENGTH ) ], 0, SHELL_CODEC_FRAME_LENGTH * sizeof( opus_int ) );
+            silk_memset( &pulses[ silk_SMULBB( i, SHELL_CODEC_FRAME_LENGTH ) ], 0, SHELL_CODEC_FRAME_LENGTH * sizeof( pulses[0] ) );
        }
    }

--- a/lib/rbcodec/codecs/libopus/silk/macros.h
+++ b/lib/rbcodec/codecs/libopus/silk/macros.h
@ -79,17 +79,24 @@ POSSIBILITY OF SUCH DAMAGE.
                                        (( (a) & ((b)^0x80000000) & 0x80000000) ? silk_int32_MIN : (a)-(b)) :    \
                                        ((((a)^0x80000000) & (b)  & 0x80000000) ? silk_int32_MAX : (a)-(b)) )

-#include "ecintrin.h"
+#if defined(MIPSr1_ASM)
+#include "mips/macros_mipsr1.h"
+#endif

+#include "ecintrin.h"
+#ifndef OVERRIDE_silk_CLZ16
 static OPUS_INLINE opus_int32 silk_CLZ16(opus_int16 in16)
 {
    return 32 - EC_ILOG(in16<<16|0x8000);
 }
+#endif

+#ifndef OVERRIDE_silk_CLZ32
 static OPUS_INLINE opus_int32 silk_CLZ32(opus_int32 in32)
 {
    return in32 ? 32 - EC_ILOG(in32) : 32;
 }
+#endif

 /* Row based */
 #define matrix_ptr(Matrix_base_adr, row, column, N) \
--- a/lib/rbcodec/codecs/libopus/silk/main.h
+++ b/lib/rbcodec/codecs/libopus/silk/main.h
@ -116,7 +116,7 @@ void silk_encode_signs(
 /* Decodes signs of excitation */
 void silk_decode_signs(
    ec_dec                      *psRangeDec,                        /* I/O  Compressor data structure                   */
-    opus_int                    pulses[],                           /* I/O  pulse signal                                */
+    opus_int16                  pulses[],                           /* I/O  pulse signal                                */
    opus_int                    length,                             /* I    length of input                             */
    const opus_int              signalType,                         /* I    Signal type                                 */
    const opus_int              quantOffsetType,                    /* I    Quantization offset type                    */
@ -161,7 +161,7 @@ void silk_shell_encoder(

 /* Shell decoder, operates on one shell code frame of 16 pulses */
 void silk_shell_decoder(
-    opus_int                    *pulses0,                       /* O    data: nonnegative pulse amplitudes          */
+    opus_int16                  *pulses0,                       /* O    data: nonnegative pulse amplitudes          */
    ec_dec                      *psRangeDec,                    /* I/O  Compressor data structure                   */
    const opus_int              pulses4                         /* I    number of pulses per pulse-subframe         */
 );
@ -397,13 +397,13 @@ void silk_decode_core(
    silk_decoder_state          *psDec,                         /* I/O  Decoder state                               */
    silk_decoder_control        *psDecCtrl,                     /* I    Decoder control                             */
    opus_int16                  xq[],                           /* O    Decoded speech                              */
-    const opus_int              pulses[ MAX_FRAME_LENGTH ]      /* I    Pulse signal                                */
+    const opus_int16            pulses[ MAX_FRAME_LENGTH ]      /* I    Pulse signal                                */
 );

 /* Decode quantization indices of excitation (Shell coding) */
 void silk_decode_pulses(
    ec_dec                      *psRangeDec,                    /* I/O  Compressor data structure                   */
-    opus_int                    pulses[],                       /* O    Excitation signal                           */
+    opus_int16                  pulses[],                       /* O    Excitation signal                           */
    const opus_int              signalType,                     /* I    Sigtype                                     */
    const opus_int              quantOffsetType,                /* I    quantOffsetType                             */
    const opus_int              frame_length                    /* I    Frame length                                */
--- a/lib/rbcodec/codecs/libopus/silk/resampler_private_IIR_FIR.c
+++ b/lib/rbcodec/codecs/libopus/silk/resampler_private_IIR_FIR.c
@ -72,13 +72,10 @@ void silk_resampler_private_IIR_FIR(
    silk_resampler_state_struct *S = (silk_resampler_state_struct *)SS;
    opus_int32 nSamplesIn;
    opus_int32 max_index_Q16, index_increment_Q16;
-/*    VARDECL( opus_int16, buf );
-    SAVE_STACK; */
+    VARDECL( opus_int16, buf );
+    SAVE_STACK;

-/*    ALLOC( buf, 2 * S->batchSize + RESAMPLER_ORDER_FIR_12, opus_int16 ); */
-
-    /* worst case = 2*16*10+8 = 328 * 2 = 656bytes */
-    opus_int16 buf[2 * S->batchSize + RESAMPLER_ORDER_FIR_12];
+    ALLOC( buf, 2 * S->batchSize + RESAMPLER_ORDER_FIR_12, opus_int16 );

    /* Copy buffered samples to start of buffer */
    silk_memcpy( buf, S->sFIR.i16, RESAMPLER_ORDER_FIR_12 * sizeof( opus_int16 ) );
@ -106,5 +103,5 @@ void silk_resampler_private_IIR_FIR(

    /* Copy last part of filtered signal to the state for the next call */
    silk_memcpy( S->sFIR.i16, &buf[ nSamplesIn << 1 ], RESAMPLER_ORDER_FIR_12 * sizeof( opus_int16 ) );
-/*    RESTORE_STACK; */
+    RESTORE_STACK;
 }
--- a/lib/rbcodec/codecs/libopus/silk/shell_coder.c
+++ b/lib/rbcodec/codecs/libopus/silk/shell_coder.c
@ -60,8 +60,8 @@ static OPUS_INLINE void encode_split(
 #endif

 static OPUS_INLINE void decode_split(
-    opus_int                    *p_child1,      /* O    pulse amplitude of first child subframe     */
-    opus_int                    *p_child2,      /* O    pulse amplitude of second child subframe    */
+    opus_int16                  *p_child1,      /* O    pulse amplitude of first child subframe     */
+    opus_int16                  *p_child2,      /* O    pulse amplitude of second child subframe    */
    ec_dec                      *psRangeDec,    /* I/O  Compressor data structure                   */
    const opus_int              p,              /* I    pulse amplitude of current subframe         */
    const opus_uint8            *shell_table    /* I    table of shell cdfs                         */
@ -121,12 +121,12 @@ void silk_shell_encoder(

 /* Shell decoder, operates on one shell code frame of 16 pulses */
 void silk_shell_decoder(
-    opus_int                    *pulses0,                       /* O    data: nonnegative pulse amplitudes          */
+    opus_int16                  *pulses0,                       /* O    data: nonnegative pulse amplitudes          */
    ec_dec                      *psRangeDec,                    /* I/O  Compressor data structure                   */
    const opus_int              pulses4                         /* I    number of pulses per pulse-subframe         */
 )
 {
-    opus_int pulses3[ 2 ], pulses2[ 4 ], pulses1[ 8 ];
+    opus_int16 pulses3[ 2 ], pulses2[ 4 ], pulses1[ 8 ];

    /* this function operates on one shell code frame of 16 pulses */
    silk_assert( SHELL_CODEC_FRAME_LENGTH == 16 );
--- a/lib/rbcodec/codecs/libopus/silk/sum_sqr_shift.c
+++ b/lib/rbcodec/codecs/libopus/silk/sum_sqr_shift.c
@ -53,6 +53,7 @@ void silk_sum_sqr_shift(
            /* Scale down */
            nrg = (opus_int32)silk_RSHIFT_uint( (opus_uint32)nrg, 2 );
            shft = 2;
+            i+=2;
            break;
        }
    }
--- a/lib/rbcodec/codecs/opus.c
+++ b/lib/rbcodec/codecs/opus.c
@ -337,8 +337,6 @@ enum codec_status codec_run(void)
    param = ci->id3->elapsed;
    strtoffset = ci->id3->offset;

-    global_stack = 0;
-
 #if defined(CPU_COLDFIRE)
    /* EMAC rounding is disabled because of MULT16_32_Q15, which will be
       inaccurate with rounding in its current incarnation */