Subversion Repositories DashDisplay

Rev

Rev 2 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------    
  2. * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
  3. *    
  4. * $Date:        19. March 2015
  5. * $Revision:    V.1.4.5  
  6. *    
  7. * Project:          CMSIS DSP Library    
  8. * Title:            arm_cfft_radix4_q15.c    
  9. *    
  10. * Description:  This file has function definition of Radix-4 FFT & IFFT function and    
  11. *                               In-place bit reversal using bit reversal table    
  12. *    
  13. * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  14. *  
  15. * Redistribution and use in source and binary forms, with or without
  16. * modification, are permitted provided that the following conditions
  17. * are met:
  18. *   - Redistributions of source code must retain the above copyright
  19. *     notice, this list of conditions and the following disclaimer.
  20. *   - Redistributions in binary form must reproduce the above copyright
  21. *     notice, this list of conditions and the following disclaimer in
  22. *     the documentation and/or other materials provided with the
  23. *     distribution.
  24. *   - Neither the name of ARM LIMITED nor the names of its contributors
  25. *     may be used to endorse or promote products derived from this
  26. *     software without specific prior written permission.
  27. *
  28. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  29. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  30. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  31. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  32. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  33. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  34. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  35. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  36. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  37. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  38. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  39. * POSSIBILITY OF SUCH DAMAGE.    
  40. * -------------------------------------------------------------------- */
  41.  
  42. #include "arm_math.h"
  43.  
  44.  
  45. void arm_radix4_butterfly_q15(
  46.   q15_t * pSrc16,
  47.   uint32_t fftLen,
  48.   q15_t * pCoef16,
  49.   uint32_t twidCoefModifier);
  50.  
  51. void arm_radix4_butterfly_inverse_q15(
  52.   q15_t * pSrc16,
  53.   uint32_t fftLen,
  54.   q15_t * pCoef16,
  55.   uint32_t twidCoefModifier);
  56.  
  57. void arm_bitreversal_q15(
  58.   q15_t * pSrc,
  59.   uint32_t fftLen,
  60.   uint16_t bitRevFactor,
  61.   uint16_t * pBitRevTab);
  62.  
  63. /**    
  64.  * @ingroup groupTransforms    
  65.  */
  66.  
  67. /**    
  68.  * @addtogroup ComplexFFT    
  69.  * @{    
  70.  */
  71.  
  72.  
  73. /**    
  74.  * @details    
  75.  * @brief Processing function for the Q15 CFFT/CIFFT.  
  76.  * @deprecated Do not use this function.  It has been superseded by \ref arm_cfft_q15 and will be removed
  77.  * @param[in]      *S    points to an instance of the Q15 CFFT/CIFFT structure.  
  78.  * @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.  
  79.  * @return none.  
  80.  *    
  81.  * \par Input and output formats:    
  82.  * \par    
  83.  * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.  
  84.  * Hence the output format is different for different FFT sizes.    
  85.  * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:  
  86.  * \par  
  87.  * \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"    
  88.  * \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"    
  89.  */
  90.  
  91. void arm_cfft_radix4_q15(
  92.   const arm_cfft_radix4_instance_q15 * S,
  93.   q15_t * pSrc)
  94. {
  95.   if(S->ifftFlag == 1u)
  96.   {
  97.     /*  Complex IFFT radix-4  */
  98.     arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle,
  99.                                      S->twidCoefModifier);
  100.   }
  101.   else
  102.   {
  103.     /*  Complex FFT radix-4  */
  104.     arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle,
  105.                              S->twidCoefModifier);
  106.   }
  107.  
  108.   if(S->bitReverseFlag == 1u)
  109.   {
  110.     /*  Bit Reversal */
  111.     arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
  112.   }
  113.  
  114. }
  115.  
  116. /**    
  117.  * @} end of ComplexFFT group    
  118.  */
  119.  
  120. /*    
  121. * Radix-4 FFT algorithm used is :    
  122. *    
  123. * Input real and imaginary data:    
  124. * x(n) = xa + j * ya    
  125. * x(n+N/4 ) = xb + j * yb    
  126. * x(n+N/2 ) = xc + j * yc    
  127. * x(n+3N 4) = xd + j * yd    
  128. *    
  129. *    
  130. * Output real and imaginary data:    
  131. * x(4r) = xa'+ j * ya'    
  132. * x(4r+1) = xb'+ j * yb'    
  133. * x(4r+2) = xc'+ j * yc'    
  134. * x(4r+3) = xd'+ j * yd'    
  135. *    
  136. *    
  137. * Twiddle factors for radix-4 FFT:    
  138. * Wn = co1 + j * (- si1)    
  139. * W2n = co2 + j * (- si2)    
  140. * W3n = co3 + j * (- si3)    
  141.    
  142. * The real and imaginary output values for the radix-4 butterfly are    
  143. * xa' = xa + xb + xc + xd    
  144. * ya' = ya + yb + yc + yd    
  145. * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)    
  146. * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)    
  147. * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)    
  148. * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)    
  149. * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)    
  150. * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)    
  151. *    
  152. */
  153.  
  154. /**    
  155.  * @brief  Core function for the Q15 CFFT butterfly process.  
  156.  * @param[in, out] *pSrc16          points to the in-place buffer of Q15 data type.  
  157.  * @param[in]      fftLen           length of the FFT.  
  158.  * @param[in]      *pCoef16         points to twiddle coefficient buffer.  
  159.  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.  
  160.  * @return none.  
  161.  */
  162.  
  163. void arm_radix4_butterfly_q15(
  164.   q15_t * pSrc16,
  165.   uint32_t fftLen,
  166.   q15_t * pCoef16,
  167.   uint32_t twidCoefModifier)
  168. {
  169.  
  170. #ifndef ARM_MATH_CM0_FAMILY
  171.  
  172.   /* Run the below code for Cortex-M4 and Cortex-M3 */
  173.  
  174.   q31_t R, S, T, U;
  175.   q31_t C1, C2, C3, out1, out2;
  176.   uint32_t n1, n2, ic, i0, j, k;
  177.  
  178.   q15_t *ptr1;
  179.   q15_t *pSi0;
  180.   q15_t *pSi1;
  181.   q15_t *pSi2;
  182.   q15_t *pSi3;
  183.  
  184.   q31_t xaya, xbyb, xcyc, xdyd;
  185.  
  186.   /* Total process is divided into three stages */
  187.  
  188.   /* process first stage, middle stages, & last stage */
  189.  
  190.   /*  Initializations for the first stage */
  191.   n2 = fftLen;
  192.   n1 = n2;
  193.  
  194.   /* n2 = fftLen/4 */
  195.   n2 >>= 2u;
  196.  
  197.   /* Index for twiddle coefficient */
  198.   ic = 0u;
  199.  
  200.   /* Index for input read and output write */
  201.   j = n2;
  202.  
  203.   pSi0 = pSrc16;
  204.   pSi1 = pSi0 + 2 * n2;
  205.   pSi2 = pSi1 + 2 * n2;
  206.   pSi3 = pSi2 + 2 * n2;
  207.  
  208.   /* Input is in 1.15(q15) format */
  209.  
  210.   /*  start of first stage process */
  211.   do
  212.   {
  213.     /*  Butterfly implementation */
  214.  
  215.     /*  Reading i0, i0+fftLen/2 inputs */
  216.     /* Read ya (real), xa(imag) input */
  217.     T = _SIMD32_OFFSET(pSi0);
  218.     T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
  219.     T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
  220.     //in = ((int16_t) (T & 0xFFFF)) >> 2;       // alternative code that takes 3 cycles
  221.     //T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
  222.  
  223.     /* Read yc (real), xc(imag) input */
  224.     S = _SIMD32_OFFSET(pSi2);
  225.     S = __SHADD16(S, 0);
  226.     S = __SHADD16(S, 0);
  227.  
  228.     /* R = packed((ya + yc), (xa + xc) ) */
  229.     R = __QADD16(T, S);
  230.  
  231.     /* S = packed((ya - yc), (xa - xc) ) */
  232.     S = __QSUB16(T, S);
  233.  
  234.     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  235.     /* Read yb (real), xb(imag) input */
  236.     T = _SIMD32_OFFSET(pSi1);
  237.     T = __SHADD16(T, 0);
  238.     T = __SHADD16(T, 0);
  239.  
  240.     /* Read yd (real), xd(imag) input */
  241.     U = _SIMD32_OFFSET(pSi3);
  242.     U = __SHADD16(U, 0);
  243.     U = __SHADD16(U, 0);
  244.  
  245.     /* T = packed((yb + yd), (xb + xd) ) */
  246.     T = __QADD16(T, U);
  247.  
  248.     /*  writing the butterfly processed i0 sample */
  249.     /* xa' = xa + xb + xc + xd */
  250.     /* ya' = ya + yb + yc + yd */
  251.     _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
  252.     pSi0 += 2;
  253.  
  254.     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
  255.     R = __QSUB16(R, T);
  256.  
  257.     /* co2 & si2 are read from SIMD Coefficient pointer */
  258.     C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
  259.  
  260. #ifndef ARM_MATH_BIG_ENDIAN
  261.  
  262.     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  263.     out1 = __SMUAD(C2, R) >> 16u;
  264.     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  265.     out2 = __SMUSDX(C2, R);
  266.  
  267. #else
  268.  
  269.     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  270.     out1 = __SMUSDX(R, C2) >> 16u;
  271.     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  272.     out2 = __SMUAD(C2, R);
  273.  
  274. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  275.  
  276.     /*  Reading i0+fftLen/4 */
  277.     /* T = packed(yb, xb) */
  278.     T = _SIMD32_OFFSET(pSi1);
  279.     T = __SHADD16(T, 0);
  280.     T = __SHADD16(T, 0);
  281.  
  282.     /* writing the butterfly processed i0 + fftLen/4 sample */
  283.     /* writing output(xc', yc') in little endian format */
  284.     _SIMD32_OFFSET(pSi1) =
  285.       (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  286.     pSi1 += 2;
  287.  
  288.     /*  Butterfly calculations */
  289.     /* U = packed(yd, xd) */
  290.     U = _SIMD32_OFFSET(pSi3);
  291.     U = __SHADD16(U, 0);
  292.     U = __SHADD16(U, 0);
  293.  
  294.     /* T = packed(yb-yd, xb-xd) */
  295.     T = __QSUB16(T, U);
  296.  
  297. #ifndef ARM_MATH_BIG_ENDIAN
  298.  
  299.     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  300.     R = __QASX(S, T);
  301.     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  302.     S = __QSAX(S, T);
  303.  
  304. #else
  305.  
  306.     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  307.     R = __QSAX(S, T);
  308.     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  309.     S = __QASX(S, T);
  310.  
  311. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  312.  
  313.     /* co1 & si1 are read from SIMD Coefficient pointer */
  314.     C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
  315.     /*  Butterfly process for the i0+fftLen/2 sample */
  316.  
  317. #ifndef ARM_MATH_BIG_ENDIAN
  318.  
  319.     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  320.     out1 = __SMUAD(C1, S) >> 16u;
  321.     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  322.     out2 = __SMUSDX(C1, S);
  323.  
  324. #else
  325.  
  326.     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  327.     out1 = __SMUSDX(S, C1) >> 16u;
  328.     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  329.     out2 = __SMUAD(C1, S);
  330.  
  331. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  332.  
  333.     /* writing output(xb', yb') in little endian format */
  334.     _SIMD32_OFFSET(pSi2) =
  335.       ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
  336.     pSi2 += 2;
  337.  
  338.  
  339.     /* co3 & si3 are read from SIMD Coefficient pointer */
  340.     C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
  341.     /*  Butterfly process for the i0+3fftLen/4 sample */
  342.  
  343. #ifndef ARM_MATH_BIG_ENDIAN
  344.  
  345.     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  346.     out1 = __SMUAD(C3, R) >> 16u;
  347.     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  348.     out2 = __SMUSDX(C3, R);
  349.  
  350. #else
  351.  
  352.     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  353.     out1 = __SMUSDX(R, C3) >> 16u;
  354.     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  355.     out2 = __SMUAD(C3, R);
  356.  
  357. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  358.  
  359.     /* writing output(xd', yd') in little endian format */
  360.     _SIMD32_OFFSET(pSi3) =
  361.       ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  362.     pSi3 += 2;
  363.  
  364.     /*  Twiddle coefficients index modifier */
  365.     ic = ic + twidCoefModifier;
  366.  
  367.   } while(--j);
  368.   /* data is in 4.11(q11) format */
  369.  
  370.   /* end of first stage process */
  371.  
  372.  
  373.   /* start of middle stage process */
  374.  
  375.   /*  Twiddle coefficients index modifier */
  376.   twidCoefModifier <<= 2u;
  377.  
  378.   /*  Calculation of Middle stage */
  379.   for (k = fftLen / 4u; k > 4u; k >>= 2u)
  380.   {
  381.     /*  Initializations for the middle stage */
  382.     n1 = n2;
  383.     n2 >>= 2u;
  384.     ic = 0u;
  385.  
  386.     for (j = 0u; j <= (n2 - 1u); j++)
  387.     {
  388.       /*  index calculation for the coefficients */
  389.       C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
  390.       C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
  391.       C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
  392.  
  393.       /*  Twiddle coefficients index modifier */
  394.       ic = ic + twidCoefModifier;
  395.      
  396.       pSi0 = pSrc16 + 2 * j;
  397.       pSi1 = pSi0 + 2 * n2;
  398.       pSi2 = pSi1 + 2 * n2;
  399.       pSi3 = pSi2 + 2 * n2;
  400.  
  401.       /*  Butterfly implementation */
  402.       for (i0 = j; i0 < fftLen; i0 += n1)
  403.       {
  404.         /*  Reading i0, i0+fftLen/2 inputs */
  405.         /* Read ya (real), xa(imag) input */
  406.         T = _SIMD32_OFFSET(pSi0);
  407.  
  408.         /* Read yc (real), xc(imag) input */
  409.         S = _SIMD32_OFFSET(pSi2);
  410.  
  411.         /* R = packed( (ya + yc), (xa + xc)) */
  412.         R = __QADD16(T, S);
  413.  
  414.         /* S = packed((ya - yc), (xa - xc)) */
  415.         S = __QSUB16(T, S);
  416.  
  417.         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  418.         /* Read yb (real), xb(imag) input */
  419.         T = _SIMD32_OFFSET(pSi1);
  420.  
  421.         /* Read yd (real), xd(imag) input */
  422.         U = _SIMD32_OFFSET(pSi3);
  423.  
  424.         /* T = packed( (yb + yd), (xb + xd)) */
  425.         T = __QADD16(T, U);
  426.  
  427.         /*  writing the butterfly processed i0 sample */
  428.  
  429.         /* xa' = xa + xb + xc + xd */
  430.         /* ya' = ya + yb + yc + yd */
  431.         out1 = __SHADD16(R, T);
  432.         out1 = __SHADD16(out1, 0);
  433.         _SIMD32_OFFSET(pSi0) = out1;
  434.         pSi0 += 2 * n1;
  435.  
  436.         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
  437.         R = __SHSUB16(R, T);
  438.  
  439. #ifndef ARM_MATH_BIG_ENDIAN
  440.  
  441.         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  442.         out1 = __SMUAD(C2, R) >> 16u;
  443.  
  444.         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  445.         out2 = __SMUSDX(C2, R);
  446.  
  447. #else
  448.  
  449.         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  450.         out1 = __SMUSDX(R, C2) >> 16u;
  451.  
  452.         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  453.         out2 = __SMUAD(C2, R);
  454.  
  455. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  456.  
  457.         /*  Reading i0+3fftLen/4 */
  458.         /* Read yb (real), xb(imag) input */
  459.         T = _SIMD32_OFFSET(pSi1);
  460.  
  461.         /*  writing the butterfly processed i0 + fftLen/4 sample */
  462.         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  463.         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  464.         _SIMD32_OFFSET(pSi1) =
  465.           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  466.         pSi1 += 2 * n1;
  467.  
  468.         /*  Butterfly calculations */
  469.  
  470.         /* Read yd (real), xd(imag) input */
  471.         U = _SIMD32_OFFSET(pSi3);
  472.  
  473.         /* T = packed(yb-yd, xb-xd) */
  474.         T = __QSUB16(T, U);
  475.  
  476. #ifndef ARM_MATH_BIG_ENDIAN
  477.  
  478.         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  479.         R = __SHASX(S, T);
  480.  
  481.         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  482.         S = __SHSAX(S, T);
  483.  
  484.  
  485.         /*  Butterfly process for the i0+fftLen/2 sample */
  486.         out1 = __SMUAD(C1, S) >> 16u;
  487.         out2 = __SMUSDX(C1, S);
  488.  
  489. #else
  490.  
  491.         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  492.         R = __SHSAX(S, T);
  493.  
  494.         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  495.         S = __SHASX(S, T);
  496.  
  497.  
  498.         /*  Butterfly process for the i0+fftLen/2 sample */
  499.         out1 = __SMUSDX(S, C1) >> 16u;
  500.         out2 = __SMUAD(C1, S);
  501.  
  502. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  503.  
  504.         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  505.         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  506.         _SIMD32_OFFSET(pSi2) =
  507.           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  508.         pSi2 += 2 * n1;
  509.  
  510.         /*  Butterfly process for the i0+3fftLen/4 sample */
  511.  
  512. #ifndef ARM_MATH_BIG_ENDIAN
  513.  
  514.         out1 = __SMUAD(C3, R) >> 16u;
  515.         out2 = __SMUSDX(C3, R);
  516.  
  517. #else
  518.  
  519.         out1 = __SMUSDX(R, C3) >> 16u;
  520.         out2 = __SMUAD(C3, R);
  521.  
  522. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  523.  
  524.         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  525.         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  526.         _SIMD32_OFFSET(pSi3) =
  527.           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  528.         pSi3 += 2 * n1;
  529.       }
  530.     }
  531.     /*  Twiddle coefficients index modifier */
  532.     twidCoefModifier <<= 2u;
  533.   }
  534.   /* end of middle stage process */
  535.  
  536.  
  537.   /* data is in 10.6(q6) format for the 1024 point */
  538.   /* data is in 8.8(q8) format for the 256 point */
  539.   /* data is in 6.10(q10) format for the 64 point */
  540.   /* data is in 4.12(q12) format for the 16 point */
  541.  
  542.   /*  Initializations for the last stage */
  543.   j = fftLen >> 2;
  544.  
  545.   ptr1 = &pSrc16[0];
  546.  
  547.   /* start of last stage process */
  548.  
  549.   /*  Butterfly implementation */
  550.   do
  551.   {
  552.     /* Read xa (real), ya(imag) input */
  553.     xaya = *__SIMD32(ptr1)++;
  554.  
  555.     /* Read xb (real), yb(imag) input */
  556.     xbyb = *__SIMD32(ptr1)++;
  557.  
  558.     /* Read xc (real), yc(imag) input */
  559.     xcyc = *__SIMD32(ptr1)++;
  560.  
  561.     /* Read xd (real), yd(imag) input */
  562.     xdyd = *__SIMD32(ptr1)++;
  563.  
  564.     /* R = packed((ya + yc), (xa + xc)) */
  565.     R = __QADD16(xaya, xcyc);
  566.  
  567.     /* T = packed((yb + yd), (xb + xd)) */
  568.     T = __QADD16(xbyb, xdyd);
  569.  
  570.     /* pointer updation for writing */
  571.     ptr1 = ptr1 - 8u;
  572.  
  573.  
  574.     /* xa' = xa + xb + xc + xd */
  575.     /* ya' = ya + yb + yc + yd */
  576.     *__SIMD32(ptr1)++ = __SHADD16(R, T);
  577.  
  578.     /* T = packed((yb + yd), (xb + xd)) */
  579.     T = __QADD16(xbyb, xdyd);
  580.  
  581.     /* xc' = (xa-xb+xc-xd) */
  582.     /* yc' = (ya-yb+yc-yd) */
  583.     *__SIMD32(ptr1)++ = __SHSUB16(R, T);
  584.  
  585.     /* S = packed((ya - yc), (xa - xc)) */
  586.     S = __QSUB16(xaya, xcyc);
  587.  
  588.     /* Read yd (real), xd(imag) input */
  589.     /* T = packed( (yb - yd), (xb - xd))  */
  590.     U = __QSUB16(xbyb, xdyd);
  591.  
  592. #ifndef ARM_MATH_BIG_ENDIAN
  593.  
  594.     /* xb' = (xa+yb-xc-yd) */
  595.     /* yb' = (ya-xb-yc+xd) */
  596.     *__SIMD32(ptr1)++ = __SHSAX(S, U);
  597.  
  598.  
  599.     /* xd' = (xa-yb-xc+yd) */
  600.     /* yd' = (ya+xb-yc-xd) */
  601.     *__SIMD32(ptr1)++ = __SHASX(S, U);
  602.  
  603. #else
  604.  
  605.     /* xb' = (xa+yb-xc-yd) */
  606.     /* yb' = (ya-xb-yc+xd) */
  607.     *__SIMD32(ptr1)++ = __SHASX(S, U);
  608.  
  609.  
  610.     /* xd' = (xa-yb-xc+yd) */
  611.     /* yd' = (ya+xb-yc-xd) */
  612.     *__SIMD32(ptr1)++ = __SHSAX(S, U);
  613.  
  614. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  615.  
  616.   } while(--j);
  617.  
  618.   /* end of last stage process */
  619.  
  620.   /* output is in 11.5(q5) format for the 1024 point */
  621.   /* output is in 9.7(q7) format for the 256 point   */
  622.   /* output is in 7.9(q9) format for the 64 point  */
  623.   /* output is in 5.11(q11) format for the 16 point  */
  624.  
  625.  
  626. #else
  627.  
  628.   /* Run the below code for Cortex-M0 */
  629.  
  630.   q15_t R0, R1, S0, S1, T0, T1, U0, U1;
  631.   q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
  632.   uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
  633.  
  634.   /* Total process is divided into three stages */
  635.  
  636.   /* process first stage, middle stages, & last stage */
  637.  
  638.   /*  Initializations for the first stage */
  639.   n2 = fftLen;
  640.   n1 = n2;
  641.  
  642.   /* n2 = fftLen/4 */
  643.   n2 >>= 2u;
  644.  
  645.   /* Index for twiddle coefficient */
  646.   ic = 0u;
  647.  
  648.   /* Index for input read and output write */
  649.   i0 = 0u;
  650.   j = n2;
  651.  
  652.   /* Input is in 1.15(q15) format */
  653.  
  654.   /*  start of first stage process */
  655.   do
  656.   {
  657.     /*  Butterfly implementation */
  658.  
  659.     /*  index calculation for the input as, */
  660.     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  661.     i1 = i0 + n2;
  662.     i2 = i1 + n2;
  663.     i3 = i2 + n2;
  664.  
  665.     /*  Reading i0, i0+fftLen/2 inputs */
  666.  
  667.     /* input is down scale by 4 to avoid overflow */
  668.     /* Read ya (real), xa(imag) input */
  669.     T0 = pSrc16[i0 * 2u] >> 2u;
  670.     T1 = pSrc16[(i0 * 2u) + 1u] >> 2u;
  671.  
  672.     /* input is down scale by 4 to avoid overflow */
  673.     /* Read yc (real), xc(imag) input */
  674.     S0 = pSrc16[i2 * 2u] >> 2u;
  675.     S1 = pSrc16[(i2 * 2u) + 1u] >> 2u;
  676.  
  677.     /* R0 = (ya + yc) */
  678.     R0 = __SSAT(T0 + S0, 16u);
  679.     /* R1 = (xa + xc) */
  680.     R1 = __SSAT(T1 + S1, 16u);
  681.  
  682.     /* S0 = (ya - yc) */
  683.     S0 = __SSAT(T0 - S0, 16);
  684.     /* S1 = (xa - xc) */
  685.     S1 = __SSAT(T1 - S1, 16);
  686.  
  687.     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  688.     /* input is down scale by 4 to avoid overflow */
  689.     /* Read yb (real), xb(imag) input */
  690.     T0 = pSrc16[i1 * 2u] >> 2u;
  691.     T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
  692.  
  693.     /* input is down scale by 4 to avoid overflow */
  694.     /* Read yd (real), xd(imag) input */
  695.     U0 = pSrc16[i3 * 2u] >> 2u;
  696.     U1 = pSrc16[(i3 * 2u) + 1] >> 2u;
  697.  
  698.     /* T0 = (yb + yd) */
  699.     T0 = __SSAT(T0 + U0, 16u);
  700.     /* T1 = (xb + xd) */
  701.     T1 = __SSAT(T1 + U1, 16u);
  702.  
  703.     /*  writing the butterfly processed i0 sample */
  704.     /* ya' = ya + yb + yc + yd */
  705.     /* xa' = xa + xb + xc + xd */
  706.     pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
  707.     pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
  708.  
  709.     /* R0 = (ya + yc) - (yb + yd) */
  710.     /* R1 = (xa + xc) - (xb + xd) */
  711.     R0 = __SSAT(R0 - T0, 16u);
  712.     R1 = __SSAT(R1 - T1, 16u);
  713.  
  714.     /* co2 & si2 are read from Coefficient pointer */
  715.     Co2 = pCoef16[2u * ic * 2u];
  716.     Si2 = pCoef16[(2u * ic * 2u) + 1];
  717.  
  718.     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  719.     out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u);
  720.     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  721.     out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u);
  722.  
  723.     /*  Reading i0+fftLen/4 */
  724.     /* input is down scale by 4 to avoid overflow */
  725.     /* T0 = yb, T1 =  xb */
  726.     T0 = pSrc16[i1 * 2u] >> 2;
  727.     T1 = pSrc16[(i1 * 2u) + 1] >> 2;
  728.  
  729.     /* writing the butterfly processed i0 + fftLen/4 sample */
  730.     /* writing output(xc', yc') in little endian format */
  731.     pSrc16[i1 * 2u] = out1;
  732.     pSrc16[(i1 * 2u) + 1] = out2;
  733.  
  734.     /*  Butterfly calculations */
  735.     /* input is down scale by 4 to avoid overflow */
  736.     /* U0 = yd, U1 = xd */
  737.     U0 = pSrc16[i3 * 2u] >> 2;
  738.     U1 = pSrc16[(i3 * 2u) + 1] >> 2;
  739.     /* T0 = yb-yd */
  740.     T0 = __SSAT(T0 - U0, 16);
  741.     /* T1 = xb-xd */
  742.     T1 = __SSAT(T1 - U1, 16);
  743.  
  744.     /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
  745.     R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
  746.     R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
  747.  
  748.     /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
  749.     S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16u);
  750.     S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16u);
  751.  
  752.     /* co1 & si1 are read from Coefficient pointer */
  753.     Co1 = pCoef16[ic * 2u];
  754.     Si1 = pCoef16[(ic * 2u) + 1];
  755.     /*  Butterfly process for the i0+fftLen/2 sample */
  756.     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  757.     out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
  758.     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  759.     out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
  760.  
  761.     /* writing output(xb', yb') in little endian format */
  762.     pSrc16[i2 * 2u] = out1;
  763.     pSrc16[(i2 * 2u) + 1] = out2;
  764.  
  765.     /* Co3 & si3 are read from Coefficient pointer */
  766.     Co3 = pCoef16[3u * (ic * 2u)];
  767.     Si3 = pCoef16[(3u * (ic * 2u)) + 1];
  768.     /*  Butterfly process for the i0+3fftLen/4 sample */
  769.     /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
  770.     out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u);
  771.     /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
  772.     out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u);
  773.     /* writing output(xd', yd') in little endian format */
  774.     pSrc16[i3 * 2u] = out1;
  775.     pSrc16[(i3 * 2u) + 1] = out2;
  776.  
  777.     /*  Twiddle coefficients index modifier */
  778.     ic = ic + twidCoefModifier;
  779.  
  780.     /*  Updating input index */
  781.     i0 = i0 + 1u;
  782.  
  783.   } while(--j);
  784.   /* data is in 4.11(q11) format */
  785.  
  786.   /* end of first stage process */
  787.  
  788.  
  789.   /* start of middle stage process */
  790.  
  791.   /*  Twiddle coefficients index modifier */
  792.   twidCoefModifier <<= 2u;
  793.  
  794.   /*  Calculation of Middle stage */
  795.   for (k = fftLen / 4u; k > 4u; k >>= 2u)
  796.   {
  797.     /*  Initializations for the middle stage */
  798.     n1 = n2;
  799.     n2 >>= 2u;
  800.     ic = 0u;
  801.  
  802.     for (j = 0u; j <= (n2 - 1u); j++)
  803.     {
  804.       /*  index calculation for the coefficients */
  805.       Co1 = pCoef16[ic * 2u];
  806.       Si1 = pCoef16[(ic * 2u) + 1u];
  807.       Co2 = pCoef16[2u * (ic * 2u)];
  808.       Si2 = pCoef16[(2u * (ic * 2u)) + 1u];
  809.       Co3 = pCoef16[3u * (ic * 2u)];
  810.       Si3 = pCoef16[(3u * (ic * 2u)) + 1u];
  811.  
  812.       /*  Twiddle coefficients index modifier */
  813.       ic = ic + twidCoefModifier;
  814.  
  815.       /*  Butterfly implementation */
  816.       for (i0 = j; i0 < fftLen; i0 += n1)
  817.       {
  818.         /*  index calculation for the input as, */
  819.         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  820.         i1 = i0 + n2;
  821.         i2 = i1 + n2;
  822.         i3 = i2 + n2;
  823.  
  824.         /*  Reading i0, i0+fftLen/2 inputs */
  825.         /* Read ya (real), xa(imag) input */
  826.         T0 = pSrc16[i0 * 2u];
  827.         T1 = pSrc16[(i0 * 2u) + 1u];
  828.  
  829.         /* Read yc (real), xc(imag) input */
  830.         S0 = pSrc16[i2 * 2u];
  831.         S1 = pSrc16[(i2 * 2u) + 1u];
  832.  
  833.         /* R0 = (ya + yc), R1 = (xa + xc) */
  834.         R0 = __SSAT(T0 + S0, 16);
  835.         R1 = __SSAT(T1 + S1, 16);
  836.  
  837.         /* S0 = (ya - yc), S1 =(xa - xc) */
  838.         S0 = __SSAT(T0 - S0, 16);
  839.         S1 = __SSAT(T1 - S1, 16);
  840.  
  841.         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  842.         /* Read yb (real), xb(imag) input */
  843.         T0 = pSrc16[i1 * 2u];
  844.         T1 = pSrc16[(i1 * 2u) + 1u];
  845.  
  846.         /* Read yd (real), xd(imag) input */
  847.         U0 = pSrc16[i3 * 2u];
  848.         U1 = pSrc16[(i3 * 2u) + 1u];
  849.  
  850.  
  851.         /* T0 = (yb + yd), T1 = (xb + xd) */
  852.         T0 = __SSAT(T0 + U0, 16);
  853.         T1 = __SSAT(T1 + U1, 16);
  854.  
  855.         /*  writing the butterfly processed i0 sample */
  856.  
  857.         /* xa' = xa + xb + xc + xd */
  858.         /* ya' = ya + yb + yc + yd */
  859.         out1 = ((R0 >> 1u) + (T0 >> 1u)) >> 1u;
  860.         out2 = ((R1 >> 1u) + (T1 >> 1u)) >> 1u;
  861.  
  862.         pSrc16[i0 * 2u] = out1;
  863.         pSrc16[(2u * i0) + 1u] = out2;
  864.  
  865.         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
  866.         R0 = (R0 >> 1u) - (T0 >> 1u);
  867.         R1 = (R1 >> 1u) - (T1 >> 1u);
  868.  
  869.         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  870.         out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u);
  871.  
  872.         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  873.         out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u);
  874.  
  875.         /*  Reading i0+3fftLen/4 */
  876.         /* Read yb (real), xb(imag) input */
  877.         T0 = pSrc16[i1 * 2u];
  878.         T1 = pSrc16[(i1 * 2u) + 1u];
  879.  
  880.         /*  writing the butterfly processed i0 + fftLen/4 sample */
  881.         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  882.         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  883.         pSrc16[i1 * 2u] = out1;
  884.         pSrc16[(i1 * 2u) + 1u] = out2;
  885.  
  886.         /*  Butterfly calculations */
  887.  
  888.         /* Read yd (real), xd(imag) input */
  889.         U0 = pSrc16[i3 * 2u];
  890.         U1 = pSrc16[(i3 * 2u) + 1u];
  891.  
  892.         /* T0 = yb-yd, T1 = xb-xd */
  893.         T0 = __SSAT(T0 - U0, 16);
  894.         T1 = __SSAT(T1 - U1, 16);
  895.  
  896.         /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
  897.         R0 = (S0 >> 1u) - (T1 >> 1u);
  898.         R1 = (S1 >> 1u) + (T0 >> 1u);
  899.  
  900.         /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
  901.         S0 = (S0 >> 1u) + (T1 >> 1u);
  902.         S1 = (S1 >> 1u) - (T0 >> 1u);
  903.  
  904.         /*  Butterfly process for the i0+fftLen/2 sample */
  905.         out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16u);
  906.  
  907.         out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16u);
  908.  
  909.         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  910.         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  911.         pSrc16[i2 * 2u] = out1;
  912.         pSrc16[(i2 * 2u) + 1u] = out2;
  913.  
  914.         /*  Butterfly process for the i0+3fftLen/4 sample */
  915.         out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u);
  916.  
  917.         out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u);
  918.         /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
  919.         /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
  920.         pSrc16[i3 * 2u] = out1;
  921.         pSrc16[(i3 * 2u) + 1u] = out2;
  922.       }
  923.     }
  924.     /*  Twiddle coefficients index modifier */
  925.     twidCoefModifier <<= 2u;
  926.   }
  927.   /* end of middle stage process */
  928.  
  929.  
  930.   /* data is in 10.6(q6) format for the 1024 point */
  931.   /* data is in 8.8(q8) format for the 256 point */
  932.   /* data is in 6.10(q10) format for the 64 point */
  933.   /* data is in 4.12(q12) format for the 16 point */
  934.  
  935.   /*  Initializations for the last stage */
  936.   n1 = n2;
  937.   n2 >>= 2u;
  938.  
  939.   /* start of last stage process */
  940.  
  941.   /*  Butterfly implementation */
  942.   for (i0 = 0u; i0 <= (fftLen - n1); i0 += n1)
  943.   {
  944.     /*  index calculation for the input as, */
  945.     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  946.     i1 = i0 + n2;
  947.     i2 = i1 + n2;
  948.     i3 = i2 + n2;
  949.  
  950.     /*  Reading i0, i0+fftLen/2 inputs */
  951.     /* Read ya (real), xa(imag) input */
  952.     T0 = pSrc16[i0 * 2u];
  953.     T1 = pSrc16[(i0 * 2u) + 1u];
  954.  
  955.     /* Read yc (real), xc(imag) input */
  956.     S0 = pSrc16[i2 * 2u];
  957.     S1 = pSrc16[(i2 * 2u) + 1u];
  958.  
  959.     /* R0 = (ya + yc), R1 = (xa + xc) */
  960.     R0 = __SSAT(T0 + S0, 16u);
  961.     R1 = __SSAT(T1 + S1, 16u);
  962.  
  963.     /* S0 = (ya - yc), S1 = (xa - xc) */
  964.     S0 = __SSAT(T0 - S0, 16u);
  965.     S1 = __SSAT(T1 - S1, 16u);
  966.  
  967.     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  968.     /* Read yb (real), xb(imag) input */
  969.     T0 = pSrc16[i1 * 2u];
  970.     T1 = pSrc16[(i1 * 2u) + 1u];
  971.     /* Read yd (real), xd(imag) input */
  972.     U0 = pSrc16[i3 * 2u];
  973.     U1 = pSrc16[(i3 * 2u) + 1u];
  974.  
  975.     /* T0 = (yb + yd), T1 = (xb + xd)) */
  976.     T0 = __SSAT(T0 + U0, 16u);
  977.     T1 = __SSAT(T1 + U1, 16u);
  978.  
  979.     /*  writing the butterfly processed i0 sample */
  980.     /* xa' = xa + xb + xc + xd */
  981.     /* ya' = ya + yb + yc + yd */
  982.     pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
  983.     pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
  984.  
  985.     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
  986.     R0 = (R0 >> 1u) - (T0 >> 1u);
  987.     R1 = (R1 >> 1u) - (T1 >> 1u);
  988.     /* Read yb (real), xb(imag) input */
  989.     T0 = pSrc16[i1 * 2u];
  990.     T1 = pSrc16[(i1 * 2u) + 1u];
  991.  
  992.     /*  writing the butterfly processed i0 + fftLen/4 sample */
  993.     /* xc' = (xa-xb+xc-xd) */
  994.     /* yc' = (ya-yb+yc-yd) */
  995.     pSrc16[i1 * 2u] = R0;
  996.     pSrc16[(i1 * 2u) + 1u] = R1;
  997.  
  998.     /* Read yd (real), xd(imag) input */
  999.     U0 = pSrc16[i3 * 2u];
  1000.     U1 = pSrc16[(i3 * 2u) + 1u];
  1001.     /* T0 = (yb - yd), T1 = (xb - xd)  */
  1002.     T0 = __SSAT(T0 - U0, 16u);
  1003.     T1 = __SSAT(T1 - U1, 16u);
  1004.  
  1005.     /*  writing the butterfly processed i0 + fftLen/2 sample */
  1006.     /* xb' = (xa+yb-xc-yd) */
  1007.     /* yb' = (ya-xb-yc+xd) */
  1008.     pSrc16[i2 * 2u] = (S0 >> 1u) + (T1 >> 1u);
  1009.     pSrc16[(i2 * 2u) + 1u] = (S1 >> 1u) - (T0 >> 1u);
  1010.  
  1011.     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
  1012.     /* xd' = (xa-yb-xc+yd) */
  1013.     /* yd' = (ya+xb-yc-xd) */
  1014.     pSrc16[i3 * 2u] = (S0 >> 1u) - (T1 >> 1u);
  1015.     pSrc16[(i3 * 2u) + 1u] = (S1 >> 1u) + (T0 >> 1u);
  1016.  
  1017.   }
  1018.  
  1019.   /* end of last stage process */
  1020.  
  1021.   /* output is in 11.5(q5) format for the 1024 point */
  1022.   /* output is in 9.7(q7) format for the 256 point   */
  1023.   /* output is in 7.9(q9) format for the 64 point  */
  1024.   /* output is in 5.11(q11) format for the 16 point  */
  1025.  
  1026. #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  1027.  
  1028. }
  1029.  
  1030.  
  1031. /**    
  1032.  * @brief  Core function for the Q15 CIFFT butterfly process.  
  1033.  * @param[in, out] *pSrc16          points to the in-place buffer of Q15 data type.  
  1034.  * @param[in]      fftLen           length of the FFT.  
  1035.  * @param[in]      *pCoef16         points to twiddle coefficient buffer.  
  1036.  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.  
  1037.  * @return none.  
  1038.  */
  1039.  
  1040. /*    
  1041. * Radix-4 IFFT algorithm used is :    
  1042. *    
  1043. * CIFFT uses same twiddle coefficients as CFFT function    
  1044. *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]    
  1045. *    
  1046. *    
  1047. * IFFT is implemented with following changes in equations from FFT    
  1048. *    
  1049. * Input real and imaginary data:    
  1050. * x(n) = xa + j * ya    
  1051. * x(n+N/4 ) = xb + j * yb    
  1052. * x(n+N/2 ) = xc + j * yc    
  1053. * x(n+3N 4) = xd + j * yd    
  1054. *    
  1055. *    
  1056. * Output real and imaginary data:    
  1057. * x(4r) = xa'+ j * ya'    
  1058. * x(4r+1) = xb'+ j * yb'    
  1059. * x(4r+2) = xc'+ j * yc'    
  1060. * x(4r+3) = xd'+ j * yd'    
  1061. *    
  1062. *    
  1063. * Twiddle factors for radix-4 IFFT:    
  1064. * Wn = co1 + j * (si1)    
  1065. * W2n = co2 + j * (si2)    
  1066. * W3n = co3 + j * (si3)    
  1067.    
  1068. * The real and imaginary output values for the radix-4 butterfly are    
  1069. * xa' = xa + xb + xc + xd    
  1070. * ya' = ya + yb + yc + yd    
  1071. * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)    
  1072. * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)    
  1073. * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)    
  1074. * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)    
  1075. * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)    
  1076. * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)    
  1077. *    
  1078. */
  1079.  
  1080. void arm_radix4_butterfly_inverse_q15(
  1081.   q15_t * pSrc16,
  1082.   uint32_t fftLen,
  1083.   q15_t * pCoef16,
  1084.   uint32_t twidCoefModifier)
  1085. {
  1086.  
  1087. #ifndef ARM_MATH_CM0_FAMILY
  1088.  
  1089.   /* Run the below code for Cortex-M4 and Cortex-M3 */
  1090.  
  1091.   q31_t R, S, T, U;
  1092.   q31_t C1, C2, C3, out1, out2;
  1093.   uint32_t n1, n2, ic, i0, j, k;
  1094.  
  1095.   q15_t *ptr1;
  1096.   q15_t *pSi0;
  1097.   q15_t *pSi1;
  1098.   q15_t *pSi2;
  1099.   q15_t *pSi3;
  1100.  
  1101.   q31_t xaya, xbyb, xcyc, xdyd;
  1102.  
  1103.   /* Total process is divided into three stages */
  1104.  
  1105.   /* process first stage, middle stages, & last stage */
  1106.  
  1107.   /*  Initializations for the first stage */
  1108.   n2 = fftLen;
  1109.   n1 = n2;
  1110.  
  1111.   /* n2 = fftLen/4 */
  1112.   n2 >>= 2u;
  1113.  
  1114.   /* Index for twiddle coefficient */
  1115.   ic = 0u;
  1116.  
  1117.   /* Index for input read and output write */
  1118.   j = n2;
  1119.  
  1120.   pSi0 = pSrc16;
  1121.   pSi1 = pSi0 + 2 * n2;
  1122.   pSi2 = pSi1 + 2 * n2;
  1123.   pSi3 = pSi2 + 2 * n2;
  1124.  
  1125.   /* Input is in 1.15(q15) format */
  1126.  
  1127.   /*  start of first stage process */
  1128.   do
  1129.   {
  1130.     /*  Butterfly implementation */
  1131.  
  1132.     /*  Reading i0, i0+fftLen/2 inputs */
  1133.     /* Read ya (real), xa(imag) input */
  1134.     T = _SIMD32_OFFSET(pSi0);
  1135.     T = __SHADD16(T, 0);
  1136.     T = __SHADD16(T, 0);
  1137.  
  1138.     /* Read yc (real), xc(imag) input */
  1139.     S = _SIMD32_OFFSET(pSi2);
  1140.     S = __SHADD16(S, 0);
  1141.     S = __SHADD16(S, 0);
  1142.  
  1143.     /* R = packed((ya + yc), (xa + xc) ) */
  1144.     R = __QADD16(T, S);
  1145.  
  1146.     /* S = packed((ya - yc), (xa - xc) ) */
  1147.     S = __QSUB16(T, S);
  1148.  
  1149.     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1150.     /* Read yb (real), xb(imag) input */
  1151.     T = _SIMD32_OFFSET(pSi1);
  1152.     T = __SHADD16(T, 0);
  1153.     T = __SHADD16(T, 0);
  1154.  
  1155.     /* Read yd (real), xd(imag) input */
  1156.     U = _SIMD32_OFFSET(pSi3);
  1157.     U = __SHADD16(U, 0);
  1158.     U = __SHADD16(U, 0);
  1159.  
  1160.     /* T = packed((yb + yd), (xb + xd) ) */
  1161.     T = __QADD16(T, U);
  1162.  
  1163.     /*  writing the butterfly processed i0 sample */
  1164.     /* xa' = xa + xb + xc + xd */
  1165.     /* ya' = ya + yb + yc + yd */
  1166.     _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
  1167.     pSi0 += 2;
  1168.  
  1169.     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
  1170.     R = __QSUB16(R, T);
  1171.  
  1172.     /* co2 & si2 are read from SIMD Coefficient pointer */
  1173.     C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
  1174.  
  1175. #ifndef ARM_MATH_BIG_ENDIAN
  1176.  
  1177.     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  1178.     out1 = __SMUSD(C2, R) >> 16u;
  1179.     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1180.     out2 = __SMUADX(C2, R);
  1181.  
  1182. #else
  1183.  
  1184.     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1185.     out1 = __SMUADX(C2, R) >> 16u;
  1186.     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  1187.     out2 = __SMUSD(__QSUB16(0, C2), R);
  1188.  
  1189. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1190.  
  1191.     /*  Reading i0+fftLen/4 */
  1192.     /* T = packed(yb, xb) */
  1193.     T = _SIMD32_OFFSET(pSi1);
  1194.     T = __SHADD16(T, 0);
  1195.     T = __SHADD16(T, 0);
  1196.  
  1197.     /* writing the butterfly processed i0 + fftLen/4 sample */
  1198.     /* writing output(xc', yc') in little endian format */
  1199.     _SIMD32_OFFSET(pSi1) =
  1200.       (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1201.     pSi1 += 2;
  1202.  
  1203.     /*  Butterfly calculations */
  1204.     /* U = packed(yd, xd) */
  1205.     U = _SIMD32_OFFSET(pSi3);
  1206.     U = __SHADD16(U, 0);
  1207.     U = __SHADD16(U, 0);
  1208.  
  1209.     /* T = packed(yb-yd, xb-xd) */
  1210.     T = __QSUB16(T, U);
  1211.  
  1212. #ifndef ARM_MATH_BIG_ENDIAN
  1213.  
  1214.     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  1215.     R = __QSAX(S, T);
  1216.     /* S = packed((ya-yc) + (xb- xd),  (xa-xc) - (yb-yd)) */
  1217.     S = __QASX(S, T);
  1218.  
  1219. #else
  1220.  
  1221.     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  1222.     R = __QASX(S, T);
  1223.     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  1224.     S = __QSAX(S, T);
  1225.  
  1226. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1227.  
  1228.     /* co1 & si1 are read from SIMD Coefficient pointer */
  1229.     C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
  1230.     /*  Butterfly process for the i0+fftLen/2 sample */
  1231.  
  1232. #ifndef ARM_MATH_BIG_ENDIAN
  1233.  
  1234.     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  1235.     out1 = __SMUSD(C1, S) >> 16u;
  1236.     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  1237.     out2 = __SMUADX(C1, S);
  1238.  
  1239. #else
  1240.  
  1241.     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  1242.     out1 = __SMUADX(C1, S) >> 16u;
  1243.     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  1244.     out2 = __SMUSD(__QSUB16(0, C1), S);
  1245.  
  1246. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1247.  
  1248.     /* writing output(xb', yb') in little endian format */
  1249.     _SIMD32_OFFSET(pSi2) =
  1250.       ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
  1251.     pSi2 += 2;
  1252.  
  1253.  
  1254.     /* co3 & si3 are read from SIMD Coefficient pointer */
  1255.     C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
  1256.     /*  Butterfly process for the i0+3fftLen/4 sample */
  1257.  
  1258. #ifndef ARM_MATH_BIG_ENDIAN
  1259.  
  1260.     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  1261.     out1 = __SMUSD(C3, R) >> 16u;
  1262.     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  1263.     out2 = __SMUADX(C3, R);
  1264.  
  1265. #else
  1266.  
  1267.     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  1268.     out1 = __SMUADX(C3, R) >> 16u;
  1269.     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  1270.     out2 = __SMUSD(__QSUB16(0, C3), R);
  1271.  
  1272. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1273.  
  1274.     /* writing output(xd', yd') in little endian format */
  1275.     _SIMD32_OFFSET(pSi3) =
  1276.       ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1277.     pSi3 += 2;
  1278.  
  1279.     /*  Twiddle coefficients index modifier */
  1280.     ic = ic + twidCoefModifier;
  1281.  
  1282.   } while(--j);
  1283.   /* data is in 4.11(q11) format */
  1284.  
  1285.   /* end of first stage process */
  1286.  
  1287.  
  1288.   /* start of middle stage process */
  1289.  
  1290.   /*  Twiddle coefficients index modifier */
  1291.   twidCoefModifier <<= 2u;
  1292.  
  1293.   /*  Calculation of Middle stage */
  1294.   for (k = fftLen / 4u; k > 4u; k >>= 2u)
  1295.   {
  1296.     /*  Initializations for the middle stage */
  1297.     n1 = n2;
  1298.     n2 >>= 2u;
  1299.     ic = 0u;
  1300.  
  1301.     for (j = 0u; j <= (n2 - 1u); j++)
  1302.     {
  1303.       /*  index calculation for the coefficients */
  1304.       C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
  1305.       C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
  1306.       C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
  1307.  
  1308.       /*  Twiddle coefficients index modifier */
  1309.       ic = ic + twidCoefModifier;
  1310.      
  1311.       pSi0 = pSrc16 + 2 * j;
  1312.       pSi1 = pSi0 + 2 * n2;
  1313.       pSi2 = pSi1 + 2 * n2;
  1314.       pSi3 = pSi2 + 2 * n2;
  1315.  
  1316.       /*  Butterfly implementation */
  1317.       for (i0 = j; i0 < fftLen; i0 += n1)
  1318.       {
  1319.         /*  Reading i0, i0+fftLen/2 inputs */
  1320.         /* Read ya (real), xa(imag) input */
  1321.         T = _SIMD32_OFFSET(pSi0);
  1322.  
  1323.         /* Read yc (real), xc(imag) input */
  1324.         S = _SIMD32_OFFSET(pSi2);
  1325.  
  1326.         /* R = packed( (ya + yc), (xa + xc)) */
  1327.         R = __QADD16(T, S);
  1328.  
  1329.         /* S = packed((ya - yc), (xa - xc)) */
  1330.         S = __QSUB16(T, S);
  1331.  
  1332.         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1333.         /* Read yb (real), xb(imag) input */
  1334.         T = _SIMD32_OFFSET(pSi1);
  1335.  
  1336.         /* Read yd (real), xd(imag) input */
  1337.         U = _SIMD32_OFFSET(pSi3);
  1338.  
  1339.         /* T = packed( (yb + yd), (xb + xd)) */
  1340.         T = __QADD16(T, U);
  1341.  
  1342.         /*  writing the butterfly processed i0 sample */
  1343.  
  1344.         /* xa' = xa + xb + xc + xd */
  1345.         /* ya' = ya + yb + yc + yd */
  1346.         out1 = __SHADD16(R, T);
  1347.         out1 = __SHADD16(out1, 0);
  1348.         _SIMD32_OFFSET(pSi0) = out1;
  1349.         pSi0 += 2 * n1;
  1350.  
  1351.         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
  1352.         R = __SHSUB16(R, T);
  1353.  
  1354. #ifndef ARM_MATH_BIG_ENDIAN
  1355.  
  1356.         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  1357.         out1 = __SMUSD(C2, R) >> 16u;
  1358.  
  1359.         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1360.         out2 = __SMUADX(C2, R);
  1361.  
  1362. #else
  1363.  
  1364.         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1365.         out1 = __SMUADX(R, C2) >> 16u;
  1366.  
  1367.         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  1368.         out2 = __SMUSD(__QSUB16(0, C2), R);
  1369.  
  1370. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1371.  
  1372.         /*  Reading i0+3fftLen/4 */
  1373.         /* Read yb (real), xb(imag) input */
  1374.         T = _SIMD32_OFFSET(pSi1);
  1375.  
  1376.         /*  writing the butterfly processed i0 + fftLen/4 sample */
  1377.         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  1378.         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1379.         _SIMD32_OFFSET(pSi1) =
  1380.           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1381.         pSi1 += 2 * n1;
  1382.  
  1383.         /*  Butterfly calculations */
  1384.  
  1385.         /* Read yd (real), xd(imag) input */
  1386.         U = _SIMD32_OFFSET(pSi3);
  1387.  
  1388.         /* T = packed(yb-yd, xb-xd) */
  1389.         T = __QSUB16(T, U);
  1390.  
  1391. #ifndef ARM_MATH_BIG_ENDIAN
  1392.  
  1393.         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  1394.         R = __SHSAX(S, T);
  1395.  
  1396.         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  1397.         S = __SHASX(S, T);
  1398.  
  1399.  
  1400.         /*  Butterfly process for the i0+fftLen/2 sample */
  1401.         out1 = __SMUSD(C1, S) >> 16u;
  1402.         out2 = __SMUADX(C1, S);
  1403.  
  1404. #else
  1405.  
  1406.         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  1407.         R = __SHASX(S, T);
  1408.  
  1409.         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  1410.         S = __SHSAX(S, T);
  1411.  
  1412.  
  1413.         /*  Butterfly process for the i0+fftLen/2 sample */
  1414.         out1 = __SMUADX(S, C1) >> 16u;
  1415.         out2 = __SMUSD(__QSUB16(0, C1), S);
  1416.  
  1417. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1418.  
  1419.         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  1420.         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  1421.         _SIMD32_OFFSET(pSi2) =
  1422.           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1423.         pSi2 += 2 * n1;
  1424.  
  1425.         /*  Butterfly process for the i0+3fftLen/4 sample */
  1426.  
  1427. #ifndef ARM_MATH_BIG_ENDIAN
  1428.  
  1429.         out1 = __SMUSD(C3, R) >> 16u;
  1430.         out2 = __SMUADX(C3, R);
  1431.  
  1432. #else
  1433.  
  1434.         out1 = __SMUADX(C3, R) >> 16u;
  1435.         out2 = __SMUSD(__QSUB16(0, C3), R);
  1436.  
  1437. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1438.  
  1439.         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  1440.         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  1441.         _SIMD32_OFFSET(pSi3) =
  1442.           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1443.         pSi3 += 2 * n1;
  1444.       }
  1445.     }
  1446.     /*  Twiddle coefficients index modifier */
  1447.     twidCoefModifier <<= 2u;
  1448.   }
  1449.   /* end of middle stage process */
  1450.  
  1451.   /* data is in 10.6(q6) format for the 1024 point */
  1452.   /* data is in 8.8(q8) format for the 256 point */
  1453.   /* data is in 6.10(q10) format for the 64 point */
  1454.   /* data is in 4.12(q12) format for the 16 point */
  1455.  
  1456.   /*  Initializations for the last stage */
  1457.   j = fftLen >> 2;
  1458.  
  1459.   ptr1 = &pSrc16[0];
  1460.  
  1461.   /* start of last stage process */
  1462.  
  1463.   /*  Butterfly implementation */
  1464.   do
  1465.   {
  1466.     /* Read xa (real), ya(imag) input */
  1467.     xaya = *__SIMD32(ptr1)++;
  1468.  
  1469.     /* Read xb (real), yb(imag) input */
  1470.     xbyb = *__SIMD32(ptr1)++;
  1471.  
  1472.     /* Read xc (real), yc(imag) input */
  1473.     xcyc = *__SIMD32(ptr1)++;
  1474.  
  1475.     /* Read xd (real), yd(imag) input */
  1476.     xdyd = *__SIMD32(ptr1)++;
  1477.  
  1478.     /* R = packed((ya + yc), (xa + xc)) */
  1479.     R = __QADD16(xaya, xcyc);
  1480.  
  1481.     /* T = packed((yb + yd), (xb + xd)) */
  1482.     T = __QADD16(xbyb, xdyd);
  1483.  
  1484.     /* pointer updation for writing */
  1485.     ptr1 = ptr1 - 8u;
  1486.  
  1487.  
  1488.     /* xa' = xa + xb + xc + xd */
  1489.     /* ya' = ya + yb + yc + yd */
  1490.     *__SIMD32(ptr1)++ = __SHADD16(R, T);
  1491.  
  1492.     /* T = packed((yb + yd), (xb + xd)) */
  1493.     T = __QADD16(xbyb, xdyd);
  1494.  
  1495.     /* xc' = (xa-xb+xc-xd) */
  1496.     /* yc' = (ya-yb+yc-yd) */
  1497.     *__SIMD32(ptr1)++ = __SHSUB16(R, T);
  1498.  
  1499.     /* S = packed((ya - yc), (xa - xc)) */
  1500.     S = __QSUB16(xaya, xcyc);
  1501.  
  1502.     /* Read yd (real), xd(imag) input */
  1503.     /* T = packed( (yb - yd), (xb - xd))  */
  1504.     U = __QSUB16(xbyb, xdyd);
  1505.  
  1506. #ifndef ARM_MATH_BIG_ENDIAN
  1507.  
  1508.     /* xb' = (xa+yb-xc-yd) */
  1509.     /* yb' = (ya-xb-yc+xd) */
  1510.     *__SIMD32(ptr1)++ = __SHASX(S, U);
  1511.  
  1512.  
  1513.     /* xd' = (xa-yb-xc+yd) */
  1514.     /* yd' = (ya+xb-yc-xd) */
  1515.     *__SIMD32(ptr1)++ = __SHSAX(S, U);
  1516.  
  1517. #else
  1518.  
  1519.     /* xb' = (xa+yb-xc-yd) */
  1520.     /* yb' = (ya-xb-yc+xd) */
  1521.     *__SIMD32(ptr1)++ = __SHSAX(S, U);
  1522.  
  1523.  
  1524.     /* xd' = (xa-yb-xc+yd) */
  1525.     /* yd' = (ya+xb-yc-xd) */
  1526.     *__SIMD32(ptr1)++ = __SHASX(S, U);
  1527.  
  1528.  
  1529. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1530.  
  1531.   } while(--j);
  1532.  
  1533.   /* end of last stage  process */
  1534.  
  1535.   /* output is in 11.5(q5) format for the 1024 point */
  1536.   /* output is in 9.7(q7) format for the 256 point   */
  1537.   /* output is in 7.9(q9) format for the 64 point  */
  1538.   /* output is in 5.11(q11) format for the 16 point  */
  1539.  
  1540.  
  1541. #else
  1542.  
  1543.   /* Run the below code for Cortex-M0 */
  1544.  
  1545.   q15_t R0, R1, S0, S1, T0, T1, U0, U1;
  1546.   q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
  1547.   uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
  1548.  
  1549.   /* Total process is divided into three stages */
  1550.  
  1551.   /* process first stage, middle stages, & last stage */
  1552.  
  1553.   /*  Initializations for the first stage */
  1554.   n2 = fftLen;
  1555.   n1 = n2;
  1556.  
  1557.   /* n2 = fftLen/4 */
  1558.   n2 >>= 2u;
  1559.  
  1560.   /* Index for twiddle coefficient */
  1561.   ic = 0u;
  1562.  
  1563.   /* Index for input read and output write */
  1564.   i0 = 0u;
  1565.  
  1566.   j = n2;
  1567.  
  1568.   /* Input is in 1.15(q15) format */
  1569.  
  1570.   /*  Start of first stage process */
  1571.   do
  1572.   {
  1573.     /*  Butterfly implementation */
  1574.  
  1575.     /*  index calculation for the input as, */
  1576.     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  1577.     i1 = i0 + n2;
  1578.     i2 = i1 + n2;
  1579.     i3 = i2 + n2;
  1580.  
  1581.     /*  Reading i0, i0+fftLen/2 inputs */
  1582.     /* input is down scale by 4 to avoid overflow */
  1583.     /* Read ya (real), xa(imag) input */
  1584.     T0 = pSrc16[i0 * 2u] >> 2u;
  1585.     T1 = pSrc16[(i0 * 2u) + 1u] >> 2u;
  1586.     /* input is down scale by 4 to avoid overflow */
  1587.     /* Read yc (real), xc(imag) input */
  1588.     S0 = pSrc16[i2 * 2u] >> 2u;
  1589.     S1 = pSrc16[(i2 * 2u) + 1u] >> 2u;
  1590.  
  1591.     /* R0 = (ya + yc), R1 = (xa + xc) */
  1592.     R0 = __SSAT(T0 + S0, 16u);
  1593.     R1 = __SSAT(T1 + S1, 16u);
  1594.     /* S0 = (ya - yc), S1 = (xa - xc) */
  1595.     S0 = __SSAT(T0 - S0, 16u);
  1596.     S1 = __SSAT(T1 - S1, 16u);
  1597.  
  1598.     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1599.     /* input is down scale by 4 to avoid overflow */
  1600.     /* Read yb (real), xb(imag) input */
  1601.     T0 = pSrc16[i1 * 2u] >> 2u;
  1602.     T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
  1603.     /* Read yd (real), xd(imag) input */
  1604.     /* input is down scale by 4 to avoid overflow */
  1605.     U0 = pSrc16[i3 * 2u] >> 2u;
  1606.     U1 = pSrc16[(i3 * 2u) + 1u] >> 2u;
  1607.  
  1608.     /* T0 = (yb + yd), T1 = (xb + xd) */
  1609.     T0 = __SSAT(T0 + U0, 16u);
  1610.     T1 = __SSAT(T1 + U1, 16u);
  1611.  
  1612.     /*  writing the butterfly processed i0 sample */
  1613.     /* xa' = xa + xb + xc + xd */
  1614.     /* ya' = ya + yb + yc + yd */
  1615.     pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
  1616.     pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
  1617.  
  1618.     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
  1619.     R0 = __SSAT(R0 - T0, 16u);
  1620.     R1 = __SSAT(R1 - T1, 16u);
  1621.     /* co2 & si2 are read from Coefficient pointer */
  1622.     Co2 = pCoef16[2u * ic * 2u];
  1623.     Si2 = pCoef16[(2u * ic * 2u) + 1u];
  1624.     /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
  1625.     out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16u);
  1626.     /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
  1627.     out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16u);
  1628.  
  1629.     /*  Reading i0+fftLen/4 */
  1630.     /* input is down scale by 4 to avoid overflow */
  1631.     /* T0 = yb, T1 = xb */
  1632.     T0 = pSrc16[i1 * 2u] >> 2u;
  1633.     T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
  1634.  
  1635.     /* writing the butterfly processed i0 + fftLen/4 sample */
  1636.     /* writing output(xc', yc') in little endian format */
  1637.     pSrc16[i1 * 2u] = out1;
  1638.     pSrc16[(i1 * 2u) + 1u] = out2;
  1639.  
  1640.     /*  Butterfly calculations */
  1641.     /* input is down scale by 4 to avoid overflow */
  1642.     /* U0 = yd, U1 = xd) */
  1643.     U0 = pSrc16[i3 * 2u] >> 2u;
  1644.     U1 = pSrc16[(i3 * 2u) + 1u] >> 2u;
  1645.  
  1646.     /* T0 = yb-yd, T1 = xb-xd) */
  1647.     T0 = __SSAT(T0 - U0, 16u);
  1648.     T1 = __SSAT(T1 - U1, 16u);
  1649.     /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
  1650.     R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
  1651.     R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
  1652.     /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
  1653.     S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
  1654.     S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
  1655.  
  1656.     /* co1 & si1 are read from Coefficient pointer */
  1657.     Co1 = pCoef16[ic * 2u];
  1658.     Si1 = pCoef16[(ic * 2u) + 1u];
  1659.     /*  Butterfly process for the i0+fftLen/2 sample */
  1660.     /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
  1661.     out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u);
  1662.     /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
  1663.     out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u);
  1664.     /* writing output(xb', yb') in little endian format */
  1665.     pSrc16[i2 * 2u] = out1;
  1666.     pSrc16[(i2 * 2u) + 1u] = out2;
  1667.  
  1668.     /* Co3 & si3 are read from Coefficient pointer */
  1669.     Co3 = pCoef16[3u * ic * 2u];
  1670.     Si3 = pCoef16[(3u * ic * 2u) + 1u];
  1671.     /*  Butterfly process for the i0+3fftLen/4 sample */
  1672.     /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
  1673.     out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u);
  1674.     /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
  1675.     out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u);
  1676.     /* writing output(xd', yd') in little endian format */
  1677.     pSrc16[i3 * 2u] = out1;
  1678.     pSrc16[(i3 * 2u) + 1u] = out2;
  1679.  
  1680.     /*  Twiddle coefficients index modifier */
  1681.     ic = ic + twidCoefModifier;
  1682.  
  1683.     /*  Updating input index */
  1684.     i0 = i0 + 1u;
  1685.  
  1686.   } while(--j);
  1687.  
  1688.   /*  End of first stage process */
  1689.  
  1690.   /* data is in 4.11(q11) format */
  1691.  
  1692.  
  1693.   /*  Start of Middle stage process */
  1694.  
  1695.   /*  Twiddle coefficients index modifier */
  1696.   twidCoefModifier <<= 2u;
  1697.  
  1698.   /*  Calculation of Middle stage */
  1699.   for (k = fftLen / 4u; k > 4u; k >>= 2u)
  1700.   {
  1701.     /*  Initializations for the middle stage */
  1702.     n1 = n2;
  1703.     n2 >>= 2u;
  1704.     ic = 0u;
  1705.  
  1706.     for (j = 0u; j <= (n2 - 1u); j++)
  1707.     {
  1708.       /*  index calculation for the coefficients */
  1709.       Co1 = pCoef16[ic * 2u];
  1710.       Si1 = pCoef16[(ic * 2u) + 1u];
  1711.       Co2 = pCoef16[2u * ic * 2u];
  1712.       Si2 = pCoef16[2u * ic * 2u + 1u];
  1713.       Co3 = pCoef16[3u * ic * 2u];
  1714.       Si3 = pCoef16[(3u * ic * 2u) + 1u];
  1715.  
  1716.       /*  Twiddle coefficients index modifier */
  1717.       ic = ic + twidCoefModifier;
  1718.  
  1719.       /*  Butterfly implementation */
  1720.       for (i0 = j; i0 < fftLen; i0 += n1)
  1721.       {
  1722.         /*  index calculation for the input as, */
  1723.         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  1724.         i1 = i0 + n2;
  1725.         i2 = i1 + n2;
  1726.         i3 = i2 + n2;
  1727.  
  1728.         /*  Reading i0, i0+fftLen/2 inputs */
  1729.         /* Read ya (real), xa(imag) input */
  1730.         T0 = pSrc16[i0 * 2u];
  1731.         T1 = pSrc16[(i0 * 2u) + 1u];
  1732.  
  1733.         /* Read yc (real), xc(imag) input */
  1734.         S0 = pSrc16[i2 * 2u];
  1735.         S1 = pSrc16[(i2 * 2u) + 1u];
  1736.  
  1737.  
  1738.         /* R0 = (ya + yc), R1 = (xa + xc) */
  1739.         R0 = __SSAT(T0 + S0, 16u);
  1740.         R1 = __SSAT(T1 + S1, 16u);
  1741.         /* S0 = (ya - yc), S1 = (xa - xc) */
  1742.         S0 = __SSAT(T0 - S0, 16u);
  1743.         S1 = __SSAT(T1 - S1, 16u);
  1744.  
  1745.         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1746.         /* Read yb (real), xb(imag) input */
  1747.         T0 = pSrc16[i1 * 2u];
  1748.         T1 = pSrc16[(i1 * 2u) + 1u];
  1749.  
  1750.         /* Read yd (real), xd(imag) input */
  1751.         U0 = pSrc16[i3 * 2u];
  1752.         U1 = pSrc16[(i3 * 2u) + 1u];
  1753.  
  1754.         /* T0 = (yb + yd), T1 = (xb + xd) */
  1755.         T0 = __SSAT(T0 + U0, 16u);
  1756.         T1 = __SSAT(T1 + U1, 16u);
  1757.  
  1758.         /*  writing the butterfly processed i0 sample */
  1759.         /* xa' = xa + xb + xc + xd */
  1760.         /* ya' = ya + yb + yc + yd */
  1761.         pSrc16[i0 * 2u] = ((R0 >> 1u) + (T0 >> 1u)) >> 1u;
  1762.         pSrc16[(i0 * 2u) + 1u] = ((R1 >> 1u) + (T1 >> 1u)) >> 1u;
  1763.  
  1764.         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
  1765.         R0 = (R0 >> 1u) - (T0 >> 1u);
  1766.         R1 = (R1 >> 1u) - (T1 >> 1u);
  1767.  
  1768.         /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
  1769.         out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
  1770.         /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
  1771.         out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
  1772.  
  1773.         /*  Reading i0+3fftLen/4 */
  1774.         /* Read yb (real), xb(imag) input */
  1775.         T0 = pSrc16[i1 * 2u];
  1776.         T1 = pSrc16[(i1 * 2u) + 1u];
  1777.  
  1778.         /*  writing the butterfly processed i0 + fftLen/4 sample */
  1779.         /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
  1780.         /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
  1781.         pSrc16[i1 * 2u] = out1;
  1782.         pSrc16[(i1 * 2u) + 1u] = out2;
  1783.  
  1784.         /*  Butterfly calculations */
  1785.         /* Read yd (real), xd(imag) input */
  1786.         U0 = pSrc16[i3 * 2u];
  1787.         U1 = pSrc16[(i3 * 2u) + 1u];
  1788.  
  1789.         /* T0 = yb-yd, T1 = xb-xd) */
  1790.         T0 = __SSAT(T0 - U0, 16u);
  1791.         T1 = __SSAT(T1 - U1, 16u);
  1792.  
  1793.         /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
  1794.         R0 = (S0 >> 1u) + (T1 >> 1u);
  1795.         R1 = (S1 >> 1u) - (T0 >> 1u);
  1796.  
  1797.         /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
  1798.         S0 = (S0 >> 1u) - (T1 >> 1u);
  1799.         S1 = (S1 >> 1u) + (T0 >> 1u);
  1800.  
  1801.         /*  Butterfly process for the i0+fftLen/2 sample */
  1802.         out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u);
  1803.         out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u);
  1804.         /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
  1805.         /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
  1806.         pSrc16[i2 * 2u] = out1;
  1807.         pSrc16[(i2 * 2u) + 1u] = out2;
  1808.  
  1809.         /*  Butterfly process for the i0+3fftLen/4 sample */
  1810.         out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u);
  1811.  
  1812.         out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u);
  1813.         /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
  1814.         /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
  1815.         pSrc16[i3 * 2u] = out1;
  1816.         pSrc16[(i3 * 2u) + 1u] = out2;
  1817.  
  1818.  
  1819.       }
  1820.     }
  1821.     /*  Twiddle coefficients index modifier */
  1822.     twidCoefModifier <<= 2u;
  1823.   }
  1824.   /*  End of Middle stages process */
  1825.  
  1826.  
  1827.   /* data is in 10.6(q6) format for the 1024 point */
  1828.   /* data is in 8.8(q8) format for the 256 point   */
  1829.   /* data is in 6.10(q10) format for the 64 point  */
  1830.   /* data is in 4.12(q12) format for the 16 point  */
  1831.  
  1832.   /* start of last stage process */
  1833.  
  1834.  
  1835.   /*  Initializations for the last stage */
  1836.   n1 = n2;
  1837.   n2 >>= 2u;
  1838.  
  1839.   /*  Butterfly implementation */
  1840.   for (i0 = 0u; i0 <= (fftLen - n1); i0 += n1)
  1841.   {
  1842.     /*  index calculation for the input as, */
  1843.     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  1844.     i1 = i0 + n2;
  1845.     i2 = i1 + n2;
  1846.     i3 = i2 + n2;
  1847.  
  1848.     /*  Reading i0, i0+fftLen/2 inputs */
  1849.     /* Read ya (real), xa(imag) input */
  1850.     T0 = pSrc16[i0 * 2u];
  1851.     T1 = pSrc16[(i0 * 2u) + 1u];
  1852.     /* Read yc (real), xc(imag) input */
  1853.     S0 = pSrc16[i2 * 2u];
  1854.     S1 = pSrc16[(i2 * 2u) + 1u];
  1855.  
  1856.     /* R0 = (ya + yc), R1 = (xa + xc) */
  1857.     R0 = __SSAT(T0 + S0, 16u);
  1858.     R1 = __SSAT(T1 + S1, 16u);
  1859.     /* S0 = (ya - yc), S1 = (xa - xc) */
  1860.     S0 = __SSAT(T0 - S0, 16u);
  1861.     S1 = __SSAT(T1 - S1, 16u);
  1862.  
  1863.     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1864.     /* Read yb (real), xb(imag) input */
  1865.     T0 = pSrc16[i1 * 2u];
  1866.     T1 = pSrc16[(i1 * 2u) + 1u];
  1867.     /* Read yd (real), xd(imag) input */
  1868.     U0 = pSrc16[i3 * 2u];
  1869.     U1 = pSrc16[(i3 * 2u) + 1u];
  1870.  
  1871.     /* T0 = (yb + yd), T1 = (xb + xd) */
  1872.     T0 = __SSAT(T0 + U0, 16u);
  1873.     T1 = __SSAT(T1 + U1, 16u);
  1874.  
  1875.     /*  writing the butterfly processed i0 sample */
  1876.     /* xa' = xa + xb + xc + xd */
  1877.     /* ya' = ya + yb + yc + yd */
  1878.     pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
  1879.     pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
  1880.  
  1881.     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
  1882.     R0 = (R0 >> 1u) - (T0 >> 1u);
  1883.     R1 = (R1 >> 1u) - (T1 >> 1u);
  1884.  
  1885.     /* Read yb (real), xb(imag) input */
  1886.     T0 = pSrc16[i1 * 2u];
  1887.     T1 = pSrc16[(i1 * 2u) + 1u];
  1888.  
  1889.     /*  writing the butterfly processed i0 + fftLen/4 sample */
  1890.     /* xc' = (xa-xb+xc-xd) */
  1891.     /* yc' = (ya-yb+yc-yd) */
  1892.     pSrc16[i1 * 2u] = R0;
  1893.     pSrc16[(i1 * 2u) + 1u] = R1;
  1894.  
  1895.     /* Read yd (real), xd(imag) input */
  1896.     U0 = pSrc16[i3 * 2u];
  1897.     U1 = pSrc16[(i3 * 2u) + 1u];
  1898.     /* T0 = (yb - yd), T1 = (xb - xd) */
  1899.     T0 = __SSAT(T0 - U0, 16u);
  1900.     T1 = __SSAT(T1 - U1, 16u);
  1901.  
  1902.     /*  writing the butterfly processed i0 + fftLen/2 sample */
  1903.     /* xb' = (xa-yb-xc+yd) */
  1904.     /* yb' = (ya+xb-yc-xd) */
  1905.     pSrc16[i2 * 2u] = (S0 >> 1u) - (T1 >> 1u);
  1906.     pSrc16[(i2 * 2u) + 1u] = (S1 >> 1u) + (T0 >> 1u);
  1907.  
  1908.  
  1909.     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
  1910.     /* xd' = (xa+yb-xc-yd) */
  1911.     /* yd' = (ya-xb-yc+xd) */
  1912.     pSrc16[i3 * 2u] = (S0 >> 1u) + (T1 >> 1u);
  1913.     pSrc16[(i3 * 2u) + 1u] = (S1 >> 1u) - (T0 >> 1u);
  1914.   }
  1915.   /* end of last stage  process */
  1916.  
  1917.   /* output is in 11.5(q5) format for the 1024 point */
  1918.   /* output is in 9.7(q7) format for the 256 point   */
  1919.   /* output is in 7.9(q9) format for the 64 point  */
  1920.   /* output is in 5.11(q11) format for the 16 point  */
  1921.  
  1922. #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  1923.  
  1924. }
  1925.