Subversion Repositories DashDisplay

Rev

Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------    
  2. * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
  3. *    
  4. * $Date:        19. March 2015
  5. * $Revision:    V.1.4.5  
  6. *    
  7. * Project:          CMSIS DSP Library    
  8. * Title:            arm_cfft_radix4_q31.c    
  9. *    
  10. * Description:  This file has function definition of Radix-4 FFT & IFFT function and    
  11. *                               In-place bit reversal using bit reversal table    
  12. *    
  13. * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  14. *  
  15. * Redistribution and use in source and binary forms, with or without
  16. * modification, are permitted provided that the following conditions
  17. * are met:
  18. *   - Redistributions of source code must retain the above copyright
  19. *     notice, this list of conditions and the following disclaimer.
  20. *   - Redistributions in binary form must reproduce the above copyright
  21. *     notice, this list of conditions and the following disclaimer in
  22. *     the documentation and/or other materials provided with the
  23. *     distribution.
  24. *   - Neither the name of ARM LIMITED nor the names of its contributors
  25. *     may be used to endorse or promote products derived from this
  26. *     software without specific prior written permission.
  27. *
  28. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  29. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  30. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  31. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  32. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  33. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  34. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  35. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  36. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  37. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  38. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  39. * POSSIBILITY OF SUCH DAMAGE.    
  40. * -------------------------------------------------------------------- */
  41.  
  42. #include "arm_math.h"
  43.  
  44. void arm_radix4_butterfly_inverse_q31(
  45. q31_t * pSrc,
  46. uint32_t fftLen,
  47. q31_t * pCoef,
  48. uint32_t twidCoefModifier);
  49.  
  50. void arm_radix4_butterfly_q31(
  51. q31_t * pSrc,
  52. uint32_t fftLen,
  53. q31_t * pCoef,
  54. uint32_t twidCoefModifier);
  55.  
  56. void arm_bitreversal_q31(
  57. q31_t * pSrc,
  58. uint32_t fftLen,
  59. uint16_t bitRevFactor,
  60. uint16_t * pBitRevTab);
  61.  
  62. /**    
  63.  * @ingroup groupTransforms    
  64.  */
  65.  
  66. /**    
  67.  * @addtogroup ComplexFFT    
  68.  * @{    
  69.  */
  70.  
  71. /**    
  72.  * @details    
  73.  * @brief Processing function for the Q31 CFFT/CIFFT.    
  74.  * @deprecated Do not use this function.  It has been superseded by \ref arm_cfft_q31 and will be removed
  75.  * @param[in]      *S    points to an instance of the Q31 CFFT/CIFFT structure.  
  76.  * @param[in, out] *pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place.  
  77.  * @return none.    
  78.  *    
  79.  * \par Input and output formats:    
  80.  * \par    
  81.  * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.  
  82.  * Hence the output format is different for different FFT sizes.    
  83.  * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:  
  84.  * \par  
  85.  * \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"    
  86.  * \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"    
  87.  *    
  88.  */
  89.  
  90. void arm_cfft_radix4_q31(
  91.   const arm_cfft_radix4_instance_q31 * S,
  92.   q31_t * pSrc)
  93. {
  94.   if(S->ifftFlag == 1u)
  95.   {
  96.     /* Complex IFFT radix-4 */
  97.     arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle,
  98.                                      S->twidCoefModifier);
  99.   }
  100.   else
  101.   {
  102.     /* Complex FFT radix-4 */
  103.     arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle,
  104.                              S->twidCoefModifier);
  105.   }
  106.  
  107.  
  108.   if(S->bitReverseFlag == 1u)
  109.   {
  110.     /*  Bit Reversal */
  111.     arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
  112.   }
  113.  
  114. }
  115.  
  116. /**    
  117.  * @} end of ComplexFFT group    
  118.  */
  119.  
  120. /*    
  121. * Radix-4 FFT algorithm used is :    
  122. *    
  123. * Input real and imaginary data:    
  124. * x(n) = xa + j * ya    
  125. * x(n+N/4 ) = xb + j * yb    
  126. * x(n+N/2 ) = xc + j * yc    
  127. * x(n+3N 4) = xd + j * yd    
  128. *    
  129. *    
  130. * Output real and imaginary data:    
  131. * x(4r) = xa'+ j * ya'    
  132. * x(4r+1) = xb'+ j * yb'    
  133. * x(4r+2) = xc'+ j * yc'    
  134. * x(4r+3) = xd'+ j * yd'    
  135. *    
  136. *    
  137. * Twiddle factors for radix-4 FFT:    
  138. * Wn = co1 + j * (- si1)    
  139. * W2n = co2 + j * (- si2)    
  140. * W3n = co3 + j * (- si3)    
  141. *    
  142. *  Butterfly implementation:    
  143. * xa' = xa + xb + xc + xd    
  144. * ya' = ya + yb + yc + yd    
  145. * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)    
  146. * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)    
  147. * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)    
  148. * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)    
  149. * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)    
  150. * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)    
  151. *    
  152. */
  153.  
  154. /**    
  155.  * @brief  Core function for the Q31 CFFT butterfly process.  
  156.  * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.  
  157.  * @param[in]      fftLen           length of the FFT.  
  158.  * @param[in]      *pCoef           points to twiddle coefficient buffer.  
  159.  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.  
  160.  * @return none.  
  161.  */
  162.  
  163. void arm_radix4_butterfly_q31(
  164.   q31_t * pSrc,
  165.   uint32_t fftLen,
  166.   q31_t * pCoef,
  167.   uint32_t twidCoefModifier)
  168. {
  169. #if defined(ARM_MATH_CM7)
  170.   uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
  171.   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
  172.  
  173.   q31_t xa, xb, xc, xd;
  174.   q31_t ya, yb, yc, yd;
  175.   q31_t xa_out, xb_out, xc_out, xd_out;
  176.   q31_t ya_out, yb_out, yc_out, yd_out;
  177.  
  178.   q31_t *ptr1;
  179.   q63_t xaya, xbyb, xcyc, xdyd;
  180.   /* Total process is divided into three stages */
  181.  
  182.   /* process first stage, middle stages, & last stage */
  183.  
  184.  
  185.   /* start of first stage process */
  186.  
  187.   /*  Initializations for the first stage */
  188.   n2 = fftLen;
  189.   n1 = n2;
  190.   /* n2 = fftLen/4 */
  191.   n2 >>= 2u;
  192.   i0 = 0u;
  193.   ia1 = 0u;
  194.  
  195.   j = n2;
  196.  
  197.   /*  Calculation of first stage */
  198.   do
  199.   {
  200.     /*  index calculation for the input as, */
  201.     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
  202.     i1 = i0 + n2;
  203.     i2 = i1 + n2;
  204.     i3 = i2 + n2;
  205.  
  206.     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
  207.  
  208.     /*  Butterfly implementation */
  209.     /* xa + xc */
  210.     r1 = (pSrc[(2u * i0)] >> 4u) + (pSrc[(2u * i2)] >> 4u);
  211.     /* xa - xc */
  212.     r2 = (pSrc[2u * i0] >> 4u) - (pSrc[2u * i2] >> 4u);
  213.  
  214.     /* xb + xd */
  215.     t1 = (pSrc[2u * i1] >> 4u) + (pSrc[2u * i3] >> 4u);
  216.  
  217.     /* ya + yc */
  218.     s1 = (pSrc[(2u * i0) + 1u] >> 4u) + (pSrc[(2u * i2) + 1u] >> 4u);
  219.     /* ya - yc */
  220.     s2 = (pSrc[(2u * i0) + 1u] >> 4u) - (pSrc[(2u * i2) + 1u] >> 4u);
  221.  
  222.     /* xa' = xa + xb + xc + xd */
  223.     pSrc[2u * i0] = (r1 + t1);
  224.     /* (xa + xc) - (xb + xd) */
  225.     r1 = r1 - t1;
  226.     /* yb + yd */
  227.     t2 = (pSrc[(2u * i1) + 1u] >> 4u) + (pSrc[(2u * i3) + 1u] >> 4u);
  228.  
  229.     /* ya' = ya + yb + yc + yd */
  230.     pSrc[(2u * i0) + 1u] = (s1 + t2);
  231.  
  232.     /* (ya + yc) - (yb + yd) */
  233.     s1 = s1 - t2;
  234.  
  235.     /* yb - yd */
  236.     t1 = (pSrc[(2u * i1) + 1u] >> 4u) - (pSrc[(2u * i3) + 1u] >> 4u);
  237.     /* xb - xd */
  238.     t2 = (pSrc[2u * i1] >> 4u) - (pSrc[2u * i3] >> 4u);
  239.  
  240.     /*  index calculation for the coefficients */
  241.     ia2 = 2u * ia1;
  242.     co2 = pCoef[ia2 * 2u];
  243.     si2 = pCoef[(ia2 * 2u) + 1u];
  244.  
  245.     /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
  246.     pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
  247.                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
  248.  
  249.     /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
  250.     pSrc[(2u * i1) + 1u] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
  251.                             ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
  252.  
  253.     /* (xa - xc) + (yb - yd) */
  254.     r1 = r2 + t1;
  255.     /* (xa - xc) - (yb - yd) */
  256.     r2 = r2 - t1;
  257.  
  258.     /* (ya - yc) - (xb - xd) */
  259.     s1 = s2 - t2;
  260.     /* (ya - yc) + (xb - xd) */
  261.     s2 = s2 + t2;
  262.  
  263.     co1 = pCoef[ia1 * 2u];
  264.     si1 = pCoef[(ia1 * 2u) + 1u];
  265.  
  266.     /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
  267.     pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
  268.                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
  269.  
  270.     /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
  271.     pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
  272.                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
  273.  
  274.     /*  index calculation for the coefficients */
  275.     ia3 = 3u * ia1;
  276.     co3 = pCoef[ia3 * 2u];
  277.     si3 = pCoef[(ia3 * 2u) + 1u];
  278.  
  279.     /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
  280.     pSrc[2u * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
  281.                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
  282.  
  283.     /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
  284.     pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
  285.                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
  286.  
  287.     /*  Twiddle coefficients index modifier */
  288.     ia1 = ia1 + twidCoefModifier;
  289.  
  290.     /*  Updating input index */
  291.     i0 = i0 + 1u;
  292.  
  293.   } while(--j);
  294.  
  295.   /* end of first stage process */
  296.  
  297.   /* data is in 5.27(q27) format */
  298.  
  299.  
  300.   /* start of Middle stages process */
  301.  
  302.  
  303.   /* each stage in middle stages provides two down scaling of the input */
  304.  
  305.   twidCoefModifier <<= 2u;
  306.  
  307.  
  308.   for (k = fftLen / 4u; k > 4u; k >>= 2u)
  309.   {
  310.     /*  Initializations for the first stage */
  311.     n1 = n2;
  312.     n2 >>= 2u;
  313.     ia1 = 0u;
  314.  
  315.     /*  Calculation of first stage */
  316.     for (j = 0u; j <= (n2 - 1u); j++)
  317.     {
  318.       /*  index calculation for the coefficients */
  319.       ia2 = ia1 + ia1;
  320.       ia3 = ia2 + ia1;
  321.       co1 = pCoef[ia1 * 2u];
  322.       si1 = pCoef[(ia1 * 2u) + 1u];
  323.       co2 = pCoef[ia2 * 2u];
  324.       si2 = pCoef[(ia2 * 2u) + 1u];
  325.       co3 = pCoef[ia3 * 2u];
  326.       si3 = pCoef[(ia3 * 2u) + 1u];
  327.       /*  Twiddle coefficients index modifier */
  328.       ia1 = ia1 + twidCoefModifier;
  329.  
  330.       for (i0 = j; i0 < fftLen; i0 += n1)
  331.       {
  332.         /*  index calculation for the input as, */
  333.         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
  334.         i1 = i0 + n2;
  335.         i2 = i1 + n2;
  336.         i3 = i2 + n2;
  337.  
  338.         /*  Butterfly implementation */
  339.         /* xa + xc */
  340.         r1 = pSrc[2u * i0] + pSrc[2u * i2];
  341.         /* xa - xc */
  342.         r2 = pSrc[2u * i0] - pSrc[2u * i2];
  343.  
  344.         /* ya + yc */
  345.         s1 = pSrc[(2u * i0) + 1u] + pSrc[(2u * i2) + 1u];
  346.         /* ya - yc */
  347.         s2 = pSrc[(2u * i0) + 1u] - pSrc[(2u * i2) + 1u];
  348.  
  349.         /* xb + xd */
  350.         t1 = pSrc[2u * i1] + pSrc[2u * i3];
  351.  
  352.         /* xa' = xa + xb + xc + xd */
  353.         pSrc[2u * i0] = (r1 + t1) >> 2u;
  354.         /* xa + xc -(xb + xd) */
  355.         r1 = r1 - t1;
  356.  
  357.         /* yb + yd */
  358.         t2 = pSrc[(2u * i1) + 1u] + pSrc[(2u * i3) + 1u];
  359.         /* ya' = ya + yb + yc + yd */
  360.         pSrc[(2u * i0) + 1u] = (s1 + t2) >> 2u;
  361.  
  362.         /* (ya + yc) - (yb + yd) */
  363.         s1 = s1 - t2;
  364.  
  365.         /* (yb - yd) */
  366.         t1 = pSrc[(2u * i1) + 1u] - pSrc[(2u * i3) + 1u];
  367.         /* (xb - xd) */
  368.         t2 = pSrc[2u * i1] - pSrc[2u * i3];
  369.  
  370.         /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
  371.         pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
  372.                          ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1u;
  373.  
  374.         /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
  375.         pSrc[(2u * i1) + 1u] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
  376.                                 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1u;
  377.  
  378.         /* (xa - xc) + (yb - yd) */
  379.         r1 = r2 + t1;
  380.         /* (xa - xc) - (yb - yd) */
  381.         r2 = r2 - t1;
  382.  
  383.         /* (ya - yc) -  (xb - xd) */
  384.         s1 = s2 - t2;
  385.         /* (ya - yc) +  (xb - xd) */
  386.         s2 = s2 + t2;
  387.  
  388.         /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
  389.         pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
  390.                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
  391.  
  392.         /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
  393.         pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
  394.                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
  395.  
  396.         /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
  397.         pSrc[2u * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
  398.                          ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
  399.  
  400.         /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
  401.         pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
  402.                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
  403.       }
  404.     }
  405.     twidCoefModifier <<= 2u;
  406.   }
  407. #else
  408.   uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
  409.   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
  410.  
  411.   q31_t xa, xb, xc, xd;
  412.   q31_t ya, yb, yc, yd;
  413.   q31_t xa_out, xb_out, xc_out, xd_out;
  414.   q31_t ya_out, yb_out, yc_out, yd_out;
  415.  
  416.   q31_t *ptr1;
  417.   q31_t *pSi0;
  418.   q31_t *pSi1;
  419.   q31_t *pSi2;
  420.   q31_t *pSi3;
  421.   q63_t xaya, xbyb, xcyc, xdyd;
  422.   /* Total process is divided into three stages */
  423.  
  424.   /* process first stage, middle stages, & last stage */
  425.  
  426.  
  427.   /* start of first stage process */
  428.  
  429.   /*  Initializations for the first stage */
  430.   n2 = fftLen;
  431.   n1 = n2;
  432.   /* n2 = fftLen/4 */
  433.   n2 >>= 2u;
  434.  
  435.   ia1 = 0u;
  436.  
  437.   j = n2;
  438.  
  439.   pSi0 = pSrc;
  440.   pSi1 = pSi0 + 2 * n2;
  441.   pSi2 = pSi1 + 2 * n2;
  442.   pSi3 = pSi2 + 2 * n2;
  443.  
  444.   /*  Calculation of first stage */
  445.   do
  446.   {
  447.     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
  448.  
  449.     /*  Butterfly implementation */
  450.     /* xa + xc */
  451.     r1 = (pSi0[0] >> 4u) + (pSi2[0] >> 4u);
  452.     /* xa - xc */
  453.     r2 = (pSi0[0] >> 4u) - (pSi2[0] >> 4u);
  454.  
  455.     /* xb + xd */
  456.     t1 = (pSi1[0] >> 4u) + (pSi3[0] >> 4u);
  457.  
  458.     /* ya + yc */
  459.     s1 = (pSi0[1] >> 4u) + (pSi2[1] >> 4u);
  460.     /* ya - yc */
  461.     s2 = (pSi0[1] >> 4u) - (pSi2[1] >> 4u);
  462.  
  463.     /* xa' = xa + xb + xc + xd */
  464.     *pSi0++ = (r1 + t1);
  465.     /* (xa + xc) - (xb + xd) */
  466.     r1 = r1 - t1;
  467.     /* yb + yd */
  468.     t2 = (pSi1[1] >> 4u) + (pSi3[1] >> 4u);
  469.  
  470.     /* ya' = ya + yb + yc + yd */
  471.     *pSi0++ = (s1 + t2);
  472.  
  473.     /* (ya + yc) - (yb + yd) */
  474.     s1 = s1 - t2;
  475.  
  476.     /* yb - yd */
  477.     t1 = (pSi1[1] >> 4u) - (pSi3[1] >> 4u);
  478.     /* xb - xd */
  479.     t2 = (pSi1[0] >> 4u) - (pSi3[0] >> 4u);
  480.  
  481.     /*  index calculation for the coefficients */
  482.     ia2 = 2u * ia1;
  483.     co2 = pCoef[ia2 * 2u];
  484.     si2 = pCoef[(ia2 * 2u) + 1u];
  485.  
  486.     /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
  487.     *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
  488.                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
  489.  
  490.     /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
  491.     *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
  492.                             ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
  493.  
  494.     /* (xa - xc) + (yb - yd) */
  495.     r1 = r2 + t1;
  496.     /* (xa - xc) - (yb - yd) */
  497.     r2 = r2 - t1;
  498.  
  499.     /* (ya - yc) - (xb - xd) */
  500.     s1 = s2 - t2;
  501.     /* (ya - yc) + (xb - xd) */
  502.     s2 = s2 + t2;
  503.  
  504.     co1 = pCoef[ia1 * 2u];
  505.     si1 = pCoef[(ia1 * 2u) + 1u];
  506.  
  507.     /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
  508.     *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
  509.                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
  510.  
  511.     /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
  512.     *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
  513.                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
  514.  
  515.     /*  index calculation for the coefficients */
  516.     ia3 = 3u * ia1;
  517.     co3 = pCoef[ia3 * 2u];
  518.     si3 = pCoef[(ia3 * 2u) + 1u];
  519.  
  520.     /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
  521.     *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
  522.                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
  523.  
  524.     /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
  525.     *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
  526.                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
  527.  
  528.     /*  Twiddle coefficients index modifier */
  529.     ia1 = ia1 + twidCoefModifier;
  530.  
  531.   } while(--j);
  532.  
  533.   /* end of first stage process */
  534.  
  535.   /* data is in 5.27(q27) format */
  536.  
  537.  
  538.   /* start of Middle stages process */
  539.  
  540.  
  541.   /* each stage in middle stages provides two down scaling of the input */
  542.  
  543.   twidCoefModifier <<= 2u;
  544.  
  545.  
  546.   for (k = fftLen / 4u; k > 4u; k >>= 2u)
  547.   {
  548.     /*  Initializations for the first stage */
  549.     n1 = n2;
  550.     n2 >>= 2u;
  551.     ia1 = 0u;
  552.  
  553.     /*  Calculation of first stage */
  554.     for (j = 0u; j <= (n2 - 1u); j++)
  555.     {
  556.       /*  index calculation for the coefficients */
  557.       ia2 = ia1 + ia1;
  558.       ia3 = ia2 + ia1;
  559.       co1 = pCoef[ia1 * 2u];
  560.       si1 = pCoef[(ia1 * 2u) + 1u];
  561.       co2 = pCoef[ia2 * 2u];
  562.       si2 = pCoef[(ia2 * 2u) + 1u];
  563.       co3 = pCoef[ia3 * 2u];
  564.       si3 = pCoef[(ia3 * 2u) + 1u];
  565.       /*  Twiddle coefficients index modifier */
  566.       ia1 = ia1 + twidCoefModifier;
  567.      
  568.       pSi0 = pSrc + 2 * j;
  569.       pSi1 = pSi0 + 2 * n2;
  570.       pSi2 = pSi1 + 2 * n2;
  571.       pSi3 = pSi2 + 2 * n2;
  572.  
  573.       for (i0 = j; i0 < fftLen; i0 += n1)
  574.       {
  575.         /*  Butterfly implementation */
  576.         /* xa + xc */
  577.         r1 = pSi0[0] + pSi2[0];
  578.  
  579.         /* xa - xc */
  580.         r2 = pSi0[0] - pSi2[0];
  581.  
  582.  
  583.         /* ya + yc */
  584.         s1 = pSi0[1] + pSi2[1];
  585.  
  586.         /* ya - yc */
  587.         s2 = pSi0[1] - pSi2[1];
  588.  
  589.  
  590.         /* xb + xd */
  591.         t1 = pSi1[0] + pSi3[0];
  592.  
  593.  
  594.         /* xa' = xa + xb + xc + xd */
  595.         pSi0[0] = (r1 + t1) >> 2u;
  596.         /* xa + xc -(xb + xd) */
  597.         r1 = r1 - t1;
  598.  
  599.         /* yb + yd */
  600.         t2 = pSi1[1] + pSi3[1];
  601.  
  602.         /* ya' = ya + yb + yc + yd */
  603.         pSi0[1] = (s1 + t2) >> 2u;
  604.         pSi0 += 2 * n1;
  605.  
  606.         /* (ya + yc) - (yb + yd) */
  607.         s1 = s1 - t2;
  608.  
  609.         /* (yb - yd) */
  610.         t1 = pSi1[1] - pSi3[1];
  611.  
  612.         /* (xb - xd) */
  613.         t2 = pSi1[0] - pSi3[0];
  614.  
  615.  
  616.         /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
  617.         pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
  618.                          ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1u;
  619.  
  620.         /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
  621.         pSi1[1] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
  622.                                 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1u;
  623.         pSi1 += 2 * n1;
  624.  
  625.         /* (xa - xc) + (yb - yd) */
  626.         r1 = r2 + t1;
  627.         /* (xa - xc) - (yb - yd) */
  628.         r2 = r2 - t1;
  629.  
  630.         /* (ya - yc) -  (xb - xd) */
  631.         s1 = s2 - t2;
  632.         /* (ya - yc) +  (xb - xd) */
  633.         s2 = s2 + t2;
  634.  
  635.         /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
  636.         pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
  637.                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
  638.  
  639.         /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
  640.         pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
  641.                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
  642.         pSi2 += 2 * n1;
  643.  
  644.         /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
  645.         pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
  646.                          ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
  647.  
  648.         /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
  649.         pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
  650.                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
  651.         pSi3 += 2 * n1;
  652.       }
  653.     }
  654.     twidCoefModifier <<= 2u;
  655.   }
  656. #endif
  657.  
  658.   /* End of Middle stages process */
  659.  
  660.   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
  661.   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
  662.   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
  663.   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
  664.  
  665.  
  666.   /* start of Last stage process */
  667.   /*  Initializations for the last stage */
  668.   j = fftLen >> 2;
  669.   ptr1 = &pSrc[0];
  670.  
  671.   /*  Calculations of last stage */
  672.   do
  673.   {
  674.  
  675. #ifndef ARM_MATH_BIG_ENDIAN
  676.  
  677.     /* Read xa (real), ya(imag) input */
  678.     xaya = *__SIMD64(ptr1)++;
  679.     xa = (q31_t) xaya;
  680.     ya = (q31_t) (xaya >> 32);
  681.  
  682.     /* Read xb (real), yb(imag) input */
  683.     xbyb = *__SIMD64(ptr1)++;
  684.     xb = (q31_t) xbyb;
  685.     yb = (q31_t) (xbyb >> 32);
  686.  
  687.     /* Read xc (real), yc(imag) input */
  688.     xcyc = *__SIMD64(ptr1)++;
  689.     xc = (q31_t) xcyc;
  690.     yc = (q31_t) (xcyc >> 32);
  691.  
  692.     /* Read xc (real), yc(imag) input */
  693.     xdyd = *__SIMD64(ptr1)++;
  694.     xd = (q31_t) xdyd;
  695.     yd = (q31_t) (xdyd >> 32);
  696.  
  697. #else
  698.  
  699.     /* Read xa (real), ya(imag) input */
  700.     xaya = *__SIMD64(ptr1)++;
  701.     ya = (q31_t) xaya;
  702.     xa = (q31_t) (xaya >> 32);
  703.  
  704.     /* Read xb (real), yb(imag) input */
  705.     xbyb = *__SIMD64(ptr1)++;
  706.     yb = (q31_t) xbyb;
  707.     xb = (q31_t) (xbyb >> 32);
  708.  
  709.     /* Read xc (real), yc(imag) input */
  710.     xcyc = *__SIMD64(ptr1)++;
  711.     yc = (q31_t) xcyc;
  712.     xc = (q31_t) (xcyc >> 32);
  713.  
  714.     /* Read xc (real), yc(imag) input */
  715.     xdyd = *__SIMD64(ptr1)++;
  716.     yd = (q31_t) xdyd;
  717.     xd = (q31_t) (xdyd >> 32);
  718.  
  719.  
  720. #endif
  721.  
  722.     /* xa' = xa + xb + xc + xd */
  723.     xa_out = xa + xb + xc + xd;
  724.  
  725.     /* ya' = ya + yb + yc + yd */
  726.     ya_out = ya + yb + yc + yd;
  727.  
  728.     /* pointer updation for writing */
  729.     ptr1 = ptr1 - 8u;
  730.  
  731.     /* writing xa' and ya' */
  732.     *ptr1++ = xa_out;
  733.     *ptr1++ = ya_out;
  734.  
  735.     xc_out = (xa - xb + xc - xd);
  736.     yc_out = (ya - yb + yc - yd);
  737.  
  738.     /* writing xc' and yc' */
  739.     *ptr1++ = xc_out;
  740.     *ptr1++ = yc_out;
  741.  
  742.     xb_out = (xa + yb - xc - yd);
  743.     yb_out = (ya - xb - yc + xd);
  744.  
  745.     /* writing xb' and yb' */
  746.     *ptr1++ = xb_out;
  747.     *ptr1++ = yb_out;
  748.  
  749.     xd_out = (xa - yb - xc + yd);
  750.     yd_out = (ya + xb - yc - xd);
  751.  
  752.     /* writing xd' and yd' */
  753.     *ptr1++ = xd_out;
  754.     *ptr1++ = yd_out;
  755.  
  756.  
  757.   } while(--j);
  758.  
  759.   /* output is in 11.21(q21) format for the 1024 point */
  760.   /* output is in 9.23(q23) format for the 256 point */
  761.   /* output is in 7.25(q25) format for the 64 point */
  762.   /* output is in 5.27(q27) format for the 16 point */
  763.  
  764.   /* End of last stage process */
  765.  
  766. }
  767.  
  768.  
  769. /**    
  770.  * @brief  Core function for the Q31 CIFFT butterfly process.  
  771.  * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.  
  772.  * @param[in]      fftLen           length of the FFT.  
  773.  * @param[in]      *pCoef           points to twiddle coefficient buffer.  
  774.  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.  
  775.  * @return none.  
  776.  */
  777.  
  778.  
  779. /*    
  780. * Radix-4 IFFT algorithm used is :    
  781. *    
  782. * CIFFT uses same twiddle coefficients as CFFT Function    
  783. *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]    
  784. *    
  785. *    
  786. * IFFT is implemented with following changes in equations from FFT    
  787. *    
  788. * Input real and imaginary data:    
  789. * x(n) = xa + j * ya    
  790. * x(n+N/4 ) = xb + j * yb    
  791. * x(n+N/2 ) = xc + j * yc    
  792. * x(n+3N 4) = xd + j * yd    
  793. *    
  794. *    
  795. * Output real and imaginary data:    
  796. * x(4r) = xa'+ j * ya'    
  797. * x(4r+1) = xb'+ j * yb'    
  798. * x(4r+2) = xc'+ j * yc'    
  799. * x(4r+3) = xd'+ j * yd'    
  800. *    
  801. *    
  802. * Twiddle factors for radix-4 IFFT:    
  803. * Wn = co1 + j * (si1)    
  804. * W2n = co2 + j * (si2)    
  805. * W3n = co3 + j * (si3)    
  806.    
  807. * The real and imaginary output values for the radix-4 butterfly are    
  808. * xa' = xa + xb + xc + xd    
  809. * ya' = ya + yb + yc + yd    
  810. * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)    
  811. * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)    
  812. * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)    
  813. * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)    
  814. * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)    
  815. * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)    
  816. *    
  817. */
  818.  
  819. void arm_radix4_butterfly_inverse_q31(
  820.   q31_t * pSrc,
  821.   uint32_t fftLen,
  822.   q31_t * pCoef,
  823.   uint32_t twidCoefModifier)
  824. {
  825. #if defined(ARM_MATH_CM7)
  826.   uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
  827.   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
  828.   q31_t xa, xb, xc, xd;
  829.   q31_t ya, yb, yc, yd;
  830.   q31_t xa_out, xb_out, xc_out, xd_out;
  831.   q31_t ya_out, yb_out, yc_out, yd_out;
  832.  
  833.   q31_t *ptr1;
  834.   q63_t xaya, xbyb, xcyc, xdyd;
  835.  
  836.   /* input is be 1.31(q31) format for all FFT sizes */
  837.   /* Total process is divided into three stages */
  838.   /* process first stage, middle stages, & last stage */
  839.  
  840.   /* Start of first stage process */
  841.  
  842.   /* Initializations for the first stage */
  843.   n2 = fftLen;
  844.   n1 = n2;
  845.   /* n2 = fftLen/4 */
  846.   n2 >>= 2u;
  847.   i0 = 0u;
  848.   ia1 = 0u;
  849.  
  850.   j = n2;
  851.  
  852.   do
  853.   {
  854.  
  855.     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
  856.  
  857.     /*  index calculation for the input as, */
  858.     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
  859.     i1 = i0 + n2;
  860.     i2 = i1 + n2;
  861.     i3 = i2 + n2;
  862.  
  863.     /*  Butterfly implementation */
  864.     /* xa + xc */
  865.     r1 = (pSrc[2u * i0] >> 4u) + (pSrc[2u * i2] >> 4u);
  866.     /* xa - xc */
  867.     r2 = (pSrc[2u * i0] >> 4u) - (pSrc[2u * i2] >> 4u);
  868.  
  869.     /* xb + xd */
  870.     t1 = (pSrc[2u * i1] >> 4u) + (pSrc[2u * i3] >> 4u);
  871.  
  872.     /* ya + yc */
  873.     s1 = (pSrc[(2u * i0) + 1u] >> 4u) + (pSrc[(2u * i2) + 1u] >> 4u);
  874.     /* ya - yc */
  875.     s2 = (pSrc[(2u * i0) + 1u] >> 4u) - (pSrc[(2u * i2) + 1u] >> 4u);
  876.  
  877.     /* xa' = xa + xb + xc + xd */
  878.     pSrc[2u * i0] = (r1 + t1);
  879.     /* (xa + xc) - (xb + xd) */
  880.     r1 = r1 - t1;
  881.     /* yb + yd */
  882.     t2 = (pSrc[(2u * i1) + 1u] >> 4u) + (pSrc[(2u * i3) + 1u] >> 4u);
  883.     /* ya' = ya + yb + yc + yd */
  884.     pSrc[(2u * i0) + 1u] = (s1 + t2);
  885.  
  886.     /* (ya + yc) - (yb + yd) */
  887.     s1 = s1 - t2;
  888.  
  889.     /* yb - yd */
  890.     t1 = (pSrc[(2u * i1) + 1u] >> 4u) - (pSrc[(2u * i3) + 1u] >> 4u);
  891.     /* xb - xd */
  892.     t2 = (pSrc[2u * i1] >> 4u) - (pSrc[2u * i3] >> 4u);
  893.  
  894.     /*  index calculation for the coefficients */
  895.     ia2 = 2u * ia1;
  896.     co2 = pCoef[ia2 * 2u];
  897.     si2 = pCoef[(ia2 * 2u) + 1u];
  898.  
  899.     /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
  900.     pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
  901.                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
  902.  
  903.     /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
  904.     pSrc[2u * i1 + 1u] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
  905.                           ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
  906.  
  907.     /* (xa - xc) - (yb - yd) */
  908.     r1 = r2 - t1;
  909.     /* (xa - xc) + (yb - yd) */
  910.     r2 = r2 + t1;
  911.  
  912.     /* (ya - yc) + (xb - xd) */
  913.     s1 = s2 + t2;
  914.     /* (ya - yc) - (xb - xd) */
  915.     s2 = s2 - t2;
  916.  
  917.     co1 = pCoef[ia1 * 2u];
  918.     si1 = pCoef[(ia1 * 2u) + 1u];
  919.  
  920.     /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
  921.     pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
  922.                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
  923.  
  924.     /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
  925.     pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
  926.                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
  927.  
  928.     /*  index calculation for the coefficients */
  929.     ia3 = 3u * ia1;
  930.     co3 = pCoef[ia3 * 2u];
  931.     si3 = pCoef[(ia3 * 2u) + 1u];
  932.  
  933.     /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
  934.     pSrc[2u * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
  935.                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
  936.  
  937.     /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
  938.     pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
  939.                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
  940.  
  941.     /*  Twiddle coefficients index modifier */
  942.     ia1 = ia1 + twidCoefModifier;
  943.  
  944.     /*  Updating input index */
  945.     i0 = i0 + 1u;
  946.  
  947.   } while(--j);
  948.  
  949.   /* data is in 5.27(q27) format */
  950.   /* each stage provides two down scaling of the input */
  951.  
  952.  
  953.   /* Start of Middle stages process */
  954.  
  955.   twidCoefModifier <<= 2u;
  956.  
  957.   /*  Calculation of second stage to excluding last stage */
  958.   for (k = fftLen / 4u; k > 4u; k >>= 2u)
  959.   {
  960.     /*  Initializations for the first stage */
  961.     n1 = n2;
  962.     n2 >>= 2u;
  963.     ia1 = 0u;
  964.  
  965.     for (j = 0; j <= (n2 - 1u); j++)
  966.     {
  967.       /*  index calculation for the coefficients */
  968.       ia2 = ia1 + ia1;
  969.       ia3 = ia2 + ia1;
  970.       co1 = pCoef[ia1 * 2u];
  971.       si1 = pCoef[(ia1 * 2u) + 1u];
  972.       co2 = pCoef[ia2 * 2u];
  973.       si2 = pCoef[(ia2 * 2u) + 1u];
  974.       co3 = pCoef[ia3 * 2u];
  975.       si3 = pCoef[(ia3 * 2u) + 1u];
  976.       /*  Twiddle coefficients index modifier */
  977.       ia1 = ia1 + twidCoefModifier;
  978.  
  979.       for (i0 = j; i0 < fftLen; i0 += n1)
  980.       {
  981.         /*  index calculation for the input as, */
  982.         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
  983.         i1 = i0 + n2;
  984.         i2 = i1 + n2;
  985.         i3 = i2 + n2;
  986.  
  987.         /*  Butterfly implementation */
  988.         /* xa + xc */
  989.         r1 = pSrc[2u * i0] + pSrc[2u * i2];
  990.         /* xa - xc */
  991.         r2 = pSrc[2u * i0] - pSrc[2u * i2];
  992.  
  993.         /* ya + yc */
  994.         s1 = pSrc[(2u * i0) + 1u] + pSrc[(2u * i2) + 1u];
  995.         /* ya - yc */
  996.         s2 = pSrc[(2u * i0) + 1u] - pSrc[(2u * i2) + 1u];
  997.  
  998.         /* xb + xd */
  999.         t1 = pSrc[2u * i1] + pSrc[2u * i3];
  1000.  
  1001.         /* xa' = xa + xb + xc + xd */
  1002.         pSrc[2u * i0] = (r1 + t1) >> 2u;
  1003.         /* xa + xc -(xb + xd) */
  1004.         r1 = r1 - t1;
  1005.         /* yb + yd */
  1006.         t2 = pSrc[(2u * i1) + 1u] + pSrc[(2u * i3) + 1u];
  1007.         /* ya' = ya + yb + yc + yd */
  1008.         pSrc[(2u * i0) + 1u] = (s1 + t2) >> 2u;
  1009.  
  1010.         /* (ya + yc) - (yb + yd) */
  1011.         s1 = s1 - t2;
  1012.  
  1013.         /* (yb - yd) */
  1014.         t1 = pSrc[(2u * i1) + 1u] - pSrc[(2u * i3) + 1u];
  1015.         /* (xb - xd) */
  1016.         t2 = pSrc[2u * i1] - pSrc[2u * i3];
  1017.  
  1018.         /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
  1019.         pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32u)) -
  1020.                          ((int32_t) (((q63_t) s1 * si2) >> 32u))) >> 1u;
  1021.  
  1022.         /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
  1023.         pSrc[(2u * i1) + 1u] =
  1024.           (((int32_t) (((q63_t) s1 * co2) >> 32u)) +
  1025.            ((int32_t) (((q63_t) r1 * si2) >> 32u))) >> 1u;
  1026.  
  1027.         /* (xa - xc) - (yb - yd) */
  1028.         r1 = r2 - t1;
  1029.         /* (xa - xc) + (yb - yd) */
  1030.         r2 = r2 + t1;
  1031.  
  1032.         /* (ya - yc) +  (xb - xd) */
  1033.         s1 = s2 + t2;
  1034.         /* (ya - yc) -  (xb - xd) */
  1035.         s2 = s2 - t2;
  1036.  
  1037.         /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
  1038.         pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
  1039.                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
  1040.  
  1041.         /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
  1042.         pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
  1043.                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
  1044.  
  1045.         /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
  1046.         pSrc[(2u * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
  1047.                            ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
  1048.  
  1049.         /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
  1050.         pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
  1051.                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
  1052.       }
  1053.     }
  1054.     twidCoefModifier <<= 2u;
  1055.   }
  1056. #else
  1057.   uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
  1058.   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
  1059.   q31_t xa, xb, xc, xd;
  1060.   q31_t ya, yb, yc, yd;
  1061.   q31_t xa_out, xb_out, xc_out, xd_out;
  1062.   q31_t ya_out, yb_out, yc_out, yd_out;
  1063.  
  1064.   q31_t *ptr1;
  1065.   q31_t *pSi0;
  1066.   q31_t *pSi1;
  1067.   q31_t *pSi2;
  1068.   q31_t *pSi3;
  1069.   q63_t xaya, xbyb, xcyc, xdyd;
  1070.  
  1071.   /* input is be 1.31(q31) format for all FFT sizes */
  1072.   /* Total process is divided into three stages */
  1073.   /* process first stage, middle stages, & last stage */
  1074.  
  1075.   /* Start of first stage process */
  1076.  
  1077.   /* Initializations for the first stage */
  1078.   n2 = fftLen;
  1079.   n1 = n2;
  1080.   /* n2 = fftLen/4 */
  1081.   n2 >>= 2u;
  1082.  
  1083.   ia1 = 0u;
  1084.  
  1085.   j = n2;
  1086.  
  1087.   pSi0 = pSrc;
  1088.   pSi1 = pSi0 + 2 * n2;
  1089.   pSi2 = pSi1 + 2 * n2;
  1090.   pSi3 = pSi2 + 2 * n2;
  1091.  
  1092.   do
  1093.   {
  1094.     /*  Butterfly implementation */
  1095.     /* xa + xc */
  1096.     r1 = (pSi0[0] >> 4u) + (pSi2[0] >> 4u);
  1097.     /* xa - xc */
  1098.     r2 = (pSi0[0] >> 4u) - (pSi2[0] >> 4u);
  1099.  
  1100.     /* xb + xd */
  1101.     t1 = (pSi1[0] >> 4u) + (pSi3[0] >> 4u);
  1102.  
  1103.     /* ya + yc */
  1104.     s1 = (pSi0[1] >> 4u) + (pSi2[1] >> 4u);
  1105.     /* ya - yc */
  1106.     s2 = (pSi0[1] >> 4u) - (pSi2[1] >> 4u);
  1107.  
  1108.     /* xa' = xa + xb + xc + xd */
  1109.     *pSi0++ = (r1 + t1);
  1110.     /* (xa + xc) - (xb + xd) */
  1111.     r1 = r1 - t1;
  1112.     /* yb + yd */
  1113.     t2 = (pSi1[1] >> 4u) + (pSi3[1] >> 4u);
  1114.     /* ya' = ya + yb + yc + yd */
  1115.     *pSi0++ = (s1 + t2);
  1116.  
  1117.     /* (ya + yc) - (yb + yd) */
  1118.     s1 = s1 - t2;
  1119.  
  1120.     /* yb - yd */
  1121.     t1 = (pSi1[1] >> 4u) - (pSi3[1] >> 4u);
  1122.     /* xb - xd */
  1123.     t2 = (pSi1[0] >> 4u) - (pSi3[0] >> 4u);
  1124.  
  1125.     /*  index calculation for the coefficients */
  1126.     ia2 = 2u * ia1;
  1127.     co2 = pCoef[ia2 * 2u];
  1128.     si2 = pCoef[(ia2 * 2u) + 1u];
  1129.  
  1130.     /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
  1131.     *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
  1132.                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
  1133.  
  1134.     /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
  1135.     *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
  1136.                           ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
  1137.  
  1138.     /* (xa - xc) - (yb - yd) */
  1139.     r1 = r2 - t1;
  1140.     /* (xa - xc) + (yb - yd) */
  1141.     r2 = r2 + t1;
  1142.  
  1143.     /* (ya - yc) + (xb - xd) */
  1144.     s1 = s2 + t2;
  1145.     /* (ya - yc) - (xb - xd) */
  1146.     s2 = s2 - t2;
  1147.  
  1148.     co1 = pCoef[ia1 * 2u];
  1149.     si1 = pCoef[(ia1 * 2u) + 1u];
  1150.  
  1151.     /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
  1152.     *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
  1153.                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
  1154.  
  1155.     /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
  1156.     *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
  1157.                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
  1158.  
  1159.     /*  index calculation for the coefficients */
  1160.     ia3 = 3u * ia1;
  1161.     co3 = pCoef[ia3 * 2u];
  1162.     si3 = pCoef[(ia3 * 2u) + 1u];
  1163.  
  1164.     /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
  1165.     *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
  1166.                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
  1167.  
  1168.     /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
  1169.     *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
  1170.                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
  1171.  
  1172.     /*  Twiddle coefficients index modifier */
  1173.     ia1 = ia1 + twidCoefModifier;
  1174.  
  1175.   } while(--j);
  1176.  
  1177.   /* data is in 5.27(q27) format */
  1178.   /* each stage provides two down scaling of the input */
  1179.  
  1180.  
  1181.   /* Start of Middle stages process */
  1182.  
  1183.   twidCoefModifier <<= 2u;
  1184.  
  1185.   /*  Calculation of second stage to excluding last stage */
  1186.   for (k = fftLen / 4u; k > 4u; k >>= 2u)
  1187.   {
  1188.     /*  Initializations for the first stage */
  1189.     n1 = n2;
  1190.     n2 >>= 2u;
  1191.     ia1 = 0u;
  1192.  
  1193.     for (j = 0; j <= (n2 - 1u); j++)
  1194.     {
  1195.       /*  index calculation for the coefficients */
  1196.       ia2 = ia1 + ia1;
  1197.       ia3 = ia2 + ia1;
  1198.       co1 = pCoef[ia1 * 2u];
  1199.       si1 = pCoef[(ia1 * 2u) + 1u];
  1200.       co2 = pCoef[ia2 * 2u];
  1201.       si2 = pCoef[(ia2 * 2u) + 1u];
  1202.       co3 = pCoef[ia3 * 2u];
  1203.       si3 = pCoef[(ia3 * 2u) + 1u];
  1204.       /*  Twiddle coefficients index modifier */
  1205.       ia1 = ia1 + twidCoefModifier;
  1206.      
  1207.       pSi0 = pSrc + 2 * j;
  1208.       pSi1 = pSi0 + 2 * n2;
  1209.       pSi2 = pSi1 + 2 * n2;
  1210.       pSi3 = pSi2 + 2 * n2;
  1211.  
  1212.       for (i0 = j; i0 < fftLen; i0 += n1)
  1213.       {
  1214.         /*  Butterfly implementation */
  1215.         /* xa + xc */
  1216.         r1 = pSi0[0] + pSi2[0];
  1217.  
  1218.         /* xa - xc */
  1219.         r2 = pSi0[0] - pSi2[0];
  1220.  
  1221.  
  1222.         /* ya + yc */
  1223.         s1 = pSi0[1] + pSi2[1];
  1224.  
  1225.         /* ya - yc */
  1226.         s2 = pSi0[1] - pSi2[1];
  1227.  
  1228.  
  1229.         /* xb + xd */
  1230.         t1 = pSi1[0] + pSi3[0];
  1231.  
  1232.  
  1233.         /* xa' = xa + xb + xc + xd */
  1234.         pSi0[0] = (r1 + t1) >> 2u;
  1235.         /* xa + xc -(xb + xd) */
  1236.         r1 = r1 - t1;
  1237.         /* yb + yd */
  1238.         t2 = pSi1[1] + pSi3[1];
  1239.  
  1240.         /* ya' = ya + yb + yc + yd */
  1241.         pSi0[1] = (s1 + t2) >> 2u;
  1242.         pSi0 += 2 * n1;
  1243.  
  1244.         /* (ya + yc) - (yb + yd) */
  1245.         s1 = s1 - t2;
  1246.  
  1247.         /* (yb - yd) */
  1248.         t1 = pSi1[1] - pSi3[1];
  1249.  
  1250.         /* (xb - xd) */
  1251.         t2 = pSi1[0] - pSi3[0];
  1252.  
  1253.  
  1254.         /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
  1255.         pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32u)) -
  1256.                          ((int32_t) (((q63_t) s1 * si2) >> 32u))) >> 1u;
  1257.  
  1258.         /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
  1259.         pSi1[1] =
  1260.  
  1261.           (((int32_t) (((q63_t) s1 * co2) >> 32u)) +
  1262.            ((int32_t) (((q63_t) r1 * si2) >> 32u))) >> 1u;
  1263.         pSi1 += 2 * n1;
  1264.  
  1265.         /* (xa - xc) - (yb - yd) */
  1266.         r1 = r2 - t1;
  1267.         /* (xa - xc) + (yb - yd) */
  1268.         r2 = r2 + t1;
  1269.  
  1270.         /* (ya - yc) +  (xb - xd) */
  1271.         s1 = s2 + t2;
  1272.         /* (ya - yc) -  (xb - xd) */
  1273.         s2 = s2 - t2;
  1274.  
  1275.         /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
  1276.         pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
  1277.                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
  1278.  
  1279.         /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
  1280.         pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
  1281.                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
  1282.         pSi2 += 2 * n1;
  1283.  
  1284.         /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
  1285.         pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
  1286.                            ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
  1287.  
  1288.         /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
  1289.         pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
  1290.                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
  1291.         pSi3 += 2 * n1;
  1292.       }
  1293.     }
  1294.     twidCoefModifier <<= 2u;
  1295.   }
  1296. #endif
  1297.  
  1298.   /* End of Middle stages process */
  1299.  
  1300.   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
  1301.   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
  1302.   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
  1303.   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
  1304.  
  1305.  
  1306.   /* Start of last stage process */
  1307.  
  1308.  
  1309.   /*  Initializations for the last stage */
  1310.   j = fftLen >> 2;
  1311.   ptr1 = &pSrc[0];
  1312.  
  1313.   /*  Calculations of last stage */
  1314.   do
  1315.   {
  1316. #ifndef ARM_MATH_BIG_ENDIAN
  1317.     /* Read xa (real), ya(imag) input */
  1318.     xaya = *__SIMD64(ptr1)++;
  1319.     xa = (q31_t) xaya;
  1320.     ya = (q31_t) (xaya >> 32);
  1321.  
  1322.     /* Read xb (real), yb(imag) input */
  1323.     xbyb = *__SIMD64(ptr1)++;
  1324.     xb = (q31_t) xbyb;
  1325.     yb = (q31_t) (xbyb >> 32);
  1326.  
  1327.     /* Read xc (real), yc(imag) input */
  1328.     xcyc = *__SIMD64(ptr1)++;
  1329.     xc = (q31_t) xcyc;
  1330.     yc = (q31_t) (xcyc >> 32);
  1331.  
  1332.     /* Read xc (real), yc(imag) input */
  1333.     xdyd = *__SIMD64(ptr1)++;
  1334.     xd = (q31_t) xdyd;
  1335.     yd = (q31_t) (xdyd >> 32);
  1336.  
  1337. #else
  1338.  
  1339.     /* Read xa (real), ya(imag) input */
  1340.     xaya = *__SIMD64(ptr1)++;
  1341.     ya = (q31_t) xaya;
  1342.     xa = (q31_t) (xaya >> 32);
  1343.  
  1344.     /* Read xb (real), yb(imag) input */
  1345.     xbyb = *__SIMD64(ptr1)++;
  1346.     yb = (q31_t) xbyb;
  1347.     xb = (q31_t) (xbyb >> 32);
  1348.  
  1349.     /* Read xc (real), yc(imag) input */
  1350.     xcyc = *__SIMD64(ptr1)++;
  1351.     yc = (q31_t) xcyc;
  1352.     xc = (q31_t) (xcyc >> 32);
  1353.  
  1354.     /* Read xc (real), yc(imag) input */
  1355.     xdyd = *__SIMD64(ptr1)++;
  1356.     yd = (q31_t) xdyd;
  1357.     xd = (q31_t) (xdyd >> 32);
  1358.  
  1359.  
  1360. #endif
  1361.  
  1362.     /* xa' = xa + xb + xc + xd */
  1363.     xa_out = xa + xb + xc + xd;
  1364.  
  1365.     /* ya' = ya + yb + yc + yd */
  1366.     ya_out = ya + yb + yc + yd;
  1367.  
  1368.     /* pointer updation for writing */
  1369.     ptr1 = ptr1 - 8u;
  1370.  
  1371.     /* writing xa' and ya' */
  1372.     *ptr1++ = xa_out;
  1373.     *ptr1++ = ya_out;
  1374.  
  1375.     xc_out = (xa - xb + xc - xd);
  1376.     yc_out = (ya - yb + yc - yd);
  1377.  
  1378.     /* writing xc' and yc' */
  1379.     *ptr1++ = xc_out;
  1380.     *ptr1++ = yc_out;
  1381.  
  1382.     xb_out = (xa - yb - xc + yd);
  1383.     yb_out = (ya + xb - yc - xd);
  1384.  
  1385.     /* writing xb' and yb' */
  1386.     *ptr1++ = xb_out;
  1387.     *ptr1++ = yb_out;
  1388.  
  1389.     xd_out = (xa + yb - xc - yd);
  1390.     yd_out = (ya - xb - yc + xd);
  1391.  
  1392.     /* writing xd' and yd' */
  1393.     *ptr1++ = xd_out;
  1394.     *ptr1++ = yd_out;
  1395.  
  1396.   } while(--j);
  1397.  
  1398.   /* output is in 11.21(q21) format for the 1024 point */
  1399.   /* output is in 9.23(q23) format for the 256 point */
  1400.   /* output is in 7.25(q25) format for the 64 point */
  1401.   /* output is in 5.27(q27) format for the 16 point */
  1402.  
  1403.   /* End of last stage process */
  1404. }
  1405.