Subversion Repositories dashGPS

Rev

Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------
  2.  * Project:      CMSIS DSP Library
  3.  * Title:        arm_cfft_radix4_q31.c
  4.  * Description:  This file has function definition of Radix-4 FFT & IFFT function and
  5.  *               In-place bit reversal using bit reversal table
  6.  *
  7.  * $Date:        27. January 2017
  8.  * $Revision:    V.1.5.1
  9.  *
  10.  * Target Processor: Cortex-M cores
  11.  * -------------------------------------------------------------------- */
  12. /*
  13.  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  14.  *
  15.  * SPDX-License-Identifier: Apache-2.0
  16.  *
  17.  * Licensed under the Apache License, Version 2.0 (the License); you may
  18.  * not use this file except in compliance with the License.
  19.  * You may obtain a copy of the License at
  20.  *
  21.  * www.apache.org/licenses/LICENSE-2.0
  22.  *
  23.  * Unless required by applicable law or agreed to in writing, software
  24.  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  25.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  26.  * See the License for the specific language governing permissions and
  27.  * limitations under the License.
  28.  */
  29.  
  30. #include "arm_math.h"
  31.  
  32. void arm_radix4_butterfly_inverse_q31(
  33. q31_t * pSrc,
  34. uint32_t fftLen,
  35. q31_t * pCoef,
  36. uint32_t twidCoefModifier);
  37.  
  38. void arm_radix4_butterfly_q31(
  39. q31_t * pSrc,
  40. uint32_t fftLen,
  41. q31_t * pCoef,
  42. uint32_t twidCoefModifier);
  43.  
  44. void arm_bitreversal_q31(
  45. q31_t * pSrc,
  46. uint32_t fftLen,
  47. uint16_t bitRevFactor,
  48. uint16_t * pBitRevTab);
  49.  
  50. /**
  51.  * @ingroup groupTransforms
  52.  */
  53.  
  54. /**
  55.  * @addtogroup ComplexFFT
  56.  * @{
  57.  */
  58.  
  59. /**
  60.  * @details
  61.  * @brief Processing function for the Q31 CFFT/CIFFT.
  62.  * @deprecated Do not use this function.  It has been superseded by \ref arm_cfft_q31 and will be removed
  63.  * @param[in]      *S    points to an instance of the Q31 CFFT/CIFFT structure.
  64.  * @param[in, out] *pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place.
  65.  * @return none.
  66.  *
  67.  * \par Input and output formats:
  68.  * \par
  69.  * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
  70.  * Hence the output format is different for different FFT sizes.
  71.  * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
  72.  * \par
  73.  * \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"
  74.  * \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"
  75.  *
  76.  */
  77.  
  78. void arm_cfft_radix4_q31(
  79.   const arm_cfft_radix4_instance_q31 * S,
  80.   q31_t * pSrc)
  81. {
  82.   if (S->ifftFlag == 1U)
  83.   {
  84.     /* Complex IFFT radix-4 */
  85.     arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
  86.   }
  87.   else
  88.   {
  89.     /* Complex FFT radix-4 */
  90.     arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
  91.   }
  92.  
  93.   if (S->bitReverseFlag == 1U)
  94.   {
  95.     /*  Bit Reversal */
  96.     arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
  97.   }
  98.  
  99. }
  100.  
  101. /**
  102.  * @} end of ComplexFFT group
  103.  */
  104.  
  105. /*
  106. * Radix-4 FFT algorithm used is :
  107. *
  108. * Input real and imaginary data:
  109. * x(n) = xa + j * ya
  110. * x(n+N/4 ) = xb + j * yb
  111. * x(n+N/2 ) = xc + j * yc
  112. * x(n+3N 4) = xd + j * yd
  113. *
  114. *
  115. * Output real and imaginary data:
  116. * x(4r) = xa'+ j * ya'
  117. * x(4r+1) = xb'+ j * yb'
  118. * x(4r+2) = xc'+ j * yc'
  119. * x(4r+3) = xd'+ j * yd'
  120. *
  121. *
  122. * Twiddle factors for radix-4 FFT:
  123. * Wn = co1 + j * (- si1)
  124. * W2n = co2 + j * (- si2)
  125. * W3n = co3 + j * (- si3)
  126. *
  127. *  Butterfly implementation:
  128. * xa' = xa + xb + xc + xd
  129. * ya' = ya + yb + yc + yd
  130. * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
  131. * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
  132. * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
  133. * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
  134. * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
  135. * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
  136. *
  137. */
  138.  
  139. /**
  140.  * @brief  Core function for the Q31 CFFT butterfly process.
  141.  * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.
  142.  * @param[in]      fftLen           length of the FFT.
  143.  * @param[in]      *pCoef           points to twiddle coefficient buffer.
  144.  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
  145.  * @return none.
  146.  */
  147.  
  148. void arm_radix4_butterfly_q31(
  149.   q31_t * pSrc,
  150.   uint32_t fftLen,
  151.   q31_t * pCoef,
  152.   uint32_t twidCoefModifier)
  153. {
  154. #if defined(ARM_MATH_CM7)
  155.   uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
  156.   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
  157.  
  158.   q31_t xa, xb, xc, xd;
  159.   q31_t ya, yb, yc, yd;
  160.   q31_t xa_out, xb_out, xc_out, xd_out;
  161.   q31_t ya_out, yb_out, yc_out, yd_out;
  162.  
  163.   q31_t *ptr1;
  164.   q63_t xaya, xbyb, xcyc, xdyd;
  165.   /* Total process is divided into three stages */
  166.  
  167.   /* process first stage, middle stages, & last stage */
  168.  
  169.  
  170.   /* start of first stage process */
  171.  
  172.   /*  Initializations for the first stage */
  173.   n2 = fftLen;
  174.   n1 = n2;
  175.   /* n2 = fftLen/4 */
  176.   n2 >>= 2U;
  177.   i0 = 0U;
  178.   ia1 = 0U;
  179.  
  180.   j = n2;
  181.  
  182.   /*  Calculation of first stage */
  183.   do
  184.   {
  185.     /*  index calculation for the input as, */
  186.     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
  187.     i1 = i0 + n2;
  188.     i2 = i1 + n2;
  189.     i3 = i2 + n2;
  190.  
  191.     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
  192.  
  193.     /*  Butterfly implementation */
  194.     /* xa + xc */
  195.     r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U);
  196.     /* xa - xc */
  197.     r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
  198.  
  199.     /* xb + xd */
  200.     t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
  201.  
  202.     /* ya + yc */
  203.     s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
  204.     /* ya - yc */
  205.     s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
  206.  
  207.     /* xa' = xa + xb + xc + xd */
  208.     pSrc[2U * i0] = (r1 + t1);
  209.     /* (xa + xc) - (xb + xd) */
  210.     r1 = r1 - t1;
  211.     /* yb + yd */
  212.     t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
  213.  
  214.     /* ya' = ya + yb + yc + yd */
  215.     pSrc[(2U * i0) + 1U] = (s1 + t2);
  216.  
  217.     /* (ya + yc) - (yb + yd) */
  218.     s1 = s1 - t2;
  219.  
  220.     /* yb - yd */
  221.     t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
  222.     /* xb - xd */
  223.     t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
  224.  
  225.     /*  index calculation for the coefficients */
  226.     ia2 = 2U * ia1;
  227.     co2 = pCoef[ia2 * 2U];
  228.     si2 = pCoef[(ia2 * 2U) + 1U];
  229.  
  230.     /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
  231.     pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
  232.                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
  233.  
  234.     /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
  235.     pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
  236.                             ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
  237.  
  238.     /* (xa - xc) + (yb - yd) */
  239.     r1 = r2 + t1;
  240.     /* (xa - xc) - (yb - yd) */
  241.     r2 = r2 - t1;
  242.  
  243.     /* (ya - yc) - (xb - xd) */
  244.     s1 = s2 - t2;
  245.     /* (ya - yc) + (xb - xd) */
  246.     s2 = s2 + t2;
  247.  
  248.     co1 = pCoef[ia1 * 2U];
  249.     si1 = pCoef[(ia1 * 2U) + 1U];
  250.  
  251.     /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
  252.     pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
  253.                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
  254.  
  255.     /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
  256.     pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
  257.                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
  258.  
  259.     /*  index calculation for the coefficients */
  260.     ia3 = 3U * ia1;
  261.     co3 = pCoef[ia3 * 2U];
  262.     si3 = pCoef[(ia3 * 2U) + 1U];
  263.  
  264.     /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
  265.     pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
  266.                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
  267.  
  268.     /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
  269.     pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
  270.                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
  271.  
  272.     /*  Twiddle coefficients index modifier */
  273.     ia1 = ia1 + twidCoefModifier;
  274.  
  275.     /*  Updating input index */
  276.     i0 = i0 + 1U;
  277.  
  278.   } while (--j);
  279.  
  280.   /* end of first stage process */
  281.  
  282.   /* data is in 5.27(q27) format */
  283.  
  284.  
  285.   /* start of Middle stages process */
  286.  
  287.  
  288.   /* each stage in middle stages provides two down scaling of the input */
  289.  
  290.   twidCoefModifier <<= 2U;
  291.  
  292.  
  293.   for (k = fftLen / 4U; k > 4U; k >>= 2U)
  294.   {
  295.     /*  Initializations for the first stage */
  296.     n1 = n2;
  297.     n2 >>= 2U;
  298.     ia1 = 0U;
  299.  
  300.     /*  Calculation of first stage */
  301.     for (j = 0U; j <= (n2 - 1U); j++)
  302.     {
  303.       /*  index calculation for the coefficients */
  304.       ia2 = ia1 + ia1;
  305.       ia3 = ia2 + ia1;
  306.       co1 = pCoef[ia1 * 2U];
  307.       si1 = pCoef[(ia1 * 2U) + 1U];
  308.       co2 = pCoef[ia2 * 2U];
  309.       si2 = pCoef[(ia2 * 2U) + 1U];
  310.       co3 = pCoef[ia3 * 2U];
  311.       si3 = pCoef[(ia3 * 2U) + 1U];
  312.       /*  Twiddle coefficients index modifier */
  313.       ia1 = ia1 + twidCoefModifier;
  314.  
  315.       for (i0 = j; i0 < fftLen; i0 += n1)
  316.       {
  317.         /*  index calculation for the input as, */
  318.         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
  319.         i1 = i0 + n2;
  320.         i2 = i1 + n2;
  321.         i3 = i2 + n2;
  322.  
  323.         /*  Butterfly implementation */
  324.         /* xa + xc */
  325.         r1 = pSrc[2U * i0] + pSrc[2U * i2];
  326.         /* xa - xc */
  327.         r2 = pSrc[2U * i0] - pSrc[2U * i2];
  328.  
  329.         /* ya + yc */
  330.         s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
  331.         /* ya - yc */
  332.         s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
  333.  
  334.         /* xb + xd */
  335.         t1 = pSrc[2U * i1] + pSrc[2U * i3];
  336.  
  337.         /* xa' = xa + xb + xc + xd */
  338.         pSrc[2U * i0] = (r1 + t1) >> 2U;
  339.         /* xa + xc -(xb + xd) */
  340.         r1 = r1 - t1;
  341.  
  342.         /* yb + yd */
  343.         t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
  344.         /* ya' = ya + yb + yc + yd */
  345.         pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
  346.  
  347.         /* (ya + yc) - (yb + yd) */
  348.         s1 = s1 - t2;
  349.  
  350.         /* (yb - yd) */
  351.         t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
  352.         /* (xb - xd) */
  353.         t2 = pSrc[2U * i1] - pSrc[2U * i3];
  354.  
  355.         /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
  356.         pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
  357.                          ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
  358.  
  359.         /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
  360.         pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
  361.                                 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
  362.  
  363.         /* (xa - xc) + (yb - yd) */
  364.         r1 = r2 + t1;
  365.         /* (xa - xc) - (yb - yd) */
  366.         r2 = r2 - t1;
  367.  
  368.         /* (ya - yc) -  (xb - xd) */
  369.         s1 = s2 - t2;
  370.         /* (ya - yc) +  (xb - xd) */
  371.         s2 = s2 + t2;
  372.  
  373.         /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
  374.         pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
  375.                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
  376.  
  377.         /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
  378.         pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
  379.                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
  380.  
  381.         /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
  382.         pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
  383.                          ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
  384.  
  385.         /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
  386.         pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
  387.                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
  388.       }
  389.     }
  390.     twidCoefModifier <<= 2U;
  391.   }
  392. #else
  393.   uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
  394.   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
  395.  
  396.   q31_t xa, xb, xc, xd;
  397.   q31_t ya, yb, yc, yd;
  398.   q31_t xa_out, xb_out, xc_out, xd_out;
  399.   q31_t ya_out, yb_out, yc_out, yd_out;
  400.  
  401.   q31_t *ptr1;
  402.   q31_t *pSi0;
  403.   q31_t *pSi1;
  404.   q31_t *pSi2;
  405.   q31_t *pSi3;
  406.   q63_t xaya, xbyb, xcyc, xdyd;
  407.   /* Total process is divided into three stages */
  408.  
  409.   /* process first stage, middle stages, & last stage */
  410.  
  411.  
  412.   /* start of first stage process */
  413.  
  414.   /*  Initializations for the first stage */
  415.   n2 = fftLen;
  416.   n1 = n2;
  417.   /* n2 = fftLen/4 */
  418.   n2 >>= 2U;
  419.  
  420.   ia1 = 0U;
  421.  
  422.   j = n2;
  423.  
  424.   pSi0 = pSrc;
  425.   pSi1 = pSi0 + 2 * n2;
  426.   pSi2 = pSi1 + 2 * n2;
  427.   pSi3 = pSi2 + 2 * n2;
  428.  
  429.   /*  Calculation of first stage */
  430.   do
  431.   {
  432.     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
  433.  
  434.     /*  Butterfly implementation */
  435.     /* xa + xc */
  436.     r1 = (pSi0[0] >> 4U) + (pSi2[0] >> 4U);
  437.     /* xa - xc */
  438.     r2 = (pSi0[0] >> 4U) - (pSi2[0] >> 4U);
  439.  
  440.     /* xb + xd */
  441.     t1 = (pSi1[0] >> 4U) + (pSi3[0] >> 4U);
  442.  
  443.     /* ya + yc */
  444.     s1 = (pSi0[1] >> 4U) + (pSi2[1] >> 4U);
  445.     /* ya - yc */
  446.     s2 = (pSi0[1] >> 4U) - (pSi2[1] >> 4U);
  447.  
  448.     /* xa' = xa + xb + xc + xd */
  449.     *pSi0++ = (r1 + t1);
  450.     /* (xa + xc) - (xb + xd) */
  451.     r1 = r1 - t1;
  452.     /* yb + yd */
  453.     t2 = (pSi1[1] >> 4U) + (pSi3[1] >> 4U);
  454.  
  455.     /* ya' = ya + yb + yc + yd */
  456.     *pSi0++ = (s1 + t2);
  457.  
  458.     /* (ya + yc) - (yb + yd) */
  459.     s1 = s1 - t2;
  460.  
  461.     /* yb - yd */
  462.     t1 = (pSi1[1] >> 4U) - (pSi3[1] >> 4U);
  463.     /* xb - xd */
  464.     t2 = (pSi1[0] >> 4U) - (pSi3[0] >> 4U);
  465.  
  466.     /*  index calculation for the coefficients */
  467.     ia2 = 2U * ia1;
  468.     co2 = pCoef[ia2 * 2U];
  469.     si2 = pCoef[(ia2 * 2U) + 1U];
  470.  
  471.     /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
  472.     *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
  473.                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
  474.  
  475.     /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
  476.     *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
  477.                             ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
  478.  
  479.     /* (xa - xc) + (yb - yd) */
  480.     r1 = r2 + t1;
  481.     /* (xa - xc) - (yb - yd) */
  482.     r2 = r2 - t1;
  483.  
  484.     /* (ya - yc) - (xb - xd) */
  485.     s1 = s2 - t2;
  486.     /* (ya - yc) + (xb - xd) */
  487.     s2 = s2 + t2;
  488.  
  489.     co1 = pCoef[ia1 * 2U];
  490.     si1 = pCoef[(ia1 * 2U) + 1U];
  491.  
  492.     /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
  493.     *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
  494.                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
  495.  
  496.     /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
  497.     *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
  498.                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
  499.  
  500.     /*  index calculation for the coefficients */
  501.     ia3 = 3U * ia1;
  502.     co3 = pCoef[ia3 * 2U];
  503.     si3 = pCoef[(ia3 * 2U) + 1U];
  504.  
  505.     /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
  506.     *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
  507.                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
  508.  
  509.     /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
  510.     *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
  511.                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
  512.  
  513.     /*  Twiddle coefficients index modifier */
  514.     ia1 = ia1 + twidCoefModifier;
  515.  
  516.   } while (--j);
  517.  
  518.   /* end of first stage process */
  519.  
  520.   /* data is in 5.27(q27) format */
  521.  
  522.  
  523.   /* start of Middle stages process */
  524.  
  525.  
  526.   /* each stage in middle stages provides two down scaling of the input */
  527.  
  528.   twidCoefModifier <<= 2U;
  529.  
  530.  
  531.   for (k = fftLen / 4U; k > 4U; k >>= 2U)
  532.   {
  533.     /*  Initializations for the first stage */
  534.     n1 = n2;
  535.     n2 >>= 2U;
  536.     ia1 = 0U;
  537.  
  538.     /*  Calculation of first stage */
  539.     for (j = 0U; j <= (n2 - 1U); j++)
  540.     {
  541.       /*  index calculation for the coefficients */
  542.       ia2 = ia1 + ia1;
  543.       ia3 = ia2 + ia1;
  544.       co1 = pCoef[ia1 * 2U];
  545.       si1 = pCoef[(ia1 * 2U) + 1U];
  546.       co2 = pCoef[ia2 * 2U];
  547.       si2 = pCoef[(ia2 * 2U) + 1U];
  548.       co3 = pCoef[ia3 * 2U];
  549.       si3 = pCoef[(ia3 * 2U) + 1U];
  550.       /*  Twiddle coefficients index modifier */
  551.       ia1 = ia1 + twidCoefModifier;
  552.  
  553.       pSi0 = pSrc + 2 * j;
  554.       pSi1 = pSi0 + 2 * n2;
  555.       pSi2 = pSi1 + 2 * n2;
  556.       pSi3 = pSi2 + 2 * n2;
  557.  
  558.       for (i0 = j; i0 < fftLen; i0 += n1)
  559.       {
  560.         /*  Butterfly implementation */
  561.         /* xa + xc */
  562.         r1 = pSi0[0] + pSi2[0];
  563.  
  564.         /* xa - xc */
  565.         r2 = pSi0[0] - pSi2[0];
  566.  
  567.  
  568.         /* ya + yc */
  569.         s1 = pSi0[1] + pSi2[1];
  570.  
  571.         /* ya - yc */
  572.         s2 = pSi0[1] - pSi2[1];
  573.  
  574.  
  575.         /* xb + xd */
  576.         t1 = pSi1[0] + pSi3[0];
  577.  
  578.  
  579.         /* xa' = xa + xb + xc + xd */
  580.         pSi0[0] = (r1 + t1) >> 2U;
  581.         /* xa + xc -(xb + xd) */
  582.         r1 = r1 - t1;
  583.  
  584.         /* yb + yd */
  585.         t2 = pSi1[1] + pSi3[1];
  586.  
  587.         /* ya' = ya + yb + yc + yd */
  588.         pSi0[1] = (s1 + t2) >> 2U;
  589.         pSi0 += 2 * n1;
  590.  
  591.         /* (ya + yc) - (yb + yd) */
  592.         s1 = s1 - t2;
  593.  
  594.         /* (yb - yd) */
  595.         t1 = pSi1[1] - pSi3[1];
  596.  
  597.         /* (xb - xd) */
  598.         t2 = pSi1[0] - pSi3[0];
  599.  
  600.  
  601.         /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
  602.         pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
  603.                          ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
  604.  
  605.         /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
  606.         pSi1[1] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
  607.                                 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
  608.         pSi1 += 2 * n1;
  609.  
  610.         /* (xa - xc) + (yb - yd) */
  611.         r1 = r2 + t1;
  612.         /* (xa - xc) - (yb - yd) */
  613.         r2 = r2 - t1;
  614.  
  615.         /* (ya - yc) -  (xb - xd) */
  616.         s1 = s2 - t2;
  617.         /* (ya - yc) +  (xb - xd) */
  618.         s2 = s2 + t2;
  619.  
  620.         /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
  621.         pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
  622.                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
  623.  
  624.         /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
  625.         pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
  626.                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
  627.         pSi2 += 2 * n1;
  628.  
  629.         /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
  630.         pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
  631.                          ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
  632.  
  633.         /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
  634.         pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
  635.                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
  636.         pSi3 += 2 * n1;
  637.       }
  638.     }
  639.     twidCoefModifier <<= 2U;
  640.   }
  641. #endif
  642.  
  643.   /* End of Middle stages process */
  644.  
  645.   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
  646.   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
  647.   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
  648.   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
  649.  
  650.  
  651.   /* start of Last stage process */
  652.   /*  Initializations for the last stage */
  653.   j = fftLen >> 2;
  654.   ptr1 = &pSrc[0];
  655.  
  656.   /*  Calculations of last stage */
  657.   do
  658.   {
  659.  
  660. #ifndef ARM_MATH_BIG_ENDIAN
  661.  
  662.     /* Read xa (real), ya(imag) input */
  663.     xaya = *__SIMD64(ptr1)++;
  664.     xa = (q31_t) xaya;
  665.     ya = (q31_t) (xaya >> 32);
  666.  
  667.     /* Read xb (real), yb(imag) input */
  668.     xbyb = *__SIMD64(ptr1)++;
  669.     xb = (q31_t) xbyb;
  670.     yb = (q31_t) (xbyb >> 32);
  671.  
  672.     /* Read xc (real), yc(imag) input */
  673.     xcyc = *__SIMD64(ptr1)++;
  674.     xc = (q31_t) xcyc;
  675.     yc = (q31_t) (xcyc >> 32);
  676.  
  677.     /* Read xc (real), yc(imag) input */
  678.     xdyd = *__SIMD64(ptr1)++;
  679.     xd = (q31_t) xdyd;
  680.     yd = (q31_t) (xdyd >> 32);
  681.  
  682. #else
  683.  
  684.     /* Read xa (real), ya(imag) input */
  685.     xaya = *__SIMD64(ptr1)++;
  686.     ya = (q31_t) xaya;
  687.     xa = (q31_t) (xaya >> 32);
  688.  
  689.     /* Read xb (real), yb(imag) input */
  690.     xbyb = *__SIMD64(ptr1)++;
  691.     yb = (q31_t) xbyb;
  692.     xb = (q31_t) (xbyb >> 32);
  693.  
  694.     /* Read xc (real), yc(imag) input */
  695.     xcyc = *__SIMD64(ptr1)++;
  696.     yc = (q31_t) xcyc;
  697.     xc = (q31_t) (xcyc >> 32);
  698.  
  699.     /* Read xc (real), yc(imag) input */
  700.     xdyd = *__SIMD64(ptr1)++;
  701.     yd = (q31_t) xdyd;
  702.     xd = (q31_t) (xdyd >> 32);
  703.  
  704.  
  705. #endif
  706.  
  707.     /* xa' = xa + xb + xc + xd */
  708.     xa_out = xa + xb + xc + xd;
  709.  
  710.     /* ya' = ya + yb + yc + yd */
  711.     ya_out = ya + yb + yc + yd;
  712.  
  713.     /* pointer updation for writing */
  714.     ptr1 = ptr1 - 8U;
  715.  
  716.     /* writing xa' and ya' */
  717.     *ptr1++ = xa_out;
  718.     *ptr1++ = ya_out;
  719.  
  720.     xc_out = (xa - xb + xc - xd);
  721.     yc_out = (ya - yb + yc - yd);
  722.  
  723.     /* writing xc' and yc' */
  724.     *ptr1++ = xc_out;
  725.     *ptr1++ = yc_out;
  726.  
  727.     xb_out = (xa + yb - xc - yd);
  728.     yb_out = (ya - xb - yc + xd);
  729.  
  730.     /* writing xb' and yb' */
  731.     *ptr1++ = xb_out;
  732.     *ptr1++ = yb_out;
  733.  
  734.     xd_out = (xa - yb - xc + yd);
  735.     yd_out = (ya + xb - yc - xd);
  736.  
  737.     /* writing xd' and yd' */
  738.     *ptr1++ = xd_out;
  739.     *ptr1++ = yd_out;
  740.  
  741.  
  742.   } while (--j);
  743.  
  744.   /* output is in 11.21(q21) format for the 1024 point */
  745.   /* output is in 9.23(q23) format for the 256 point */
  746.   /* output is in 7.25(q25) format for the 64 point */
  747.   /* output is in 5.27(q27) format for the 16 point */
  748.  
  749.   /* End of last stage process */
  750.  
  751. }
  752.  
  753.  
  754. /**
  755.  * @brief  Core function for the Q31 CIFFT butterfly process.
  756.  * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.
  757.  * @param[in]      fftLen           length of the FFT.
  758.  * @param[in]      *pCoef           points to twiddle coefficient buffer.
  759.  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
  760.  * @return none.
  761.  */
  762.  
  763.  
  764. /*
  765. * Radix-4 IFFT algorithm used is :
  766. *
  767. * CIFFT uses same twiddle coefficients as CFFT Function
  768. *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
  769. *
  770. *
  771. * IFFT is implemented with following changes in equations from FFT
  772. *
  773. * Input real and imaginary data:
  774. * x(n) = xa + j * ya
  775. * x(n+N/4 ) = xb + j * yb
  776. * x(n+N/2 ) = xc + j * yc
  777. * x(n+3N 4) = xd + j * yd
  778. *
  779. *
  780. * Output real and imaginary data:
  781. * x(4r) = xa'+ j * ya'
  782. * x(4r+1) = xb'+ j * yb'
  783. * x(4r+2) = xc'+ j * yc'
  784. * x(4r+3) = xd'+ j * yd'
  785. *
  786. *
  787. * Twiddle factors for radix-4 IFFT:
  788. * Wn = co1 + j * (si1)
  789. * W2n = co2 + j * (si2)
  790. * W3n = co3 + j * (si3)
  791.  
  792. * The real and imaginary output values for the radix-4 butterfly are
  793. * xa' = xa + xb + xc + xd
  794. * ya' = ya + yb + yc + yd
  795. * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
  796. * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
  797. * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
  798. * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
  799. * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
  800. * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
  801. *
  802. */
  803.  
  804. void arm_radix4_butterfly_inverse_q31(
  805.   q31_t * pSrc,
  806.   uint32_t fftLen,
  807.   q31_t * pCoef,
  808.   uint32_t twidCoefModifier)
  809. {
  810. #if defined(ARM_MATH_CM7)
  811.   uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
  812.   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
  813.   q31_t xa, xb, xc, xd;
  814.   q31_t ya, yb, yc, yd;
  815.   q31_t xa_out, xb_out, xc_out, xd_out;
  816.   q31_t ya_out, yb_out, yc_out, yd_out;
  817.  
  818.   q31_t *ptr1;
  819.   q63_t xaya, xbyb, xcyc, xdyd;
  820.  
  821.   /* input is be 1.31(q31) format for all FFT sizes */
  822.   /* Total process is divided into three stages */
  823.   /* process first stage, middle stages, & last stage */
  824.  
  825.   /* Start of first stage process */
  826.  
  827.   /* Initializations for the first stage */
  828.   n2 = fftLen;
  829.   n1 = n2;
  830.   /* n2 = fftLen/4 */
  831.   n2 >>= 2U;
  832.   i0 = 0U;
  833.   ia1 = 0U;
  834.  
  835.   j = n2;
  836.  
  837.   do
  838.   {
  839.  
  840.     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
  841.  
  842.     /*  index calculation for the input as, */
  843.     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
  844.     i1 = i0 + n2;
  845.     i2 = i1 + n2;
  846.     i3 = i2 + n2;
  847.  
  848.     /*  Butterfly implementation */
  849.     /* xa + xc */
  850.     r1 = (pSrc[2U * i0] >> 4U) + (pSrc[2U * i2] >> 4U);
  851.     /* xa - xc */
  852.     r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
  853.  
  854.     /* xb + xd */
  855.     t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
  856.  
  857.     /* ya + yc */
  858.     s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
  859.     /* ya - yc */
  860.     s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
  861.  
  862.     /* xa' = xa + xb + xc + xd */
  863.     pSrc[2U * i0] = (r1 + t1);
  864.     /* (xa + xc) - (xb + xd) */
  865.     r1 = r1 - t1;
  866.     /* yb + yd */
  867.     t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
  868.     /* ya' = ya + yb + yc + yd */
  869.     pSrc[(2U * i0) + 1U] = (s1 + t2);
  870.  
  871.     /* (ya + yc) - (yb + yd) */
  872.     s1 = s1 - t2;
  873.  
  874.     /* yb - yd */
  875.     t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
  876.     /* xb - xd */
  877.     t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
  878.  
  879.     /*  index calculation for the coefficients */
  880.     ia2 = 2U * ia1;
  881.     co2 = pCoef[ia2 * 2U];
  882.     si2 = pCoef[(ia2 * 2U) + 1U];
  883.  
  884.     /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
  885.     pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
  886.                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
  887.  
  888.     /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
  889.     pSrc[2U * i1 + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
  890.                           ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
  891.  
  892.     /* (xa - xc) - (yb - yd) */
  893.     r1 = r2 - t1;
  894.     /* (xa - xc) + (yb - yd) */
  895.     r2 = r2 + t1;
  896.  
  897.     /* (ya - yc) + (xb - xd) */
  898.     s1 = s2 + t2;
  899.     /* (ya - yc) - (xb - xd) */
  900.     s2 = s2 - t2;
  901.  
  902.     co1 = pCoef[ia1 * 2U];
  903.     si1 = pCoef[(ia1 * 2U) + 1U];
  904.  
  905.     /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
  906.     pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
  907.                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
  908.  
  909.     /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
  910.     pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
  911.                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
  912.  
  913.     /*  index calculation for the coefficients */
  914.     ia3 = 3U * ia1;
  915.     co3 = pCoef[ia3 * 2U];
  916.     si3 = pCoef[(ia3 * 2U) + 1U];
  917.  
  918.     /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
  919.     pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
  920.                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
  921.  
  922.     /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
  923.     pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
  924.                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
  925.  
  926.     /*  Twiddle coefficients index modifier */
  927.     ia1 = ia1 + twidCoefModifier;
  928.  
  929.     /*  Updating input index */
  930.     i0 = i0 + 1U;
  931.  
  932.   } while (--j);
  933.  
  934.   /* data is in 5.27(q27) format */
  935.   /* each stage provides two down scaling of the input */
  936.  
  937.  
  938.   /* Start of Middle stages process */
  939.  
  940.   twidCoefModifier <<= 2U;
  941.  
  942.   /*  Calculation of second stage to excluding last stage */
  943.   for (k = fftLen / 4U; k > 4U; k >>= 2U)
  944.   {
  945.     /*  Initializations for the first stage */
  946.     n1 = n2;
  947.     n2 >>= 2U;
  948.     ia1 = 0U;
  949.  
  950.     for (j = 0; j <= (n2 - 1U); j++)
  951.     {
  952.       /*  index calculation for the coefficients */
  953.       ia2 = ia1 + ia1;
  954.       ia3 = ia2 + ia1;
  955.       co1 = pCoef[ia1 * 2U];
  956.       si1 = pCoef[(ia1 * 2U) + 1U];
  957.       co2 = pCoef[ia2 * 2U];
  958.       si2 = pCoef[(ia2 * 2U) + 1U];
  959.       co3 = pCoef[ia3 * 2U];
  960.       si3 = pCoef[(ia3 * 2U) + 1U];
  961.       /*  Twiddle coefficients index modifier */
  962.       ia1 = ia1 + twidCoefModifier;
  963.  
  964.       for (i0 = j; i0 < fftLen; i0 += n1)
  965.       {
  966.         /*  index calculation for the input as, */
  967.         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
  968.         i1 = i0 + n2;
  969.         i2 = i1 + n2;
  970.         i3 = i2 + n2;
  971.  
  972.         /*  Butterfly implementation */
  973.         /* xa + xc */
  974.         r1 = pSrc[2U * i0] + pSrc[2U * i2];
  975.         /* xa - xc */
  976.         r2 = pSrc[2U * i0] - pSrc[2U * i2];
  977.  
  978.         /* ya + yc */
  979.         s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
  980.         /* ya - yc */
  981.         s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
  982.  
  983.         /* xb + xd */
  984.         t1 = pSrc[2U * i1] + pSrc[2U * i3];
  985.  
  986.         /* xa' = xa + xb + xc + xd */
  987.         pSrc[2U * i0] = (r1 + t1) >> 2U;
  988.         /* xa + xc -(xb + xd) */
  989.         r1 = r1 - t1;
  990.         /* yb + yd */
  991.         t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
  992.         /* ya' = ya + yb + yc + yd */
  993.         pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
  994.  
  995.         /* (ya + yc) - (yb + yd) */
  996.         s1 = s1 - t2;
  997.  
  998.         /* (yb - yd) */
  999.         t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
  1000.         /* (xb - xd) */
  1001.         t2 = pSrc[2U * i1] - pSrc[2U * i3];
  1002.  
  1003.         /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
  1004.         pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
  1005.                          ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
  1006.  
  1007.         /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
  1008.         pSrc[(2U * i1) + 1U] =
  1009.           (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
  1010.            ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
  1011.  
  1012.         /* (xa - xc) - (yb - yd) */
  1013.         r1 = r2 - t1;
  1014.         /* (xa - xc) + (yb - yd) */
  1015.         r2 = r2 + t1;
  1016.  
  1017.         /* (ya - yc) +  (xb - xd) */
  1018.         s1 = s2 + t2;
  1019.         /* (ya - yc) -  (xb - xd) */
  1020.         s2 = s2 - t2;
  1021.  
  1022.         /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
  1023.         pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
  1024.                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
  1025.  
  1026.         /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
  1027.         pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
  1028.                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
  1029.  
  1030.         /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
  1031.         pSrc[(2U * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
  1032.                            ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
  1033.  
  1034.         /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
  1035.         pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
  1036.                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
  1037.       }
  1038.     }
  1039.     twidCoefModifier <<= 2U;
  1040.   }
  1041. #else
  1042.   uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
  1043.   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
  1044.   q31_t xa, xb, xc, xd;
  1045.   q31_t ya, yb, yc, yd;
  1046.   q31_t xa_out, xb_out, xc_out, xd_out;
  1047.   q31_t ya_out, yb_out, yc_out, yd_out;
  1048.  
  1049.   q31_t *ptr1;
  1050.   q31_t *pSi0;
  1051.   q31_t *pSi1;
  1052.   q31_t *pSi2;
  1053.   q31_t *pSi3;
  1054.   q63_t xaya, xbyb, xcyc, xdyd;
  1055.  
  1056.   /* input is be 1.31(q31) format for all FFT sizes */
  1057.   /* Total process is divided into three stages */
  1058.   /* process first stage, middle stages, & last stage */
  1059.  
  1060.   /* Start of first stage process */
  1061.  
  1062.   /* Initializations for the first stage */
  1063.   n2 = fftLen;
  1064.   n1 = n2;
  1065.   /* n2 = fftLen/4 */
  1066.   n2 >>= 2U;
  1067.  
  1068.   ia1 = 0U;
  1069.  
  1070.   j = n2;
  1071.  
  1072.   pSi0 = pSrc;
  1073.   pSi1 = pSi0 + 2 * n2;
  1074.   pSi2 = pSi1 + 2 * n2;
  1075.   pSi3 = pSi2 + 2 * n2;
  1076.  
  1077.   do
  1078.   {
  1079.     /*  Butterfly implementation */
  1080.     /* xa + xc */
  1081.     r1 = (pSi0[0] >> 4U) + (pSi2[0] >> 4U);
  1082.     /* xa - xc */
  1083.     r2 = (pSi0[0] >> 4U) - (pSi2[0] >> 4U);
  1084.  
  1085.     /* xb + xd */
  1086.     t1 = (pSi1[0] >> 4U) + (pSi3[0] >> 4U);
  1087.  
  1088.     /* ya + yc */
  1089.     s1 = (pSi0[1] >> 4U) + (pSi2[1] >> 4U);
  1090.     /* ya - yc */
  1091.     s2 = (pSi0[1] >> 4U) - (pSi2[1] >> 4U);
  1092.  
  1093.     /* xa' = xa + xb + xc + xd */
  1094.     *pSi0++ = (r1 + t1);
  1095.     /* (xa + xc) - (xb + xd) */
  1096.     r1 = r1 - t1;
  1097.     /* yb + yd */
  1098.     t2 = (pSi1[1] >> 4U) + (pSi3[1] >> 4U);
  1099.     /* ya' = ya + yb + yc + yd */
  1100.     *pSi0++ = (s1 + t2);
  1101.  
  1102.     /* (ya + yc) - (yb + yd) */
  1103.     s1 = s1 - t2;
  1104.  
  1105.     /* yb - yd */
  1106.     t1 = (pSi1[1] >> 4U) - (pSi3[1] >> 4U);
  1107.     /* xb - xd */
  1108.     t2 = (pSi1[0] >> 4U) - (pSi3[0] >> 4U);
  1109.  
  1110.     /*  index calculation for the coefficients */
  1111.     ia2 = 2U * ia1;
  1112.     co2 = pCoef[ia2 * 2U];
  1113.     si2 = pCoef[(ia2 * 2U) + 1U];
  1114.  
  1115.     /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
  1116.     *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
  1117.                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
  1118.  
  1119.     /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
  1120.     *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
  1121.                           ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
  1122.  
  1123.     /* (xa - xc) - (yb - yd) */
  1124.     r1 = r2 - t1;
  1125.     /* (xa - xc) + (yb - yd) */
  1126.     r2 = r2 + t1;
  1127.  
  1128.     /* (ya - yc) + (xb - xd) */
  1129.     s1 = s2 + t2;
  1130.     /* (ya - yc) - (xb - xd) */
  1131.     s2 = s2 - t2;
  1132.  
  1133.     co1 = pCoef[ia1 * 2U];
  1134.     si1 = pCoef[(ia1 * 2U) + 1U];
  1135.  
  1136.     /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
  1137.     *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
  1138.                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
  1139.  
  1140.     /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
  1141.     *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
  1142.                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
  1143.  
  1144.     /*  index calculation for the coefficients */
  1145.     ia3 = 3U * ia1;
  1146.     co3 = pCoef[ia3 * 2U];
  1147.     si3 = pCoef[(ia3 * 2U) + 1U];
  1148.  
  1149.     /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
  1150.     *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
  1151.                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
  1152.  
  1153.     /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
  1154.     *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
  1155.                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
  1156.  
  1157.     /*  Twiddle coefficients index modifier */
  1158.     ia1 = ia1 + twidCoefModifier;
  1159.  
  1160.   } while (--j);
  1161.  
  1162.   /* data is in 5.27(q27) format */
  1163.   /* each stage provides two down scaling of the input */
  1164.  
  1165.  
  1166.   /* Start of Middle stages process */
  1167.  
  1168.   twidCoefModifier <<= 2U;
  1169.  
  1170.   /*  Calculation of second stage to excluding last stage */
  1171.   for (k = fftLen / 4U; k > 4U; k >>= 2U)
  1172.   {
  1173.     /*  Initializations for the first stage */
  1174.     n1 = n2;
  1175.     n2 >>= 2U;
  1176.     ia1 = 0U;
  1177.  
  1178.     for (j = 0; j <= (n2 - 1U); j++)
  1179.     {
  1180.       /*  index calculation for the coefficients */
  1181.       ia2 = ia1 + ia1;
  1182.       ia3 = ia2 + ia1;
  1183.       co1 = pCoef[ia1 * 2U];
  1184.       si1 = pCoef[(ia1 * 2U) + 1U];
  1185.       co2 = pCoef[ia2 * 2U];
  1186.       si2 = pCoef[(ia2 * 2U) + 1U];
  1187.       co3 = pCoef[ia3 * 2U];
  1188.       si3 = pCoef[(ia3 * 2U) + 1U];
  1189.       /*  Twiddle coefficients index modifier */
  1190.       ia1 = ia1 + twidCoefModifier;
  1191.  
  1192.       pSi0 = pSrc + 2 * j;
  1193.       pSi1 = pSi0 + 2 * n2;
  1194.       pSi2 = pSi1 + 2 * n2;
  1195.       pSi3 = pSi2 + 2 * n2;
  1196.  
  1197.       for (i0 = j; i0 < fftLen; i0 += n1)
  1198.       {
  1199.         /*  Butterfly implementation */
  1200.         /* xa + xc */
  1201.         r1 = pSi0[0] + pSi2[0];
  1202.  
  1203.         /* xa - xc */
  1204.         r2 = pSi0[0] - pSi2[0];
  1205.  
  1206.  
  1207.         /* ya + yc */
  1208.         s1 = pSi0[1] + pSi2[1];
  1209.  
  1210.         /* ya - yc */
  1211.         s2 = pSi0[1] - pSi2[1];
  1212.  
  1213.  
  1214.         /* xb + xd */
  1215.         t1 = pSi1[0] + pSi3[0];
  1216.  
  1217.  
  1218.         /* xa' = xa + xb + xc + xd */
  1219.         pSi0[0] = (r1 + t1) >> 2U;
  1220.         /* xa + xc -(xb + xd) */
  1221.         r1 = r1 - t1;
  1222.         /* yb + yd */
  1223.         t2 = pSi1[1] + pSi3[1];
  1224.  
  1225.         /* ya' = ya + yb + yc + yd */
  1226.         pSi0[1] = (s1 + t2) >> 2U;
  1227.         pSi0 += 2 * n1;
  1228.  
  1229.         /* (ya + yc) - (yb + yd) */
  1230.         s1 = s1 - t2;
  1231.  
  1232.         /* (yb - yd) */
  1233.         t1 = pSi1[1] - pSi3[1];
  1234.  
  1235.         /* (xb - xd) */
  1236.         t2 = pSi1[0] - pSi3[0];
  1237.  
  1238.  
  1239.         /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
  1240.         pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
  1241.                          ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
  1242.  
  1243.         /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
  1244.         pSi1[1] =
  1245.  
  1246.           (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
  1247.            ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
  1248.         pSi1 += 2 * n1;
  1249.  
  1250.         /* (xa - xc) - (yb - yd) */
  1251.         r1 = r2 - t1;
  1252.         /* (xa - xc) + (yb - yd) */
  1253.         r2 = r2 + t1;
  1254.  
  1255.         /* (ya - yc) +  (xb - xd) */
  1256.         s1 = s2 + t2;
  1257.         /* (ya - yc) -  (xb - xd) */
  1258.         s2 = s2 - t2;
  1259.  
  1260.         /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
  1261.         pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
  1262.                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
  1263.  
  1264.         /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
  1265.         pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
  1266.                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
  1267.         pSi2 += 2 * n1;
  1268.  
  1269.         /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
  1270.         pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
  1271.                            ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
  1272.  
  1273.         /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
  1274.         pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
  1275.                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
  1276.         pSi3 += 2 * n1;
  1277.       }
  1278.     }
  1279.     twidCoefModifier <<= 2U;
  1280.   }
  1281. #endif
  1282.  
  1283.   /* End of Middle stages process */
  1284.  
  1285.   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
  1286.   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
  1287.   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
  1288.   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
  1289.  
  1290.  
  1291.   /* Start of last stage process */
  1292.  
  1293.  
  1294.   /*  Initializations for the last stage */
  1295.   j = fftLen >> 2;
  1296.   ptr1 = &pSrc[0];
  1297.  
  1298.   /*  Calculations of last stage */
  1299.   do
  1300.   {
  1301. #ifndef ARM_MATH_BIG_ENDIAN
  1302.     /* Read xa (real), ya(imag) input */
  1303.     xaya = *__SIMD64(ptr1)++;
  1304.     xa = (q31_t) xaya;
  1305.     ya = (q31_t) (xaya >> 32);
  1306.  
  1307.     /* Read xb (real), yb(imag) input */
  1308.     xbyb = *__SIMD64(ptr1)++;
  1309.     xb = (q31_t) xbyb;
  1310.     yb = (q31_t) (xbyb >> 32);
  1311.  
  1312.     /* Read xc (real), yc(imag) input */
  1313.     xcyc = *__SIMD64(ptr1)++;
  1314.     xc = (q31_t) xcyc;
  1315.     yc = (q31_t) (xcyc >> 32);
  1316.  
  1317.     /* Read xc (real), yc(imag) input */
  1318.     xdyd = *__SIMD64(ptr1)++;
  1319.     xd = (q31_t) xdyd;
  1320.     yd = (q31_t) (xdyd >> 32);
  1321.  
  1322. #else
  1323.  
  1324.     /* Read xa (real), ya(imag) input */
  1325.     xaya = *__SIMD64(ptr1)++;
  1326.     ya = (q31_t) xaya;
  1327.     xa = (q31_t) (xaya >> 32);
  1328.  
  1329.     /* Read xb (real), yb(imag) input */
  1330.     xbyb = *__SIMD64(ptr1)++;
  1331.     yb = (q31_t) xbyb;
  1332.     xb = (q31_t) (xbyb >> 32);
  1333.  
  1334.     /* Read xc (real), yc(imag) input */
  1335.     xcyc = *__SIMD64(ptr1)++;
  1336.     yc = (q31_t) xcyc;
  1337.     xc = (q31_t) (xcyc >> 32);
  1338.  
  1339.     /* Read xc (real), yc(imag) input */
  1340.     xdyd = *__SIMD64(ptr1)++;
  1341.     yd = (q31_t) xdyd;
  1342.     xd = (q31_t) (xdyd >> 32);
  1343.  
  1344.  
  1345. #endif
  1346.  
  1347.     /* xa' = xa + xb + xc + xd */
  1348.     xa_out = xa + xb + xc + xd;
  1349.  
  1350.     /* ya' = ya + yb + yc + yd */
  1351.     ya_out = ya + yb + yc + yd;
  1352.  
  1353.     /* pointer updation for writing */
  1354.     ptr1 = ptr1 - 8U;
  1355.  
  1356.     /* writing xa' and ya' */
  1357.     *ptr1++ = xa_out;
  1358.     *ptr1++ = ya_out;
  1359.  
  1360.     xc_out = (xa - xb + xc - xd);
  1361.     yc_out = (ya - yb + yc - yd);
  1362.  
  1363.     /* writing xc' and yc' */
  1364.     *ptr1++ = xc_out;
  1365.     *ptr1++ = yc_out;
  1366.  
  1367.     xb_out = (xa - yb - xc + yd);
  1368.     yb_out = (ya + xb - yc - xd);
  1369.  
  1370.     /* writing xb' and yb' */
  1371.     *ptr1++ = xb_out;
  1372.     *ptr1++ = yb_out;
  1373.  
  1374.     xd_out = (xa + yb - xc - yd);
  1375.     yd_out = (ya - xb - yc + xd);
  1376.  
  1377.     /* writing xd' and yd' */
  1378.     *ptr1++ = xd_out;
  1379.     *ptr1++ = yd_out;
  1380.  
  1381.   } while (--j);
  1382.  
  1383.   /* output is in 11.21(q21) format for the 1024 point */
  1384.   /* output is in 9.23(q23) format for the 256 point */
  1385.   /* output is in 7.25(q25) format for the 64 point */
  1386.   /* output is in 5.27(q27) format for the 16 point */
  1387.  
  1388.   /* End of last stage process */
  1389. }
  1390.