Subversion Repositories dashGPS

Rev

Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------
  2.  * Project:      CMSIS DSP Library
  3.  * Title:        arm_cfft_radix4_q15.c
  4.  * Description:  This file has function definition of Radix-4 FFT & IFFT function and
  5.  *               In-place bit reversal using bit reversal table
  6.  *
  7.  * $Date:        27. January 2017
  8.  * $Revision:    V.1.5.1
  9.  *
  10.  * Target Processor: Cortex-M cores
  11.  * -------------------------------------------------------------------- */
  12. /*
  13.  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  14.  *
  15.  * SPDX-License-Identifier: Apache-2.0
  16.  *
  17.  * Licensed under the Apache License, Version 2.0 (the License); you may
  18.  * not use this file except in compliance with the License.
  19.  * You may obtain a copy of the License at
  20.  *
  21.  * www.apache.org/licenses/LICENSE-2.0
  22.  *
  23.  * Unless required by applicable law or agreed to in writing, software
  24.  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  25.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  26.  * See the License for the specific language governing permissions and
  27.  * limitations under the License.
  28.  */
  29.  
  30. #include "arm_math.h"
  31.  
  32.  
  33. void arm_radix4_butterfly_q15(
  34.   q15_t * pSrc16,
  35.   uint32_t fftLen,
  36.   q15_t * pCoef16,
  37.   uint32_t twidCoefModifier);
  38.  
  39. void arm_radix4_butterfly_inverse_q15(
  40.   q15_t * pSrc16,
  41.   uint32_t fftLen,
  42.   q15_t * pCoef16,
  43.   uint32_t twidCoefModifier);
  44.  
  45. void arm_bitreversal_q15(
  46.   q15_t * pSrc,
  47.   uint32_t fftLen,
  48.   uint16_t bitRevFactor,
  49.   uint16_t * pBitRevTab);
  50.  
  51. /**
  52.  * @ingroup groupTransforms
  53.  */
  54.  
  55. /**
  56.  * @addtogroup ComplexFFT
  57.  * @{
  58.  */
  59.  
  60.  
  61. /**
  62.  * @details
  63.  * @brief Processing function for the Q15 CFFT/CIFFT.
  64.  * @deprecated Do not use this function.  It has been superseded by \ref arm_cfft_q15 and will be removed
  65.  * @param[in]      *S    points to an instance of the Q15 CFFT/CIFFT structure.
  66.  * @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.
  67.  * @return none.
  68.  *
  69.  * \par Input and output formats:
  70.  * \par
  71.  * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
  72.  * Hence the output format is different for different FFT sizes.
  73.  * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
  74.  * \par
  75.  * \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
  76.  * \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
  77.  */
  78.  
  79. void arm_cfft_radix4_q15(
  80.   const arm_cfft_radix4_instance_q15 * S,
  81.   q15_t * pSrc)
  82. {
  83.   if (S->ifftFlag == 1U)
  84.   {
  85.     /*  Complex IFFT radix-4  */
  86.     arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
  87.   }
  88.   else
  89.   {
  90.     /*  Complex FFT radix-4  */
  91.     arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
  92.   }
  93.  
  94.   if (S->bitReverseFlag == 1U)
  95.   {
  96.     /*  Bit Reversal */
  97.     arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
  98.   }
  99.  
  100. }
  101.  
  102. /**
  103.  * @} end of ComplexFFT group
  104.  */
  105.  
  106. /*
  107. * Radix-4 FFT algorithm used is :
  108. *
  109. * Input real and imaginary data:
  110. * x(n) = xa + j * ya
  111. * x(n+N/4 ) = xb + j * yb
  112. * x(n+N/2 ) = xc + j * yc
  113. * x(n+3N 4) = xd + j * yd
  114. *
  115. *
  116. * Output real and imaginary data:
  117. * x(4r) = xa'+ j * ya'
  118. * x(4r+1) = xb'+ j * yb'
  119. * x(4r+2) = xc'+ j * yc'
  120. * x(4r+3) = xd'+ j * yd'
  121. *
  122. *
  123. * Twiddle factors for radix-4 FFT:
  124. * Wn = co1 + j * (- si1)
  125. * W2n = co2 + j * (- si2)
  126. * W3n = co3 + j * (- si3)
  127.  
  128. * The real and imaginary output values for the radix-4 butterfly are
  129. * xa' = xa + xb + xc + xd
  130. * ya' = ya + yb + yc + yd
  131. * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
  132. * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
  133. * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
  134. * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
  135. * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
  136. * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
  137. *
  138. */
  139.  
  140. /**
  141.  * @brief  Core function for the Q15 CFFT butterfly process.
  142.  * @param[in, out] *pSrc16          points to the in-place buffer of Q15 data type.
  143.  * @param[in]      fftLen           length of the FFT.
  144.  * @param[in]      *pCoef16         points to twiddle coefficient buffer.
  145.  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
  146.  * @return none.
  147.  */
  148.  
  149. void arm_radix4_butterfly_q15(
  150.   q15_t * pSrc16,
  151.   uint32_t fftLen,
  152.   q15_t * pCoef16,
  153.   uint32_t twidCoefModifier)
  154. {
  155.  
  156. #if defined (ARM_MATH_DSP)
  157.  
  158.   /* Run the below code for Cortex-M4 and Cortex-M3 */
  159.  
  160.   q31_t R, S, T, U;
  161.   q31_t C1, C2, C3, out1, out2;
  162.   uint32_t n1, n2, ic, i0, j, k;
  163.  
  164.   q15_t *ptr1;
  165.   q15_t *pSi0;
  166.   q15_t *pSi1;
  167.   q15_t *pSi2;
  168.   q15_t *pSi3;
  169.  
  170.   q31_t xaya, xbyb, xcyc, xdyd;
  171.  
  172.   /* Total process is divided into three stages */
  173.  
  174.   /* process first stage, middle stages, & last stage */
  175.  
  176.   /*  Initializations for the first stage */
  177.   n2 = fftLen;
  178.   n1 = n2;
  179.  
  180.   /* n2 = fftLen/4 */
  181.   n2 >>= 2U;
  182.  
  183.   /* Index for twiddle coefficient */
  184.   ic = 0U;
  185.  
  186.   /* Index for input read and output write */
  187.   j = n2;
  188.  
  189.   pSi0 = pSrc16;
  190.   pSi1 = pSi0 + 2 * n2;
  191.   pSi2 = pSi1 + 2 * n2;
  192.   pSi3 = pSi2 + 2 * n2;
  193.  
  194.   /* Input is in 1.15(q15) format */
  195.  
  196.   /*  start of first stage process */
  197.   do
  198.   {
  199.     /*  Butterfly implementation */
  200.  
  201.     /*  Reading i0, i0+fftLen/2 inputs */
  202.     /* Read ya (real), xa(imag) input */
  203.     T = _SIMD32_OFFSET(pSi0);
  204.     T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
  205.     T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
  206.     //in = ((int16_t) (T & 0xFFFF)) >> 2;       // alternative code that takes 3 cycles
  207.     //T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
  208.  
  209.     /* Read yc (real), xc(imag) input */
  210.     S = _SIMD32_OFFSET(pSi2);
  211.     S = __SHADD16(S, 0);
  212.     S = __SHADD16(S, 0);
  213.  
  214.     /* R = packed((ya + yc), (xa + xc) ) */
  215.     R = __QADD16(T, S);
  216.  
  217.     /* S = packed((ya - yc), (xa - xc) ) */
  218.     S = __QSUB16(T, S);
  219.  
  220.     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  221.     /* Read yb (real), xb(imag) input */
  222.     T = _SIMD32_OFFSET(pSi1);
  223.     T = __SHADD16(T, 0);
  224.     T = __SHADD16(T, 0);
  225.  
  226.     /* Read yd (real), xd(imag) input */
  227.     U = _SIMD32_OFFSET(pSi3);
  228.     U = __SHADD16(U, 0);
  229.     U = __SHADD16(U, 0);
  230.  
  231.     /* T = packed((yb + yd), (xb + xd) ) */
  232.     T = __QADD16(T, U);
  233.  
  234.     /*  writing the butterfly processed i0 sample */
  235.     /* xa' = xa + xb + xc + xd */
  236.     /* ya' = ya + yb + yc + yd */
  237.     _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
  238.     pSi0 += 2;
  239.  
  240.     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
  241.     R = __QSUB16(R, T);
  242.  
  243.     /* co2 & si2 are read from SIMD Coefficient pointer */
  244.     C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
  245.  
  246. #ifndef ARM_MATH_BIG_ENDIAN
  247.  
  248.     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  249.     out1 = __SMUAD(C2, R) >> 16U;
  250.     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  251.     out2 = __SMUSDX(C2, R);
  252.  
  253. #else
  254.  
  255.     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  256.     out1 = __SMUSDX(R, C2) >> 16U;
  257.     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  258.     out2 = __SMUAD(C2, R);
  259.  
  260. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  261.  
  262.     /*  Reading i0+fftLen/4 */
  263.     /* T = packed(yb, xb) */
  264.     T = _SIMD32_OFFSET(pSi1);
  265.     T = __SHADD16(T, 0);
  266.     T = __SHADD16(T, 0);
  267.  
  268.     /* writing the butterfly processed i0 + fftLen/4 sample */
  269.     /* writing output(xc', yc') in little endian format */
  270.     _SIMD32_OFFSET(pSi1) =
  271.       (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  272.     pSi1 += 2;
  273.  
  274.     /*  Butterfly calculations */
  275.     /* U = packed(yd, xd) */
  276.     U = _SIMD32_OFFSET(pSi3);
  277.     U = __SHADD16(U, 0);
  278.     U = __SHADD16(U, 0);
  279.  
  280.     /* T = packed(yb-yd, xb-xd) */
  281.     T = __QSUB16(T, U);
  282.  
  283. #ifndef ARM_MATH_BIG_ENDIAN
  284.  
  285.     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  286.     R = __QASX(S, T);
  287.     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  288.     S = __QSAX(S, T);
  289.  
  290. #else
  291.  
  292.     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  293.     R = __QSAX(S, T);
  294.     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  295.     S = __QASX(S, T);
  296.  
  297. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  298.  
  299.     /* co1 & si1 are read from SIMD Coefficient pointer */
  300.     C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
  301.     /*  Butterfly process for the i0+fftLen/2 sample */
  302.  
  303. #ifndef ARM_MATH_BIG_ENDIAN
  304.  
  305.     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  306.     out1 = __SMUAD(C1, S) >> 16U;
  307.     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  308.     out2 = __SMUSDX(C1, S);
  309.  
  310. #else
  311.  
  312.     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  313.     out1 = __SMUSDX(S, C1) >> 16U;
  314.     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  315.     out2 = __SMUAD(C1, S);
  316.  
  317. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  318.  
  319.     /* writing output(xb', yb') in little endian format */
  320.     _SIMD32_OFFSET(pSi2) =
  321.       ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
  322.     pSi2 += 2;
  323.  
  324.  
  325.     /* co3 & si3 are read from SIMD Coefficient pointer */
  326.     C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
  327.     /*  Butterfly process for the i0+3fftLen/4 sample */
  328.  
  329. #ifndef ARM_MATH_BIG_ENDIAN
  330.  
  331.     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  332.     out1 = __SMUAD(C3, R) >> 16U;
  333.     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  334.     out2 = __SMUSDX(C3, R);
  335.  
  336. #else
  337.  
  338.     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  339.     out1 = __SMUSDX(R, C3) >> 16U;
  340.     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  341.     out2 = __SMUAD(C3, R);
  342.  
  343. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  344.  
  345.     /* writing output(xd', yd') in little endian format */
  346.     _SIMD32_OFFSET(pSi3) =
  347.       ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  348.     pSi3 += 2;
  349.  
  350.     /*  Twiddle coefficients index modifier */
  351.     ic = ic + twidCoefModifier;
  352.  
  353.   } while (--j);
  354.   /* data is in 4.11(q11) format */
  355.  
  356.   /* end of first stage process */
  357.  
  358.  
  359.   /* start of middle stage process */
  360.  
  361.   /*  Twiddle coefficients index modifier */
  362.   twidCoefModifier <<= 2U;
  363.  
  364.   /*  Calculation of Middle stage */
  365.   for (k = fftLen / 4U; k > 4U; k >>= 2U)
  366.   {
  367.     /*  Initializations for the middle stage */
  368.     n1 = n2;
  369.     n2 >>= 2U;
  370.     ic = 0U;
  371.  
  372.     for (j = 0U; j <= (n2 - 1U); j++)
  373.     {
  374.       /*  index calculation for the coefficients */
  375.       C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
  376.       C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
  377.       C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
  378.  
  379.       /*  Twiddle coefficients index modifier */
  380.       ic = ic + twidCoefModifier;
  381.  
  382.       pSi0 = pSrc16 + 2 * j;
  383.       pSi1 = pSi0 + 2 * n2;
  384.       pSi2 = pSi1 + 2 * n2;
  385.       pSi3 = pSi2 + 2 * n2;
  386.  
  387.       /*  Butterfly implementation */
  388.       for (i0 = j; i0 < fftLen; i0 += n1)
  389.       {
  390.         /*  Reading i0, i0+fftLen/2 inputs */
  391.         /* Read ya (real), xa(imag) input */
  392.         T = _SIMD32_OFFSET(pSi0);
  393.  
  394.         /* Read yc (real), xc(imag) input */
  395.         S = _SIMD32_OFFSET(pSi2);
  396.  
  397.         /* R = packed( (ya + yc), (xa + xc)) */
  398.         R = __QADD16(T, S);
  399.  
  400.         /* S = packed((ya - yc), (xa - xc)) */
  401.         S = __QSUB16(T, S);
  402.  
  403.         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  404.         /* Read yb (real), xb(imag) input */
  405.         T = _SIMD32_OFFSET(pSi1);
  406.  
  407.         /* Read yd (real), xd(imag) input */
  408.         U = _SIMD32_OFFSET(pSi3);
  409.  
  410.         /* T = packed( (yb + yd), (xb + xd)) */
  411.         T = __QADD16(T, U);
  412.  
  413.         /*  writing the butterfly processed i0 sample */
  414.  
  415.         /* xa' = xa + xb + xc + xd */
  416.         /* ya' = ya + yb + yc + yd */
  417.         out1 = __SHADD16(R, T);
  418.         out1 = __SHADD16(out1, 0);
  419.         _SIMD32_OFFSET(pSi0) = out1;
  420.         pSi0 += 2 * n1;
  421.  
  422.         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
  423.         R = __SHSUB16(R, T);
  424.  
  425. #ifndef ARM_MATH_BIG_ENDIAN
  426.  
  427.         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  428.         out1 = __SMUAD(C2, R) >> 16U;
  429.  
  430.         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  431.         out2 = __SMUSDX(C2, R);
  432.  
  433. #else
  434.  
  435.         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  436.         out1 = __SMUSDX(R, C2) >> 16U;
  437.  
  438.         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  439.         out2 = __SMUAD(C2, R);
  440.  
  441. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  442.  
  443.         /*  Reading i0+3fftLen/4 */
  444.         /* Read yb (real), xb(imag) input */
  445.         T = _SIMD32_OFFSET(pSi1);
  446.  
  447.         /*  writing the butterfly processed i0 + fftLen/4 sample */
  448.         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  449.         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  450.         _SIMD32_OFFSET(pSi1) =
  451.           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  452.         pSi1 += 2 * n1;
  453.  
  454.         /*  Butterfly calculations */
  455.  
  456.         /* Read yd (real), xd(imag) input */
  457.         U = _SIMD32_OFFSET(pSi3);
  458.  
  459.         /* T = packed(yb-yd, xb-xd) */
  460.         T = __QSUB16(T, U);
  461.  
  462. #ifndef ARM_MATH_BIG_ENDIAN
  463.  
  464.         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  465.         R = __SHASX(S, T);
  466.  
  467.         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  468.         S = __SHSAX(S, T);
  469.  
  470.  
  471.         /*  Butterfly process for the i0+fftLen/2 sample */
  472.         out1 = __SMUAD(C1, S) >> 16U;
  473.         out2 = __SMUSDX(C1, S);
  474.  
  475. #else
  476.  
  477.         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  478.         R = __SHSAX(S, T);
  479.  
  480.         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  481.         S = __SHASX(S, T);
  482.  
  483.  
  484.         /*  Butterfly process for the i0+fftLen/2 sample */
  485.         out1 = __SMUSDX(S, C1) >> 16U;
  486.         out2 = __SMUAD(C1, S);
  487.  
  488. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  489.  
  490.         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  491.         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  492.         _SIMD32_OFFSET(pSi2) =
  493.           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  494.         pSi2 += 2 * n1;
  495.  
  496.         /*  Butterfly process for the i0+3fftLen/4 sample */
  497.  
  498. #ifndef ARM_MATH_BIG_ENDIAN
  499.  
  500.         out1 = __SMUAD(C3, R) >> 16U;
  501.         out2 = __SMUSDX(C3, R);
  502.  
  503. #else
  504.  
  505.         out1 = __SMUSDX(R, C3) >> 16U;
  506.         out2 = __SMUAD(C3, R);
  507.  
  508. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  509.  
  510.         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  511.         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  512.         _SIMD32_OFFSET(pSi3) =
  513.           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  514.         pSi3 += 2 * n1;
  515.       }
  516.     }
  517.     /*  Twiddle coefficients index modifier */
  518.     twidCoefModifier <<= 2U;
  519.   }
  520.   /* end of middle stage process */
  521.  
  522.  
  523.   /* data is in 10.6(q6) format for the 1024 point */
  524.   /* data is in 8.8(q8) format for the 256 point */
  525.   /* data is in 6.10(q10) format for the 64 point */
  526.   /* data is in 4.12(q12) format for the 16 point */
  527.  
  528.   /*  Initializations for the last stage */
  529.   j = fftLen >> 2;
  530.  
  531.   ptr1 = &pSrc16[0];
  532.  
  533.   /* start of last stage process */
  534.  
  535.   /*  Butterfly implementation */
  536.   do
  537.   {
  538.     /* Read xa (real), ya(imag) input */
  539.     xaya = *__SIMD32(ptr1)++;
  540.  
  541.     /* Read xb (real), yb(imag) input */
  542.     xbyb = *__SIMD32(ptr1)++;
  543.  
  544.     /* Read xc (real), yc(imag) input */
  545.     xcyc = *__SIMD32(ptr1)++;
  546.  
  547.     /* Read xd (real), yd(imag) input */
  548.     xdyd = *__SIMD32(ptr1)++;
  549.  
  550.     /* R = packed((ya + yc), (xa + xc)) */
  551.     R = __QADD16(xaya, xcyc);
  552.  
  553.     /* T = packed((yb + yd), (xb + xd)) */
  554.     T = __QADD16(xbyb, xdyd);
  555.  
  556.     /* pointer updation for writing */
  557.     ptr1 = ptr1 - 8U;
  558.  
  559.  
  560.     /* xa' = xa + xb + xc + xd */
  561.     /* ya' = ya + yb + yc + yd */
  562.     *__SIMD32(ptr1)++ = __SHADD16(R, T);
  563.  
  564.     /* T = packed((yb + yd), (xb + xd)) */
  565.     T = __QADD16(xbyb, xdyd);
  566.  
  567.     /* xc' = (xa-xb+xc-xd) */
  568.     /* yc' = (ya-yb+yc-yd) */
  569.     *__SIMD32(ptr1)++ = __SHSUB16(R, T);
  570.  
  571.     /* S = packed((ya - yc), (xa - xc)) */
  572.     S = __QSUB16(xaya, xcyc);
  573.  
  574.     /* Read yd (real), xd(imag) input */
  575.     /* T = packed( (yb - yd), (xb - xd))  */
  576.     U = __QSUB16(xbyb, xdyd);
  577.  
  578. #ifndef ARM_MATH_BIG_ENDIAN
  579.  
  580.     /* xb' = (xa+yb-xc-yd) */
  581.     /* yb' = (ya-xb-yc+xd) */
  582.     *__SIMD32(ptr1)++ = __SHSAX(S, U);
  583.  
  584.  
  585.     /* xd' = (xa-yb-xc+yd) */
  586.     /* yd' = (ya+xb-yc-xd) */
  587.     *__SIMD32(ptr1)++ = __SHASX(S, U);
  588.  
  589. #else
  590.  
  591.     /* xb' = (xa+yb-xc-yd) */
  592.     /* yb' = (ya-xb-yc+xd) */
  593.     *__SIMD32(ptr1)++ = __SHASX(S, U);
  594.  
  595.  
  596.     /* xd' = (xa-yb-xc+yd) */
  597.     /* yd' = (ya+xb-yc-xd) */
  598.     *__SIMD32(ptr1)++ = __SHSAX(S, U);
  599.  
  600. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  601.  
  602.   } while (--j);
  603.  
  604.   /* end of last stage process */
  605.  
  606.   /* output is in 11.5(q5) format for the 1024 point */
  607.   /* output is in 9.7(q7) format for the 256 point   */
  608.   /* output is in 7.9(q9) format for the 64 point  */
  609.   /* output is in 5.11(q11) format for the 16 point  */
  610.  
  611.  
  612. #else
  613.  
  614.   /* Run the below code for Cortex-M0 */
  615.  
  616.   q15_t R0, R1, S0, S1, T0, T1, U0, U1;
  617.   q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
  618.   uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
  619.  
  620.   /* Total process is divided into three stages */
  621.  
  622.   /* process first stage, middle stages, & last stage */
  623.  
  624.   /*  Initializations for the first stage */
  625.   n2 = fftLen;
  626.   n1 = n2;
  627.  
  628.   /* n2 = fftLen/4 */
  629.   n2 >>= 2U;
  630.  
  631.   /* Index for twiddle coefficient */
  632.   ic = 0U;
  633.  
  634.   /* Index for input read and output write */
  635.   i0 = 0U;
  636.   j = n2;
  637.  
  638.   /* Input is in 1.15(q15) format */
  639.  
  640.   /*  start of first stage process */
  641.   do
  642.   {
  643.     /*  Butterfly implementation */
  644.  
  645.     /*  index calculation for the input as, */
  646.     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  647.     i1 = i0 + n2;
  648.     i2 = i1 + n2;
  649.     i3 = i2 + n2;
  650.  
  651.     /*  Reading i0, i0+fftLen/2 inputs */
  652.  
  653.     /* input is down scale by 4 to avoid overflow */
  654.     /* Read ya (real), xa(imag) input */
  655.     T0 = pSrc16[i0 * 2U] >> 2U;
  656.     T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
  657.  
  658.     /* input is down scale by 4 to avoid overflow */
  659.     /* Read yc (real), xc(imag) input */
  660.     S0 = pSrc16[i2 * 2U] >> 2U;
  661.     S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
  662.  
  663.     /* R0 = (ya + yc) */
  664.     R0 = __SSAT(T0 + S0, 16U);
  665.     /* R1 = (xa + xc) */
  666.     R1 = __SSAT(T1 + S1, 16U);
  667.  
  668.     /* S0 = (ya - yc) */
  669.     S0 = __SSAT(T0 - S0, 16);
  670.     /* S1 = (xa - xc) */
  671.     S1 = __SSAT(T1 - S1, 16);
  672.  
  673.     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  674.     /* input is down scale by 4 to avoid overflow */
  675.     /* Read yb (real), xb(imag) input */
  676.     T0 = pSrc16[i1 * 2U] >> 2U;
  677.     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
  678.  
  679.     /* input is down scale by 4 to avoid overflow */
  680.     /* Read yd (real), xd(imag) input */
  681.     U0 = pSrc16[i3 * 2U] >> 2U;
  682.     U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
  683.  
  684.     /* T0 = (yb + yd) */
  685.     T0 = __SSAT(T0 + U0, 16U);
  686.     /* T1 = (xb + xd) */
  687.     T1 = __SSAT(T1 + U1, 16U);
  688.  
  689.     /*  writing the butterfly processed i0 sample */
  690.     /* ya' = ya + yb + yc + yd */
  691.     /* xa' = xa + xb + xc + xd */
  692.     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
  693.     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
  694.  
  695.     /* R0 = (ya + yc) - (yb + yd) */
  696.     /* R1 = (xa + xc) - (xb + xd) */
  697.     R0 = __SSAT(R0 - T0, 16U);
  698.     R1 = __SSAT(R1 - T1, 16U);
  699.  
  700.     /* co2 & si2 are read from Coefficient pointer */
  701.     Co2 = pCoef16[2U * ic * 2U];
  702.     Si2 = pCoef16[(2U * ic * 2U) + 1];
  703.  
  704.     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  705.     out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
  706.     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  707.     out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
  708.  
  709.     /*  Reading i0+fftLen/4 */
  710.     /* input is down scale by 4 to avoid overflow */
  711.     /* T0 = yb, T1 =  xb */
  712.     T0 = pSrc16[i1 * 2U] >> 2;
  713.     T1 = pSrc16[(i1 * 2U) + 1] >> 2;
  714.  
  715.     /* writing the butterfly processed i0 + fftLen/4 sample */
  716.     /* writing output(xc', yc') in little endian format */
  717.     pSrc16[i1 * 2U] = out1;
  718.     pSrc16[(i1 * 2U) + 1] = out2;
  719.  
  720.     /*  Butterfly calculations */
  721.     /* input is down scale by 4 to avoid overflow */
  722.     /* U0 = yd, U1 = xd */
  723.     U0 = pSrc16[i3 * 2U] >> 2;
  724.     U1 = pSrc16[(i3 * 2U) + 1] >> 2;
  725.     /* T0 = yb-yd */
  726.     T0 = __SSAT(T0 - U0, 16);
  727.     /* T1 = xb-xd */
  728.     T1 = __SSAT(T1 - U1, 16);
  729.  
  730.     /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
  731.     R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
  732.     R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
  733.  
  734.     /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
  735.     S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
  736.     S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
  737.  
  738.     /* co1 & si1 are read from Coefficient pointer */
  739.     Co1 = pCoef16[ic * 2U];
  740.     Si1 = pCoef16[(ic * 2U) + 1];
  741.     /*  Butterfly process for the i0+fftLen/2 sample */
  742.     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  743.     out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
  744.     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  745.     out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
  746.  
  747.     /* writing output(xb', yb') in little endian format */
  748.     pSrc16[i2 * 2U] = out1;
  749.     pSrc16[(i2 * 2U) + 1] = out2;
  750.  
  751.     /* Co3 & si3 are read from Coefficient pointer */
  752.     Co3 = pCoef16[3U * (ic * 2U)];
  753.     Si3 = pCoef16[(3U * (ic * 2U)) + 1];
  754.     /*  Butterfly process for the i0+3fftLen/4 sample */
  755.     /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
  756.     out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
  757.     /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
  758.     out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
  759.     /* writing output(xd', yd') in little endian format */
  760.     pSrc16[i3 * 2U] = out1;
  761.     pSrc16[(i3 * 2U) + 1] = out2;
  762.  
  763.     /*  Twiddle coefficients index modifier */
  764.     ic = ic + twidCoefModifier;
  765.  
  766.     /*  Updating input index */
  767.     i0 = i0 + 1U;
  768.  
  769.   } while (--j);
  770.   /* data is in 4.11(q11) format */
  771.  
  772.   /* end of first stage process */
  773.  
  774.  
  775.   /* start of middle stage process */
  776.  
  777.   /*  Twiddle coefficients index modifier */
  778.   twidCoefModifier <<= 2U;
  779.  
  780.   /*  Calculation of Middle stage */
  781.   for (k = fftLen / 4U; k > 4U; k >>= 2U)
  782.   {
  783.     /*  Initializations for the middle stage */
  784.     n1 = n2;
  785.     n2 >>= 2U;
  786.     ic = 0U;
  787.  
  788.     for (j = 0U; j <= (n2 - 1U); j++)
  789.     {
  790.       /*  index calculation for the coefficients */
  791.       Co1 = pCoef16[ic * 2U];
  792.       Si1 = pCoef16[(ic * 2U) + 1U];
  793.       Co2 = pCoef16[2U * (ic * 2U)];
  794.       Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
  795.       Co3 = pCoef16[3U * (ic * 2U)];
  796.       Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
  797.  
  798.       /*  Twiddle coefficients index modifier */
  799.       ic = ic + twidCoefModifier;
  800.  
  801.       /*  Butterfly implementation */
  802.       for (i0 = j; i0 < fftLen; i0 += n1)
  803.       {
  804.         /*  index calculation for the input as, */
  805.         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  806.         i1 = i0 + n2;
  807.         i2 = i1 + n2;
  808.         i3 = i2 + n2;
  809.  
  810.         /*  Reading i0, i0+fftLen/2 inputs */
  811.         /* Read ya (real), xa(imag) input */
  812.         T0 = pSrc16[i0 * 2U];
  813.         T1 = pSrc16[(i0 * 2U) + 1U];
  814.  
  815.         /* Read yc (real), xc(imag) input */
  816.         S0 = pSrc16[i2 * 2U];
  817.         S1 = pSrc16[(i2 * 2U) + 1U];
  818.  
  819.         /* R0 = (ya + yc), R1 = (xa + xc) */
  820.         R0 = __SSAT(T0 + S0, 16);
  821.         R1 = __SSAT(T1 + S1, 16);
  822.  
  823.         /* S0 = (ya - yc), S1 =(xa - xc) */
  824.         S0 = __SSAT(T0 - S0, 16);
  825.         S1 = __SSAT(T1 - S1, 16);
  826.  
  827.         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  828.         /* Read yb (real), xb(imag) input */
  829.         T0 = pSrc16[i1 * 2U];
  830.         T1 = pSrc16[(i1 * 2U) + 1U];
  831.  
  832.         /* Read yd (real), xd(imag) input */
  833.         U0 = pSrc16[i3 * 2U];
  834.         U1 = pSrc16[(i3 * 2U) + 1U];
  835.  
  836.  
  837.         /* T0 = (yb + yd), T1 = (xb + xd) */
  838.         T0 = __SSAT(T0 + U0, 16);
  839.         T1 = __SSAT(T1 + U1, 16);
  840.  
  841.         /*  writing the butterfly processed i0 sample */
  842.  
  843.         /* xa' = xa + xb + xc + xd */
  844.         /* ya' = ya + yb + yc + yd */
  845.         out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
  846.         out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
  847.  
  848.         pSrc16[i0 * 2U] = out1;
  849.         pSrc16[(2U * i0) + 1U] = out2;
  850.  
  851.         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
  852.         R0 = (R0 >> 1U) - (T0 >> 1U);
  853.         R1 = (R1 >> 1U) - (T1 >> 1U);
  854.  
  855.         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  856.         out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
  857.  
  858.         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  859.         out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
  860.  
  861.         /*  Reading i0+3fftLen/4 */
  862.         /* Read yb (real), xb(imag) input */
  863.         T0 = pSrc16[i1 * 2U];
  864.         T1 = pSrc16[(i1 * 2U) + 1U];
  865.  
  866.         /*  writing the butterfly processed i0 + fftLen/4 sample */
  867.         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  868.         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  869.         pSrc16[i1 * 2U] = out1;
  870.         pSrc16[(i1 * 2U) + 1U] = out2;
  871.  
  872.         /*  Butterfly calculations */
  873.  
  874.         /* Read yd (real), xd(imag) input */
  875.         U0 = pSrc16[i3 * 2U];
  876.         U1 = pSrc16[(i3 * 2U) + 1U];
  877.  
  878.         /* T0 = yb-yd, T1 = xb-xd */
  879.         T0 = __SSAT(T0 - U0, 16);
  880.         T1 = __SSAT(T1 - U1, 16);
  881.  
  882.         /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
  883.         R0 = (S0 >> 1U) - (T1 >> 1U);
  884.         R1 = (S1 >> 1U) + (T0 >> 1U);
  885.  
  886.         /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
  887.         S0 = (S0 >> 1U) + (T1 >> 1U);
  888.         S1 = (S1 >> 1U) - (T0 >> 1U);
  889.  
  890.         /*  Butterfly process for the i0+fftLen/2 sample */
  891.         out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
  892.  
  893.         out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
  894.  
  895.         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  896.         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  897.         pSrc16[i2 * 2U] = out1;
  898.         pSrc16[(i2 * 2U) + 1U] = out2;
  899.  
  900.         /*  Butterfly process for the i0+3fftLen/4 sample */
  901.         out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
  902.  
  903.         out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
  904.         /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
  905.         /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
  906.         pSrc16[i3 * 2U] = out1;
  907.         pSrc16[(i3 * 2U) + 1U] = out2;
  908.       }
  909.     }
  910.     /*  Twiddle coefficients index modifier */
  911.     twidCoefModifier <<= 2U;
  912.   }
  913.   /* end of middle stage process */
  914.  
  915.  
  916.   /* data is in 10.6(q6) format for the 1024 point */
  917.   /* data is in 8.8(q8) format for the 256 point */
  918.   /* data is in 6.10(q10) format for the 64 point */
  919.   /* data is in 4.12(q12) format for the 16 point */
  920.  
  921.   /*  Initializations for the last stage */
  922.   n1 = n2;
  923.   n2 >>= 2U;
  924.  
  925.   /* start of last stage process */
  926.  
  927.   /*  Butterfly implementation */
  928.   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
  929.   {
  930.     /*  index calculation for the input as, */
  931.     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  932.     i1 = i0 + n2;
  933.     i2 = i1 + n2;
  934.     i3 = i2 + n2;
  935.  
  936.     /*  Reading i0, i0+fftLen/2 inputs */
  937.     /* Read ya (real), xa(imag) input */
  938.     T0 = pSrc16[i0 * 2U];
  939.     T1 = pSrc16[(i0 * 2U) + 1U];
  940.  
  941.     /* Read yc (real), xc(imag) input */
  942.     S0 = pSrc16[i2 * 2U];
  943.     S1 = pSrc16[(i2 * 2U) + 1U];
  944.  
  945.     /* R0 = (ya + yc), R1 = (xa + xc) */
  946.     R0 = __SSAT(T0 + S0, 16U);
  947.     R1 = __SSAT(T1 + S1, 16U);
  948.  
  949.     /* S0 = (ya - yc), S1 = (xa - xc) */
  950.     S0 = __SSAT(T0 - S0, 16U);
  951.     S1 = __SSAT(T1 - S1, 16U);
  952.  
  953.     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  954.     /* Read yb (real), xb(imag) input */
  955.     T0 = pSrc16[i1 * 2U];
  956.     T1 = pSrc16[(i1 * 2U) + 1U];
  957.     /* Read yd (real), xd(imag) input */
  958.     U0 = pSrc16[i3 * 2U];
  959.     U1 = pSrc16[(i3 * 2U) + 1U];
  960.  
  961.     /* T0 = (yb + yd), T1 = (xb + xd)) */
  962.     T0 = __SSAT(T0 + U0, 16U);
  963.     T1 = __SSAT(T1 + U1, 16U);
  964.  
  965.     /*  writing the butterfly processed i0 sample */
  966.     /* xa' = xa + xb + xc + xd */
  967.     /* ya' = ya + yb + yc + yd */
  968.     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
  969.     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
  970.  
  971.     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
  972.     R0 = (R0 >> 1U) - (T0 >> 1U);
  973.     R1 = (R1 >> 1U) - (T1 >> 1U);
  974.     /* Read yb (real), xb(imag) input */
  975.     T0 = pSrc16[i1 * 2U];
  976.     T1 = pSrc16[(i1 * 2U) + 1U];
  977.  
  978.     /*  writing the butterfly processed i0 + fftLen/4 sample */
  979.     /* xc' = (xa-xb+xc-xd) */
  980.     /* yc' = (ya-yb+yc-yd) */
  981.     pSrc16[i1 * 2U] = R0;
  982.     pSrc16[(i1 * 2U) + 1U] = R1;
  983.  
  984.     /* Read yd (real), xd(imag) input */
  985.     U0 = pSrc16[i3 * 2U];
  986.     U1 = pSrc16[(i3 * 2U) + 1U];
  987.     /* T0 = (yb - yd), T1 = (xb - xd)  */
  988.     T0 = __SSAT(T0 - U0, 16U);
  989.     T1 = __SSAT(T1 - U1, 16U);
  990.  
  991.     /*  writing the butterfly processed i0 + fftLen/2 sample */
  992.     /* xb' = (xa+yb-xc-yd) */
  993.     /* yb' = (ya-xb-yc+xd) */
  994.     pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
  995.     pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
  996.  
  997.     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
  998.     /* xd' = (xa-yb-xc+yd) */
  999.     /* yd' = (ya+xb-yc-xd) */
  1000.     pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
  1001.     pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
  1002.  
  1003.   }
  1004.  
  1005.   /* end of last stage process */
  1006.  
  1007.   /* output is in 11.5(q5) format for the 1024 point */
  1008.   /* output is in 9.7(q7) format for the 256 point   */
  1009.   /* output is in 7.9(q9) format for the 64 point  */
  1010.   /* output is in 5.11(q11) format for the 16 point  */
  1011.  
  1012. #endif /* #if defined (ARM_MATH_DSP) */
  1013.  
  1014. }
  1015.  
  1016.  
  1017. /**
  1018.  * @brief  Core function for the Q15 CIFFT butterfly process.
  1019.  * @param[in, out] *pSrc16          points to the in-place buffer of Q15 data type.
  1020.  * @param[in]      fftLen           length of the FFT.
  1021.  * @param[in]      *pCoef16         points to twiddle coefficient buffer.
  1022.  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
  1023.  * @return none.
  1024.  */
  1025.  
  1026. /*
  1027. * Radix-4 IFFT algorithm used is :
  1028. *
  1029. * CIFFT uses same twiddle coefficients as CFFT function
  1030. *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
  1031. *
  1032. *
  1033. * IFFT is implemented with following changes in equations from FFT
  1034. *
  1035. * Input real and imaginary data:
  1036. * x(n) = xa + j * ya
  1037. * x(n+N/4 ) = xb + j * yb
  1038. * x(n+N/2 ) = xc + j * yc
  1039. * x(n+3N 4) = xd + j * yd
  1040. *
  1041. *
  1042. * Output real and imaginary data:
  1043. * x(4r) = xa'+ j * ya'
  1044. * x(4r+1) = xb'+ j * yb'
  1045. * x(4r+2) = xc'+ j * yc'
  1046. * x(4r+3) = xd'+ j * yd'
  1047. *
  1048. *
  1049. * Twiddle factors for radix-4 IFFT:
  1050. * Wn = co1 + j * (si1)
  1051. * W2n = co2 + j * (si2)
  1052. * W3n = co3 + j * (si3)
  1053.  
  1054. * The real and imaginary output values for the radix-4 butterfly are
  1055. * xa' = xa + xb + xc + xd
  1056. * ya' = ya + yb + yc + yd
  1057. * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
  1058. * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
  1059. * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
  1060. * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
  1061. * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
  1062. * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
  1063. *
  1064. */
  1065.  
  1066. void arm_radix4_butterfly_inverse_q15(
  1067.   q15_t * pSrc16,
  1068.   uint32_t fftLen,
  1069.   q15_t * pCoef16,
  1070.   uint32_t twidCoefModifier)
  1071. {
  1072.  
  1073. #if defined (ARM_MATH_DSP)
  1074.  
  1075.   /* Run the below code for Cortex-M4 and Cortex-M3 */
  1076.  
  1077.   q31_t R, S, T, U;
  1078.   q31_t C1, C2, C3, out1, out2;
  1079.   uint32_t n1, n2, ic, i0, j, k;
  1080.  
  1081.   q15_t *ptr1;
  1082.   q15_t *pSi0;
  1083.   q15_t *pSi1;
  1084.   q15_t *pSi2;
  1085.   q15_t *pSi3;
  1086.  
  1087.   q31_t xaya, xbyb, xcyc, xdyd;
  1088.  
  1089.   /* Total process is divided into three stages */
  1090.  
  1091.   /* process first stage, middle stages, & last stage */
  1092.  
  1093.   /*  Initializations for the first stage */
  1094.   n2 = fftLen;
  1095.   n1 = n2;
  1096.  
  1097.   /* n2 = fftLen/4 */
  1098.   n2 >>= 2U;
  1099.  
  1100.   /* Index for twiddle coefficient */
  1101.   ic = 0U;
  1102.  
  1103.   /* Index for input read and output write */
  1104.   j = n2;
  1105.  
  1106.   pSi0 = pSrc16;
  1107.   pSi1 = pSi0 + 2 * n2;
  1108.   pSi2 = pSi1 + 2 * n2;
  1109.   pSi3 = pSi2 + 2 * n2;
  1110.  
  1111.   /* Input is in 1.15(q15) format */
  1112.  
  1113.   /*  start of first stage process */
  1114.   do
  1115.   {
  1116.     /*  Butterfly implementation */
  1117.  
  1118.     /*  Reading i0, i0+fftLen/2 inputs */
  1119.     /* Read ya (real), xa(imag) input */
  1120.     T = _SIMD32_OFFSET(pSi0);
  1121.     T = __SHADD16(T, 0);
  1122.     T = __SHADD16(T, 0);
  1123.  
  1124.     /* Read yc (real), xc(imag) input */
  1125.     S = _SIMD32_OFFSET(pSi2);
  1126.     S = __SHADD16(S, 0);
  1127.     S = __SHADD16(S, 0);
  1128.  
  1129.     /* R = packed((ya + yc), (xa + xc) ) */
  1130.     R = __QADD16(T, S);
  1131.  
  1132.     /* S = packed((ya - yc), (xa - xc) ) */
  1133.     S = __QSUB16(T, S);
  1134.  
  1135.     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1136.     /* Read yb (real), xb(imag) input */
  1137.     T = _SIMD32_OFFSET(pSi1);
  1138.     T = __SHADD16(T, 0);
  1139.     T = __SHADD16(T, 0);
  1140.  
  1141.     /* Read yd (real), xd(imag) input */
  1142.     U = _SIMD32_OFFSET(pSi3);
  1143.     U = __SHADD16(U, 0);
  1144.     U = __SHADD16(U, 0);
  1145.  
  1146.     /* T = packed((yb + yd), (xb + xd) ) */
  1147.     T = __QADD16(T, U);
  1148.  
  1149.     /*  writing the butterfly processed i0 sample */
  1150.     /* xa' = xa + xb + xc + xd */
  1151.     /* ya' = ya + yb + yc + yd */
  1152.     _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
  1153.     pSi0 += 2;
  1154.  
  1155.     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
  1156.     R = __QSUB16(R, T);
  1157.  
  1158.     /* co2 & si2 are read from SIMD Coefficient pointer */
  1159.     C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
  1160.  
  1161. #ifndef ARM_MATH_BIG_ENDIAN
  1162.  
  1163.     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  1164.     out1 = __SMUSD(C2, R) >> 16U;
  1165.     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1166.     out2 = __SMUADX(C2, R);
  1167.  
  1168. #else
  1169.  
  1170.     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1171.     out1 = __SMUADX(C2, R) >> 16U;
  1172.     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  1173.     out2 = __SMUSD(__QSUB16(0, C2), R);
  1174.  
  1175. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1176.  
  1177.     /*  Reading i0+fftLen/4 */
  1178.     /* T = packed(yb, xb) */
  1179.     T = _SIMD32_OFFSET(pSi1);
  1180.     T = __SHADD16(T, 0);
  1181.     T = __SHADD16(T, 0);
  1182.  
  1183.     /* writing the butterfly processed i0 + fftLen/4 sample */
  1184.     /* writing output(xc', yc') in little endian format */
  1185.     _SIMD32_OFFSET(pSi1) =
  1186.       (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1187.     pSi1 += 2;
  1188.  
  1189.     /*  Butterfly calculations */
  1190.     /* U = packed(yd, xd) */
  1191.     U = _SIMD32_OFFSET(pSi3);
  1192.     U = __SHADD16(U, 0);
  1193.     U = __SHADD16(U, 0);
  1194.  
  1195.     /* T = packed(yb-yd, xb-xd) */
  1196.     T = __QSUB16(T, U);
  1197.  
  1198. #ifndef ARM_MATH_BIG_ENDIAN
  1199.  
  1200.     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  1201.     R = __QSAX(S, T);
  1202.     /* S = packed((ya-yc) + (xb- xd),  (xa-xc) - (yb-yd)) */
  1203.     S = __QASX(S, T);
  1204.  
  1205. #else
  1206.  
  1207.     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  1208.     R = __QASX(S, T);
  1209.     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  1210.     S = __QSAX(S, T);
  1211.  
  1212. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1213.  
  1214.     /* co1 & si1 are read from SIMD Coefficient pointer */
  1215.     C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
  1216.     /*  Butterfly process for the i0+fftLen/2 sample */
  1217.  
  1218. #ifndef ARM_MATH_BIG_ENDIAN
  1219.  
  1220.     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  1221.     out1 = __SMUSD(C1, S) >> 16U;
  1222.     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  1223.     out2 = __SMUADX(C1, S);
  1224.  
  1225. #else
  1226.  
  1227.     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  1228.     out1 = __SMUADX(C1, S) >> 16U;
  1229.     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  1230.     out2 = __SMUSD(__QSUB16(0, C1), S);
  1231.  
  1232. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1233.  
  1234.     /* writing output(xb', yb') in little endian format */
  1235.     _SIMD32_OFFSET(pSi2) =
  1236.       ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
  1237.     pSi2 += 2;
  1238.  
  1239.  
  1240.     /* co3 & si3 are read from SIMD Coefficient pointer */
  1241.     C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
  1242.     /*  Butterfly process for the i0+3fftLen/4 sample */
  1243.  
  1244. #ifndef ARM_MATH_BIG_ENDIAN
  1245.  
  1246.     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  1247.     out1 = __SMUSD(C3, R) >> 16U;
  1248.     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  1249.     out2 = __SMUADX(C3, R);
  1250.  
  1251. #else
  1252.  
  1253.     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  1254.     out1 = __SMUADX(C3, R) >> 16U;
  1255.     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  1256.     out2 = __SMUSD(__QSUB16(0, C3), R);
  1257.  
  1258. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1259.  
  1260.     /* writing output(xd', yd') in little endian format */
  1261.     _SIMD32_OFFSET(pSi3) =
  1262.       ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1263.     pSi3 += 2;
  1264.  
  1265.     /*  Twiddle coefficients index modifier */
  1266.     ic = ic + twidCoefModifier;
  1267.  
  1268.   } while (--j);
  1269.   /* data is in 4.11(q11) format */
  1270.  
  1271.   /* end of first stage process */
  1272.  
  1273.  
  1274.   /* start of middle stage process */
  1275.  
  1276.   /*  Twiddle coefficients index modifier */
  1277.   twidCoefModifier <<= 2U;
  1278.  
  1279.   /*  Calculation of Middle stage */
  1280.   for (k = fftLen / 4U; k > 4U; k >>= 2U)
  1281.   {
  1282.     /*  Initializations for the middle stage */
  1283.     n1 = n2;
  1284.     n2 >>= 2U;
  1285.     ic = 0U;
  1286.  
  1287.     for (j = 0U; j <= (n2 - 1U); j++)
  1288.     {
  1289.       /*  index calculation for the coefficients */
  1290.       C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
  1291.       C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
  1292.       C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
  1293.  
  1294.       /*  Twiddle coefficients index modifier */
  1295.       ic = ic + twidCoefModifier;
  1296.  
  1297.       pSi0 = pSrc16 + 2 * j;
  1298.       pSi1 = pSi0 + 2 * n2;
  1299.       pSi2 = pSi1 + 2 * n2;
  1300.       pSi3 = pSi2 + 2 * n2;
  1301.  
  1302.       /*  Butterfly implementation */
  1303.       for (i0 = j; i0 < fftLen; i0 += n1)
  1304.       {
  1305.         /*  Reading i0, i0+fftLen/2 inputs */
  1306.         /* Read ya (real), xa(imag) input */
  1307.         T = _SIMD32_OFFSET(pSi0);
  1308.  
  1309.         /* Read yc (real), xc(imag) input */
  1310.         S = _SIMD32_OFFSET(pSi2);
  1311.  
  1312.         /* R = packed( (ya + yc), (xa + xc)) */
  1313.         R = __QADD16(T, S);
  1314.  
  1315.         /* S = packed((ya - yc), (xa - xc)) */
  1316.         S = __QSUB16(T, S);
  1317.  
  1318.         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1319.         /* Read yb (real), xb(imag) input */
  1320.         T = _SIMD32_OFFSET(pSi1);
  1321.  
  1322.         /* Read yd (real), xd(imag) input */
  1323.         U = _SIMD32_OFFSET(pSi3);
  1324.  
  1325.         /* T = packed( (yb + yd), (xb + xd)) */
  1326.         T = __QADD16(T, U);
  1327.  
  1328.         /*  writing the butterfly processed i0 sample */
  1329.  
  1330.         /* xa' = xa + xb + xc + xd */
  1331.         /* ya' = ya + yb + yc + yd */
  1332.         out1 = __SHADD16(R, T);
  1333.         out1 = __SHADD16(out1, 0);
  1334.         _SIMD32_OFFSET(pSi0) = out1;
  1335.         pSi0 += 2 * n1;
  1336.  
  1337.         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
  1338.         R = __SHSUB16(R, T);
  1339.  
  1340. #ifndef ARM_MATH_BIG_ENDIAN
  1341.  
  1342.         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  1343.         out1 = __SMUSD(C2, R) >> 16U;
  1344.  
  1345.         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1346.         out2 = __SMUADX(C2, R);
  1347.  
  1348. #else
  1349.  
  1350.         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1351.         out1 = __SMUADX(R, C2) >> 16U;
  1352.  
  1353.         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  1354.         out2 = __SMUSD(__QSUB16(0, C2), R);
  1355.  
  1356. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1357.  
  1358.         /*  Reading i0+3fftLen/4 */
  1359.         /* Read yb (real), xb(imag) input */
  1360.         T = _SIMD32_OFFSET(pSi1);
  1361.  
  1362.         /*  writing the butterfly processed i0 + fftLen/4 sample */
  1363.         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  1364.         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1365.         _SIMD32_OFFSET(pSi1) =
  1366.           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1367.         pSi1 += 2 * n1;
  1368.  
  1369.         /*  Butterfly calculations */
  1370.  
  1371.         /* Read yd (real), xd(imag) input */
  1372.         U = _SIMD32_OFFSET(pSi3);
  1373.  
  1374.         /* T = packed(yb-yd, xb-xd) */
  1375.         T = __QSUB16(T, U);
  1376.  
  1377. #ifndef ARM_MATH_BIG_ENDIAN
  1378.  
  1379.         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  1380.         R = __SHSAX(S, T);
  1381.  
  1382.         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  1383.         S = __SHASX(S, T);
  1384.  
  1385.  
  1386.         /*  Butterfly process for the i0+fftLen/2 sample */
  1387.         out1 = __SMUSD(C1, S) >> 16U;
  1388.         out2 = __SMUADX(C1, S);
  1389.  
  1390. #else
  1391.  
  1392.         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  1393.         R = __SHASX(S, T);
  1394.  
  1395.         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
  1396.         S = __SHSAX(S, T);
  1397.  
  1398.  
  1399.         /*  Butterfly process for the i0+fftLen/2 sample */
  1400.         out1 = __SMUADX(S, C1) >> 16U;
  1401.         out2 = __SMUSD(__QSUB16(0, C1), S);
  1402.  
  1403. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1404.  
  1405.         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  1406.         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  1407.         _SIMD32_OFFSET(pSi2) =
  1408.           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1409.         pSi2 += 2 * n1;
  1410.  
  1411.         /*  Butterfly process for the i0+3fftLen/4 sample */
  1412.  
  1413. #ifndef ARM_MATH_BIG_ENDIAN
  1414.  
  1415.         out1 = __SMUSD(C3, R) >> 16U;
  1416.         out2 = __SMUADX(C3, R);
  1417.  
  1418. #else
  1419.  
  1420.         out1 = __SMUADX(C3, R) >> 16U;
  1421.         out2 = __SMUSD(__QSUB16(0, C3), R);
  1422.  
  1423. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1424.  
  1425.         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  1426.         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  1427.         _SIMD32_OFFSET(pSi3) =
  1428.           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1429.         pSi3 += 2 * n1;
  1430.       }
  1431.     }
  1432.     /*  Twiddle coefficients index modifier */
  1433.     twidCoefModifier <<= 2U;
  1434.   }
  1435.   /* end of middle stage process */
  1436.  
  1437.   /* data is in 10.6(q6) format for the 1024 point */
  1438.   /* data is in 8.8(q8) format for the 256 point */
  1439.   /* data is in 6.10(q10) format for the 64 point */
  1440.   /* data is in 4.12(q12) format for the 16 point */
  1441.  
  1442.   /*  Initializations for the last stage */
  1443.   j = fftLen >> 2;
  1444.  
  1445.   ptr1 = &pSrc16[0];
  1446.  
  1447.   /* start of last stage process */
  1448.  
  1449.   /*  Butterfly implementation */
  1450.   do
  1451.   {
  1452.     /* Read xa (real), ya(imag) input */
  1453.     xaya = *__SIMD32(ptr1)++;
  1454.  
  1455.     /* Read xb (real), yb(imag) input */
  1456.     xbyb = *__SIMD32(ptr1)++;
  1457.  
  1458.     /* Read xc (real), yc(imag) input */
  1459.     xcyc = *__SIMD32(ptr1)++;
  1460.  
  1461.     /* Read xd (real), yd(imag) input */
  1462.     xdyd = *__SIMD32(ptr1)++;
  1463.  
  1464.     /* R = packed((ya + yc), (xa + xc)) */
  1465.     R = __QADD16(xaya, xcyc);
  1466.  
  1467.     /* T = packed((yb + yd), (xb + xd)) */
  1468.     T = __QADD16(xbyb, xdyd);
  1469.  
  1470.     /* pointer updation for writing */
  1471.     ptr1 = ptr1 - 8U;
  1472.  
  1473.  
  1474.     /* xa' = xa + xb + xc + xd */
  1475.     /* ya' = ya + yb + yc + yd */
  1476.     *__SIMD32(ptr1)++ = __SHADD16(R, T);
  1477.  
  1478.     /* T = packed((yb + yd), (xb + xd)) */
  1479.     T = __QADD16(xbyb, xdyd);
  1480.  
  1481.     /* xc' = (xa-xb+xc-xd) */
  1482.     /* yc' = (ya-yb+yc-yd) */
  1483.     *__SIMD32(ptr1)++ = __SHSUB16(R, T);
  1484.  
  1485.     /* S = packed((ya - yc), (xa - xc)) */
  1486.     S = __QSUB16(xaya, xcyc);
  1487.  
  1488.     /* Read yd (real), xd(imag) input */
  1489.     /* T = packed( (yb - yd), (xb - xd))  */
  1490.     U = __QSUB16(xbyb, xdyd);
  1491.  
  1492. #ifndef ARM_MATH_BIG_ENDIAN
  1493.  
  1494.     /* xb' = (xa+yb-xc-yd) */
  1495.     /* yb' = (ya-xb-yc+xd) */
  1496.     *__SIMD32(ptr1)++ = __SHASX(S, U);
  1497.  
  1498.  
  1499.     /* xd' = (xa-yb-xc+yd) */
  1500.     /* yd' = (ya+xb-yc-xd) */
  1501.     *__SIMD32(ptr1)++ = __SHSAX(S, U);
  1502.  
  1503. #else
  1504.  
  1505.     /* xb' = (xa+yb-xc-yd) */
  1506.     /* yb' = (ya-xb-yc+xd) */
  1507.     *__SIMD32(ptr1)++ = __SHSAX(S, U);
  1508.  
  1509.  
  1510.     /* xd' = (xa-yb-xc+yd) */
  1511.     /* yd' = (ya+xb-yc-xd) */
  1512.     *__SIMD32(ptr1)++ = __SHASX(S, U);
  1513.  
  1514.  
  1515. #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
  1516.  
  1517.   } while (--j);
  1518.  
  1519.   /* end of last stage  process */
  1520.  
  1521.   /* output is in 11.5(q5) format for the 1024 point */
  1522.   /* output is in 9.7(q7) format for the 256 point   */
  1523.   /* output is in 7.9(q9) format for the 64 point  */
  1524.   /* output is in 5.11(q11) format for the 16 point  */
  1525.  
  1526.  
  1527. #else
  1528.  
  1529.   /* Run the below code for Cortex-M0 */
  1530.  
  1531.   q15_t R0, R1, S0, S1, T0, T1, U0, U1;
  1532.   q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
  1533.   uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
  1534.  
  1535.   /* Total process is divided into three stages */
  1536.  
  1537.   /* process first stage, middle stages, & last stage */
  1538.  
  1539.   /*  Initializations for the first stage */
  1540.   n2 = fftLen;
  1541.   n1 = n2;
  1542.  
  1543.   /* n2 = fftLen/4 */
  1544.   n2 >>= 2U;
  1545.  
  1546.   /* Index for twiddle coefficient */
  1547.   ic = 0U;
  1548.  
  1549.   /* Index for input read and output write */
  1550.   i0 = 0U;
  1551.  
  1552.   j = n2;
  1553.  
  1554.   /* Input is in 1.15(q15) format */
  1555.  
  1556.   /*  Start of first stage process */
  1557.   do
  1558.   {
  1559.     /*  Butterfly implementation */
  1560.  
  1561.     /*  index calculation for the input as, */
  1562.     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  1563.     i1 = i0 + n2;
  1564.     i2 = i1 + n2;
  1565.     i3 = i2 + n2;
  1566.  
  1567.     /*  Reading i0, i0+fftLen/2 inputs */
  1568.     /* input is down scale by 4 to avoid overflow */
  1569.     /* Read ya (real), xa(imag) input */
  1570.     T0 = pSrc16[i0 * 2U] >> 2U;
  1571.     T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
  1572.     /* input is down scale by 4 to avoid overflow */
  1573.     /* Read yc (real), xc(imag) input */
  1574.     S0 = pSrc16[i2 * 2U] >> 2U;
  1575.     S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
  1576.  
  1577.     /* R0 = (ya + yc), R1 = (xa + xc) */
  1578.     R0 = __SSAT(T0 + S0, 16U);
  1579.     R1 = __SSAT(T1 + S1, 16U);
  1580.     /* S0 = (ya - yc), S1 = (xa - xc) */
  1581.     S0 = __SSAT(T0 - S0, 16U);
  1582.     S1 = __SSAT(T1 - S1, 16U);
  1583.  
  1584.     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1585.     /* input is down scale by 4 to avoid overflow */
  1586.     /* Read yb (real), xb(imag) input */
  1587.     T0 = pSrc16[i1 * 2U] >> 2U;
  1588.     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
  1589.     /* Read yd (real), xd(imag) input */
  1590.     /* input is down scale by 4 to avoid overflow */
  1591.     U0 = pSrc16[i3 * 2U] >> 2U;
  1592.     U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
  1593.  
  1594.     /* T0 = (yb + yd), T1 = (xb + xd) */
  1595.     T0 = __SSAT(T0 + U0, 16U);
  1596.     T1 = __SSAT(T1 + U1, 16U);
  1597.  
  1598.     /*  writing the butterfly processed i0 sample */
  1599.     /* xa' = xa + xb + xc + xd */
  1600.     /* ya' = ya + yb + yc + yd */
  1601.     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
  1602.     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
  1603.  
  1604.     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
  1605.     R0 = __SSAT(R0 - T0, 16U);
  1606.     R1 = __SSAT(R1 - T1, 16U);
  1607.     /* co2 & si2 are read from Coefficient pointer */
  1608.     Co2 = pCoef16[2U * ic * 2U];
  1609.     Si2 = pCoef16[(2U * ic * 2U) + 1U];
  1610.     /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
  1611.     out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
  1612.     /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
  1613.     out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
  1614.  
  1615.     /*  Reading i0+fftLen/4 */
  1616.     /* input is down scale by 4 to avoid overflow */
  1617.     /* T0 = yb, T1 = xb */
  1618.     T0 = pSrc16[i1 * 2U] >> 2U;
  1619.     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
  1620.  
  1621.     /* writing the butterfly processed i0 + fftLen/4 sample */
  1622.     /* writing output(xc', yc') in little endian format */
  1623.     pSrc16[i1 * 2U] = out1;
  1624.     pSrc16[(i1 * 2U) + 1U] = out2;
  1625.  
  1626.     /*  Butterfly calculations */
  1627.     /* input is down scale by 4 to avoid overflow */
  1628.     /* U0 = yd, U1 = xd) */
  1629.     U0 = pSrc16[i3 * 2U] >> 2U;
  1630.     U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
  1631.  
  1632.     /* T0 = yb-yd, T1 = xb-xd) */
  1633.     T0 = __SSAT(T0 - U0, 16U);
  1634.     T1 = __SSAT(T1 - U1, 16U);
  1635.     /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
  1636.     R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
  1637.     R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
  1638.     /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
  1639.     S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
  1640.     S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
  1641.  
  1642.     /* co1 & si1 are read from Coefficient pointer */
  1643.     Co1 = pCoef16[ic * 2U];
  1644.     Si1 = pCoef16[(ic * 2U) + 1U];
  1645.     /*  Butterfly process for the i0+fftLen/2 sample */
  1646.     /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
  1647.     out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
  1648.     /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
  1649.     out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
  1650.     /* writing output(xb', yb') in little endian format */
  1651.     pSrc16[i2 * 2U] = out1;
  1652.     pSrc16[(i2 * 2U) + 1U] = out2;
  1653.  
  1654.     /* Co3 & si3 are read from Coefficient pointer */
  1655.     Co3 = pCoef16[3U * ic * 2U];
  1656.     Si3 = pCoef16[(3U * ic * 2U) + 1U];
  1657.     /*  Butterfly process for the i0+3fftLen/4 sample */
  1658.     /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
  1659.     out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
  1660.     /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
  1661.     out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
  1662.     /* writing output(xd', yd') in little endian format */
  1663.     pSrc16[i3 * 2U] = out1;
  1664.     pSrc16[(i3 * 2U) + 1U] = out2;
  1665.  
  1666.     /*  Twiddle coefficients index modifier */
  1667.     ic = ic + twidCoefModifier;
  1668.  
  1669.     /*  Updating input index */
  1670.     i0 = i0 + 1U;
  1671.  
  1672.   } while (--j);
  1673.  
  1674.   /*  End of first stage process */
  1675.  
  1676.   /* data is in 4.11(q11) format */
  1677.  
  1678.  
  1679.   /*  Start of Middle stage process */
  1680.  
  1681.   /*  Twiddle coefficients index modifier */
  1682.   twidCoefModifier <<= 2U;
  1683.  
  1684.   /*  Calculation of Middle stage */
  1685.   for (k = fftLen / 4U; k > 4U; k >>= 2U)
  1686.   {
  1687.     /*  Initializations for the middle stage */
  1688.     n1 = n2;
  1689.     n2 >>= 2U;
  1690.     ic = 0U;
  1691.  
  1692.     for (j = 0U; j <= (n2 - 1U); j++)
  1693.     {
  1694.       /*  index calculation for the coefficients */
  1695.       Co1 = pCoef16[ic * 2U];
  1696.       Si1 = pCoef16[(ic * 2U) + 1U];
  1697.       Co2 = pCoef16[2U * ic * 2U];
  1698.       Si2 = pCoef16[2U * ic * 2U + 1U];
  1699.       Co3 = pCoef16[3U * ic * 2U];
  1700.       Si3 = pCoef16[(3U * ic * 2U) + 1U];
  1701.  
  1702.       /*  Twiddle coefficients index modifier */
  1703.       ic = ic + twidCoefModifier;
  1704.  
  1705.       /*  Butterfly implementation */
  1706.       for (i0 = j; i0 < fftLen; i0 += n1)
  1707.       {
  1708.         /*  index calculation for the input as, */
  1709.         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  1710.         i1 = i0 + n2;
  1711.         i2 = i1 + n2;
  1712.         i3 = i2 + n2;
  1713.  
  1714.         /*  Reading i0, i0+fftLen/2 inputs */
  1715.         /* Read ya (real), xa(imag) input */
  1716.         T0 = pSrc16[i0 * 2U];
  1717.         T1 = pSrc16[(i0 * 2U) + 1U];
  1718.  
  1719.         /* Read yc (real), xc(imag) input */
  1720.         S0 = pSrc16[i2 * 2U];
  1721.         S1 = pSrc16[(i2 * 2U) + 1U];
  1722.  
  1723.  
  1724.         /* R0 = (ya + yc), R1 = (xa + xc) */
  1725.         R0 = __SSAT(T0 + S0, 16U);
  1726.         R1 = __SSAT(T1 + S1, 16U);
  1727.         /* S0 = (ya - yc), S1 = (xa - xc) */
  1728.         S0 = __SSAT(T0 - S0, 16U);
  1729.         S1 = __SSAT(T1 - S1, 16U);
  1730.  
  1731.         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1732.         /* Read yb (real), xb(imag) input */
  1733.         T0 = pSrc16[i1 * 2U];
  1734.         T1 = pSrc16[(i1 * 2U) + 1U];
  1735.  
  1736.         /* Read yd (real), xd(imag) input */
  1737.         U0 = pSrc16[i3 * 2U];
  1738.         U1 = pSrc16[(i3 * 2U) + 1U];
  1739.  
  1740.         /* T0 = (yb + yd), T1 = (xb + xd) */
  1741.         T0 = __SSAT(T0 + U0, 16U);
  1742.         T1 = __SSAT(T1 + U1, 16U);
  1743.  
  1744.         /*  writing the butterfly processed i0 sample */
  1745.         /* xa' = xa + xb + xc + xd */
  1746.         /* ya' = ya + yb + yc + yd */
  1747.         pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
  1748.         pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
  1749.  
  1750.         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
  1751.         R0 = (R0 >> 1U) - (T0 >> 1U);
  1752.         R1 = (R1 >> 1U) - (T1 >> 1U);
  1753.  
  1754.         /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
  1755.         out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
  1756.         /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
  1757.         out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
  1758.  
  1759.         /*  Reading i0+3fftLen/4 */
  1760.         /* Read yb (real), xb(imag) input */
  1761.         T0 = pSrc16[i1 * 2U];
  1762.         T1 = pSrc16[(i1 * 2U) + 1U];
  1763.  
  1764.         /*  writing the butterfly processed i0 + fftLen/4 sample */
  1765.         /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
  1766.         /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
  1767.         pSrc16[i1 * 2U] = out1;
  1768.         pSrc16[(i1 * 2U) + 1U] = out2;
  1769.  
  1770.         /*  Butterfly calculations */
  1771.         /* Read yd (real), xd(imag) input */
  1772.         U0 = pSrc16[i3 * 2U];
  1773.         U1 = pSrc16[(i3 * 2U) + 1U];
  1774.  
  1775.         /* T0 = yb-yd, T1 = xb-xd) */
  1776.         T0 = __SSAT(T0 - U0, 16U);
  1777.         T1 = __SSAT(T1 - U1, 16U);
  1778.  
  1779.         /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
  1780.         R0 = (S0 >> 1U) + (T1 >> 1U);
  1781.         R1 = (S1 >> 1U) - (T0 >> 1U);
  1782.  
  1783.         /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
  1784.         S0 = (S0 >> 1U) - (T1 >> 1U);
  1785.         S1 = (S1 >> 1U) + (T0 >> 1U);
  1786.  
  1787.         /*  Butterfly process for the i0+fftLen/2 sample */
  1788.         out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
  1789.         out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
  1790.         /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
  1791.         /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
  1792.         pSrc16[i2 * 2U] = out1;
  1793.         pSrc16[(i2 * 2U) + 1U] = out2;
  1794.  
  1795.         /*  Butterfly process for the i0+3fftLen/4 sample */
  1796.         out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
  1797.  
  1798.         out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
  1799.         /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
  1800.         /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
  1801.         pSrc16[i3 * 2U] = out1;
  1802.         pSrc16[(i3 * 2U) + 1U] = out2;
  1803.  
  1804.  
  1805.       }
  1806.     }
  1807.     /*  Twiddle coefficients index modifier */
  1808.     twidCoefModifier <<= 2U;
  1809.   }
  1810.   /*  End of Middle stages process */
  1811.  
  1812.  
  1813.   /* data is in 10.6(q6) format for the 1024 point */
  1814.   /* data is in 8.8(q8) format for the 256 point   */
  1815.   /* data is in 6.10(q10) format for the 64 point  */
  1816.   /* data is in 4.12(q12) format for the 16 point  */
  1817.  
  1818.   /* start of last stage process */
  1819.  
  1820.  
  1821.   /*  Initializations for the last stage */
  1822.   n1 = n2;
  1823.   n2 >>= 2U;
  1824.  
  1825.   /*  Butterfly implementation */
  1826.   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
  1827.   {
  1828.     /*  index calculation for the input as, */
  1829.     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  1830.     i1 = i0 + n2;
  1831.     i2 = i1 + n2;
  1832.     i3 = i2 + n2;
  1833.  
  1834.     /*  Reading i0, i0+fftLen/2 inputs */
  1835.     /* Read ya (real), xa(imag) input */
  1836.     T0 = pSrc16[i0 * 2U];
  1837.     T1 = pSrc16[(i0 * 2U) + 1U];
  1838.     /* Read yc (real), xc(imag) input */
  1839.     S0 = pSrc16[i2 * 2U];
  1840.     S1 = pSrc16[(i2 * 2U) + 1U];
  1841.  
  1842.     /* R0 = (ya + yc), R1 = (xa + xc) */
  1843.     R0 = __SSAT(T0 + S0, 16U);
  1844.     R1 = __SSAT(T1 + S1, 16U);
  1845.     /* S0 = (ya - yc), S1 = (xa - xc) */
  1846.     S0 = __SSAT(T0 - S0, 16U);
  1847.     S1 = __SSAT(T1 - S1, 16U);
  1848.  
  1849.     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1850.     /* Read yb (real), xb(imag) input */
  1851.     T0 = pSrc16[i1 * 2U];
  1852.     T1 = pSrc16[(i1 * 2U) + 1U];
  1853.     /* Read yd (real), xd(imag) input */
  1854.     U0 = pSrc16[i3 * 2U];
  1855.     U1 = pSrc16[(i3 * 2U) + 1U];
  1856.  
  1857.     /* T0 = (yb + yd), T1 = (xb + xd) */
  1858.     T0 = __SSAT(T0 + U0, 16U);
  1859.     T1 = __SSAT(T1 + U1, 16U);
  1860.  
  1861.     /*  writing the butterfly processed i0 sample */
  1862.     /* xa' = xa + xb + xc + xd */
  1863.     /* ya' = ya + yb + yc + yd */
  1864.     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
  1865.     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
  1866.  
  1867.     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
  1868.     R0 = (R0 >> 1U) - (T0 >> 1U);
  1869.     R1 = (R1 >> 1U) - (T1 >> 1U);
  1870.  
  1871.     /* Read yb (real), xb(imag) input */
  1872.     T0 = pSrc16[i1 * 2U];
  1873.     T1 = pSrc16[(i1 * 2U) + 1U];
  1874.  
  1875.     /*  writing the butterfly processed i0 + fftLen/4 sample */
  1876.     /* xc' = (xa-xb+xc-xd) */
  1877.     /* yc' = (ya-yb+yc-yd) */
  1878.     pSrc16[i1 * 2U] = R0;
  1879.     pSrc16[(i1 * 2U) + 1U] = R1;
  1880.  
  1881.     /* Read yd (real), xd(imag) input */
  1882.     U0 = pSrc16[i3 * 2U];
  1883.     U1 = pSrc16[(i3 * 2U) + 1U];
  1884.     /* T0 = (yb - yd), T1 = (xb - xd) */
  1885.     T0 = __SSAT(T0 - U0, 16U);
  1886.     T1 = __SSAT(T1 - U1, 16U);
  1887.  
  1888.     /*  writing the butterfly processed i0 + fftLen/2 sample */
  1889.     /* xb' = (xa-yb-xc+yd) */
  1890.     /* yb' = (ya+xb-yc-xd) */
  1891.     pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
  1892.     pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
  1893.  
  1894.  
  1895.     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
  1896.     /* xd' = (xa+yb-xc-yd) */
  1897.     /* yd' = (ya-xb-yc+xd) */
  1898.     pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
  1899.     pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
  1900.   }
  1901.   /* end of last stage  process */
  1902.  
  1903.   /* output is in 11.5(q5) format for the 1024 point */
  1904.   /* output is in 9.7(q7) format for the 256 point   */
  1905.   /* output is in 7.9(q9) format for the 64 point  */
  1906.   /* output is in 5.11(q11) format for the 16 point  */
  1907.  
  1908. #endif /* #if defined (ARM_MATH_DSP) */
  1909.  
  1910. }
  1911.