Subversion Repositories LedShow

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 mjames 1
/* ----------------------------------------------------------------------    
2
* Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
3
*    
4
* $Date:        19. March 2015
5
* $Revision:    V.1.4.5  
6
*    
7
* Project:          CMSIS DSP Library    
8
* Title:            arm_cfft_radix4_q31.c    
9
*    
10
* Description:  This file has function definition of Radix-4 FFT & IFFT function and    
11
*                               In-place bit reversal using bit reversal table    
12
*    
13
* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14
*  
15
* Redistribution and use in source and binary forms, with or without
16
* modification, are permitted provided that the following conditions
17
* are met:
18
*   - Redistributions of source code must retain the above copyright
19
*     notice, this list of conditions and the following disclaimer.
20
*   - Redistributions in binary form must reproduce the above copyright
21
*     notice, this list of conditions and the following disclaimer in
22
*     the documentation and/or other materials provided with the
23
*     distribution.
24
*   - Neither the name of ARM LIMITED nor the names of its contributors
25
*     may be used to endorse or promote products derived from this
26
*     software without specific prior written permission.
27
*
28
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
29
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
31
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
32
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
33
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
34
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
35
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
36
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
38
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39
* POSSIBILITY OF SUCH DAMAGE.    
40
* -------------------------------------------------------------------- */
41
 
42
#include "arm_math.h"
43
 
44
void arm_radix4_butterfly_inverse_q31(
45
q31_t * pSrc,
46
uint32_t fftLen,
47
q31_t * pCoef,
48
uint32_t twidCoefModifier);
49
 
50
void arm_radix4_butterfly_q31(
51
q31_t * pSrc,
52
uint32_t fftLen,
53
q31_t * pCoef,
54
uint32_t twidCoefModifier);
55
 
56
void arm_bitreversal_q31(
57
q31_t * pSrc,
58
uint32_t fftLen,
59
uint16_t bitRevFactor,
60
uint16_t * pBitRevTab);
61
 
62
/**    
63
 * @ingroup groupTransforms    
64
 */
65
 
66
/**    
67
 * @addtogroup ComplexFFT    
68
 * @{    
69
 */
70
 
71
/**    
72
 * @details    
73
 * @brief Processing function for the Q31 CFFT/CIFFT.    
74
 * @deprecated Do not use this function.  It has been superseded by \ref arm_cfft_q31 and will be removed
75
 * @param[in]      *S    points to an instance of the Q31 CFFT/CIFFT structure.  
76
 * @param[in, out] *pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place.  
77
 * @return none.    
78
 *    
79
 * \par Input and output formats:    
80
 * \par    
81
 * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.  
82
 * Hence the output format is different for different FFT sizes.    
83
 * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:  
84
 * \par  
85
 * \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"    
86
 * \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"    
87
 *    
88
 */
89
 
90
void arm_cfft_radix4_q31(
91
  const arm_cfft_radix4_instance_q31 * S,
92
  q31_t * pSrc)
93
{
94
  if(S->ifftFlag == 1u)
95
  {
96
    /* Complex IFFT radix-4 */
97
    arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle,
98
                                     S->twidCoefModifier);
99
  }
100
  else
101
  {
102
    /* Complex FFT radix-4 */
103
    arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle,
104
                             S->twidCoefModifier);
105
  }
106
 
107
 
108
  if(S->bitReverseFlag == 1u)
109
  {
110
    /*  Bit Reversal */
111
    arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
112
  }
113
 
114
}
115
 
116
/**    
117
 * @} end of ComplexFFT group    
118
 */
119
 
120
/*    
121
* Radix-4 FFT algorithm used is :    
122
*    
123
* Input real and imaginary data:    
124
* x(n) = xa + j * ya    
125
* x(n+N/4 ) = xb + j * yb    
126
* x(n+N/2 ) = xc + j * yc    
127
* x(n+3N 4) = xd + j * yd    
128
*    
129
*    
130
* Output real and imaginary data:    
131
* x(4r) = xa'+ j * ya'    
132
* x(4r+1) = xb'+ j * yb'    
133
* x(4r+2) = xc'+ j * yc'    
134
* x(4r+3) = xd'+ j * yd'    
135
*    
136
*    
137
* Twiddle factors for radix-4 FFT:    
138
* Wn = co1 + j * (- si1)    
139
* W2n = co2 + j * (- si2)    
140
* W3n = co3 + j * (- si3)    
141
*    
142
*  Butterfly implementation:    
143
* xa' = xa + xb + xc + xd    
144
* ya' = ya + yb + yc + yd    
145
* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)    
146
* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)    
147
* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)    
148
* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)    
149
* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)    
150
* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)    
151
*    
152
*/
153
 
154
/**    
155
 * @brief  Core function for the Q31 CFFT butterfly process.  
156
 * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.  
157
 * @param[in]      fftLen           length of the FFT.  
158
 * @param[in]      *pCoef           points to twiddle coefficient buffer.  
159
 * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.  
160
 * @return none.  
161
 */
162
 
163
void arm_radix4_butterfly_q31(
164
  q31_t * pSrc,
165
  uint32_t fftLen,
166
  q31_t * pCoef,
167
  uint32_t twidCoefModifier)
168
{
169
#if defined(ARM_MATH_CM7)
170
  uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
171
  q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
172
 
173
  q31_t xa, xb, xc, xd;
174
  q31_t ya, yb, yc, yd;
175
  q31_t xa_out, xb_out, xc_out, xd_out;
176
  q31_t ya_out, yb_out, yc_out, yd_out;
177
 
178
  q31_t *ptr1;
179
  q63_t xaya, xbyb, xcyc, xdyd;
180
  /* Total process is divided into three stages */
181
 
182
  /* process first stage, middle stages, & last stage */
183
 
184
 
185
  /* start of first stage process */
186
 
187
  /*  Initializations for the first stage */
188
  n2 = fftLen;
189
  n1 = n2;
190
  /* n2 = fftLen/4 */
191
  n2 >>= 2u;
192
  i0 = 0u;
193
  ia1 = 0u;
194
 
195
  j = n2;
196
 
197
  /*  Calculation of first stage */
198
  do
199
  {
200
    /*  index calculation for the input as, */
201
    /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
202
    i1 = i0 + n2;
203
    i2 = i1 + n2;
204
    i3 = i2 + n2;
205
 
206
    /* input is in 1.31(q31) format and provide 4 guard bits for the input */
207
 
208
    /*  Butterfly implementation */
209
    /* xa + xc */
210
    r1 = (pSrc[(2u * i0)] >> 4u) + (pSrc[(2u * i2)] >> 4u);
211
    /* xa - xc */
212
    r2 = (pSrc[2u * i0] >> 4u) - (pSrc[2u * i2] >> 4u);
213
 
214
    /* xb + xd */
215
    t1 = (pSrc[2u * i1] >> 4u) + (pSrc[2u * i3] >> 4u);
216
 
217
    /* ya + yc */
218
    s1 = (pSrc[(2u * i0) + 1u] >> 4u) + (pSrc[(2u * i2) + 1u] >> 4u);
219
    /* ya - yc */
220
    s2 = (pSrc[(2u * i0) + 1u] >> 4u) - (pSrc[(2u * i2) + 1u] >> 4u);
221
 
222
    /* xa' = xa + xb + xc + xd */
223
    pSrc[2u * i0] = (r1 + t1);
224
    /* (xa + xc) - (xb + xd) */
225
    r1 = r1 - t1;
226
    /* yb + yd */
227
    t2 = (pSrc[(2u * i1) + 1u] >> 4u) + (pSrc[(2u * i3) + 1u] >> 4u);
228
 
229
    /* ya' = ya + yb + yc + yd */
230
    pSrc[(2u * i0) + 1u] = (s1 + t2);
231
 
232
    /* (ya + yc) - (yb + yd) */
233
    s1 = s1 - t2;
234
 
235
    /* yb - yd */
236
    t1 = (pSrc[(2u * i1) + 1u] >> 4u) - (pSrc[(2u * i3) + 1u] >> 4u);
237
    /* xb - xd */
238
    t2 = (pSrc[2u * i1] >> 4u) - (pSrc[2u * i3] >> 4u);
239
 
240
    /*  index calculation for the coefficients */
241
    ia2 = 2u * ia1;
242
    co2 = pCoef[ia2 * 2u];
243
    si2 = pCoef[(ia2 * 2u) + 1u];
244
 
245
    /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
246
    pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
247
                     ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
248
 
249
    /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
250
    pSrc[(2u * i1) + 1u] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
251
                            ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
252
 
253
    /* (xa - xc) + (yb - yd) */
254
    r1 = r2 + t1;
255
    /* (xa - xc) - (yb - yd) */
256
    r2 = r2 - t1;
257
 
258
    /* (ya - yc) - (xb - xd) */
259
    s1 = s2 - t2;
260
    /* (ya - yc) + (xb - xd) */
261
    s2 = s2 + t2;
262
 
263
    co1 = pCoef[ia1 * 2u];
264
    si1 = pCoef[(ia1 * 2u) + 1u];
265
 
266
    /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
267
    pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
268
                     ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
269
 
270
    /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
271
    pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
272
                            ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
273
 
274
    /*  index calculation for the coefficients */
275
    ia3 = 3u * ia1;
276
    co3 = pCoef[ia3 * 2u];
277
    si3 = pCoef[(ia3 * 2u) + 1u];
278
 
279
    /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
280
    pSrc[2u * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
281
                     ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
282
 
283
    /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
284
    pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
285
                            ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
286
 
287
    /*  Twiddle coefficients index modifier */
288
    ia1 = ia1 + twidCoefModifier;
289
 
290
    /*  Updating input index */
291
    i0 = i0 + 1u;
292
 
293
  } while(--j);
294
 
295
  /* end of first stage process */
296
 
297
  /* data is in 5.27(q27) format */
298
 
299
 
300
  /* start of Middle stages process */
301
 
302
 
303
  /* each stage in middle stages provides two down scaling of the input */
304
 
305
  twidCoefModifier <<= 2u;
306
 
307
 
308
  for (k = fftLen / 4u; k > 4u; k >>= 2u)
309
  {
310
    /*  Initializations for the first stage */
311
    n1 = n2;
312
    n2 >>= 2u;
313
    ia1 = 0u;
314
 
315
    /*  Calculation of first stage */
316
    for (j = 0u; j <= (n2 - 1u); j++)
317
    {
318
      /*  index calculation for the coefficients */
319
      ia2 = ia1 + ia1;
320
      ia3 = ia2 + ia1;
321
      co1 = pCoef[ia1 * 2u];
322
      si1 = pCoef[(ia1 * 2u) + 1u];
323
      co2 = pCoef[ia2 * 2u];
324
      si2 = pCoef[(ia2 * 2u) + 1u];
325
      co3 = pCoef[ia3 * 2u];
326
      si3 = pCoef[(ia3 * 2u) + 1u];
327
      /*  Twiddle coefficients index modifier */
328
      ia1 = ia1 + twidCoefModifier;
329
 
330
      for (i0 = j; i0 < fftLen; i0 += n1)
331
      {
332
        /*  index calculation for the input as, */
333
        /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
334
        i1 = i0 + n2;
335
        i2 = i1 + n2;
336
        i3 = i2 + n2;
337
 
338
        /*  Butterfly implementation */
339
        /* xa + xc */
340
        r1 = pSrc[2u * i0] + pSrc[2u * i2];
341
        /* xa - xc */
342
        r2 = pSrc[2u * i0] - pSrc[2u * i2];
343
 
344
        /* ya + yc */
345
        s1 = pSrc[(2u * i0) + 1u] + pSrc[(2u * i2) + 1u];
346
        /* ya - yc */
347
        s2 = pSrc[(2u * i0) + 1u] - pSrc[(2u * i2) + 1u];
348
 
349
        /* xb + xd */
350
        t1 = pSrc[2u * i1] + pSrc[2u * i3];
351
 
352
        /* xa' = xa + xb + xc + xd */
353
        pSrc[2u * i0] = (r1 + t1) >> 2u;
354
        /* xa + xc -(xb + xd) */
355
        r1 = r1 - t1;
356
 
357
        /* yb + yd */
358
        t2 = pSrc[(2u * i1) + 1u] + pSrc[(2u * i3) + 1u];
359
        /* ya' = ya + yb + yc + yd */
360
        pSrc[(2u * i0) + 1u] = (s1 + t2) >> 2u;
361
 
362
        /* (ya + yc) - (yb + yd) */
363
        s1 = s1 - t2;
364
 
365
        /* (yb - yd) */
366
        t1 = pSrc[(2u * i1) + 1u] - pSrc[(2u * i3) + 1u];
367
        /* (xb - xd) */
368
        t2 = pSrc[2u * i1] - pSrc[2u * i3];
369
 
370
        /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
371
        pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
372
                         ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1u;
373
 
374
        /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
375
        pSrc[(2u * i1) + 1u] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
376
                                ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1u;
377
 
378
        /* (xa - xc) + (yb - yd) */
379
        r1 = r2 + t1;
380
        /* (xa - xc) - (yb - yd) */
381
        r2 = r2 - t1;
382
 
383
        /* (ya - yc) -  (xb - xd) */
384
        s1 = s2 - t2;
385
        /* (ya - yc) +  (xb - xd) */
386
        s2 = s2 + t2;
387
 
388
        /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
389
        pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
390
                         ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
391
 
392
        /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
393
        pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
394
                                ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
395
 
396
        /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
397
        pSrc[2u * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
398
                         ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
399
 
400
        /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
401
        pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
402
                                ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
403
      }
404
    }
405
    twidCoefModifier <<= 2u;
406
  }
407
#else
408
  uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
409
  q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
410
 
411
  q31_t xa, xb, xc, xd;
412
  q31_t ya, yb, yc, yd;
413
  q31_t xa_out, xb_out, xc_out, xd_out;
414
  q31_t ya_out, yb_out, yc_out, yd_out;
415
 
416
  q31_t *ptr1;
417
  q31_t *pSi0;
418
  q31_t *pSi1;
419
  q31_t *pSi2;
420
  q31_t *pSi3;
421
  q63_t xaya, xbyb, xcyc, xdyd;
422
  /* Total process is divided into three stages */
423
 
424
  /* process first stage, middle stages, & last stage */
425
 
426
 
427
  /* start of first stage process */
428
 
429
  /*  Initializations for the first stage */
430
  n2 = fftLen;
431
  n1 = n2;
432
  /* n2 = fftLen/4 */
433
  n2 >>= 2u;
434
 
435
  ia1 = 0u;
436
 
437
  j = n2;
438
 
439
  pSi0 = pSrc;
440
  pSi1 = pSi0 + 2 * n2;
441
  pSi2 = pSi1 + 2 * n2;
442
  pSi3 = pSi2 + 2 * n2;
443
 
444
  /*  Calculation of first stage */
445
  do
446
  {
447
    /* input is in 1.31(q31) format and provide 4 guard bits for the input */
448
 
449
    /*  Butterfly implementation */
450
    /* xa + xc */
451
    r1 = (pSi0[0] >> 4u) + (pSi2[0] >> 4u);
452
    /* xa - xc */
453
    r2 = (pSi0[0] >> 4u) - (pSi2[0] >> 4u);
454
 
455
    /* xb + xd */
456
    t1 = (pSi1[0] >> 4u) + (pSi3[0] >> 4u);
457
 
458
    /* ya + yc */
459
    s1 = (pSi0[1] >> 4u) + (pSi2[1] >> 4u);
460
    /* ya - yc */
461
    s2 = (pSi0[1] >> 4u) - (pSi2[1] >> 4u);
462
 
463
    /* xa' = xa + xb + xc + xd */
464
    *pSi0++ = (r1 + t1);
465
    /* (xa + xc) - (xb + xd) */
466
    r1 = r1 - t1;
467
    /* yb + yd */
468
    t2 = (pSi1[1] >> 4u) + (pSi3[1] >> 4u);
469
 
470
    /* ya' = ya + yb + yc + yd */
471
    *pSi0++ = (s1 + t2);
472
 
473
    /* (ya + yc) - (yb + yd) */
474
    s1 = s1 - t2;
475
 
476
    /* yb - yd */
477
    t1 = (pSi1[1] >> 4u) - (pSi3[1] >> 4u);
478
    /* xb - xd */
479
    t2 = (pSi1[0] >> 4u) - (pSi3[0] >> 4u);
480
 
481
    /*  index calculation for the coefficients */
482
    ia2 = 2u * ia1;
483
    co2 = pCoef[ia2 * 2u];
484
    si2 = pCoef[(ia2 * 2u) + 1u];
485
 
486
    /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
487
    *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
488
                     ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
489
 
490
    /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
491
    *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
492
                            ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
493
 
494
    /* (xa - xc) + (yb - yd) */
495
    r1 = r2 + t1;
496
    /* (xa - xc) - (yb - yd) */
497
    r2 = r2 - t1;
498
 
499
    /* (ya - yc) - (xb - xd) */
500
    s1 = s2 - t2;
501
    /* (ya - yc) + (xb - xd) */
502
    s2 = s2 + t2;
503
 
504
    co1 = pCoef[ia1 * 2u];
505
    si1 = pCoef[(ia1 * 2u) + 1u];
506
 
507
    /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
508
    *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
509
                     ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
510
 
511
    /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
512
    *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
513
                            ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
514
 
515
    /*  index calculation for the coefficients */
516
    ia3 = 3u * ia1;
517
    co3 = pCoef[ia3 * 2u];
518
    si3 = pCoef[(ia3 * 2u) + 1u];
519
 
520
    /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
521
    *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
522
                     ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
523
 
524
    /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
525
    *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
526
                            ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
527
 
528
    /*  Twiddle coefficients index modifier */
529
    ia1 = ia1 + twidCoefModifier;
530
 
531
  } while(--j);
532
 
533
  /* end of first stage process */
534
 
535
  /* data is in 5.27(q27) format */
536
 
537
 
538
  /* start of Middle stages process */
539
 
540
 
541
  /* each stage in middle stages provides two down scaling of the input */
542
 
543
  twidCoefModifier <<= 2u;
544
 
545
 
546
  for (k = fftLen / 4u; k > 4u; k >>= 2u)
547
  {
548
    /*  Initializations for the first stage */
549
    n1 = n2;
550
    n2 >>= 2u;
551
    ia1 = 0u;
552
 
553
    /*  Calculation of first stage */
554
    for (j = 0u; j <= (n2 - 1u); j++)
555
    {
556
      /*  index calculation for the coefficients */
557
      ia2 = ia1 + ia1;
558
      ia3 = ia2 + ia1;
559
      co1 = pCoef[ia1 * 2u];
560
      si1 = pCoef[(ia1 * 2u) + 1u];
561
      co2 = pCoef[ia2 * 2u];
562
      si2 = pCoef[(ia2 * 2u) + 1u];
563
      co3 = pCoef[ia3 * 2u];
564
      si3 = pCoef[(ia3 * 2u) + 1u];
565
      /*  Twiddle coefficients index modifier */
566
      ia1 = ia1 + twidCoefModifier;
567
 
568
      pSi0 = pSrc + 2 * j;
569
      pSi1 = pSi0 + 2 * n2;
570
      pSi2 = pSi1 + 2 * n2;
571
      pSi3 = pSi2 + 2 * n2;
572
 
573
      for (i0 = j; i0 < fftLen; i0 += n1)
574
      {
575
        /*  Butterfly implementation */
576
        /* xa + xc */
577
        r1 = pSi0[0] + pSi2[0];
578
 
579
        /* xa - xc */
580
        r2 = pSi0[0] - pSi2[0];
581
 
582
 
583
        /* ya + yc */
584
        s1 = pSi0[1] + pSi2[1];
585
 
586
        /* ya - yc */
587
        s2 = pSi0[1] - pSi2[1];
588
 
589
 
590
        /* xb + xd */
591
        t1 = pSi1[0] + pSi3[0];
592
 
593
 
594
        /* xa' = xa + xb + xc + xd */
595
        pSi0[0] = (r1 + t1) >> 2u;
596
        /* xa + xc -(xb + xd) */
597
        r1 = r1 - t1;
598
 
599
        /* yb + yd */
600
        t2 = pSi1[1] + pSi3[1];
601
 
602
        /* ya' = ya + yb + yc + yd */
603
        pSi0[1] = (s1 + t2) >> 2u;
604
        pSi0 += 2 * n1;
605
 
606
        /* (ya + yc) - (yb + yd) */
607
        s1 = s1 - t2;
608
 
609
        /* (yb - yd) */
610
        t1 = pSi1[1] - pSi3[1];
611
 
612
        /* (xb - xd) */
613
        t2 = pSi1[0] - pSi3[0];
614
 
615
 
616
        /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
617
        pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
618
                         ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1u;
619
 
620
        /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
621
        pSi1[1] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
622
                                ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1u;
623
        pSi1 += 2 * n1;
624
 
625
        /* (xa - xc) + (yb - yd) */
626
        r1 = r2 + t1;
627
        /* (xa - xc) - (yb - yd) */
628
        r2 = r2 - t1;
629
 
630
        /* (ya - yc) -  (xb - xd) */
631
        s1 = s2 - t2;
632
        /* (ya - yc) +  (xb - xd) */
633
        s2 = s2 + t2;
634
 
635
        /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
636
        pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
637
                         ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
638
 
639
        /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
640
        pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
641
                                ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
642
        pSi2 += 2 * n1;
643
 
644
        /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
645
        pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
646
                         ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
647
 
648
        /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
649
        pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
650
                                ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
651
        pSi3 += 2 * n1;
652
      }
653
    }
654
    twidCoefModifier <<= 2u;
655
  }
656
#endif
657
 
658
  /* End of Middle stages process */
659
 
660
  /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
661
  /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
662
  /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
663
  /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
664
 
665
 
666
  /* start of Last stage process */
667
  /*  Initializations for the last stage */
668
  j = fftLen >> 2;
669
  ptr1 = &pSrc[0];
670
 
671
  /*  Calculations of last stage */
672
  do
673
  {
674
 
675
#ifndef ARM_MATH_BIG_ENDIAN
676
 
677
    /* Read xa (real), ya(imag) input */
678
    xaya = *__SIMD64(ptr1)++;
679
    xa = (q31_t) xaya;
680
    ya = (q31_t) (xaya >> 32);
681
 
682
    /* Read xb (real), yb(imag) input */
683
    xbyb = *__SIMD64(ptr1)++;
684
    xb = (q31_t) xbyb;
685
    yb = (q31_t) (xbyb >> 32);
686
 
687
    /* Read xc (real), yc(imag) input */
688
    xcyc = *__SIMD64(ptr1)++;
689
    xc = (q31_t) xcyc;
690
    yc = (q31_t) (xcyc >> 32);
691
 
692
    /* Read xc (real), yc(imag) input */
693
    xdyd = *__SIMD64(ptr1)++;
694
    xd = (q31_t) xdyd;
695
    yd = (q31_t) (xdyd >> 32);
696
 
697
#else
698
 
699
    /* Read xa (real), ya(imag) input */
700
    xaya = *__SIMD64(ptr1)++;
701
    ya = (q31_t) xaya;
702
    xa = (q31_t) (xaya >> 32);
703
 
704
    /* Read xb (real), yb(imag) input */
705
    xbyb = *__SIMD64(ptr1)++;
706
    yb = (q31_t) xbyb;
707
    xb = (q31_t) (xbyb >> 32);
708
 
709
    /* Read xc (real), yc(imag) input */
710
    xcyc = *__SIMD64(ptr1)++;
711
    yc = (q31_t) xcyc;
712
    xc = (q31_t) (xcyc >> 32);
713
 
714
    /* Read xc (real), yc(imag) input */
715
    xdyd = *__SIMD64(ptr1)++;
716
    yd = (q31_t) xdyd;
717
    xd = (q31_t) (xdyd >> 32);
718
 
719
 
720
#endif
721
 
722
    /* xa' = xa + xb + xc + xd */
723
    xa_out = xa + xb + xc + xd;
724
 
725
    /* ya' = ya + yb + yc + yd */
726
    ya_out = ya + yb + yc + yd;
727
 
728
    /* pointer updation for writing */
729
    ptr1 = ptr1 - 8u;
730
 
731
    /* writing xa' and ya' */
732
    *ptr1++ = xa_out;
733
    *ptr1++ = ya_out;
734
 
735
    xc_out = (xa - xb + xc - xd);
736
    yc_out = (ya - yb + yc - yd);
737
 
738
    /* writing xc' and yc' */
739
    *ptr1++ = xc_out;
740
    *ptr1++ = yc_out;
741
 
742
    xb_out = (xa + yb - xc - yd);
743
    yb_out = (ya - xb - yc + xd);
744
 
745
    /* writing xb' and yb' */
746
    *ptr1++ = xb_out;
747
    *ptr1++ = yb_out;
748
 
749
    xd_out = (xa - yb - xc + yd);
750
    yd_out = (ya + xb - yc - xd);
751
 
752
    /* writing xd' and yd' */
753
    *ptr1++ = xd_out;
754
    *ptr1++ = yd_out;
755
 
756
 
757
  } while(--j);
758
 
759
  /* output is in 11.21(q21) format for the 1024 point */
760
  /* output is in 9.23(q23) format for the 256 point */
761
  /* output is in 7.25(q25) format for the 64 point */
762
  /* output is in 5.27(q27) format for the 16 point */
763
 
764
  /* End of last stage process */
765
 
766
}
767
 
768
 
769
/**    
770
 * @brief  Core function for the Q31 CIFFT butterfly process.  
771
 * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.  
772
 * @param[in]      fftLen           length of the FFT.  
773
 * @param[in]      *pCoef           points to twiddle coefficient buffer.  
774
 * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.  
775
 * @return none.  
776
 */
777
 
778
 
779
/*    
780
* Radix-4 IFFT algorithm used is :    
781
*    
782
* CIFFT uses same twiddle coefficients as CFFT Function    
783
*  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]    
784
*    
785
*    
786
* IFFT is implemented with following changes in equations from FFT    
787
*    
788
* Input real and imaginary data:    
789
* x(n) = xa + j * ya    
790
* x(n+N/4 ) = xb + j * yb    
791
* x(n+N/2 ) = xc + j * yc    
792
* x(n+3N 4) = xd + j * yd    
793
*    
794
*    
795
* Output real and imaginary data:    
796
* x(4r) = xa'+ j * ya'    
797
* x(4r+1) = xb'+ j * yb'    
798
* x(4r+2) = xc'+ j * yc'    
799
* x(4r+3) = xd'+ j * yd'    
800
*    
801
*    
802
* Twiddle factors for radix-4 IFFT:    
803
* Wn = co1 + j * (si1)    
804
* W2n = co2 + j * (si2)    
805
* W3n = co3 + j * (si3)    
806
 
807
* The real and imaginary output values for the radix-4 butterfly are    
808
* xa' = xa + xb + xc + xd    
809
* ya' = ya + yb + yc + yd    
810
* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)    
811
* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)    
812
* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)    
813
* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)    
814
* xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)    
815
* yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)    
816
*    
817
*/
818
 
819
void arm_radix4_butterfly_inverse_q31(
820
  q31_t * pSrc,
821
  uint32_t fftLen,
822
  q31_t * pCoef,
823
  uint32_t twidCoefModifier)
824
{
825
#if defined(ARM_MATH_CM7)
826
  uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
827
  q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
828
  q31_t xa, xb, xc, xd;
829
  q31_t ya, yb, yc, yd;
830
  q31_t xa_out, xb_out, xc_out, xd_out;
831
  q31_t ya_out, yb_out, yc_out, yd_out;
832
 
833
  q31_t *ptr1;
834
  q63_t xaya, xbyb, xcyc, xdyd;
835
 
836
  /* input is be 1.31(q31) format for all FFT sizes */
837
  /* Total process is divided into three stages */
838
  /* process first stage, middle stages, & last stage */
839
 
840
  /* Start of first stage process */
841
 
842
  /* Initializations for the first stage */
843
  n2 = fftLen;
844
  n1 = n2;
845
  /* n2 = fftLen/4 */
846
  n2 >>= 2u;
847
  i0 = 0u;
848
  ia1 = 0u;
849
 
850
  j = n2;
851
 
852
  do
853
  {
854
 
855
    /* input is in 1.31(q31) format and provide 4 guard bits for the input */
856
 
857
    /*  index calculation for the input as, */
858
    /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
859
    i1 = i0 + n2;
860
    i2 = i1 + n2;
861
    i3 = i2 + n2;
862
 
863
    /*  Butterfly implementation */
864
    /* xa + xc */
865
    r1 = (pSrc[2u * i0] >> 4u) + (pSrc[2u * i2] >> 4u);
866
    /* xa - xc */
867
    r2 = (pSrc[2u * i0] >> 4u) - (pSrc[2u * i2] >> 4u);
868
 
869
    /* xb + xd */
870
    t1 = (pSrc[2u * i1] >> 4u) + (pSrc[2u * i3] >> 4u);
871
 
872
    /* ya + yc */
873
    s1 = (pSrc[(2u * i0) + 1u] >> 4u) + (pSrc[(2u * i2) + 1u] >> 4u);
874
    /* ya - yc */
875
    s2 = (pSrc[(2u * i0) + 1u] >> 4u) - (pSrc[(2u * i2) + 1u] >> 4u);
876
 
877
    /* xa' = xa + xb + xc + xd */
878
    pSrc[2u * i0] = (r1 + t1);
879
    /* (xa + xc) - (xb + xd) */
880
    r1 = r1 - t1;
881
    /* yb + yd */
882
    t2 = (pSrc[(2u * i1) + 1u] >> 4u) + (pSrc[(2u * i3) + 1u] >> 4u);
883
    /* ya' = ya + yb + yc + yd */
884
    pSrc[(2u * i0) + 1u] = (s1 + t2);
885
 
886
    /* (ya + yc) - (yb + yd) */
887
    s1 = s1 - t2;
888
 
889
    /* yb - yd */
890
    t1 = (pSrc[(2u * i1) + 1u] >> 4u) - (pSrc[(2u * i3) + 1u] >> 4u);
891
    /* xb - xd */
892
    t2 = (pSrc[2u * i1] >> 4u) - (pSrc[2u * i3] >> 4u);
893
 
894
    /*  index calculation for the coefficients */
895
    ia2 = 2u * ia1;
896
    co2 = pCoef[ia2 * 2u];
897
    si2 = pCoef[(ia2 * 2u) + 1u];
898
 
899
    /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
900
    pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
901
                     ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
902
 
903
    /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
904
    pSrc[2u * i1 + 1u] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
905
                          ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
906
 
907
    /* (xa - xc) - (yb - yd) */
908
    r1 = r2 - t1;
909
    /* (xa - xc) + (yb - yd) */
910
    r2 = r2 + t1;
911
 
912
    /* (ya - yc) + (xb - xd) */
913
    s1 = s2 + t2;
914
    /* (ya - yc) - (xb - xd) */
915
    s2 = s2 - t2;
916
 
917
    co1 = pCoef[ia1 * 2u];
918
    si1 = pCoef[(ia1 * 2u) + 1u];
919
 
920
    /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
921
    pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
922
                     ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
923
 
924
    /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
925
    pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
926
                            ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
927
 
928
    /*  index calculation for the coefficients */
929
    ia3 = 3u * ia1;
930
    co3 = pCoef[ia3 * 2u];
931
    si3 = pCoef[(ia3 * 2u) + 1u];
932
 
933
    /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
934
    pSrc[2u * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
935
                     ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
936
 
937
    /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
938
    pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
939
                            ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
940
 
941
    /*  Twiddle coefficients index modifier */
942
    ia1 = ia1 + twidCoefModifier;
943
 
944
    /*  Updating input index */
945
    i0 = i0 + 1u;
946
 
947
  } while(--j);
948
 
949
  /* data is in 5.27(q27) format */
950
  /* each stage provides two down scaling of the input */
951
 
952
 
953
  /* Start of Middle stages process */
954
 
955
  twidCoefModifier <<= 2u;
956
 
957
  /*  Calculation of second stage to excluding last stage */
958
  for (k = fftLen / 4u; k > 4u; k >>= 2u)
959
  {
960
    /*  Initializations for the first stage */
961
    n1 = n2;
962
    n2 >>= 2u;
963
    ia1 = 0u;
964
 
965
    for (j = 0; j <= (n2 - 1u); j++)
966
    {
967
      /*  index calculation for the coefficients */
968
      ia2 = ia1 + ia1;
969
      ia3 = ia2 + ia1;
970
      co1 = pCoef[ia1 * 2u];
971
      si1 = pCoef[(ia1 * 2u) + 1u];
972
      co2 = pCoef[ia2 * 2u];
973
      si2 = pCoef[(ia2 * 2u) + 1u];
974
      co3 = pCoef[ia3 * 2u];
975
      si3 = pCoef[(ia3 * 2u) + 1u];
976
      /*  Twiddle coefficients index modifier */
977
      ia1 = ia1 + twidCoefModifier;
978
 
979
      for (i0 = j; i0 < fftLen; i0 += n1)
980
      {
981
        /*  index calculation for the input as, */
982
        /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
983
        i1 = i0 + n2;
984
        i2 = i1 + n2;
985
        i3 = i2 + n2;
986
 
987
        /*  Butterfly implementation */
988
        /* xa + xc */
989
        r1 = pSrc[2u * i0] + pSrc[2u * i2];
990
        /* xa - xc */
991
        r2 = pSrc[2u * i0] - pSrc[2u * i2];
992
 
993
        /* ya + yc */
994
        s1 = pSrc[(2u * i0) + 1u] + pSrc[(2u * i2) + 1u];
995
        /* ya - yc */
996
        s2 = pSrc[(2u * i0) + 1u] - pSrc[(2u * i2) + 1u];
997
 
998
        /* xb + xd */
999
        t1 = pSrc[2u * i1] + pSrc[2u * i3];
1000
 
1001
        /* xa' = xa + xb + xc + xd */
1002
        pSrc[2u * i0] = (r1 + t1) >> 2u;
1003
        /* xa + xc -(xb + xd) */
1004
        r1 = r1 - t1;
1005
        /* yb + yd */
1006
        t2 = pSrc[(2u * i1) + 1u] + pSrc[(2u * i3) + 1u];
1007
        /* ya' = ya + yb + yc + yd */
1008
        pSrc[(2u * i0) + 1u] = (s1 + t2) >> 2u;
1009
 
1010
        /* (ya + yc) - (yb + yd) */
1011
        s1 = s1 - t2;
1012
 
1013
        /* (yb - yd) */
1014
        t1 = pSrc[(2u * i1) + 1u] - pSrc[(2u * i3) + 1u];
1015
        /* (xb - xd) */
1016
        t2 = pSrc[2u * i1] - pSrc[2u * i3];
1017
 
1018
        /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1019
        pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32u)) -
1020
                         ((int32_t) (((q63_t) s1 * si2) >> 32u))) >> 1u;
1021
 
1022
        /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1023
        pSrc[(2u * i1) + 1u] =
1024
          (((int32_t) (((q63_t) s1 * co2) >> 32u)) +
1025
           ((int32_t) (((q63_t) r1 * si2) >> 32u))) >> 1u;
1026
 
1027
        /* (xa - xc) - (yb - yd) */
1028
        r1 = r2 - t1;
1029
        /* (xa - xc) + (yb - yd) */
1030
        r2 = r2 + t1;
1031
 
1032
        /* (ya - yc) +  (xb - xd) */
1033
        s1 = s2 + t2;
1034
        /* (ya - yc) -  (xb - xd) */
1035
        s2 = s2 - t2;
1036
 
1037
        /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1038
        pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
1039
                         ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
1040
 
1041
        /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1042
        pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
1043
                                ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
1044
 
1045
        /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1046
        pSrc[(2u * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
1047
                           ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
1048
 
1049
        /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1050
        pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
1051
                                ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
1052
      }
1053
    }
1054
    twidCoefModifier <<= 2u;
1055
  }
1056
#else
1057
  uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
1058
  q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
1059
  q31_t xa, xb, xc, xd;
1060
  q31_t ya, yb, yc, yd;
1061
  q31_t xa_out, xb_out, xc_out, xd_out;
1062
  q31_t ya_out, yb_out, yc_out, yd_out;
1063
 
1064
  q31_t *ptr1;
1065
  q31_t *pSi0;
1066
  q31_t *pSi1;
1067
  q31_t *pSi2;
1068
  q31_t *pSi3;
1069
  q63_t xaya, xbyb, xcyc, xdyd;
1070
 
1071
  /* input is be 1.31(q31) format for all FFT sizes */
1072
  /* Total process is divided into three stages */
1073
  /* process first stage, middle stages, & last stage */
1074
 
1075
  /* Start of first stage process */
1076
 
1077
  /* Initializations for the first stage */
1078
  n2 = fftLen;
1079
  n1 = n2;
1080
  /* n2 = fftLen/4 */
1081
  n2 >>= 2u;
1082
 
1083
  ia1 = 0u;
1084
 
1085
  j = n2;
1086
 
1087
  pSi0 = pSrc;
1088
  pSi1 = pSi0 + 2 * n2;
1089
  pSi2 = pSi1 + 2 * n2;
1090
  pSi3 = pSi2 + 2 * n2;
1091
 
1092
  do
1093
  {
1094
    /*  Butterfly implementation */
1095
    /* xa + xc */
1096
    r1 = (pSi0[0] >> 4u) + (pSi2[0] >> 4u);
1097
    /* xa - xc */
1098
    r2 = (pSi0[0] >> 4u) - (pSi2[0] >> 4u);
1099
 
1100
    /* xb + xd */
1101
    t1 = (pSi1[0] >> 4u) + (pSi3[0] >> 4u);
1102
 
1103
    /* ya + yc */
1104
    s1 = (pSi0[1] >> 4u) + (pSi2[1] >> 4u);
1105
    /* ya - yc */
1106
    s2 = (pSi0[1] >> 4u) - (pSi2[1] >> 4u);
1107
 
1108
    /* xa' = xa + xb + xc + xd */
1109
    *pSi0++ = (r1 + t1);
1110
    /* (xa + xc) - (xb + xd) */
1111
    r1 = r1 - t1;
1112
    /* yb + yd */
1113
    t2 = (pSi1[1] >> 4u) + (pSi3[1] >> 4u);
1114
    /* ya' = ya + yb + yc + yd */
1115
    *pSi0++ = (s1 + t2);
1116
 
1117
    /* (ya + yc) - (yb + yd) */
1118
    s1 = s1 - t2;
1119
 
1120
    /* yb - yd */
1121
    t1 = (pSi1[1] >> 4u) - (pSi3[1] >> 4u);
1122
    /* xb - xd */
1123
    t2 = (pSi1[0] >> 4u) - (pSi3[0] >> 4u);
1124
 
1125
    /*  index calculation for the coefficients */
1126
    ia2 = 2u * ia1;
1127
    co2 = pCoef[ia2 * 2u];
1128
    si2 = pCoef[(ia2 * 2u) + 1u];
1129
 
1130
    /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1131
    *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
1132
                     ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
1133
 
1134
    /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1135
    *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
1136
                          ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
1137
 
1138
    /* (xa - xc) - (yb - yd) */
1139
    r1 = r2 - t1;
1140
    /* (xa - xc) + (yb - yd) */
1141
    r2 = r2 + t1;
1142
 
1143
    /* (ya - yc) + (xb - xd) */
1144
    s1 = s2 + t2;
1145
    /* (ya - yc) - (xb - xd) */
1146
    s2 = s2 - t2;
1147
 
1148
    co1 = pCoef[ia1 * 2u];
1149
    si1 = pCoef[(ia1 * 2u) + 1u];
1150
 
1151
    /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1152
    *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
1153
                     ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
1154
 
1155
    /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1156
    *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
1157
                            ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
1158
 
1159
    /*  index calculation for the coefficients */
1160
    ia3 = 3u * ia1;
1161
    co3 = pCoef[ia3 * 2u];
1162
    si3 = pCoef[(ia3 * 2u) + 1u];
1163
 
1164
    /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1165
    *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
1166
                     ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
1167
 
1168
    /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1169
    *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
1170
                            ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
1171
 
1172
    /*  Twiddle coefficients index modifier */
1173
    ia1 = ia1 + twidCoefModifier;
1174
 
1175
  } while(--j);
1176
 
1177
  /* data is in 5.27(q27) format */
1178
  /* each stage provides two down scaling of the input */
1179
 
1180
 
1181
  /* Start of Middle stages process */
1182
 
1183
  twidCoefModifier <<= 2u;
1184
 
1185
  /*  Calculation of second stage to excluding last stage */
1186
  for (k = fftLen / 4u; k > 4u; k >>= 2u)
1187
  {
1188
    /*  Initializations for the first stage */
1189
    n1 = n2;
1190
    n2 >>= 2u;
1191
    ia1 = 0u;
1192
 
1193
    for (j = 0; j <= (n2 - 1u); j++)
1194
    {
1195
      /*  index calculation for the coefficients */
1196
      ia2 = ia1 + ia1;
1197
      ia3 = ia2 + ia1;
1198
      co1 = pCoef[ia1 * 2u];
1199
      si1 = pCoef[(ia1 * 2u) + 1u];
1200
      co2 = pCoef[ia2 * 2u];
1201
      si2 = pCoef[(ia2 * 2u) + 1u];
1202
      co3 = pCoef[ia3 * 2u];
1203
      si3 = pCoef[(ia3 * 2u) + 1u];
1204
      /*  Twiddle coefficients index modifier */
1205
      ia1 = ia1 + twidCoefModifier;
1206
 
1207
      pSi0 = pSrc + 2 * j;
1208
      pSi1 = pSi0 + 2 * n2;
1209
      pSi2 = pSi1 + 2 * n2;
1210
      pSi3 = pSi2 + 2 * n2;
1211
 
1212
      for (i0 = j; i0 < fftLen; i0 += n1)
1213
      {
1214
        /*  Butterfly implementation */
1215
        /* xa + xc */
1216
        r1 = pSi0[0] + pSi2[0];
1217
 
1218
        /* xa - xc */
1219
        r2 = pSi0[0] - pSi2[0];
1220
 
1221
 
1222
        /* ya + yc */
1223
        s1 = pSi0[1] + pSi2[1];
1224
 
1225
        /* ya - yc */
1226
        s2 = pSi0[1] - pSi2[1];
1227
 
1228
 
1229
        /* xb + xd */
1230
        t1 = pSi1[0] + pSi3[0];
1231
 
1232
 
1233
        /* xa' = xa + xb + xc + xd */
1234
        pSi0[0] = (r1 + t1) >> 2u;
1235
        /* xa + xc -(xb + xd) */
1236
        r1 = r1 - t1;
1237
        /* yb + yd */
1238
        t2 = pSi1[1] + pSi3[1];
1239
 
1240
        /* ya' = ya + yb + yc + yd */
1241
        pSi0[1] = (s1 + t2) >> 2u;
1242
        pSi0 += 2 * n1;
1243
 
1244
        /* (ya + yc) - (yb + yd) */
1245
        s1 = s1 - t2;
1246
 
1247
        /* (yb - yd) */
1248
        t1 = pSi1[1] - pSi3[1];
1249
 
1250
        /* (xb - xd) */
1251
        t2 = pSi1[0] - pSi3[0];
1252
 
1253
 
1254
        /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1255
        pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32u)) -
1256
                         ((int32_t) (((q63_t) s1 * si2) >> 32u))) >> 1u;
1257
 
1258
        /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1259
        pSi1[1] =
1260
 
1261
          (((int32_t) (((q63_t) s1 * co2) >> 32u)) +
1262
           ((int32_t) (((q63_t) r1 * si2) >> 32u))) >> 1u;
1263
        pSi1 += 2 * n1;
1264
 
1265
        /* (xa - xc) - (yb - yd) */
1266
        r1 = r2 - t1;
1267
        /* (xa - xc) + (yb - yd) */
1268
        r2 = r2 + t1;
1269
 
1270
        /* (ya - yc) +  (xb - xd) */
1271
        s1 = s2 + t2;
1272
        /* (ya - yc) -  (xb - xd) */
1273
        s2 = s2 - t2;
1274
 
1275
        /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1276
        pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
1277
                         ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
1278
 
1279
        /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1280
        pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
1281
                                ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
1282
        pSi2 += 2 * n1;
1283
 
1284
        /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1285
        pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
1286
                           ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
1287
 
1288
        /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1289
        pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
1290
                                ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
1291
        pSi3 += 2 * n1;
1292
      }
1293
    }
1294
    twidCoefModifier <<= 2u;
1295
  }
1296
#endif
1297
 
1298
  /* End of Middle stages process */
1299
 
1300
  /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
1301
  /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
1302
  /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
1303
  /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
1304
 
1305
 
1306
  /* Start of last stage process */
1307
 
1308
 
1309
  /*  Initializations for the last stage */
1310
  j = fftLen >> 2;
1311
  ptr1 = &pSrc[0];
1312
 
1313
  /*  Calculations of last stage */
1314
  do
1315
  {
1316
#ifndef ARM_MATH_BIG_ENDIAN
1317
    /* Read xa (real), ya(imag) input */
1318
    xaya = *__SIMD64(ptr1)++;
1319
    xa = (q31_t) xaya;
1320
    ya = (q31_t) (xaya >> 32);
1321
 
1322
    /* Read xb (real), yb(imag) input */
1323
    xbyb = *__SIMD64(ptr1)++;
1324
    xb = (q31_t) xbyb;
1325
    yb = (q31_t) (xbyb >> 32);
1326
 
1327
    /* Read xc (real), yc(imag) input */
1328
    xcyc = *__SIMD64(ptr1)++;
1329
    xc = (q31_t) xcyc;
1330
    yc = (q31_t) (xcyc >> 32);
1331
 
1332
    /* Read xc (real), yc(imag) input */
1333
    xdyd = *__SIMD64(ptr1)++;
1334
    xd = (q31_t) xdyd;
1335
    yd = (q31_t) (xdyd >> 32);
1336
 
1337
#else
1338
 
1339
    /* Read xa (real), ya(imag) input */
1340
    xaya = *__SIMD64(ptr1)++;
1341
    ya = (q31_t) xaya;
1342
    xa = (q31_t) (xaya >> 32);
1343
 
1344
    /* Read xb (real), yb(imag) input */
1345
    xbyb = *__SIMD64(ptr1)++;
1346
    yb = (q31_t) xbyb;
1347
    xb = (q31_t) (xbyb >> 32);
1348
 
1349
    /* Read xc (real), yc(imag) input */
1350
    xcyc = *__SIMD64(ptr1)++;
1351
    yc = (q31_t) xcyc;
1352
    xc = (q31_t) (xcyc >> 32);
1353
 
1354
    /* Read xc (real), yc(imag) input */
1355
    xdyd = *__SIMD64(ptr1)++;
1356
    yd = (q31_t) xdyd;
1357
    xd = (q31_t) (xdyd >> 32);
1358
 
1359
 
1360
#endif
1361
 
1362
    /* xa' = xa + xb + xc + xd */
1363
    xa_out = xa + xb + xc + xd;
1364
 
1365
    /* ya' = ya + yb + yc + yd */
1366
    ya_out = ya + yb + yc + yd;
1367
 
1368
    /* pointer updation for writing */
1369
    ptr1 = ptr1 - 8u;
1370
 
1371
    /* writing xa' and ya' */
1372
    *ptr1++ = xa_out;
1373
    *ptr1++ = ya_out;
1374
 
1375
    xc_out = (xa - xb + xc - xd);
1376
    yc_out = (ya - yb + yc - yd);
1377
 
1378
    /* writing xc' and yc' */
1379
    *ptr1++ = xc_out;
1380
    *ptr1++ = yc_out;
1381
 
1382
    xb_out = (xa - yb - xc + yd);
1383
    yb_out = (ya + xb - yc - xd);
1384
 
1385
    /* writing xb' and yb' */
1386
    *ptr1++ = xb_out;
1387
    *ptr1++ = yb_out;
1388
 
1389
    xd_out = (xa + yb - xc - yd);
1390
    yd_out = (ya - xb - yc + xd);
1391
 
1392
    /* writing xd' and yd' */
1393
    *ptr1++ = xd_out;
1394
    *ptr1++ = yd_out;
1395
 
1396
  } while(--j);
1397
 
1398
  /* output is in 11.21(q21) format for the 1024 point */
1399
  /* output is in 9.23(q23) format for the 256 point */
1400
  /* output is in 7.25(q25) format for the 64 point */
1401
  /* output is in 5.27(q27) format for the 16 point */
1402
 
1403
  /* End of last stage process */
1404
}