Subversion Repositories AFRtranscoder

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 mjames 1
/* ----------------------------------------------------------------------
2
 * Project:      CMSIS DSP Library
3
 * Title:        arm_cfft_radix4_q31.c
4
 * Description:  This file has function definition of Radix-4 FFT & IFFT function and
5
 *               In-place bit reversal using bit reversal table
6
 *
7
 * $Date:        27. January 2017
8
 * $Revision:    V.1.5.1
9
 *
10
 * Target Processor: Cortex-M cores
11
 * -------------------------------------------------------------------- */
12
/*
13
 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
14
 *
15
 * SPDX-License-Identifier: Apache-2.0
16
 *
17
 * Licensed under the Apache License, Version 2.0 (the License); you may
18
 * not use this file except in compliance with the License.
19
 * You may obtain a copy of the License at
20
 *
21
 * www.apache.org/licenses/LICENSE-2.0
22
 *
23
 * Unless required by applicable law or agreed to in writing, software
24
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26
 * See the License for the specific language governing permissions and
27
 * limitations under the License.
28
 */
29
 
30
#include "arm_math.h"
31
 
32
void arm_radix4_butterfly_inverse_q31(
33
q31_t * pSrc,
34
uint32_t fftLen,
35
q31_t * pCoef,
36
uint32_t twidCoefModifier);
37
 
38
void arm_radix4_butterfly_q31(
39
q31_t * pSrc,
40
uint32_t fftLen,
41
q31_t * pCoef,
42
uint32_t twidCoefModifier);
43
 
44
void arm_bitreversal_q31(
45
q31_t * pSrc,
46
uint32_t fftLen,
47
uint16_t bitRevFactor,
48
uint16_t * pBitRevTab);
49
 
50
/**
51
 * @ingroup groupTransforms
52
 */
53
 
54
/**
55
 * @addtogroup ComplexFFT
56
 * @{
57
 */
58
 
59
/**
60
 * @details
61
 * @brief Processing function for the Q31 CFFT/CIFFT.
62
 * @deprecated Do not use this function.  It has been superseded by \ref arm_cfft_q31 and will be removed
63
 * @param[in]      *S    points to an instance of the Q31 CFFT/CIFFT structure.
64
 * @param[in, out] *pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place.
65
 * @return none.
66
 *
67
 * \par Input and output formats:
68
 * \par
69
 * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
70
 * Hence the output format is different for different FFT sizes.
71
 * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
72
 * \par
73
 * \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"
74
 * \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"
75
 *
76
 */
77
 
78
void arm_cfft_radix4_q31(
79
  const arm_cfft_radix4_instance_q31 * S,
80
  q31_t * pSrc)
81
{
82
  if (S->ifftFlag == 1U)
83
  {
84
    /* Complex IFFT radix-4 */
85
    arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
86
  }
87
  else
88
  {
89
    /* Complex FFT radix-4 */
90
    arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
91
  }
92
 
93
  if (S->bitReverseFlag == 1U)
94
  {
95
    /*  Bit Reversal */
96
    arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
97
  }
98
 
99
}
100
 
101
/**
102
 * @} end of ComplexFFT group
103
 */
104
 
105
/*
106
* Radix-4 FFT algorithm used is :
107
*
108
* Input real and imaginary data:
109
* x(n) = xa + j * ya
110
* x(n+N/4 ) = xb + j * yb
111
* x(n+N/2 ) = xc + j * yc
112
* x(n+3N 4) = xd + j * yd
113
*
114
*
115
* Output real and imaginary data:
116
* x(4r) = xa'+ j * ya'
117
* x(4r+1) = xb'+ j * yb'
118
* x(4r+2) = xc'+ j * yc'
119
* x(4r+3) = xd'+ j * yd'
120
*
121
*
122
* Twiddle factors for radix-4 FFT:
123
* Wn = co1 + j * (- si1)
124
* W2n = co2 + j * (- si2)
125
* W3n = co3 + j * (- si3)
126
*
127
*  Butterfly implementation:
128
* xa' = xa + xb + xc + xd
129
* ya' = ya + yb + yc + yd
130
* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
131
* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
132
* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
133
* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
134
* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
135
* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
136
*
137
*/
138
 
139
/**
140
 * @brief  Core function for the Q31 CFFT butterfly process.
141
 * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.
142
 * @param[in]      fftLen           length of the FFT.
143
 * @param[in]      *pCoef           points to twiddle coefficient buffer.
144
 * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
145
 * @return none.
146
 */
147
 
148
void arm_radix4_butterfly_q31(
149
  q31_t * pSrc,
150
  uint32_t fftLen,
151
  q31_t * pCoef,
152
  uint32_t twidCoefModifier)
153
{
154
#if defined(ARM_MATH_CM7)
155
  uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
156
  q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
157
 
158
  q31_t xa, xb, xc, xd;
159
  q31_t ya, yb, yc, yd;
160
  q31_t xa_out, xb_out, xc_out, xd_out;
161
  q31_t ya_out, yb_out, yc_out, yd_out;
162
 
163
  q31_t *ptr1;
164
  q63_t xaya, xbyb, xcyc, xdyd;
165
  /* Total process is divided into three stages */
166
 
167
  /* process first stage, middle stages, & last stage */
168
 
169
 
170
  /* start of first stage process */
171
 
172
  /*  Initializations for the first stage */
173
  n2 = fftLen;
174
  n1 = n2;
175
  /* n2 = fftLen/4 */
176
  n2 >>= 2U;
177
  i0 = 0U;
178
  ia1 = 0U;
179
 
180
  j = n2;
181
 
182
  /*  Calculation of first stage */
183
  do
184
  {
185
    /*  index calculation for the input as, */
186
    /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
187
    i1 = i0 + n2;
188
    i2 = i1 + n2;
189
    i3 = i2 + n2;
190
 
191
    /* input is in 1.31(q31) format and provide 4 guard bits for the input */
192
 
193
    /*  Butterfly implementation */
194
    /* xa + xc */
195
    r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U);
196
    /* xa - xc */
197
    r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
198
 
199
    /* xb + xd */
200
    t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
201
 
202
    /* ya + yc */
203
    s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
204
    /* ya - yc */
205
    s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
206
 
207
    /* xa' = xa + xb + xc + xd */
208
    pSrc[2U * i0] = (r1 + t1);
209
    /* (xa + xc) - (xb + xd) */
210
    r1 = r1 - t1;
211
    /* yb + yd */
212
    t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
213
 
214
    /* ya' = ya + yb + yc + yd */
215
    pSrc[(2U * i0) + 1U] = (s1 + t2);
216
 
217
    /* (ya + yc) - (yb + yd) */
218
    s1 = s1 - t2;
219
 
220
    /* yb - yd */
221
    t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
222
    /* xb - xd */
223
    t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
224
 
225
    /*  index calculation for the coefficients */
226
    ia2 = 2U * ia1;
227
    co2 = pCoef[ia2 * 2U];
228
    si2 = pCoef[(ia2 * 2U) + 1U];
229
 
230
    /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
231
    pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
232
                     ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
233
 
234
    /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
235
    pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
236
                            ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
237
 
238
    /* (xa - xc) + (yb - yd) */
239
    r1 = r2 + t1;
240
    /* (xa - xc) - (yb - yd) */
241
    r2 = r2 - t1;
242
 
243
    /* (ya - yc) - (xb - xd) */
244
    s1 = s2 - t2;
245
    /* (ya - yc) + (xb - xd) */
246
    s2 = s2 + t2;
247
 
248
    co1 = pCoef[ia1 * 2U];
249
    si1 = pCoef[(ia1 * 2U) + 1U];
250
 
251
    /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
252
    pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
253
                     ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
254
 
255
    /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
256
    pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
257
                            ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
258
 
259
    /*  index calculation for the coefficients */
260
    ia3 = 3U * ia1;
261
    co3 = pCoef[ia3 * 2U];
262
    si3 = pCoef[(ia3 * 2U) + 1U];
263
 
264
    /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
265
    pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
266
                     ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
267
 
268
    /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
269
    pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
270
                            ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
271
 
272
    /*  Twiddle coefficients index modifier */
273
    ia1 = ia1 + twidCoefModifier;
274
 
275
    /*  Updating input index */
276
    i0 = i0 + 1U;
277
 
278
  } while (--j);
279
 
280
  /* end of first stage process */
281
 
282
  /* data is in 5.27(q27) format */
283
 
284
 
285
  /* start of Middle stages process */
286
 
287
 
288
  /* each stage in middle stages provides two down scaling of the input */
289
 
290
  twidCoefModifier <<= 2U;
291
 
292
 
293
  for (k = fftLen / 4U; k > 4U; k >>= 2U)
294
  {
295
    /*  Initializations for the first stage */
296
    n1 = n2;
297
    n2 >>= 2U;
298
    ia1 = 0U;
299
 
300
    /*  Calculation of first stage */
301
    for (j = 0U; j <= (n2 - 1U); j++)
302
    {
303
      /*  index calculation for the coefficients */
304
      ia2 = ia1 + ia1;
305
      ia3 = ia2 + ia1;
306
      co1 = pCoef[ia1 * 2U];
307
      si1 = pCoef[(ia1 * 2U) + 1U];
308
      co2 = pCoef[ia2 * 2U];
309
      si2 = pCoef[(ia2 * 2U) + 1U];
310
      co3 = pCoef[ia3 * 2U];
311
      si3 = pCoef[(ia3 * 2U) + 1U];
312
      /*  Twiddle coefficients index modifier */
313
      ia1 = ia1 + twidCoefModifier;
314
 
315
      for (i0 = j; i0 < fftLen; i0 += n1)
316
      {
317
        /*  index calculation for the input as, */
318
        /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
319
        i1 = i0 + n2;
320
        i2 = i1 + n2;
321
        i3 = i2 + n2;
322
 
323
        /*  Butterfly implementation */
324
        /* xa + xc */
325
        r1 = pSrc[2U * i0] + pSrc[2U * i2];
326
        /* xa - xc */
327
        r2 = pSrc[2U * i0] - pSrc[2U * i2];
328
 
329
        /* ya + yc */
330
        s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
331
        /* ya - yc */
332
        s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
333
 
334
        /* xb + xd */
335
        t1 = pSrc[2U * i1] + pSrc[2U * i3];
336
 
337
        /* xa' = xa + xb + xc + xd */
338
        pSrc[2U * i0] = (r1 + t1) >> 2U;
339
        /* xa + xc -(xb + xd) */
340
        r1 = r1 - t1;
341
 
342
        /* yb + yd */
343
        t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
344
        /* ya' = ya + yb + yc + yd */
345
        pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
346
 
347
        /* (ya + yc) - (yb + yd) */
348
        s1 = s1 - t2;
349
 
350
        /* (yb - yd) */
351
        t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
352
        /* (xb - xd) */
353
        t2 = pSrc[2U * i1] - pSrc[2U * i3];
354
 
355
        /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
356
        pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
357
                         ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
358
 
359
        /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
360
        pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
361
                                ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
362
 
363
        /* (xa - xc) + (yb - yd) */
364
        r1 = r2 + t1;
365
        /* (xa - xc) - (yb - yd) */
366
        r2 = r2 - t1;
367
 
368
        /* (ya - yc) -  (xb - xd) */
369
        s1 = s2 - t2;
370
        /* (ya - yc) +  (xb - xd) */
371
        s2 = s2 + t2;
372
 
373
        /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
374
        pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
375
                         ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
376
 
377
        /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
378
        pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
379
                                ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
380
 
381
        /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
382
        pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
383
                         ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
384
 
385
        /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
386
        pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
387
                                ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
388
      }
389
    }
390
    twidCoefModifier <<= 2U;
391
  }
392
#else
393
  uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
394
  q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
395
 
396
  q31_t xa, xb, xc, xd;
397
  q31_t ya, yb, yc, yd;
398
  q31_t xa_out, xb_out, xc_out, xd_out;
399
  q31_t ya_out, yb_out, yc_out, yd_out;
400
 
401
  q31_t *ptr1;
402
  q31_t *pSi0;
403
  q31_t *pSi1;
404
  q31_t *pSi2;
405
  q31_t *pSi3;
406
  q63_t xaya, xbyb, xcyc, xdyd;
407
  /* Total process is divided into three stages */
408
 
409
  /* process first stage, middle stages, & last stage */
410
 
411
 
412
  /* start of first stage process */
413
 
414
  /*  Initializations for the first stage */
415
  n2 = fftLen;
416
  n1 = n2;
417
  /* n2 = fftLen/4 */
418
  n2 >>= 2U;
419
 
420
  ia1 = 0U;
421
 
422
  j = n2;
423
 
424
  pSi0 = pSrc;
425
  pSi1 = pSi0 + 2 * n2;
426
  pSi2 = pSi1 + 2 * n2;
427
  pSi3 = pSi2 + 2 * n2;
428
 
429
  /*  Calculation of first stage */
430
  do
431
  {
432
    /* input is in 1.31(q31) format and provide 4 guard bits for the input */
433
 
434
    /*  Butterfly implementation */
435
    /* xa + xc */
436
    r1 = (pSi0[0] >> 4U) + (pSi2[0] >> 4U);
437
    /* xa - xc */
438
    r2 = (pSi0[0] >> 4U) - (pSi2[0] >> 4U);
439
 
440
    /* xb + xd */
441
    t1 = (pSi1[0] >> 4U) + (pSi3[0] >> 4U);
442
 
443
    /* ya + yc */
444
    s1 = (pSi0[1] >> 4U) + (pSi2[1] >> 4U);
445
    /* ya - yc */
446
    s2 = (pSi0[1] >> 4U) - (pSi2[1] >> 4U);
447
 
448
    /* xa' = xa + xb + xc + xd */
449
    *pSi0++ = (r1 + t1);
450
    /* (xa + xc) - (xb + xd) */
451
    r1 = r1 - t1;
452
    /* yb + yd */
453
    t2 = (pSi1[1] >> 4U) + (pSi3[1] >> 4U);
454
 
455
    /* ya' = ya + yb + yc + yd */
456
    *pSi0++ = (s1 + t2);
457
 
458
    /* (ya + yc) - (yb + yd) */
459
    s1 = s1 - t2;
460
 
461
    /* yb - yd */
462
    t1 = (pSi1[1] >> 4U) - (pSi3[1] >> 4U);
463
    /* xb - xd */
464
    t2 = (pSi1[0] >> 4U) - (pSi3[0] >> 4U);
465
 
466
    /*  index calculation for the coefficients */
467
    ia2 = 2U * ia1;
468
    co2 = pCoef[ia2 * 2U];
469
    si2 = pCoef[(ia2 * 2U) + 1U];
470
 
471
    /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
472
    *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
473
                     ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
474
 
475
    /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
476
    *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
477
                            ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
478
 
479
    /* (xa - xc) + (yb - yd) */
480
    r1 = r2 + t1;
481
    /* (xa - xc) - (yb - yd) */
482
    r2 = r2 - t1;
483
 
484
    /* (ya - yc) - (xb - xd) */
485
    s1 = s2 - t2;
486
    /* (ya - yc) + (xb - xd) */
487
    s2 = s2 + t2;
488
 
489
    co1 = pCoef[ia1 * 2U];
490
    si1 = pCoef[(ia1 * 2U) + 1U];
491
 
492
    /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
493
    *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
494
                     ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
495
 
496
    /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
497
    *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
498
                            ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
499
 
500
    /*  index calculation for the coefficients */
501
    ia3 = 3U * ia1;
502
    co3 = pCoef[ia3 * 2U];
503
    si3 = pCoef[(ia3 * 2U) + 1U];
504
 
505
    /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
506
    *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
507
                     ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
508
 
509
    /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
510
    *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
511
                            ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
512
 
513
    /*  Twiddle coefficients index modifier */
514
    ia1 = ia1 + twidCoefModifier;
515
 
516
  } while (--j);
517
 
518
  /* end of first stage process */
519
 
520
  /* data is in 5.27(q27) format */
521
 
522
 
523
  /* start of Middle stages process */
524
 
525
 
526
  /* each stage in middle stages provides two down scaling of the input */
527
 
528
  twidCoefModifier <<= 2U;
529
 
530
 
531
  for (k = fftLen / 4U; k > 4U; k >>= 2U)
532
  {
533
    /*  Initializations for the first stage */
534
    n1 = n2;
535
    n2 >>= 2U;
536
    ia1 = 0U;
537
 
538
    /*  Calculation of first stage */
539
    for (j = 0U; j <= (n2 - 1U); j++)
540
    {
541
      /*  index calculation for the coefficients */
542
      ia2 = ia1 + ia1;
543
      ia3 = ia2 + ia1;
544
      co1 = pCoef[ia1 * 2U];
545
      si1 = pCoef[(ia1 * 2U) + 1U];
546
      co2 = pCoef[ia2 * 2U];
547
      si2 = pCoef[(ia2 * 2U) + 1U];
548
      co3 = pCoef[ia3 * 2U];
549
      si3 = pCoef[(ia3 * 2U) + 1U];
550
      /*  Twiddle coefficients index modifier */
551
      ia1 = ia1 + twidCoefModifier;
552
 
553
      pSi0 = pSrc + 2 * j;
554
      pSi1 = pSi0 + 2 * n2;
555
      pSi2 = pSi1 + 2 * n2;
556
      pSi3 = pSi2 + 2 * n2;
557
 
558
      for (i0 = j; i0 < fftLen; i0 += n1)
559
      {
560
        /*  Butterfly implementation */
561
        /* xa + xc */
562
        r1 = pSi0[0] + pSi2[0];
563
 
564
        /* xa - xc */
565
        r2 = pSi0[0] - pSi2[0];
566
 
567
 
568
        /* ya + yc */
569
        s1 = pSi0[1] + pSi2[1];
570
 
571
        /* ya - yc */
572
        s2 = pSi0[1] - pSi2[1];
573
 
574
 
575
        /* xb + xd */
576
        t1 = pSi1[0] + pSi3[0];
577
 
578
 
579
        /* xa' = xa + xb + xc + xd */
580
        pSi0[0] = (r1 + t1) >> 2U;
581
        /* xa + xc -(xb + xd) */
582
        r1 = r1 - t1;
583
 
584
        /* yb + yd */
585
        t2 = pSi1[1] + pSi3[1];
586
 
587
        /* ya' = ya + yb + yc + yd */
588
        pSi0[1] = (s1 + t2) >> 2U;
589
        pSi0 += 2 * n1;
590
 
591
        /* (ya + yc) - (yb + yd) */
592
        s1 = s1 - t2;
593
 
594
        /* (yb - yd) */
595
        t1 = pSi1[1] - pSi3[1];
596
 
597
        /* (xb - xd) */
598
        t2 = pSi1[0] - pSi3[0];
599
 
600
 
601
        /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
602
        pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
603
                         ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
604
 
605
        /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
606
        pSi1[1] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
607
                                ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
608
        pSi1 += 2 * n1;
609
 
610
        /* (xa - xc) + (yb - yd) */
611
        r1 = r2 + t1;
612
        /* (xa - xc) - (yb - yd) */
613
        r2 = r2 - t1;
614
 
615
        /* (ya - yc) -  (xb - xd) */
616
        s1 = s2 - t2;
617
        /* (ya - yc) +  (xb - xd) */
618
        s2 = s2 + t2;
619
 
620
        /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
621
        pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
622
                         ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
623
 
624
        /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
625
        pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
626
                                ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
627
        pSi2 += 2 * n1;
628
 
629
        /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
630
        pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
631
                         ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
632
 
633
        /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
634
        pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
635
                                ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
636
        pSi3 += 2 * n1;
637
      }
638
    }
639
    twidCoefModifier <<= 2U;
640
  }
641
#endif
642
 
643
  /* End of Middle stages process */
644
 
645
  /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
646
  /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
647
  /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
648
  /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
649
 
650
 
651
  /* start of Last stage process */
652
  /*  Initializations for the last stage */
653
  j = fftLen >> 2;
654
  ptr1 = &pSrc[0];
655
 
656
  /*  Calculations of last stage */
657
  do
658
  {
659
 
660
#ifndef ARM_MATH_BIG_ENDIAN
661
 
662
    /* Read xa (real), ya(imag) input */
663
    xaya = *__SIMD64(ptr1)++;
664
    xa = (q31_t) xaya;
665
    ya = (q31_t) (xaya >> 32);
666
 
667
    /* Read xb (real), yb(imag) input */
668
    xbyb = *__SIMD64(ptr1)++;
669
    xb = (q31_t) xbyb;
670
    yb = (q31_t) (xbyb >> 32);
671
 
672
    /* Read xc (real), yc(imag) input */
673
    xcyc = *__SIMD64(ptr1)++;
674
    xc = (q31_t) xcyc;
675
    yc = (q31_t) (xcyc >> 32);
676
 
677
    /* Read xc (real), yc(imag) input */
678
    xdyd = *__SIMD64(ptr1)++;
679
    xd = (q31_t) xdyd;
680
    yd = (q31_t) (xdyd >> 32);
681
 
682
#else
683
 
684
    /* Read xa (real), ya(imag) input */
685
    xaya = *__SIMD64(ptr1)++;
686
    ya = (q31_t) xaya;
687
    xa = (q31_t) (xaya >> 32);
688
 
689
    /* Read xb (real), yb(imag) input */
690
    xbyb = *__SIMD64(ptr1)++;
691
    yb = (q31_t) xbyb;
692
    xb = (q31_t) (xbyb >> 32);
693
 
694
    /* Read xc (real), yc(imag) input */
695
    xcyc = *__SIMD64(ptr1)++;
696
    yc = (q31_t) xcyc;
697
    xc = (q31_t) (xcyc >> 32);
698
 
699
    /* Read xc (real), yc(imag) input */
700
    xdyd = *__SIMD64(ptr1)++;
701
    yd = (q31_t) xdyd;
702
    xd = (q31_t) (xdyd >> 32);
703
 
704
 
705
#endif
706
 
707
    /* xa' = xa + xb + xc + xd */
708
    xa_out = xa + xb + xc + xd;
709
 
710
    /* ya' = ya + yb + yc + yd */
711
    ya_out = ya + yb + yc + yd;
712
 
713
    /* pointer updation for writing */
714
    ptr1 = ptr1 - 8U;
715
 
716
    /* writing xa' and ya' */
717
    *ptr1++ = xa_out;
718
    *ptr1++ = ya_out;
719
 
720
    xc_out = (xa - xb + xc - xd);
721
    yc_out = (ya - yb + yc - yd);
722
 
723
    /* writing xc' and yc' */
724
    *ptr1++ = xc_out;
725
    *ptr1++ = yc_out;
726
 
727
    xb_out = (xa + yb - xc - yd);
728
    yb_out = (ya - xb - yc + xd);
729
 
730
    /* writing xb' and yb' */
731
    *ptr1++ = xb_out;
732
    *ptr1++ = yb_out;
733
 
734
    xd_out = (xa - yb - xc + yd);
735
    yd_out = (ya + xb - yc - xd);
736
 
737
    /* writing xd' and yd' */
738
    *ptr1++ = xd_out;
739
    *ptr1++ = yd_out;
740
 
741
 
742
  } while (--j);
743
 
744
  /* output is in 11.21(q21) format for the 1024 point */
745
  /* output is in 9.23(q23) format for the 256 point */
746
  /* output is in 7.25(q25) format for the 64 point */
747
  /* output is in 5.27(q27) format for the 16 point */
748
 
749
  /* End of last stage process */
750
 
751
}
752
 
753
 
754
/**
755
 * @brief  Core function for the Q31 CIFFT butterfly process.
756
 * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.
757
 * @param[in]      fftLen           length of the FFT.
758
 * @param[in]      *pCoef           points to twiddle coefficient buffer.
759
 * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
760
 * @return none.
761
 */
762
 
763
 
764
/*
765
* Radix-4 IFFT algorithm used is :
766
*
767
* CIFFT uses same twiddle coefficients as CFFT Function
768
*  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
769
*
770
*
771
* IFFT is implemented with following changes in equations from FFT
772
*
773
* Input real and imaginary data:
774
* x(n) = xa + j * ya
775
* x(n+N/4 ) = xb + j * yb
776
* x(n+N/2 ) = xc + j * yc
777
* x(n+3N 4) = xd + j * yd
778
*
779
*
780
* Output real and imaginary data:
781
* x(4r) = xa'+ j * ya'
782
* x(4r+1) = xb'+ j * yb'
783
* x(4r+2) = xc'+ j * yc'
784
* x(4r+3) = xd'+ j * yd'
785
*
786
*
787
* Twiddle factors for radix-4 IFFT:
788
* Wn = co1 + j * (si1)
789
* W2n = co2 + j * (si2)
790
* W3n = co3 + j * (si3)
791
 
792
* The real and imaginary output values for the radix-4 butterfly are
793
* xa' = xa + xb + xc + xd
794
* ya' = ya + yb + yc + yd
795
* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
796
* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
797
* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
798
* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
799
* xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
800
* yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
801
*
802
*/
803
 
804
void arm_radix4_butterfly_inverse_q31(
805
  q31_t * pSrc,
806
  uint32_t fftLen,
807
  q31_t * pCoef,
808
  uint32_t twidCoefModifier)
809
{
810
#if defined(ARM_MATH_CM7)
811
  uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
812
  q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
813
  q31_t xa, xb, xc, xd;
814
  q31_t ya, yb, yc, yd;
815
  q31_t xa_out, xb_out, xc_out, xd_out;
816
  q31_t ya_out, yb_out, yc_out, yd_out;
817
 
818
  q31_t *ptr1;
819
  q63_t xaya, xbyb, xcyc, xdyd;
820
 
821
  /* input is be 1.31(q31) format for all FFT sizes */
822
  /* Total process is divided into three stages */
823
  /* process first stage, middle stages, & last stage */
824
 
825
  /* Start of first stage process */
826
 
827
  /* Initializations for the first stage */
828
  n2 = fftLen;
829
  n1 = n2;
830
  /* n2 = fftLen/4 */
831
  n2 >>= 2U;
832
  i0 = 0U;
833
  ia1 = 0U;
834
 
835
  j = n2;
836
 
837
  do
838
  {
839
 
840
    /* input is in 1.31(q31) format and provide 4 guard bits for the input */
841
 
842
    /*  index calculation for the input as, */
843
    /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
844
    i1 = i0 + n2;
845
    i2 = i1 + n2;
846
    i3 = i2 + n2;
847
 
848
    /*  Butterfly implementation */
849
    /* xa + xc */
850
    r1 = (pSrc[2U * i0] >> 4U) + (pSrc[2U * i2] >> 4U);
851
    /* xa - xc */
852
    r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
853
 
854
    /* xb + xd */
855
    t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
856
 
857
    /* ya + yc */
858
    s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
859
    /* ya - yc */
860
    s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
861
 
862
    /* xa' = xa + xb + xc + xd */
863
    pSrc[2U * i0] = (r1 + t1);
864
    /* (xa + xc) - (xb + xd) */
865
    r1 = r1 - t1;
866
    /* yb + yd */
867
    t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
868
    /* ya' = ya + yb + yc + yd */
869
    pSrc[(2U * i0) + 1U] = (s1 + t2);
870
 
871
    /* (ya + yc) - (yb + yd) */
872
    s1 = s1 - t2;
873
 
874
    /* yb - yd */
875
    t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
876
    /* xb - xd */
877
    t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
878
 
879
    /*  index calculation for the coefficients */
880
    ia2 = 2U * ia1;
881
    co2 = pCoef[ia2 * 2U];
882
    si2 = pCoef[(ia2 * 2U) + 1U];
883
 
884
    /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
885
    pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
886
                     ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
887
 
888
    /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
889
    pSrc[2U * i1 + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
890
                          ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
891
 
892
    /* (xa - xc) - (yb - yd) */
893
    r1 = r2 - t1;
894
    /* (xa - xc) + (yb - yd) */
895
    r2 = r2 + t1;
896
 
897
    /* (ya - yc) + (xb - xd) */
898
    s1 = s2 + t2;
899
    /* (ya - yc) - (xb - xd) */
900
    s2 = s2 - t2;
901
 
902
    co1 = pCoef[ia1 * 2U];
903
    si1 = pCoef[(ia1 * 2U) + 1U];
904
 
905
    /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
906
    pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
907
                     ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
908
 
909
    /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
910
    pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
911
                            ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
912
 
913
    /*  index calculation for the coefficients */
914
    ia3 = 3U * ia1;
915
    co3 = pCoef[ia3 * 2U];
916
    si3 = pCoef[(ia3 * 2U) + 1U];
917
 
918
    /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
919
    pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
920
                     ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
921
 
922
    /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
923
    pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
924
                            ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
925
 
926
    /*  Twiddle coefficients index modifier */
927
    ia1 = ia1 + twidCoefModifier;
928
 
929
    /*  Updating input index */
930
    i0 = i0 + 1U;
931
 
932
  } while (--j);
933
 
934
  /* data is in 5.27(q27) format */
935
  /* each stage provides two down scaling of the input */
936
 
937
 
938
  /* Start of Middle stages process */
939
 
940
  twidCoefModifier <<= 2U;
941
 
942
  /*  Calculation of second stage to excluding last stage */
943
  for (k = fftLen / 4U; k > 4U; k >>= 2U)
944
  {
945
    /*  Initializations for the first stage */
946
    n1 = n2;
947
    n2 >>= 2U;
948
    ia1 = 0U;
949
 
950
    for (j = 0; j <= (n2 - 1U); j++)
951
    {
952
      /*  index calculation for the coefficients */
953
      ia2 = ia1 + ia1;
954
      ia3 = ia2 + ia1;
955
      co1 = pCoef[ia1 * 2U];
956
      si1 = pCoef[(ia1 * 2U) + 1U];
957
      co2 = pCoef[ia2 * 2U];
958
      si2 = pCoef[(ia2 * 2U) + 1U];
959
      co3 = pCoef[ia3 * 2U];
960
      si3 = pCoef[(ia3 * 2U) + 1U];
961
      /*  Twiddle coefficients index modifier */
962
      ia1 = ia1 + twidCoefModifier;
963
 
964
      for (i0 = j; i0 < fftLen; i0 += n1)
965
      {
966
        /*  index calculation for the input as, */
967
        /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
968
        i1 = i0 + n2;
969
        i2 = i1 + n2;
970
        i3 = i2 + n2;
971
 
972
        /*  Butterfly implementation */
973
        /* xa + xc */
974
        r1 = pSrc[2U * i0] + pSrc[2U * i2];
975
        /* xa - xc */
976
        r2 = pSrc[2U * i0] - pSrc[2U * i2];
977
 
978
        /* ya + yc */
979
        s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
980
        /* ya - yc */
981
        s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
982
 
983
        /* xb + xd */
984
        t1 = pSrc[2U * i1] + pSrc[2U * i3];
985
 
986
        /* xa' = xa + xb + xc + xd */
987
        pSrc[2U * i0] = (r1 + t1) >> 2U;
988
        /* xa + xc -(xb + xd) */
989
        r1 = r1 - t1;
990
        /* yb + yd */
991
        t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
992
        /* ya' = ya + yb + yc + yd */
993
        pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
994
 
995
        /* (ya + yc) - (yb + yd) */
996
        s1 = s1 - t2;
997
 
998
        /* (yb - yd) */
999
        t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
1000
        /* (xb - xd) */
1001
        t2 = pSrc[2U * i1] - pSrc[2U * i3];
1002
 
1003
        /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1004
        pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
1005
                         ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
1006
 
1007
        /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1008
        pSrc[(2U * i1) + 1U] =
1009
          (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
1010
           ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
1011
 
1012
        /* (xa - xc) - (yb - yd) */
1013
        r1 = r2 - t1;
1014
        /* (xa - xc) + (yb - yd) */
1015
        r2 = r2 + t1;
1016
 
1017
        /* (ya - yc) +  (xb - xd) */
1018
        s1 = s2 + t2;
1019
        /* (ya - yc) -  (xb - xd) */
1020
        s2 = s2 - t2;
1021
 
1022
        /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1023
        pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
1024
                         ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
1025
 
1026
        /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1027
        pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
1028
                                ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
1029
 
1030
        /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1031
        pSrc[(2U * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
1032
                           ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
1033
 
1034
        /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1035
        pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
1036
                                ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
1037
      }
1038
    }
1039
    twidCoefModifier <<= 2U;
1040
  }
1041
#else
1042
  uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
1043
  q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
1044
  q31_t xa, xb, xc, xd;
1045
  q31_t ya, yb, yc, yd;
1046
  q31_t xa_out, xb_out, xc_out, xd_out;
1047
  q31_t ya_out, yb_out, yc_out, yd_out;
1048
 
1049
  q31_t *ptr1;
1050
  q31_t *pSi0;
1051
  q31_t *pSi1;
1052
  q31_t *pSi2;
1053
  q31_t *pSi3;
1054
  q63_t xaya, xbyb, xcyc, xdyd;
1055
 
1056
  /* input is be 1.31(q31) format for all FFT sizes */
1057
  /* Total process is divided into three stages */
1058
  /* process first stage, middle stages, & last stage */
1059
 
1060
  /* Start of first stage process */
1061
 
1062
  /* Initializations for the first stage */
1063
  n2 = fftLen;
1064
  n1 = n2;
1065
  /* n2 = fftLen/4 */
1066
  n2 >>= 2U;
1067
 
1068
  ia1 = 0U;
1069
 
1070
  j = n2;
1071
 
1072
  pSi0 = pSrc;
1073
  pSi1 = pSi0 + 2 * n2;
1074
  pSi2 = pSi1 + 2 * n2;
1075
  pSi3 = pSi2 + 2 * n2;
1076
 
1077
  do
1078
  {
1079
    /*  Butterfly implementation */
1080
    /* xa + xc */
1081
    r1 = (pSi0[0] >> 4U) + (pSi2[0] >> 4U);
1082
    /* xa - xc */
1083
    r2 = (pSi0[0] >> 4U) - (pSi2[0] >> 4U);
1084
 
1085
    /* xb + xd */
1086
    t1 = (pSi1[0] >> 4U) + (pSi3[0] >> 4U);
1087
 
1088
    /* ya + yc */
1089
    s1 = (pSi0[1] >> 4U) + (pSi2[1] >> 4U);
1090
    /* ya - yc */
1091
    s2 = (pSi0[1] >> 4U) - (pSi2[1] >> 4U);
1092
 
1093
    /* xa' = xa + xb + xc + xd */
1094
    *pSi0++ = (r1 + t1);
1095
    /* (xa + xc) - (xb + xd) */
1096
    r1 = r1 - t1;
1097
    /* yb + yd */
1098
    t2 = (pSi1[1] >> 4U) + (pSi3[1] >> 4U);
1099
    /* ya' = ya + yb + yc + yd */
1100
    *pSi0++ = (s1 + t2);
1101
 
1102
    /* (ya + yc) - (yb + yd) */
1103
    s1 = s1 - t2;
1104
 
1105
    /* yb - yd */
1106
    t1 = (pSi1[1] >> 4U) - (pSi3[1] >> 4U);
1107
    /* xb - xd */
1108
    t2 = (pSi1[0] >> 4U) - (pSi3[0] >> 4U);
1109
 
1110
    /*  index calculation for the coefficients */
1111
    ia2 = 2U * ia1;
1112
    co2 = pCoef[ia2 * 2U];
1113
    si2 = pCoef[(ia2 * 2U) + 1U];
1114
 
1115
    /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1116
    *pSi1++ = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
1117
                     ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
1118
 
1119
    /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1120
    *pSi1++ = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
1121
                          ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
1122
 
1123
    /* (xa - xc) - (yb - yd) */
1124
    r1 = r2 - t1;
1125
    /* (xa - xc) + (yb - yd) */
1126
    r2 = r2 + t1;
1127
 
1128
    /* (ya - yc) + (xb - xd) */
1129
    s1 = s2 + t2;
1130
    /* (ya - yc) - (xb - xd) */
1131
    s2 = s2 - t2;
1132
 
1133
    co1 = pCoef[ia1 * 2U];
1134
    si1 = pCoef[(ia1 * 2U) + 1U];
1135
 
1136
    /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1137
    *pSi2++ = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
1138
                     ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
1139
 
1140
    /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1141
    *pSi2++ = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
1142
                            ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
1143
 
1144
    /*  index calculation for the coefficients */
1145
    ia3 = 3U * ia1;
1146
    co3 = pCoef[ia3 * 2U];
1147
    si3 = pCoef[(ia3 * 2U) + 1U];
1148
 
1149
    /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1150
    *pSi3++ = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
1151
                     ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
1152
 
1153
    /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1154
    *pSi3++ = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
1155
                            ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
1156
 
1157
    /*  Twiddle coefficients index modifier */
1158
    ia1 = ia1 + twidCoefModifier;
1159
 
1160
  } while (--j);
1161
 
1162
  /* data is in 5.27(q27) format */
1163
  /* each stage provides two down scaling of the input */
1164
 
1165
 
1166
  /* Start of Middle stages process */
1167
 
1168
  twidCoefModifier <<= 2U;
1169
 
1170
  /*  Calculation of second stage to excluding last stage */
1171
  for (k = fftLen / 4U; k > 4U; k >>= 2U)
1172
  {
1173
    /*  Initializations for the first stage */
1174
    n1 = n2;
1175
    n2 >>= 2U;
1176
    ia1 = 0U;
1177
 
1178
    for (j = 0; j <= (n2 - 1U); j++)
1179
    {
1180
      /*  index calculation for the coefficients */
1181
      ia2 = ia1 + ia1;
1182
      ia3 = ia2 + ia1;
1183
      co1 = pCoef[ia1 * 2U];
1184
      si1 = pCoef[(ia1 * 2U) + 1U];
1185
      co2 = pCoef[ia2 * 2U];
1186
      si2 = pCoef[(ia2 * 2U) + 1U];
1187
      co3 = pCoef[ia3 * 2U];
1188
      si3 = pCoef[(ia3 * 2U) + 1U];
1189
      /*  Twiddle coefficients index modifier */
1190
      ia1 = ia1 + twidCoefModifier;
1191
 
1192
      pSi0 = pSrc + 2 * j;
1193
      pSi1 = pSi0 + 2 * n2;
1194
      pSi2 = pSi1 + 2 * n2;
1195
      pSi3 = pSi2 + 2 * n2;
1196
 
1197
      for (i0 = j; i0 < fftLen; i0 += n1)
1198
      {
1199
        /*  Butterfly implementation */
1200
        /* xa + xc */
1201
        r1 = pSi0[0] + pSi2[0];
1202
 
1203
        /* xa - xc */
1204
        r2 = pSi0[0] - pSi2[0];
1205
 
1206
 
1207
        /* ya + yc */
1208
        s1 = pSi0[1] + pSi2[1];
1209
 
1210
        /* ya - yc */
1211
        s2 = pSi0[1] - pSi2[1];
1212
 
1213
 
1214
        /* xb + xd */
1215
        t1 = pSi1[0] + pSi3[0];
1216
 
1217
 
1218
        /* xa' = xa + xb + xc + xd */
1219
        pSi0[0] = (r1 + t1) >> 2U;
1220
        /* xa + xc -(xb + xd) */
1221
        r1 = r1 - t1;
1222
        /* yb + yd */
1223
        t2 = pSi1[1] + pSi3[1];
1224
 
1225
        /* ya' = ya + yb + yc + yd */
1226
        pSi0[1] = (s1 + t2) >> 2U;
1227
        pSi0 += 2 * n1;
1228
 
1229
        /* (ya + yc) - (yb + yd) */
1230
        s1 = s1 - t2;
1231
 
1232
        /* (yb - yd) */
1233
        t1 = pSi1[1] - pSi3[1];
1234
 
1235
        /* (xb - xd) */
1236
        t2 = pSi1[0] - pSi3[0];
1237
 
1238
 
1239
        /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1240
        pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
1241
                         ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
1242
 
1243
        /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1244
        pSi1[1] =
1245
 
1246
          (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
1247
           ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
1248
        pSi1 += 2 * n1;
1249
 
1250
        /* (xa - xc) - (yb - yd) */
1251
        r1 = r2 - t1;
1252
        /* (xa - xc) + (yb - yd) */
1253
        r2 = r2 + t1;
1254
 
1255
        /* (ya - yc) +  (xb - xd) */
1256
        s1 = s2 + t2;
1257
        /* (ya - yc) -  (xb - xd) */
1258
        s2 = s2 - t2;
1259
 
1260
        /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1261
        pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
1262
                         ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
1263
 
1264
        /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1265
        pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
1266
                                ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
1267
        pSi2 += 2 * n1;
1268
 
1269
        /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1270
        pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
1271
                           ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
1272
 
1273
        /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1274
        pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
1275
                                ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
1276
        pSi3 += 2 * n1;
1277
      }
1278
    }
1279
    twidCoefModifier <<= 2U;
1280
  }
1281
#endif
1282
 
1283
  /* End of Middle stages process */
1284
 
1285
  /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
1286
  /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
1287
  /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
1288
  /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
1289
 
1290
 
1291
  /* Start of last stage process */
1292
 
1293
 
1294
  /*  Initializations for the last stage */
1295
  j = fftLen >> 2;
1296
  ptr1 = &pSrc[0];
1297
 
1298
  /*  Calculations of last stage */
1299
  do
1300
  {
1301
#ifndef ARM_MATH_BIG_ENDIAN
1302
    /* Read xa (real), ya(imag) input */
1303
    xaya = *__SIMD64(ptr1)++;
1304
    xa = (q31_t) xaya;
1305
    ya = (q31_t) (xaya >> 32);
1306
 
1307
    /* Read xb (real), yb(imag) input */
1308
    xbyb = *__SIMD64(ptr1)++;
1309
    xb = (q31_t) xbyb;
1310
    yb = (q31_t) (xbyb >> 32);
1311
 
1312
    /* Read xc (real), yc(imag) input */
1313
    xcyc = *__SIMD64(ptr1)++;
1314
    xc = (q31_t) xcyc;
1315
    yc = (q31_t) (xcyc >> 32);
1316
 
1317
    /* Read xc (real), yc(imag) input */
1318
    xdyd = *__SIMD64(ptr1)++;
1319
    xd = (q31_t) xdyd;
1320
    yd = (q31_t) (xdyd >> 32);
1321
 
1322
#else
1323
 
1324
    /* Read xa (real), ya(imag) input */
1325
    xaya = *__SIMD64(ptr1)++;
1326
    ya = (q31_t) xaya;
1327
    xa = (q31_t) (xaya >> 32);
1328
 
1329
    /* Read xb (real), yb(imag) input */
1330
    xbyb = *__SIMD64(ptr1)++;
1331
    yb = (q31_t) xbyb;
1332
    xb = (q31_t) (xbyb >> 32);
1333
 
1334
    /* Read xc (real), yc(imag) input */
1335
    xcyc = *__SIMD64(ptr1)++;
1336
    yc = (q31_t) xcyc;
1337
    xc = (q31_t) (xcyc >> 32);
1338
 
1339
    /* Read xc (real), yc(imag) input */
1340
    xdyd = *__SIMD64(ptr1)++;
1341
    yd = (q31_t) xdyd;
1342
    xd = (q31_t) (xdyd >> 32);
1343
 
1344
 
1345
#endif
1346
 
1347
    /* xa' = xa + xb + xc + xd */
1348
    xa_out = xa + xb + xc + xd;
1349
 
1350
    /* ya' = ya + yb + yc + yd */
1351
    ya_out = ya + yb + yc + yd;
1352
 
1353
    /* pointer updation for writing */
1354
    ptr1 = ptr1 - 8U;
1355
 
1356
    /* writing xa' and ya' */
1357
    *ptr1++ = xa_out;
1358
    *ptr1++ = ya_out;
1359
 
1360
    xc_out = (xa - xb + xc - xd);
1361
    yc_out = (ya - yb + yc - yd);
1362
 
1363
    /* writing xc' and yc' */
1364
    *ptr1++ = xc_out;
1365
    *ptr1++ = yc_out;
1366
 
1367
    xb_out = (xa - yb - xc + yd);
1368
    yb_out = (ya + xb - yc - xd);
1369
 
1370
    /* writing xb' and yb' */
1371
    *ptr1++ = xb_out;
1372
    *ptr1++ = yb_out;
1373
 
1374
    xd_out = (xa + yb - xc - yd);
1375
    yd_out = (ya - xb - yc + xd);
1376
 
1377
    /* writing xd' and yd' */
1378
    *ptr1++ = xd_out;
1379
    *ptr1++ = yd_out;
1380
 
1381
  } while (--j);
1382
 
1383
  /* output is in 11.21(q21) format for the 1024 point */
1384
  /* output is in 9.23(q23) format for the 256 point */
1385
  /* output is in 7.25(q25) format for the 64 point */
1386
  /* output is in 5.27(q27) format for the 16 point */
1387
 
1388
  /* End of last stage process */
1389
}