Subversion Repositories AFRtranscoder

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 mjames 1
/* ----------------------------------------------------------------------
2
 * Project:      CMSIS DSP Library
3
 * Title:        arm_cfft_radix4_q15.c
4
 * Description:  This file has function definition of Radix-4 FFT & IFFT function and
5
 *               In-place bit reversal using bit reversal table
6
 *
7
 * $Date:        27. January 2017
8
 * $Revision:    V.1.5.1
9
 *
10
 * Target Processor: Cortex-M cores
11
 * -------------------------------------------------------------------- */
12
/*
13
 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
14
 *
15
 * SPDX-License-Identifier: Apache-2.0
16
 *
17
 * Licensed under the Apache License, Version 2.0 (the License); you may
18
 * not use this file except in compliance with the License.
19
 * You may obtain a copy of the License at
20
 *
21
 * www.apache.org/licenses/LICENSE-2.0
22
 *
23
 * Unless required by applicable law or agreed to in writing, software
24
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26
 * See the License for the specific language governing permissions and
27
 * limitations under the License.
28
 */
29
 
30
#include "arm_math.h"
31
 
32
 
33
void arm_radix4_butterfly_q15(
34
  q15_t * pSrc16,
35
  uint32_t fftLen,
36
  q15_t * pCoef16,
37
  uint32_t twidCoefModifier);
38
 
39
void arm_radix4_butterfly_inverse_q15(
40
  q15_t * pSrc16,
41
  uint32_t fftLen,
42
  q15_t * pCoef16,
43
  uint32_t twidCoefModifier);
44
 
45
void arm_bitreversal_q15(
46
  q15_t * pSrc,
47
  uint32_t fftLen,
48
  uint16_t bitRevFactor,
49
  uint16_t * pBitRevTab);
50
 
51
/**
52
 * @ingroup groupTransforms
53
 */
54
 
55
/**
56
 * @addtogroup ComplexFFT
57
 * @{
58
 */
59
 
60
 
61
/**
62
 * @details
63
 * @brief Processing function for the Q15 CFFT/CIFFT.
64
 * @deprecated Do not use this function.  It has been superseded by \ref arm_cfft_q15 and will be removed
65
 * @param[in]      *S    points to an instance of the Q15 CFFT/CIFFT structure.
66
 * @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.
67
 * @return none.
68
 *
69
 * \par Input and output formats:
70
 * \par
71
 * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
72
 * Hence the output format is different for different FFT sizes.
73
 * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
74
 * \par
75
 * \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
76
 * \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
77
 */
78
 
79
void arm_cfft_radix4_q15(
80
  const arm_cfft_radix4_instance_q15 * S,
81
  q15_t * pSrc)
82
{
83
  if (S->ifftFlag == 1U)
84
  {
85
    /*  Complex IFFT radix-4  */
86
    arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
87
  }
88
  else
89
  {
90
    /*  Complex FFT radix-4  */
91
    arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
92
  }
93
 
94
  if (S->bitReverseFlag == 1U)
95
  {
96
    /*  Bit Reversal */
97
    arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
98
  }
99
 
100
}
101
 
102
/**
103
 * @} end of ComplexFFT group
104
 */
105
 
106
/*
107
* Radix-4 FFT algorithm used is :
108
*
109
* Input real and imaginary data:
110
* x(n) = xa + j * ya
111
* x(n+N/4 ) = xb + j * yb
112
* x(n+N/2 ) = xc + j * yc
113
* x(n+3N 4) = xd + j * yd
114
*
115
*
116
* Output real and imaginary data:
117
* x(4r) = xa'+ j * ya'
118
* x(4r+1) = xb'+ j * yb'
119
* x(4r+2) = xc'+ j * yc'
120
* x(4r+3) = xd'+ j * yd'
121
*
122
*
123
* Twiddle factors for radix-4 FFT:
124
* Wn = co1 + j * (- si1)
125
* W2n = co2 + j * (- si2)
126
* W3n = co3 + j * (- si3)
127
 
128
* The real and imaginary output values for the radix-4 butterfly are
129
* xa' = xa + xb + xc + xd
130
* ya' = ya + yb + yc + yd
131
* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
132
* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
133
* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
134
* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
135
* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
136
* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
137
*
138
*/
139
 
140
/**
141
 * @brief  Core function for the Q15 CFFT butterfly process.
142
 * @param[in, out] *pSrc16          points to the in-place buffer of Q15 data type.
143
 * @param[in]      fftLen           length of the FFT.
144
 * @param[in]      *pCoef16         points to twiddle coefficient buffer.
145
 * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
146
 * @return none.
147
 */
148
 
149
void arm_radix4_butterfly_q15(
150
  q15_t * pSrc16,
151
  uint32_t fftLen,
152
  q15_t * pCoef16,
153
  uint32_t twidCoefModifier)
154
{
155
 
156
#if defined (ARM_MATH_DSP)
157
 
158
  /* Run the below code for Cortex-M4 and Cortex-M3 */
159
 
160
  q31_t R, S, T, U;
161
  q31_t C1, C2, C3, out1, out2;
162
  uint32_t n1, n2, ic, i0, j, k;
163
 
164
  q15_t *ptr1;
165
  q15_t *pSi0;
166
  q15_t *pSi1;
167
  q15_t *pSi2;
168
  q15_t *pSi3;
169
 
170
  q31_t xaya, xbyb, xcyc, xdyd;
171
 
172
  /* Total process is divided into three stages */
173
 
174
  /* process first stage, middle stages, & last stage */
175
 
176
  /*  Initializations for the first stage */
177
  n2 = fftLen;
178
  n1 = n2;
179
 
180
  /* n2 = fftLen/4 */
181
  n2 >>= 2U;
182
 
183
  /* Index for twiddle coefficient */
184
  ic = 0U;
185
 
186
  /* Index for input read and output write */
187
  j = n2;
188
 
189
  pSi0 = pSrc16;
190
  pSi1 = pSi0 + 2 * n2;
191
  pSi2 = pSi1 + 2 * n2;
192
  pSi3 = pSi2 + 2 * n2;
193
 
194
  /* Input is in 1.15(q15) format */
195
 
196
  /*  start of first stage process */
197
  do
198
  {
199
    /*  Butterfly implementation */
200
 
201
    /*  Reading i0, i0+fftLen/2 inputs */
202
    /* Read ya (real), xa(imag) input */
203
    T = _SIMD32_OFFSET(pSi0);
204
    T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
205
    T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
206
    //in = ((int16_t) (T & 0xFFFF)) >> 2;       // alternative code that takes 3 cycles
207
    //T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
208
 
209
    /* Read yc (real), xc(imag) input */
210
    S = _SIMD32_OFFSET(pSi2);
211
    S = __SHADD16(S, 0);
212
    S = __SHADD16(S, 0);
213
 
214
    /* R = packed((ya + yc), (xa + xc) ) */
215
    R = __QADD16(T, S);
216
 
217
    /* S = packed((ya - yc), (xa - xc) ) */
218
    S = __QSUB16(T, S);
219
 
220
    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
221
    /* Read yb (real), xb(imag) input */
222
    T = _SIMD32_OFFSET(pSi1);
223
    T = __SHADD16(T, 0);
224
    T = __SHADD16(T, 0);
225
 
226
    /* Read yd (real), xd(imag) input */
227
    U = _SIMD32_OFFSET(pSi3);
228
    U = __SHADD16(U, 0);
229
    U = __SHADD16(U, 0);
230
 
231
    /* T = packed((yb + yd), (xb + xd) ) */
232
    T = __QADD16(T, U);
233
 
234
    /*  writing the butterfly processed i0 sample */
235
    /* xa' = xa + xb + xc + xd */
236
    /* ya' = ya + yb + yc + yd */
237
    _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
238
    pSi0 += 2;
239
 
240
    /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
241
    R = __QSUB16(R, T);
242
 
243
    /* co2 & si2 are read from SIMD Coefficient pointer */
244
    C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
245
 
246
#ifndef ARM_MATH_BIG_ENDIAN
247
 
248
    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
249
    out1 = __SMUAD(C2, R) >> 16U;
250
    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
251
    out2 = __SMUSDX(C2, R);
252
 
253
#else
254
 
255
    /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
256
    out1 = __SMUSDX(R, C2) >> 16U;
257
    /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
258
    out2 = __SMUAD(C2, R);
259
 
260
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
261
 
262
    /*  Reading i0+fftLen/4 */
263
    /* T = packed(yb, xb) */
264
    T = _SIMD32_OFFSET(pSi1);
265
    T = __SHADD16(T, 0);
266
    T = __SHADD16(T, 0);
267
 
268
    /* writing the butterfly processed i0 + fftLen/4 sample */
269
    /* writing output(xc', yc') in little endian format */
270
    _SIMD32_OFFSET(pSi1) =
271
      (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
272
    pSi1 += 2;
273
 
274
    /*  Butterfly calculations */
275
    /* U = packed(yd, xd) */
276
    U = _SIMD32_OFFSET(pSi3);
277
    U = __SHADD16(U, 0);
278
    U = __SHADD16(U, 0);
279
 
280
    /* T = packed(yb-yd, xb-xd) */
281
    T = __QSUB16(T, U);
282
 
283
#ifndef ARM_MATH_BIG_ENDIAN
284
 
285
    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
286
    R = __QASX(S, T);
287
    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
288
    S = __QSAX(S, T);
289
 
290
#else
291
 
292
    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
293
    R = __QSAX(S, T);
294
    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
295
    S = __QASX(S, T);
296
 
297
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
298
 
299
    /* co1 & si1 are read from SIMD Coefficient pointer */
300
    C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
301
    /*  Butterfly process for the i0+fftLen/2 sample */
302
 
303
#ifndef ARM_MATH_BIG_ENDIAN
304
 
305
    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
306
    out1 = __SMUAD(C1, S) >> 16U;
307
    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
308
    out2 = __SMUSDX(C1, S);
309
 
310
#else
311
 
312
    /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
313
    out1 = __SMUSDX(S, C1) >> 16U;
314
    /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
315
    out2 = __SMUAD(C1, S);
316
 
317
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
318
 
319
    /* writing output(xb', yb') in little endian format */
320
    _SIMD32_OFFSET(pSi2) =
321
      ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
322
    pSi2 += 2;
323
 
324
 
325
    /* co3 & si3 are read from SIMD Coefficient pointer */
326
    C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
327
    /*  Butterfly process for the i0+3fftLen/4 sample */
328
 
329
#ifndef ARM_MATH_BIG_ENDIAN
330
 
331
    /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
332
    out1 = __SMUAD(C3, R) >> 16U;
333
    /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
334
    out2 = __SMUSDX(C3, R);
335
 
336
#else
337
 
338
    /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
339
    out1 = __SMUSDX(R, C3) >> 16U;
340
    /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
341
    out2 = __SMUAD(C3, R);
342
 
343
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
344
 
345
    /* writing output(xd', yd') in little endian format */
346
    _SIMD32_OFFSET(pSi3) =
347
      ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
348
    pSi3 += 2;
349
 
350
    /*  Twiddle coefficients index modifier */
351
    ic = ic + twidCoefModifier;
352
 
353
  } while (--j);
354
  /* data is in 4.11(q11) format */
355
 
356
  /* end of first stage process */
357
 
358
 
359
  /* start of middle stage process */
360
 
361
  /*  Twiddle coefficients index modifier */
362
  twidCoefModifier <<= 2U;
363
 
364
  /*  Calculation of Middle stage */
365
  for (k = fftLen / 4U; k > 4U; k >>= 2U)
366
  {
367
    /*  Initializations for the middle stage */
368
    n1 = n2;
369
    n2 >>= 2U;
370
    ic = 0U;
371
 
372
    for (j = 0U; j <= (n2 - 1U); j++)
373
    {
374
      /*  index calculation for the coefficients */
375
      C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
376
      C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
377
      C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
378
 
379
      /*  Twiddle coefficients index modifier */
380
      ic = ic + twidCoefModifier;
381
 
382
      pSi0 = pSrc16 + 2 * j;
383
      pSi1 = pSi0 + 2 * n2;
384
      pSi2 = pSi1 + 2 * n2;
385
      pSi3 = pSi2 + 2 * n2;
386
 
387
      /*  Butterfly implementation */
388
      for (i0 = j; i0 < fftLen; i0 += n1)
389
      {
390
        /*  Reading i0, i0+fftLen/2 inputs */
391
        /* Read ya (real), xa(imag) input */
392
        T = _SIMD32_OFFSET(pSi0);
393
 
394
        /* Read yc (real), xc(imag) input */
395
        S = _SIMD32_OFFSET(pSi2);
396
 
397
        /* R = packed( (ya + yc), (xa + xc)) */
398
        R = __QADD16(T, S);
399
 
400
        /* S = packed((ya - yc), (xa - xc)) */
401
        S = __QSUB16(T, S);
402
 
403
        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
404
        /* Read yb (real), xb(imag) input */
405
        T = _SIMD32_OFFSET(pSi1);
406
 
407
        /* Read yd (real), xd(imag) input */
408
        U = _SIMD32_OFFSET(pSi3);
409
 
410
        /* T = packed( (yb + yd), (xb + xd)) */
411
        T = __QADD16(T, U);
412
 
413
        /*  writing the butterfly processed i0 sample */
414
 
415
        /* xa' = xa + xb + xc + xd */
416
        /* ya' = ya + yb + yc + yd */
417
        out1 = __SHADD16(R, T);
418
        out1 = __SHADD16(out1, 0);
419
        _SIMD32_OFFSET(pSi0) = out1;
420
        pSi0 += 2 * n1;
421
 
422
        /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
423
        R = __SHSUB16(R, T);
424
 
425
#ifndef ARM_MATH_BIG_ENDIAN
426
 
427
        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
428
        out1 = __SMUAD(C2, R) >> 16U;
429
 
430
        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
431
        out2 = __SMUSDX(C2, R);
432
 
433
#else
434
 
435
        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
436
        out1 = __SMUSDX(R, C2) >> 16U;
437
 
438
        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
439
        out2 = __SMUAD(C2, R);
440
 
441
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
442
 
443
        /*  Reading i0+3fftLen/4 */
444
        /* Read yb (real), xb(imag) input */
445
        T = _SIMD32_OFFSET(pSi1);
446
 
447
        /*  writing the butterfly processed i0 + fftLen/4 sample */
448
        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
449
        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
450
        _SIMD32_OFFSET(pSi1) =
451
          ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
452
        pSi1 += 2 * n1;
453
 
454
        /*  Butterfly calculations */
455
 
456
        /* Read yd (real), xd(imag) input */
457
        U = _SIMD32_OFFSET(pSi3);
458
 
459
        /* T = packed(yb-yd, xb-xd) */
460
        T = __QSUB16(T, U);
461
 
462
#ifndef ARM_MATH_BIG_ENDIAN
463
 
464
        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
465
        R = __SHASX(S, T);
466
 
467
        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
468
        S = __SHSAX(S, T);
469
 
470
 
471
        /*  Butterfly process for the i0+fftLen/2 sample */
472
        out1 = __SMUAD(C1, S) >> 16U;
473
        out2 = __SMUSDX(C1, S);
474
 
475
#else
476
 
477
        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
478
        R = __SHSAX(S, T);
479
 
480
        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
481
        S = __SHASX(S, T);
482
 
483
 
484
        /*  Butterfly process for the i0+fftLen/2 sample */
485
        out1 = __SMUSDX(S, C1) >> 16U;
486
        out2 = __SMUAD(C1, S);
487
 
488
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
489
 
490
        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
491
        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
492
        _SIMD32_OFFSET(pSi2) =
493
          ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
494
        pSi2 += 2 * n1;
495
 
496
        /*  Butterfly process for the i0+3fftLen/4 sample */
497
 
498
#ifndef ARM_MATH_BIG_ENDIAN
499
 
500
        out1 = __SMUAD(C3, R) >> 16U;
501
        out2 = __SMUSDX(C3, R);
502
 
503
#else
504
 
505
        out1 = __SMUSDX(R, C3) >> 16U;
506
        out2 = __SMUAD(C3, R);
507
 
508
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
509
 
510
        /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
511
        /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
512
        _SIMD32_OFFSET(pSi3) =
513
          ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
514
        pSi3 += 2 * n1;
515
      }
516
    }
517
    /*  Twiddle coefficients index modifier */
518
    twidCoefModifier <<= 2U;
519
  }
520
  /* end of middle stage process */
521
 
522
 
523
  /* data is in 10.6(q6) format for the 1024 point */
524
  /* data is in 8.8(q8) format for the 256 point */
525
  /* data is in 6.10(q10) format for the 64 point */
526
  /* data is in 4.12(q12) format for the 16 point */
527
 
528
  /*  Initializations for the last stage */
529
  j = fftLen >> 2;
530
 
531
  ptr1 = &pSrc16[0];
532
 
533
  /* start of last stage process */
534
 
535
  /*  Butterfly implementation */
536
  do
537
  {
538
    /* Read xa (real), ya(imag) input */
539
    xaya = *__SIMD32(ptr1)++;
540
 
541
    /* Read xb (real), yb(imag) input */
542
    xbyb = *__SIMD32(ptr1)++;
543
 
544
    /* Read xc (real), yc(imag) input */
545
    xcyc = *__SIMD32(ptr1)++;
546
 
547
    /* Read xd (real), yd(imag) input */
548
    xdyd = *__SIMD32(ptr1)++;
549
 
550
    /* R = packed((ya + yc), (xa + xc)) */
551
    R = __QADD16(xaya, xcyc);
552
 
553
    /* T = packed((yb + yd), (xb + xd)) */
554
    T = __QADD16(xbyb, xdyd);
555
 
556
    /* pointer updation for writing */
557
    ptr1 = ptr1 - 8U;
558
 
559
 
560
    /* xa' = xa + xb + xc + xd */
561
    /* ya' = ya + yb + yc + yd */
562
    *__SIMD32(ptr1)++ = __SHADD16(R, T);
563
 
564
    /* T = packed((yb + yd), (xb + xd)) */
565
    T = __QADD16(xbyb, xdyd);
566
 
567
    /* xc' = (xa-xb+xc-xd) */
568
    /* yc' = (ya-yb+yc-yd) */
569
    *__SIMD32(ptr1)++ = __SHSUB16(R, T);
570
 
571
    /* S = packed((ya - yc), (xa - xc)) */
572
    S = __QSUB16(xaya, xcyc);
573
 
574
    /* Read yd (real), xd(imag) input */
575
    /* T = packed( (yb - yd), (xb - xd))  */
576
    U = __QSUB16(xbyb, xdyd);
577
 
578
#ifndef ARM_MATH_BIG_ENDIAN
579
 
580
    /* xb' = (xa+yb-xc-yd) */
581
    /* yb' = (ya-xb-yc+xd) */
582
    *__SIMD32(ptr1)++ = __SHSAX(S, U);
583
 
584
 
585
    /* xd' = (xa-yb-xc+yd) */
586
    /* yd' = (ya+xb-yc-xd) */
587
    *__SIMD32(ptr1)++ = __SHASX(S, U);
588
 
589
#else
590
 
591
    /* xb' = (xa+yb-xc-yd) */
592
    /* yb' = (ya-xb-yc+xd) */
593
    *__SIMD32(ptr1)++ = __SHASX(S, U);
594
 
595
 
596
    /* xd' = (xa-yb-xc+yd) */
597
    /* yd' = (ya+xb-yc-xd) */
598
    *__SIMD32(ptr1)++ = __SHSAX(S, U);
599
 
600
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
601
 
602
  } while (--j);
603
 
604
  /* end of last stage process */
605
 
606
  /* output is in 11.5(q5) format for the 1024 point */
607
  /* output is in 9.7(q7) format for the 256 point   */
608
  /* output is in 7.9(q9) format for the 64 point  */
609
  /* output is in 5.11(q11) format for the 16 point  */
610
 
611
 
612
#else
613
 
614
  /* Run the below code for Cortex-M0 */
615
 
616
  q15_t R0, R1, S0, S1, T0, T1, U0, U1;
617
  q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
618
  uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
619
 
620
  /* Total process is divided into three stages */
621
 
622
  /* process first stage, middle stages, & last stage */
623
 
624
  /*  Initializations for the first stage */
625
  n2 = fftLen;
626
  n1 = n2;
627
 
628
  /* n2 = fftLen/4 */
629
  n2 >>= 2U;
630
 
631
  /* Index for twiddle coefficient */
632
  ic = 0U;
633
 
634
  /* Index for input read and output write */
635
  i0 = 0U;
636
  j = n2;
637
 
638
  /* Input is in 1.15(q15) format */
639
 
640
  /*  start of first stage process */
641
  do
642
  {
643
    /*  Butterfly implementation */
644
 
645
    /*  index calculation for the input as, */
646
    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
647
    i1 = i0 + n2;
648
    i2 = i1 + n2;
649
    i3 = i2 + n2;
650
 
651
    /*  Reading i0, i0+fftLen/2 inputs */
652
 
653
    /* input is down scale by 4 to avoid overflow */
654
    /* Read ya (real), xa(imag) input */
655
    T0 = pSrc16[i0 * 2U] >> 2U;
656
    T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
657
 
658
    /* input is down scale by 4 to avoid overflow */
659
    /* Read yc (real), xc(imag) input */
660
    S0 = pSrc16[i2 * 2U] >> 2U;
661
    S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
662
 
663
    /* R0 = (ya + yc) */
664
    R0 = __SSAT(T0 + S0, 16U);
665
    /* R1 = (xa + xc) */
666
    R1 = __SSAT(T1 + S1, 16U);
667
 
668
    /* S0 = (ya - yc) */
669
    S0 = __SSAT(T0 - S0, 16);
670
    /* S1 = (xa - xc) */
671
    S1 = __SSAT(T1 - S1, 16);
672
 
673
    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
674
    /* input is down scale by 4 to avoid overflow */
675
    /* Read yb (real), xb(imag) input */
676
    T0 = pSrc16[i1 * 2U] >> 2U;
677
    T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
678
 
679
    /* input is down scale by 4 to avoid overflow */
680
    /* Read yd (real), xd(imag) input */
681
    U0 = pSrc16[i3 * 2U] >> 2U;
682
    U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
683
 
684
    /* T0 = (yb + yd) */
685
    T0 = __SSAT(T0 + U0, 16U);
686
    /* T1 = (xb + xd) */
687
    T1 = __SSAT(T1 + U1, 16U);
688
 
689
    /*  writing the butterfly processed i0 sample */
690
    /* ya' = ya + yb + yc + yd */
691
    /* xa' = xa + xb + xc + xd */
692
    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
693
    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
694
 
695
    /* R0 = (ya + yc) - (yb + yd) */
696
    /* R1 = (xa + xc) - (xb + xd) */
697
    R0 = __SSAT(R0 - T0, 16U);
698
    R1 = __SSAT(R1 - T1, 16U);
699
 
700
    /* co2 & si2 are read from Coefficient pointer */
701
    Co2 = pCoef16[2U * ic * 2U];
702
    Si2 = pCoef16[(2U * ic * 2U) + 1];
703
 
704
    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
705
    out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
706
    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
707
    out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
708
 
709
    /*  Reading i0+fftLen/4 */
710
    /* input is down scale by 4 to avoid overflow */
711
    /* T0 = yb, T1 =  xb */
712
    T0 = pSrc16[i1 * 2U] >> 2;
713
    T1 = pSrc16[(i1 * 2U) + 1] >> 2;
714
 
715
    /* writing the butterfly processed i0 + fftLen/4 sample */
716
    /* writing output(xc', yc') in little endian format */
717
    pSrc16[i1 * 2U] = out1;
718
    pSrc16[(i1 * 2U) + 1] = out2;
719
 
720
    /*  Butterfly calculations */
721
    /* input is down scale by 4 to avoid overflow */
722
    /* U0 = yd, U1 = xd */
723
    U0 = pSrc16[i3 * 2U] >> 2;
724
    U1 = pSrc16[(i3 * 2U) + 1] >> 2;
725
    /* T0 = yb-yd */
726
    T0 = __SSAT(T0 - U0, 16);
727
    /* T1 = xb-xd */
728
    T1 = __SSAT(T1 - U1, 16);
729
 
730
    /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
731
    R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
732
    R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
733
 
734
    /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
735
    S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
736
    S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
737
 
738
    /* co1 & si1 are read from Coefficient pointer */
739
    Co1 = pCoef16[ic * 2U];
740
    Si1 = pCoef16[(ic * 2U) + 1];
741
    /*  Butterfly process for the i0+fftLen/2 sample */
742
    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
743
    out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
744
    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
745
    out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
746
 
747
    /* writing output(xb', yb') in little endian format */
748
    pSrc16[i2 * 2U] = out1;
749
    pSrc16[(i2 * 2U) + 1] = out2;
750
 
751
    /* Co3 & si3 are read from Coefficient pointer */
752
    Co3 = pCoef16[3U * (ic * 2U)];
753
    Si3 = pCoef16[(3U * (ic * 2U)) + 1];
754
    /*  Butterfly process for the i0+3fftLen/4 sample */
755
    /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
756
    out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
757
    /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
758
    out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
759
    /* writing output(xd', yd') in little endian format */
760
    pSrc16[i3 * 2U] = out1;
761
    pSrc16[(i3 * 2U) + 1] = out2;
762
 
763
    /*  Twiddle coefficients index modifier */
764
    ic = ic + twidCoefModifier;
765
 
766
    /*  Updating input index */
767
    i0 = i0 + 1U;
768
 
769
  } while (--j);
770
  /* data is in 4.11(q11) format */
771
 
772
  /* end of first stage process */
773
 
774
 
775
  /* start of middle stage process */
776
 
777
  /*  Twiddle coefficients index modifier */
778
  twidCoefModifier <<= 2U;
779
 
780
  /*  Calculation of Middle stage */
781
  for (k = fftLen / 4U; k > 4U; k >>= 2U)
782
  {
783
    /*  Initializations for the middle stage */
784
    n1 = n2;
785
    n2 >>= 2U;
786
    ic = 0U;
787
 
788
    for (j = 0U; j <= (n2 - 1U); j++)
789
    {
790
      /*  index calculation for the coefficients */
791
      Co1 = pCoef16[ic * 2U];
792
      Si1 = pCoef16[(ic * 2U) + 1U];
793
      Co2 = pCoef16[2U * (ic * 2U)];
794
      Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
795
      Co3 = pCoef16[3U * (ic * 2U)];
796
      Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
797
 
798
      /*  Twiddle coefficients index modifier */
799
      ic = ic + twidCoefModifier;
800
 
801
      /*  Butterfly implementation */
802
      for (i0 = j; i0 < fftLen; i0 += n1)
803
      {
804
        /*  index calculation for the input as, */
805
        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
806
        i1 = i0 + n2;
807
        i2 = i1 + n2;
808
        i3 = i2 + n2;
809
 
810
        /*  Reading i0, i0+fftLen/2 inputs */
811
        /* Read ya (real), xa(imag) input */
812
        T0 = pSrc16[i0 * 2U];
813
        T1 = pSrc16[(i0 * 2U) + 1U];
814
 
815
        /* Read yc (real), xc(imag) input */
816
        S0 = pSrc16[i2 * 2U];
817
        S1 = pSrc16[(i2 * 2U) + 1U];
818
 
819
        /* R0 = (ya + yc), R1 = (xa + xc) */
820
        R0 = __SSAT(T0 + S0, 16);
821
        R1 = __SSAT(T1 + S1, 16);
822
 
823
        /* S0 = (ya - yc), S1 =(xa - xc) */
824
        S0 = __SSAT(T0 - S0, 16);
825
        S1 = __SSAT(T1 - S1, 16);
826
 
827
        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
828
        /* Read yb (real), xb(imag) input */
829
        T0 = pSrc16[i1 * 2U];
830
        T1 = pSrc16[(i1 * 2U) + 1U];
831
 
832
        /* Read yd (real), xd(imag) input */
833
        U0 = pSrc16[i3 * 2U];
834
        U1 = pSrc16[(i3 * 2U) + 1U];
835
 
836
 
837
        /* T0 = (yb + yd), T1 = (xb + xd) */
838
        T0 = __SSAT(T0 + U0, 16);
839
        T1 = __SSAT(T1 + U1, 16);
840
 
841
        /*  writing the butterfly processed i0 sample */
842
 
843
        /* xa' = xa + xb + xc + xd */
844
        /* ya' = ya + yb + yc + yd */
845
        out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
846
        out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
847
 
848
        pSrc16[i0 * 2U] = out1;
849
        pSrc16[(2U * i0) + 1U] = out2;
850
 
851
        /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
852
        R0 = (R0 >> 1U) - (T0 >> 1U);
853
        R1 = (R1 >> 1U) - (T1 >> 1U);
854
 
855
        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
856
        out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
857
 
858
        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
859
        out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
860
 
861
        /*  Reading i0+3fftLen/4 */
862
        /* Read yb (real), xb(imag) input */
863
        T0 = pSrc16[i1 * 2U];
864
        T1 = pSrc16[(i1 * 2U) + 1U];
865
 
866
        /*  writing the butterfly processed i0 + fftLen/4 sample */
867
        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
868
        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
869
        pSrc16[i1 * 2U] = out1;
870
        pSrc16[(i1 * 2U) + 1U] = out2;
871
 
872
        /*  Butterfly calculations */
873
 
874
        /* Read yd (real), xd(imag) input */
875
        U0 = pSrc16[i3 * 2U];
876
        U1 = pSrc16[(i3 * 2U) + 1U];
877
 
878
        /* T0 = yb-yd, T1 = xb-xd */
879
        T0 = __SSAT(T0 - U0, 16);
880
        T1 = __SSAT(T1 - U1, 16);
881
 
882
        /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
883
        R0 = (S0 >> 1U) - (T1 >> 1U);
884
        R1 = (S1 >> 1U) + (T0 >> 1U);
885
 
886
        /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
887
        S0 = (S0 >> 1U) + (T1 >> 1U);
888
        S1 = (S1 >> 1U) - (T0 >> 1U);
889
 
890
        /*  Butterfly process for the i0+fftLen/2 sample */
891
        out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
892
 
893
        out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
894
 
895
        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
896
        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
897
        pSrc16[i2 * 2U] = out1;
898
        pSrc16[(i2 * 2U) + 1U] = out2;
899
 
900
        /*  Butterfly process for the i0+3fftLen/4 sample */
901
        out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
902
 
903
        out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
904
        /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
905
        /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
906
        pSrc16[i3 * 2U] = out1;
907
        pSrc16[(i3 * 2U) + 1U] = out2;
908
      }
909
    }
910
    /*  Twiddle coefficients index modifier */
911
    twidCoefModifier <<= 2U;
912
  }
913
  /* end of middle stage process */
914
 
915
 
916
  /* data is in 10.6(q6) format for the 1024 point */
917
  /* data is in 8.8(q8) format for the 256 point */
918
  /* data is in 6.10(q10) format for the 64 point */
919
  /* data is in 4.12(q12) format for the 16 point */
920
 
921
  /*  Initializations for the last stage */
922
  n1 = n2;
923
  n2 >>= 2U;
924
 
925
  /* start of last stage process */
926
 
927
  /*  Butterfly implementation */
928
  for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
929
  {
930
    /*  index calculation for the input as, */
931
    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
932
    i1 = i0 + n2;
933
    i2 = i1 + n2;
934
    i3 = i2 + n2;
935
 
936
    /*  Reading i0, i0+fftLen/2 inputs */
937
    /* Read ya (real), xa(imag) input */
938
    T0 = pSrc16[i0 * 2U];
939
    T1 = pSrc16[(i0 * 2U) + 1U];
940
 
941
    /* Read yc (real), xc(imag) input */
942
    S0 = pSrc16[i2 * 2U];
943
    S1 = pSrc16[(i2 * 2U) + 1U];
944
 
945
    /* R0 = (ya + yc), R1 = (xa + xc) */
946
    R0 = __SSAT(T0 + S0, 16U);
947
    R1 = __SSAT(T1 + S1, 16U);
948
 
949
    /* S0 = (ya - yc), S1 = (xa - xc) */
950
    S0 = __SSAT(T0 - S0, 16U);
951
    S1 = __SSAT(T1 - S1, 16U);
952
 
953
    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
954
    /* Read yb (real), xb(imag) input */
955
    T0 = pSrc16[i1 * 2U];
956
    T1 = pSrc16[(i1 * 2U) + 1U];
957
    /* Read yd (real), xd(imag) input */
958
    U0 = pSrc16[i3 * 2U];
959
    U1 = pSrc16[(i3 * 2U) + 1U];
960
 
961
    /* T0 = (yb + yd), T1 = (xb + xd)) */
962
    T0 = __SSAT(T0 + U0, 16U);
963
    T1 = __SSAT(T1 + U1, 16U);
964
 
965
    /*  writing the butterfly processed i0 sample */
966
    /* xa' = xa + xb + xc + xd */
967
    /* ya' = ya + yb + yc + yd */
968
    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
969
    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
970
 
971
    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
972
    R0 = (R0 >> 1U) - (T0 >> 1U);
973
    R1 = (R1 >> 1U) - (T1 >> 1U);
974
    /* Read yb (real), xb(imag) input */
975
    T0 = pSrc16[i1 * 2U];
976
    T1 = pSrc16[(i1 * 2U) + 1U];
977
 
978
    /*  writing the butterfly processed i0 + fftLen/4 sample */
979
    /* xc' = (xa-xb+xc-xd) */
980
    /* yc' = (ya-yb+yc-yd) */
981
    pSrc16[i1 * 2U] = R0;
982
    pSrc16[(i1 * 2U) + 1U] = R1;
983
 
984
    /* Read yd (real), xd(imag) input */
985
    U0 = pSrc16[i3 * 2U];
986
    U1 = pSrc16[(i3 * 2U) + 1U];
987
    /* T0 = (yb - yd), T1 = (xb - xd)  */
988
    T0 = __SSAT(T0 - U0, 16U);
989
    T1 = __SSAT(T1 - U1, 16U);
990
 
991
    /*  writing the butterfly processed i0 + fftLen/2 sample */
992
    /* xb' = (xa+yb-xc-yd) */
993
    /* yb' = (ya-xb-yc+xd) */
994
    pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
995
    pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
996
 
997
    /*  writing the butterfly processed i0 + 3fftLen/4 sample */
998
    /* xd' = (xa-yb-xc+yd) */
999
    /* yd' = (ya+xb-yc-xd) */
1000
    pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
1001
    pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
1002
 
1003
  }
1004
 
1005
  /* end of last stage process */
1006
 
1007
  /* output is in 11.5(q5) format for the 1024 point */
1008
  /* output is in 9.7(q7) format for the 256 point   */
1009
  /* output is in 7.9(q9) format for the 64 point  */
1010
  /* output is in 5.11(q11) format for the 16 point  */
1011
 
1012
#endif /* #if defined (ARM_MATH_DSP) */
1013
 
1014
}
1015
 
1016
 
1017
/**
1018
 * @brief  Core function for the Q15 CIFFT butterfly process.
1019
 * @param[in, out] *pSrc16          points to the in-place buffer of Q15 data type.
1020
 * @param[in]      fftLen           length of the FFT.
1021
 * @param[in]      *pCoef16         points to twiddle coefficient buffer.
1022
 * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
1023
 * @return none.
1024
 */
1025
 
1026
/*
1027
* Radix-4 IFFT algorithm used is :
1028
*
1029
* CIFFT uses same twiddle coefficients as CFFT function
1030
*  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
1031
*
1032
*
1033
* IFFT is implemented with following changes in equations from FFT
1034
*
1035
* Input real and imaginary data:
1036
* x(n) = xa + j * ya
1037
* x(n+N/4 ) = xb + j * yb
1038
* x(n+N/2 ) = xc + j * yc
1039
* x(n+3N 4) = xd + j * yd
1040
*
1041
*
1042
* Output real and imaginary data:
1043
* x(4r) = xa'+ j * ya'
1044
* x(4r+1) = xb'+ j * yb'
1045
* x(4r+2) = xc'+ j * yc'
1046
* x(4r+3) = xd'+ j * yd'
1047
*
1048
*
1049
* Twiddle factors for radix-4 IFFT:
1050
* Wn = co1 + j * (si1)
1051
* W2n = co2 + j * (si2)
1052
* W3n = co3 + j * (si3)
1053
 
1054
* The real and imaginary output values for the radix-4 butterfly are
1055
* xa' = xa + xb + xc + xd
1056
* ya' = ya + yb + yc + yd
1057
* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
1058
* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
1059
* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
1060
* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
1061
* xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
1062
* yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
1063
*
1064
*/
1065
 
1066
void arm_radix4_butterfly_inverse_q15(
1067
  q15_t * pSrc16,
1068
  uint32_t fftLen,
1069
  q15_t * pCoef16,
1070
  uint32_t twidCoefModifier)
1071
{
1072
 
1073
#if defined (ARM_MATH_DSP)
1074
 
1075
  /* Run the below code for Cortex-M4 and Cortex-M3 */
1076
 
1077
  q31_t R, S, T, U;
1078
  q31_t C1, C2, C3, out1, out2;
1079
  uint32_t n1, n2, ic, i0, j, k;
1080
 
1081
  q15_t *ptr1;
1082
  q15_t *pSi0;
1083
  q15_t *pSi1;
1084
  q15_t *pSi2;
1085
  q15_t *pSi3;
1086
 
1087
  q31_t xaya, xbyb, xcyc, xdyd;
1088
 
1089
  /* Total process is divided into three stages */
1090
 
1091
  /* process first stage, middle stages, & last stage */
1092
 
1093
  /*  Initializations for the first stage */
1094
  n2 = fftLen;
1095
  n1 = n2;
1096
 
1097
  /* n2 = fftLen/4 */
1098
  n2 >>= 2U;
1099
 
1100
  /* Index for twiddle coefficient */
1101
  ic = 0U;
1102
 
1103
  /* Index for input read and output write */
1104
  j = n2;
1105
 
1106
  pSi0 = pSrc16;
1107
  pSi1 = pSi0 + 2 * n2;
1108
  pSi2 = pSi1 + 2 * n2;
1109
  pSi3 = pSi2 + 2 * n2;
1110
 
1111
  /* Input is in 1.15(q15) format */
1112
 
1113
  /*  start of first stage process */
1114
  do
1115
  {
1116
    /*  Butterfly implementation */
1117
 
1118
    /*  Reading i0, i0+fftLen/2 inputs */
1119
    /* Read ya (real), xa(imag) input */
1120
    T = _SIMD32_OFFSET(pSi0);
1121
    T = __SHADD16(T, 0);
1122
    T = __SHADD16(T, 0);
1123
 
1124
    /* Read yc (real), xc(imag) input */
1125
    S = _SIMD32_OFFSET(pSi2);
1126
    S = __SHADD16(S, 0);
1127
    S = __SHADD16(S, 0);
1128
 
1129
    /* R = packed((ya + yc), (xa + xc) ) */
1130
    R = __QADD16(T, S);
1131
 
1132
    /* S = packed((ya - yc), (xa - xc) ) */
1133
    S = __QSUB16(T, S);
1134
 
1135
    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1136
    /* Read yb (real), xb(imag) input */
1137
    T = _SIMD32_OFFSET(pSi1);
1138
    T = __SHADD16(T, 0);
1139
    T = __SHADD16(T, 0);
1140
 
1141
    /* Read yd (real), xd(imag) input */
1142
    U = _SIMD32_OFFSET(pSi3);
1143
    U = __SHADD16(U, 0);
1144
    U = __SHADD16(U, 0);
1145
 
1146
    /* T = packed((yb + yd), (xb + xd) ) */
1147
    T = __QADD16(T, U);
1148
 
1149
    /*  writing the butterfly processed i0 sample */
1150
    /* xa' = xa + xb + xc + xd */
1151
    /* ya' = ya + yb + yc + yd */
1152
    _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
1153
    pSi0 += 2;
1154
 
1155
    /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
1156
    R = __QSUB16(R, T);
1157
 
1158
    /* co2 & si2 are read from SIMD Coefficient pointer */
1159
    C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
1160
 
1161
#ifndef ARM_MATH_BIG_ENDIAN
1162
 
1163
    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1164
    out1 = __SMUSD(C2, R) >> 16U;
1165
    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1166
    out2 = __SMUADX(C2, R);
1167
 
1168
#else
1169
 
1170
    /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1171
    out1 = __SMUADX(C2, R) >> 16U;
1172
    /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1173
    out2 = __SMUSD(__QSUB16(0, C2), R);
1174
 
1175
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1176
 
1177
    /*  Reading i0+fftLen/4 */
1178
    /* T = packed(yb, xb) */
1179
    T = _SIMD32_OFFSET(pSi1);
1180
    T = __SHADD16(T, 0);
1181
    T = __SHADD16(T, 0);
1182
 
1183
    /* writing the butterfly processed i0 + fftLen/4 sample */
1184
    /* writing output(xc', yc') in little endian format */
1185
    _SIMD32_OFFSET(pSi1) =
1186
      (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1187
    pSi1 += 2;
1188
 
1189
    /*  Butterfly calculations */
1190
    /* U = packed(yd, xd) */
1191
    U = _SIMD32_OFFSET(pSi3);
1192
    U = __SHADD16(U, 0);
1193
    U = __SHADD16(U, 0);
1194
 
1195
    /* T = packed(yb-yd, xb-xd) */
1196
    T = __QSUB16(T, U);
1197
 
1198
#ifndef ARM_MATH_BIG_ENDIAN
1199
 
1200
    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1201
    R = __QSAX(S, T);
1202
    /* S = packed((ya-yc) + (xb- xd),  (xa-xc) - (yb-yd)) */
1203
    S = __QASX(S, T);
1204
 
1205
#else
1206
 
1207
    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1208
    R = __QASX(S, T);
1209
    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1210
    S = __QSAX(S, T);
1211
 
1212
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1213
 
1214
    /* co1 & si1 are read from SIMD Coefficient pointer */
1215
    C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
1216
    /*  Butterfly process for the i0+fftLen/2 sample */
1217
 
1218
#ifndef ARM_MATH_BIG_ENDIAN
1219
 
1220
    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1221
    out1 = __SMUSD(C1, S) >> 16U;
1222
    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1223
    out2 = __SMUADX(C1, S);
1224
 
1225
#else
1226
 
1227
    /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1228
    out1 = __SMUADX(C1, S) >> 16U;
1229
    /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1230
    out2 = __SMUSD(__QSUB16(0, C1), S);
1231
 
1232
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1233
 
1234
    /* writing output(xb', yb') in little endian format */
1235
    _SIMD32_OFFSET(pSi2) =
1236
      ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
1237
    pSi2 += 2;
1238
 
1239
 
1240
    /* co3 & si3 are read from SIMD Coefficient pointer */
1241
    C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
1242
    /*  Butterfly process for the i0+3fftLen/4 sample */
1243
 
1244
#ifndef ARM_MATH_BIG_ENDIAN
1245
 
1246
    /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1247
    out1 = __SMUSD(C3, R) >> 16U;
1248
    /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1249
    out2 = __SMUADX(C3, R);
1250
 
1251
#else
1252
 
1253
    /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1254
    out1 = __SMUADX(C3, R) >> 16U;
1255
    /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1256
    out2 = __SMUSD(__QSUB16(0, C3), R);
1257
 
1258
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1259
 
1260
    /* writing output(xd', yd') in little endian format */
1261
    _SIMD32_OFFSET(pSi3) =
1262
      ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1263
    pSi3 += 2;
1264
 
1265
    /*  Twiddle coefficients index modifier */
1266
    ic = ic + twidCoefModifier;
1267
 
1268
  } while (--j);
1269
  /* data is in 4.11(q11) format */
1270
 
1271
  /* end of first stage process */
1272
 
1273
 
1274
  /* start of middle stage process */
1275
 
1276
  /*  Twiddle coefficients index modifier */
1277
  twidCoefModifier <<= 2U;
1278
 
1279
  /*  Calculation of Middle stage */
1280
  for (k = fftLen / 4U; k > 4U; k >>= 2U)
1281
  {
1282
    /*  Initializations for the middle stage */
1283
    n1 = n2;
1284
    n2 >>= 2U;
1285
    ic = 0U;
1286
 
1287
    for (j = 0U; j <= (n2 - 1U); j++)
1288
    {
1289
      /*  index calculation for the coefficients */
1290
      C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
1291
      C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
1292
      C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
1293
 
1294
      /*  Twiddle coefficients index modifier */
1295
      ic = ic + twidCoefModifier;
1296
 
1297
      pSi0 = pSrc16 + 2 * j;
1298
      pSi1 = pSi0 + 2 * n2;
1299
      pSi2 = pSi1 + 2 * n2;
1300
      pSi3 = pSi2 + 2 * n2;
1301
 
1302
      /*  Butterfly implementation */
1303
      for (i0 = j; i0 < fftLen; i0 += n1)
1304
      {
1305
        /*  Reading i0, i0+fftLen/2 inputs */
1306
        /* Read ya (real), xa(imag) input */
1307
        T = _SIMD32_OFFSET(pSi0);
1308
 
1309
        /* Read yc (real), xc(imag) input */
1310
        S = _SIMD32_OFFSET(pSi2);
1311
 
1312
        /* R = packed( (ya + yc), (xa + xc)) */
1313
        R = __QADD16(T, S);
1314
 
1315
        /* S = packed((ya - yc), (xa - xc)) */
1316
        S = __QSUB16(T, S);
1317
 
1318
        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1319
        /* Read yb (real), xb(imag) input */
1320
        T = _SIMD32_OFFSET(pSi1);
1321
 
1322
        /* Read yd (real), xd(imag) input */
1323
        U = _SIMD32_OFFSET(pSi3);
1324
 
1325
        /* T = packed( (yb + yd), (xb + xd)) */
1326
        T = __QADD16(T, U);
1327
 
1328
        /*  writing the butterfly processed i0 sample */
1329
 
1330
        /* xa' = xa + xb + xc + xd */
1331
        /* ya' = ya + yb + yc + yd */
1332
        out1 = __SHADD16(R, T);
1333
        out1 = __SHADD16(out1, 0);
1334
        _SIMD32_OFFSET(pSi0) = out1;
1335
        pSi0 += 2 * n1;
1336
 
1337
        /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
1338
        R = __SHSUB16(R, T);
1339
 
1340
#ifndef ARM_MATH_BIG_ENDIAN
1341
 
1342
        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1343
        out1 = __SMUSD(C2, R) >> 16U;
1344
 
1345
        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1346
        out2 = __SMUADX(C2, R);
1347
 
1348
#else
1349
 
1350
        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1351
        out1 = __SMUADX(R, C2) >> 16U;
1352
 
1353
        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1354
        out2 = __SMUSD(__QSUB16(0, C2), R);
1355
 
1356
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1357
 
1358
        /*  Reading i0+3fftLen/4 */
1359
        /* Read yb (real), xb(imag) input */
1360
        T = _SIMD32_OFFSET(pSi1);
1361
 
1362
        /*  writing the butterfly processed i0 + fftLen/4 sample */
1363
        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1364
        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1365
        _SIMD32_OFFSET(pSi1) =
1366
          ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1367
        pSi1 += 2 * n1;
1368
 
1369
        /*  Butterfly calculations */
1370
 
1371
        /* Read yd (real), xd(imag) input */
1372
        U = _SIMD32_OFFSET(pSi3);
1373
 
1374
        /* T = packed(yb-yd, xb-xd) */
1375
        T = __QSUB16(T, U);
1376
 
1377
#ifndef ARM_MATH_BIG_ENDIAN
1378
 
1379
        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1380
        R = __SHSAX(S, T);
1381
 
1382
        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1383
        S = __SHASX(S, T);
1384
 
1385
 
1386
        /*  Butterfly process for the i0+fftLen/2 sample */
1387
        out1 = __SMUSD(C1, S) >> 16U;
1388
        out2 = __SMUADX(C1, S);
1389
 
1390
#else
1391
 
1392
        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1393
        R = __SHASX(S, T);
1394
 
1395
        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1396
        S = __SHSAX(S, T);
1397
 
1398
 
1399
        /*  Butterfly process for the i0+fftLen/2 sample */
1400
        out1 = __SMUADX(S, C1) >> 16U;
1401
        out2 = __SMUSD(__QSUB16(0, C1), S);
1402
 
1403
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1404
 
1405
        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1406
        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1407
        _SIMD32_OFFSET(pSi2) =
1408
          ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1409
        pSi2 += 2 * n1;
1410
 
1411
        /*  Butterfly process for the i0+3fftLen/4 sample */
1412
 
1413
#ifndef ARM_MATH_BIG_ENDIAN
1414
 
1415
        out1 = __SMUSD(C3, R) >> 16U;
1416
        out2 = __SMUADX(C3, R);
1417
 
1418
#else
1419
 
1420
        out1 = __SMUADX(C3, R) >> 16U;
1421
        out2 = __SMUSD(__QSUB16(0, C3), R);
1422
 
1423
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1424
 
1425
        /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1426
        /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1427
        _SIMD32_OFFSET(pSi3) =
1428
          ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1429
        pSi3 += 2 * n1;
1430
      }
1431
    }
1432
    /*  Twiddle coefficients index modifier */
1433
    twidCoefModifier <<= 2U;
1434
  }
1435
  /* end of middle stage process */
1436
 
1437
  /* data is in 10.6(q6) format for the 1024 point */
1438
  /* data is in 8.8(q8) format for the 256 point */
1439
  /* data is in 6.10(q10) format for the 64 point */
1440
  /* data is in 4.12(q12) format for the 16 point */
1441
 
1442
  /*  Initializations for the last stage */
1443
  j = fftLen >> 2;
1444
 
1445
  ptr1 = &pSrc16[0];
1446
 
1447
  /* start of last stage process */
1448
 
1449
  /*  Butterfly implementation */
1450
  do
1451
  {
1452
    /* Read xa (real), ya(imag) input */
1453
    xaya = *__SIMD32(ptr1)++;
1454
 
1455
    /* Read xb (real), yb(imag) input */
1456
    xbyb = *__SIMD32(ptr1)++;
1457
 
1458
    /* Read xc (real), yc(imag) input */
1459
    xcyc = *__SIMD32(ptr1)++;
1460
 
1461
    /* Read xd (real), yd(imag) input */
1462
    xdyd = *__SIMD32(ptr1)++;
1463
 
1464
    /* R = packed((ya + yc), (xa + xc)) */
1465
    R = __QADD16(xaya, xcyc);
1466
 
1467
    /* T = packed((yb + yd), (xb + xd)) */
1468
    T = __QADD16(xbyb, xdyd);
1469
 
1470
    /* pointer updation for writing */
1471
    ptr1 = ptr1 - 8U;
1472
 
1473
 
1474
    /* xa' = xa + xb + xc + xd */
1475
    /* ya' = ya + yb + yc + yd */
1476
    *__SIMD32(ptr1)++ = __SHADD16(R, T);
1477
 
1478
    /* T = packed((yb + yd), (xb + xd)) */
1479
    T = __QADD16(xbyb, xdyd);
1480
 
1481
    /* xc' = (xa-xb+xc-xd) */
1482
    /* yc' = (ya-yb+yc-yd) */
1483
    *__SIMD32(ptr1)++ = __SHSUB16(R, T);
1484
 
1485
    /* S = packed((ya - yc), (xa - xc)) */
1486
    S = __QSUB16(xaya, xcyc);
1487
 
1488
    /* Read yd (real), xd(imag) input */
1489
    /* T = packed( (yb - yd), (xb - xd))  */
1490
    U = __QSUB16(xbyb, xdyd);
1491
 
1492
#ifndef ARM_MATH_BIG_ENDIAN
1493
 
1494
    /* xb' = (xa+yb-xc-yd) */
1495
    /* yb' = (ya-xb-yc+xd) */
1496
    *__SIMD32(ptr1)++ = __SHASX(S, U);
1497
 
1498
 
1499
    /* xd' = (xa-yb-xc+yd) */
1500
    /* yd' = (ya+xb-yc-xd) */
1501
    *__SIMD32(ptr1)++ = __SHSAX(S, U);
1502
 
1503
#else
1504
 
1505
    /* xb' = (xa+yb-xc-yd) */
1506
    /* yb' = (ya-xb-yc+xd) */
1507
    *__SIMD32(ptr1)++ = __SHSAX(S, U);
1508
 
1509
 
1510
    /* xd' = (xa-yb-xc+yd) */
1511
    /* yd' = (ya+xb-yc-xd) */
1512
    *__SIMD32(ptr1)++ = __SHASX(S, U);
1513
 
1514
 
1515
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1516
 
1517
  } while (--j);
1518
 
1519
  /* end of last stage  process */
1520
 
1521
  /* output is in 11.5(q5) format for the 1024 point */
1522
  /* output is in 9.7(q7) format for the 256 point   */
1523
  /* output is in 7.9(q9) format for the 64 point  */
1524
  /* output is in 5.11(q11) format for the 16 point  */
1525
 
1526
 
1527
#else
1528
 
1529
  /* Run the below code for Cortex-M0 */
1530
 
1531
  q15_t R0, R1, S0, S1, T0, T1, U0, U1;
1532
  q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
1533
  uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
1534
 
1535
  /* Total process is divided into three stages */
1536
 
1537
  /* process first stage, middle stages, & last stage */
1538
 
1539
  /*  Initializations for the first stage */
1540
  n2 = fftLen;
1541
  n1 = n2;
1542
 
1543
  /* n2 = fftLen/4 */
1544
  n2 >>= 2U;
1545
 
1546
  /* Index for twiddle coefficient */
1547
  ic = 0U;
1548
 
1549
  /* Index for input read and output write */
1550
  i0 = 0U;
1551
 
1552
  j = n2;
1553
 
1554
  /* Input is in 1.15(q15) format */
1555
 
1556
  /*  Start of first stage process */
1557
  do
1558
  {
1559
    /*  Butterfly implementation */
1560
 
1561
    /*  index calculation for the input as, */
1562
    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1563
    i1 = i0 + n2;
1564
    i2 = i1 + n2;
1565
    i3 = i2 + n2;
1566
 
1567
    /*  Reading i0, i0+fftLen/2 inputs */
1568
    /* input is down scale by 4 to avoid overflow */
1569
    /* Read ya (real), xa(imag) input */
1570
    T0 = pSrc16[i0 * 2U] >> 2U;
1571
    T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
1572
    /* input is down scale by 4 to avoid overflow */
1573
    /* Read yc (real), xc(imag) input */
1574
    S0 = pSrc16[i2 * 2U] >> 2U;
1575
    S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
1576
 
1577
    /* R0 = (ya + yc), R1 = (xa + xc) */
1578
    R0 = __SSAT(T0 + S0, 16U);
1579
    R1 = __SSAT(T1 + S1, 16U);
1580
    /* S0 = (ya - yc), S1 = (xa - xc) */
1581
    S0 = __SSAT(T0 - S0, 16U);
1582
    S1 = __SSAT(T1 - S1, 16U);
1583
 
1584
    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1585
    /* input is down scale by 4 to avoid overflow */
1586
    /* Read yb (real), xb(imag) input */
1587
    T0 = pSrc16[i1 * 2U] >> 2U;
1588
    T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
1589
    /* Read yd (real), xd(imag) input */
1590
    /* input is down scale by 4 to avoid overflow */
1591
    U0 = pSrc16[i3 * 2U] >> 2U;
1592
    U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
1593
 
1594
    /* T0 = (yb + yd), T1 = (xb + xd) */
1595
    T0 = __SSAT(T0 + U0, 16U);
1596
    T1 = __SSAT(T1 + U1, 16U);
1597
 
1598
    /*  writing the butterfly processed i0 sample */
1599
    /* xa' = xa + xb + xc + xd */
1600
    /* ya' = ya + yb + yc + yd */
1601
    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
1602
    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
1603
 
1604
    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
1605
    R0 = __SSAT(R0 - T0, 16U);
1606
    R1 = __SSAT(R1 - T1, 16U);
1607
    /* co2 & si2 are read from Coefficient pointer */
1608
    Co2 = pCoef16[2U * ic * 2U];
1609
    Si2 = pCoef16[(2U * ic * 2U) + 1U];
1610
    /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1611
    out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
1612
    /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1613
    out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
1614
 
1615
    /*  Reading i0+fftLen/4 */
1616
    /* input is down scale by 4 to avoid overflow */
1617
    /* T0 = yb, T1 = xb */
1618
    T0 = pSrc16[i1 * 2U] >> 2U;
1619
    T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
1620
 
1621
    /* writing the butterfly processed i0 + fftLen/4 sample */
1622
    /* writing output(xc', yc') in little endian format */
1623
    pSrc16[i1 * 2U] = out1;
1624
    pSrc16[(i1 * 2U) + 1U] = out2;
1625
 
1626
    /*  Butterfly calculations */
1627
    /* input is down scale by 4 to avoid overflow */
1628
    /* U0 = yd, U1 = xd) */
1629
    U0 = pSrc16[i3 * 2U] >> 2U;
1630
    U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
1631
 
1632
    /* T0 = yb-yd, T1 = xb-xd) */
1633
    T0 = __SSAT(T0 - U0, 16U);
1634
    T1 = __SSAT(T1 - U1, 16U);
1635
    /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1636
    R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
1637
    R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
1638
    /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1639
    S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
1640
    S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
1641
 
1642
    /* co1 & si1 are read from Coefficient pointer */
1643
    Co1 = pCoef16[ic * 2U];
1644
    Si1 = pCoef16[(ic * 2U) + 1U];
1645
    /*  Butterfly process for the i0+fftLen/2 sample */
1646
    /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1647
    out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
1648
    /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1649
    out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
1650
    /* writing output(xb', yb') in little endian format */
1651
    pSrc16[i2 * 2U] = out1;
1652
    pSrc16[(i2 * 2U) + 1U] = out2;
1653
 
1654
    /* Co3 & si3 are read from Coefficient pointer */
1655
    Co3 = pCoef16[3U * ic * 2U];
1656
    Si3 = pCoef16[(3U * ic * 2U) + 1U];
1657
    /*  Butterfly process for the i0+3fftLen/4 sample */
1658
    /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1659
    out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
1660
    /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1661
    out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
1662
    /* writing output(xd', yd') in little endian format */
1663
    pSrc16[i3 * 2U] = out1;
1664
    pSrc16[(i3 * 2U) + 1U] = out2;
1665
 
1666
    /*  Twiddle coefficients index modifier */
1667
    ic = ic + twidCoefModifier;
1668
 
1669
    /*  Updating input index */
1670
    i0 = i0 + 1U;
1671
 
1672
  } while (--j);
1673
 
1674
  /*  End of first stage process */
1675
 
1676
  /* data is in 4.11(q11) format */
1677
 
1678
 
1679
  /*  Start of Middle stage process */
1680
 
1681
  /*  Twiddle coefficients index modifier */
1682
  twidCoefModifier <<= 2U;
1683
 
1684
  /*  Calculation of Middle stage */
1685
  for (k = fftLen / 4U; k > 4U; k >>= 2U)
1686
  {
1687
    /*  Initializations for the middle stage */
1688
    n1 = n2;
1689
    n2 >>= 2U;
1690
    ic = 0U;
1691
 
1692
    for (j = 0U; j <= (n2 - 1U); j++)
1693
    {
1694
      /*  index calculation for the coefficients */
1695
      Co1 = pCoef16[ic * 2U];
1696
      Si1 = pCoef16[(ic * 2U) + 1U];
1697
      Co2 = pCoef16[2U * ic * 2U];
1698
      Si2 = pCoef16[2U * ic * 2U + 1U];
1699
      Co3 = pCoef16[3U * ic * 2U];
1700
      Si3 = pCoef16[(3U * ic * 2U) + 1U];
1701
 
1702
      /*  Twiddle coefficients index modifier */
1703
      ic = ic + twidCoefModifier;
1704
 
1705
      /*  Butterfly implementation */
1706
      for (i0 = j; i0 < fftLen; i0 += n1)
1707
      {
1708
        /*  index calculation for the input as, */
1709
        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1710
        i1 = i0 + n2;
1711
        i2 = i1 + n2;
1712
        i3 = i2 + n2;
1713
 
1714
        /*  Reading i0, i0+fftLen/2 inputs */
1715
        /* Read ya (real), xa(imag) input */
1716
        T0 = pSrc16[i0 * 2U];
1717
        T1 = pSrc16[(i0 * 2U) + 1U];
1718
 
1719
        /* Read yc (real), xc(imag) input */
1720
        S0 = pSrc16[i2 * 2U];
1721
        S1 = pSrc16[(i2 * 2U) + 1U];
1722
 
1723
 
1724
        /* R0 = (ya + yc), R1 = (xa + xc) */
1725
        R0 = __SSAT(T0 + S0, 16U);
1726
        R1 = __SSAT(T1 + S1, 16U);
1727
        /* S0 = (ya - yc), S1 = (xa - xc) */
1728
        S0 = __SSAT(T0 - S0, 16U);
1729
        S1 = __SSAT(T1 - S1, 16U);
1730
 
1731
        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1732
        /* Read yb (real), xb(imag) input */
1733
        T0 = pSrc16[i1 * 2U];
1734
        T1 = pSrc16[(i1 * 2U) + 1U];
1735
 
1736
        /* Read yd (real), xd(imag) input */
1737
        U0 = pSrc16[i3 * 2U];
1738
        U1 = pSrc16[(i3 * 2U) + 1U];
1739
 
1740
        /* T0 = (yb + yd), T1 = (xb + xd) */
1741
        T0 = __SSAT(T0 + U0, 16U);
1742
        T1 = __SSAT(T1 + U1, 16U);
1743
 
1744
        /*  writing the butterfly processed i0 sample */
1745
        /* xa' = xa + xb + xc + xd */
1746
        /* ya' = ya + yb + yc + yd */
1747
        pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
1748
        pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
1749
 
1750
        /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1751
        R0 = (R0 >> 1U) - (T0 >> 1U);
1752
        R1 = (R1 >> 1U) - (T1 >> 1U);
1753
 
1754
        /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
1755
        out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
1756
        /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1757
        out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
1758
 
1759
        /*  Reading i0+3fftLen/4 */
1760
        /* Read yb (real), xb(imag) input */
1761
        T0 = pSrc16[i1 * 2U];
1762
        T1 = pSrc16[(i1 * 2U) + 1U];
1763
 
1764
        /*  writing the butterfly processed i0 + fftLen/4 sample */
1765
        /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1766
        /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1767
        pSrc16[i1 * 2U] = out1;
1768
        pSrc16[(i1 * 2U) + 1U] = out2;
1769
 
1770
        /*  Butterfly calculations */
1771
        /* Read yd (real), xd(imag) input */
1772
        U0 = pSrc16[i3 * 2U];
1773
        U1 = pSrc16[(i3 * 2U) + 1U];
1774
 
1775
        /* T0 = yb-yd, T1 = xb-xd) */
1776
        T0 = __SSAT(T0 - U0, 16U);
1777
        T1 = __SSAT(T1 - U1, 16U);
1778
 
1779
        /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1780
        R0 = (S0 >> 1U) + (T1 >> 1U);
1781
        R1 = (S1 >> 1U) - (T0 >> 1U);
1782
 
1783
        /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1784
        S0 = (S0 >> 1U) - (T1 >> 1U);
1785
        S1 = (S1 >> 1U) + (T0 >> 1U);
1786
 
1787
        /*  Butterfly process for the i0+fftLen/2 sample */
1788
        out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
1789
        out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
1790
        /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1791
        /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1792
        pSrc16[i2 * 2U] = out1;
1793
        pSrc16[(i2 * 2U) + 1U] = out2;
1794
 
1795
        /*  Butterfly process for the i0+3fftLen/4 sample */
1796
        out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
1797
 
1798
        out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
1799
        /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1800
        /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1801
        pSrc16[i3 * 2U] = out1;
1802
        pSrc16[(i3 * 2U) + 1U] = out2;
1803
 
1804
 
1805
      }
1806
    }
1807
    /*  Twiddle coefficients index modifier */
1808
    twidCoefModifier <<= 2U;
1809
  }
1810
  /*  End of Middle stages process */
1811
 
1812
 
1813
  /* data is in 10.6(q6) format for the 1024 point */
1814
  /* data is in 8.8(q8) format for the 256 point   */
1815
  /* data is in 6.10(q10) format for the 64 point  */
1816
  /* data is in 4.12(q12) format for the 16 point  */
1817
 
1818
  /* start of last stage process */
1819
 
1820
 
1821
  /*  Initializations for the last stage */
1822
  n1 = n2;
1823
  n2 >>= 2U;
1824
 
1825
  /*  Butterfly implementation */
1826
  for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
1827
  {
1828
    /*  index calculation for the input as, */
1829
    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1830
    i1 = i0 + n2;
1831
    i2 = i1 + n2;
1832
    i3 = i2 + n2;
1833
 
1834
    /*  Reading i0, i0+fftLen/2 inputs */
1835
    /* Read ya (real), xa(imag) input */
1836
    T0 = pSrc16[i0 * 2U];
1837
    T1 = pSrc16[(i0 * 2U) + 1U];
1838
    /* Read yc (real), xc(imag) input */
1839
    S0 = pSrc16[i2 * 2U];
1840
    S1 = pSrc16[(i2 * 2U) + 1U];
1841
 
1842
    /* R0 = (ya + yc), R1 = (xa + xc) */
1843
    R0 = __SSAT(T0 + S0, 16U);
1844
    R1 = __SSAT(T1 + S1, 16U);
1845
    /* S0 = (ya - yc), S1 = (xa - xc) */
1846
    S0 = __SSAT(T0 - S0, 16U);
1847
    S1 = __SSAT(T1 - S1, 16U);
1848
 
1849
    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1850
    /* Read yb (real), xb(imag) input */
1851
    T0 = pSrc16[i1 * 2U];
1852
    T1 = pSrc16[(i1 * 2U) + 1U];
1853
    /* Read yd (real), xd(imag) input */
1854
    U0 = pSrc16[i3 * 2U];
1855
    U1 = pSrc16[(i3 * 2U) + 1U];
1856
 
1857
    /* T0 = (yb + yd), T1 = (xb + xd) */
1858
    T0 = __SSAT(T0 + U0, 16U);
1859
    T1 = __SSAT(T1 + U1, 16U);
1860
 
1861
    /*  writing the butterfly processed i0 sample */
1862
    /* xa' = xa + xb + xc + xd */
1863
    /* ya' = ya + yb + yc + yd */
1864
    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
1865
    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
1866
 
1867
    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1868
    R0 = (R0 >> 1U) - (T0 >> 1U);
1869
    R1 = (R1 >> 1U) - (T1 >> 1U);
1870
 
1871
    /* Read yb (real), xb(imag) input */
1872
    T0 = pSrc16[i1 * 2U];
1873
    T1 = pSrc16[(i1 * 2U) + 1U];
1874
 
1875
    /*  writing the butterfly processed i0 + fftLen/4 sample */
1876
    /* xc' = (xa-xb+xc-xd) */
1877
    /* yc' = (ya-yb+yc-yd) */
1878
    pSrc16[i1 * 2U] = R0;
1879
    pSrc16[(i1 * 2U) + 1U] = R1;
1880
 
1881
    /* Read yd (real), xd(imag) input */
1882
    U0 = pSrc16[i3 * 2U];
1883
    U1 = pSrc16[(i3 * 2U) + 1U];
1884
    /* T0 = (yb - yd), T1 = (xb - xd) */
1885
    T0 = __SSAT(T0 - U0, 16U);
1886
    T1 = __SSAT(T1 - U1, 16U);
1887
 
1888
    /*  writing the butterfly processed i0 + fftLen/2 sample */
1889
    /* xb' = (xa-yb-xc+yd) */
1890
    /* yb' = (ya+xb-yc-xd) */
1891
    pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
1892
    pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
1893
 
1894
 
1895
    /*  writing the butterfly processed i0 + 3fftLen/4 sample */
1896
    /* xd' = (xa+yb-xc-yd) */
1897
    /* yd' = (ya-xb-yc+xd) */
1898
    pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
1899
    pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
1900
  }
1901
  /* end of last stage  process */
1902
 
1903
  /* output is in 11.5(q5) format for the 1024 point */
1904
  /* output is in 9.7(q7) format for the 256 point   */
1905
  /* output is in 7.9(q9) format for the 64 point  */
1906
  /* output is in 5.11(q11) format for the 16 point  */
1907
 
1908
#endif /* #if defined (ARM_MATH_DSP) */
1909
 
1910
}