Subversion Repositories LedShow

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 mjames 1
/* ----------------------------------------------------------------------    
2
* Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
3
*    
4
* $Date:        19. March 2015
5
* $Revision:    V.1.4.5  
6
*    
7
* Project:          CMSIS DSP Library    
8
* Title:            arm_cfft_radix4_q15.c    
9
*    
10
* Description:  This file has function definition of Radix-4 FFT & IFFT function and    
11
*                               In-place bit reversal using bit reversal table    
12
*    
13
* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14
*  
15
* Redistribution and use in source and binary forms, with or without
16
* modification, are permitted provided that the following conditions
17
* are met:
18
*   - Redistributions of source code must retain the above copyright
19
*     notice, this list of conditions and the following disclaimer.
20
*   - Redistributions in binary form must reproduce the above copyright
21
*     notice, this list of conditions and the following disclaimer in
22
*     the documentation and/or other materials provided with the
23
*     distribution.
24
*   - Neither the name of ARM LIMITED nor the names of its contributors
25
*     may be used to endorse or promote products derived from this
26
*     software without specific prior written permission.
27
*
28
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
29
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
31
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
32
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
33
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
34
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
35
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
36
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
38
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39
* POSSIBILITY OF SUCH DAMAGE.    
40
* -------------------------------------------------------------------- */
41
 
42
#include "arm_math.h"
43
 
44
 
45
void arm_radix4_butterfly_q15(
46
  q15_t * pSrc16,
47
  uint32_t fftLen,
48
  q15_t * pCoef16,
49
  uint32_t twidCoefModifier);
50
 
51
void arm_radix4_butterfly_inverse_q15(
52
  q15_t * pSrc16,
53
  uint32_t fftLen,
54
  q15_t * pCoef16,
55
  uint32_t twidCoefModifier);
56
 
57
void arm_bitreversal_q15(
58
  q15_t * pSrc,
59
  uint32_t fftLen,
60
  uint16_t bitRevFactor,
61
  uint16_t * pBitRevTab);
62
 
63
/**    
64
 * @ingroup groupTransforms    
65
 */
66
 
67
/**    
68
 * @addtogroup ComplexFFT    
69
 * @{    
70
 */
71
 
72
 
73
/**    
74
 * @details    
75
 * @brief Processing function for the Q15 CFFT/CIFFT.  
76
 * @deprecated Do not use this function.  It has been superseded by \ref arm_cfft_q15 and will be removed
77
 * @param[in]      *S    points to an instance of the Q15 CFFT/CIFFT structure.  
78
 * @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.  
79
 * @return none.  
80
 *    
81
 * \par Input and output formats:    
82
 * \par    
83
 * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.  
84
 * Hence the output format is different for different FFT sizes.    
85
 * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:  
86
 * \par  
87
 * \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"    
88
 * \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"    
89
 */
90
 
91
void arm_cfft_radix4_q15(
92
  const arm_cfft_radix4_instance_q15 * S,
93
  q15_t * pSrc)
94
{
95
  if(S->ifftFlag == 1u)
96
  {
97
    /*  Complex IFFT radix-4  */
98
    arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle,
99
                                     S->twidCoefModifier);
100
  }
101
  else
102
  {
103
    /*  Complex FFT radix-4  */
104
    arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle,
105
                             S->twidCoefModifier);
106
  }
107
 
108
  if(S->bitReverseFlag == 1u)
109
  {
110
    /*  Bit Reversal */
111
    arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
112
  }
113
 
114
}
115
 
116
/**    
117
 * @} end of ComplexFFT group    
118
 */
119
 
120
/*    
121
* Radix-4 FFT algorithm used is :    
122
*    
123
* Input real and imaginary data:    
124
* x(n) = xa + j * ya    
125
* x(n+N/4 ) = xb + j * yb    
126
* x(n+N/2 ) = xc + j * yc    
127
* x(n+3N 4) = xd + j * yd    
128
*    
129
*    
130
* Output real and imaginary data:    
131
* x(4r) = xa'+ j * ya'    
132
* x(4r+1) = xb'+ j * yb'    
133
* x(4r+2) = xc'+ j * yc'    
134
* x(4r+3) = xd'+ j * yd'    
135
*    
136
*    
137
* Twiddle factors for radix-4 FFT:    
138
* Wn = co1 + j * (- si1)    
139
* W2n = co2 + j * (- si2)    
140
* W3n = co3 + j * (- si3)    
141
 
142
* The real and imaginary output values for the radix-4 butterfly are    
143
* xa' = xa + xb + xc + xd    
144
* ya' = ya + yb + yc + yd    
145
* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)    
146
* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)    
147
* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)    
148
* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)    
149
* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)    
150
* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)    
151
*    
152
*/
153
 
154
/**    
155
 * @brief  Core function for the Q15 CFFT butterfly process.  
156
 * @param[in, out] *pSrc16          points to the in-place buffer of Q15 data type.  
157
 * @param[in]      fftLen           length of the FFT.  
158
 * @param[in]      *pCoef16         points to twiddle coefficient buffer.  
159
 * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.  
160
 * @return none.  
161
 */
162
 
163
void arm_radix4_butterfly_q15(
164
  q15_t * pSrc16,
165
  uint32_t fftLen,
166
  q15_t * pCoef16,
167
  uint32_t twidCoefModifier)
168
{
169
 
170
#ifndef ARM_MATH_CM0_FAMILY
171
 
172
  /* Run the below code for Cortex-M4 and Cortex-M3 */
173
 
174
  q31_t R, S, T, U;
175
  q31_t C1, C2, C3, out1, out2;
176
  uint32_t n1, n2, ic, i0, j, k;
177
 
178
  q15_t *ptr1;
179
  q15_t *pSi0;
180
  q15_t *pSi1;
181
  q15_t *pSi2;
182
  q15_t *pSi3;
183
 
184
  q31_t xaya, xbyb, xcyc, xdyd;
185
 
186
  /* Total process is divided into three stages */
187
 
188
  /* process first stage, middle stages, & last stage */
189
 
190
  /*  Initializations for the first stage */
191
  n2 = fftLen;
192
  n1 = n2;
193
 
194
  /* n2 = fftLen/4 */
195
  n2 >>= 2u;
196
 
197
  /* Index for twiddle coefficient */
198
  ic = 0u;
199
 
200
  /* Index for input read and output write */
201
  j = n2;
202
 
203
  pSi0 = pSrc16;
204
  pSi1 = pSi0 + 2 * n2;
205
  pSi2 = pSi1 + 2 * n2;
206
  pSi3 = pSi2 + 2 * n2;
207
 
208
  /* Input is in 1.15(q15) format */
209
 
210
  /*  start of first stage process */
211
  do
212
  {
213
    /*  Butterfly implementation */
214
 
215
    /*  Reading i0, i0+fftLen/2 inputs */
216
    /* Read ya (real), xa(imag) input */
217
    T = _SIMD32_OFFSET(pSi0);
218
    T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
219
    T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
220
    //in = ((int16_t) (T & 0xFFFF)) >> 2;       // alternative code that takes 3 cycles
221
    //T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
222
 
223
    /* Read yc (real), xc(imag) input */
224
    S = _SIMD32_OFFSET(pSi2);
225
    S = __SHADD16(S, 0);
226
    S = __SHADD16(S, 0);
227
 
228
    /* R = packed((ya + yc), (xa + xc) ) */
229
    R = __QADD16(T, S);
230
 
231
    /* S = packed((ya - yc), (xa - xc) ) */
232
    S = __QSUB16(T, S);
233
 
234
    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
235
    /* Read yb (real), xb(imag) input */
236
    T = _SIMD32_OFFSET(pSi1);
237
    T = __SHADD16(T, 0);
238
    T = __SHADD16(T, 0);
239
 
240
    /* Read yd (real), xd(imag) input */
241
    U = _SIMD32_OFFSET(pSi3);
242
    U = __SHADD16(U, 0);
243
    U = __SHADD16(U, 0);
244
 
245
    /* T = packed((yb + yd), (xb + xd) ) */
246
    T = __QADD16(T, U);
247
 
248
    /*  writing the butterfly processed i0 sample */
249
    /* xa' = xa + xb + xc + xd */
250
    /* ya' = ya + yb + yc + yd */
251
    _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
252
    pSi0 += 2;
253
 
254
    /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
255
    R = __QSUB16(R, T);
256
 
257
    /* co2 & si2 are read from SIMD Coefficient pointer */
258
    C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
259
 
260
#ifndef ARM_MATH_BIG_ENDIAN
261
 
262
    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
263
    out1 = __SMUAD(C2, R) >> 16u;
264
    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
265
    out2 = __SMUSDX(C2, R);
266
 
267
#else
268
 
269
    /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
270
    out1 = __SMUSDX(R, C2) >> 16u;
271
    /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
272
    out2 = __SMUAD(C2, R);
273
 
274
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
275
 
276
    /*  Reading i0+fftLen/4 */
277
    /* T = packed(yb, xb) */
278
    T = _SIMD32_OFFSET(pSi1);
279
    T = __SHADD16(T, 0);
280
    T = __SHADD16(T, 0);
281
 
282
    /* writing the butterfly processed i0 + fftLen/4 sample */
283
    /* writing output(xc', yc') in little endian format */
284
    _SIMD32_OFFSET(pSi1) =
285
      (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
286
    pSi1 += 2;
287
 
288
    /*  Butterfly calculations */
289
    /* U = packed(yd, xd) */
290
    U = _SIMD32_OFFSET(pSi3);
291
    U = __SHADD16(U, 0);
292
    U = __SHADD16(U, 0);
293
 
294
    /* T = packed(yb-yd, xb-xd) */
295
    T = __QSUB16(T, U);
296
 
297
#ifndef ARM_MATH_BIG_ENDIAN
298
 
299
    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
300
    R = __QASX(S, T);
301
    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
302
    S = __QSAX(S, T);
303
 
304
#else
305
 
306
    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
307
    R = __QSAX(S, T);
308
    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
309
    S = __QASX(S, T);
310
 
311
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
312
 
313
    /* co1 & si1 are read from SIMD Coefficient pointer */
314
    C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
315
    /*  Butterfly process for the i0+fftLen/2 sample */
316
 
317
#ifndef ARM_MATH_BIG_ENDIAN
318
 
319
    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
320
    out1 = __SMUAD(C1, S) >> 16u;
321
    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
322
    out2 = __SMUSDX(C1, S);
323
 
324
#else
325
 
326
    /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
327
    out1 = __SMUSDX(S, C1) >> 16u;
328
    /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
329
    out2 = __SMUAD(C1, S);
330
 
331
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
332
 
333
    /* writing output(xb', yb') in little endian format */
334
    _SIMD32_OFFSET(pSi2) =
335
      ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
336
    pSi2 += 2;
337
 
338
 
339
    /* co3 & si3 are read from SIMD Coefficient pointer */
340
    C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
341
    /*  Butterfly process for the i0+3fftLen/4 sample */
342
 
343
#ifndef ARM_MATH_BIG_ENDIAN
344
 
345
    /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
346
    out1 = __SMUAD(C3, R) >> 16u;
347
    /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
348
    out2 = __SMUSDX(C3, R);
349
 
350
#else
351
 
352
    /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
353
    out1 = __SMUSDX(R, C3) >> 16u;
354
    /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
355
    out2 = __SMUAD(C3, R);
356
 
357
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
358
 
359
    /* writing output(xd', yd') in little endian format */
360
    _SIMD32_OFFSET(pSi3) =
361
      ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
362
    pSi3 += 2;
363
 
364
    /*  Twiddle coefficients index modifier */
365
    ic = ic + twidCoefModifier;
366
 
367
  } while(--j);
368
  /* data is in 4.11(q11) format */
369
 
370
  /* end of first stage process */
371
 
372
 
373
  /* start of middle stage process */
374
 
375
  /*  Twiddle coefficients index modifier */
376
  twidCoefModifier <<= 2u;
377
 
378
  /*  Calculation of Middle stage */
379
  for (k = fftLen / 4u; k > 4u; k >>= 2u)
380
  {
381
    /*  Initializations for the middle stage */
382
    n1 = n2;
383
    n2 >>= 2u;
384
    ic = 0u;
385
 
386
    for (j = 0u; j <= (n2 - 1u); j++)
387
    {
388
      /*  index calculation for the coefficients */
389
      C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
390
      C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
391
      C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
392
 
393
      /*  Twiddle coefficients index modifier */
394
      ic = ic + twidCoefModifier;
395
 
396
      pSi0 = pSrc16 + 2 * j;
397
      pSi1 = pSi0 + 2 * n2;
398
      pSi2 = pSi1 + 2 * n2;
399
      pSi3 = pSi2 + 2 * n2;
400
 
401
      /*  Butterfly implementation */
402
      for (i0 = j; i0 < fftLen; i0 += n1)
403
      {
404
        /*  Reading i0, i0+fftLen/2 inputs */
405
        /* Read ya (real), xa(imag) input */
406
        T = _SIMD32_OFFSET(pSi0);
407
 
408
        /* Read yc (real), xc(imag) input */
409
        S = _SIMD32_OFFSET(pSi2);
410
 
411
        /* R = packed( (ya + yc), (xa + xc)) */
412
        R = __QADD16(T, S);
413
 
414
        /* S = packed((ya - yc), (xa - xc)) */
415
        S = __QSUB16(T, S);
416
 
417
        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
418
        /* Read yb (real), xb(imag) input */
419
        T = _SIMD32_OFFSET(pSi1);
420
 
421
        /* Read yd (real), xd(imag) input */
422
        U = _SIMD32_OFFSET(pSi3);
423
 
424
        /* T = packed( (yb + yd), (xb + xd)) */
425
        T = __QADD16(T, U);
426
 
427
        /*  writing the butterfly processed i0 sample */
428
 
429
        /* xa' = xa + xb + xc + xd */
430
        /* ya' = ya + yb + yc + yd */
431
        out1 = __SHADD16(R, T);
432
        out1 = __SHADD16(out1, 0);
433
        _SIMD32_OFFSET(pSi0) = out1;
434
        pSi0 += 2 * n1;
435
 
436
        /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
437
        R = __SHSUB16(R, T);
438
 
439
#ifndef ARM_MATH_BIG_ENDIAN
440
 
441
        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
442
        out1 = __SMUAD(C2, R) >> 16u;
443
 
444
        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
445
        out2 = __SMUSDX(C2, R);
446
 
447
#else
448
 
449
        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
450
        out1 = __SMUSDX(R, C2) >> 16u;
451
 
452
        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
453
        out2 = __SMUAD(C2, R);
454
 
455
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
456
 
457
        /*  Reading i0+3fftLen/4 */
458
        /* Read yb (real), xb(imag) input */
459
        T = _SIMD32_OFFSET(pSi1);
460
 
461
        /*  writing the butterfly processed i0 + fftLen/4 sample */
462
        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
463
        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
464
        _SIMD32_OFFSET(pSi1) =
465
          ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
466
        pSi1 += 2 * n1;
467
 
468
        /*  Butterfly calculations */
469
 
470
        /* Read yd (real), xd(imag) input */
471
        U = _SIMD32_OFFSET(pSi3);
472
 
473
        /* T = packed(yb-yd, xb-xd) */
474
        T = __QSUB16(T, U);
475
 
476
#ifndef ARM_MATH_BIG_ENDIAN
477
 
478
        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
479
        R = __SHASX(S, T);
480
 
481
        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
482
        S = __SHSAX(S, T);
483
 
484
 
485
        /*  Butterfly process for the i0+fftLen/2 sample */
486
        out1 = __SMUAD(C1, S) >> 16u;
487
        out2 = __SMUSDX(C1, S);
488
 
489
#else
490
 
491
        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
492
        R = __SHSAX(S, T);
493
 
494
        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
495
        S = __SHASX(S, T);
496
 
497
 
498
        /*  Butterfly process for the i0+fftLen/2 sample */
499
        out1 = __SMUSDX(S, C1) >> 16u;
500
        out2 = __SMUAD(C1, S);
501
 
502
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
503
 
504
        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
505
        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
506
        _SIMD32_OFFSET(pSi2) =
507
          ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
508
        pSi2 += 2 * n1;
509
 
510
        /*  Butterfly process for the i0+3fftLen/4 sample */
511
 
512
#ifndef ARM_MATH_BIG_ENDIAN
513
 
514
        out1 = __SMUAD(C3, R) >> 16u;
515
        out2 = __SMUSDX(C3, R);
516
 
517
#else
518
 
519
        out1 = __SMUSDX(R, C3) >> 16u;
520
        out2 = __SMUAD(C3, R);
521
 
522
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
523
 
524
        /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
525
        /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
526
        _SIMD32_OFFSET(pSi3) =
527
          ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
528
        pSi3 += 2 * n1;
529
      }
530
    }
531
    /*  Twiddle coefficients index modifier */
532
    twidCoefModifier <<= 2u;
533
  }
534
  /* end of middle stage process */
535
 
536
 
537
  /* data is in 10.6(q6) format for the 1024 point */
538
  /* data is in 8.8(q8) format for the 256 point */
539
  /* data is in 6.10(q10) format for the 64 point */
540
  /* data is in 4.12(q12) format for the 16 point */
541
 
542
  /*  Initializations for the last stage */
543
  j = fftLen >> 2;
544
 
545
  ptr1 = &pSrc16[0];
546
 
547
  /* start of last stage process */
548
 
549
  /*  Butterfly implementation */
550
  do
551
  {
552
    /* Read xa (real), ya(imag) input */
553
    xaya = *__SIMD32(ptr1)++;
554
 
555
    /* Read xb (real), yb(imag) input */
556
    xbyb = *__SIMD32(ptr1)++;
557
 
558
    /* Read xc (real), yc(imag) input */
559
    xcyc = *__SIMD32(ptr1)++;
560
 
561
    /* Read xd (real), yd(imag) input */
562
    xdyd = *__SIMD32(ptr1)++;
563
 
564
    /* R = packed((ya + yc), (xa + xc)) */
565
    R = __QADD16(xaya, xcyc);
566
 
567
    /* T = packed((yb + yd), (xb + xd)) */
568
    T = __QADD16(xbyb, xdyd);
569
 
570
    /* pointer updation for writing */
571
    ptr1 = ptr1 - 8u;
572
 
573
 
574
    /* xa' = xa + xb + xc + xd */
575
    /* ya' = ya + yb + yc + yd */
576
    *__SIMD32(ptr1)++ = __SHADD16(R, T);
577
 
578
    /* T = packed((yb + yd), (xb + xd)) */
579
    T = __QADD16(xbyb, xdyd);
580
 
581
    /* xc' = (xa-xb+xc-xd) */
582
    /* yc' = (ya-yb+yc-yd) */
583
    *__SIMD32(ptr1)++ = __SHSUB16(R, T);
584
 
585
    /* S = packed((ya - yc), (xa - xc)) */
586
    S = __QSUB16(xaya, xcyc);
587
 
588
    /* Read yd (real), xd(imag) input */
589
    /* T = packed( (yb - yd), (xb - xd))  */
590
    U = __QSUB16(xbyb, xdyd);
591
 
592
#ifndef ARM_MATH_BIG_ENDIAN
593
 
594
    /* xb' = (xa+yb-xc-yd) */
595
    /* yb' = (ya-xb-yc+xd) */
596
    *__SIMD32(ptr1)++ = __SHSAX(S, U);
597
 
598
 
599
    /* xd' = (xa-yb-xc+yd) */
600
    /* yd' = (ya+xb-yc-xd) */
601
    *__SIMD32(ptr1)++ = __SHASX(S, U);
602
 
603
#else
604
 
605
    /* xb' = (xa+yb-xc-yd) */
606
    /* yb' = (ya-xb-yc+xd) */
607
    *__SIMD32(ptr1)++ = __SHASX(S, U);
608
 
609
 
610
    /* xd' = (xa-yb-xc+yd) */
611
    /* yd' = (ya+xb-yc-xd) */
612
    *__SIMD32(ptr1)++ = __SHSAX(S, U);
613
 
614
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
615
 
616
  } while(--j);
617
 
618
  /* end of last stage process */
619
 
620
  /* output is in 11.5(q5) format for the 1024 point */
621
  /* output is in 9.7(q7) format for the 256 point   */
622
  /* output is in 7.9(q9) format for the 64 point  */
623
  /* output is in 5.11(q11) format for the 16 point  */
624
 
625
 
626
#else
627
 
628
  /* Run the below code for Cortex-M0 */
629
 
630
  q15_t R0, R1, S0, S1, T0, T1, U0, U1;
631
  q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
632
  uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
633
 
634
  /* Total process is divided into three stages */
635
 
636
  /* process first stage, middle stages, & last stage */
637
 
638
  /*  Initializations for the first stage */
639
  n2 = fftLen;
640
  n1 = n2;
641
 
642
  /* n2 = fftLen/4 */
643
  n2 >>= 2u;
644
 
645
  /* Index for twiddle coefficient */
646
  ic = 0u;
647
 
648
  /* Index for input read and output write */
649
  i0 = 0u;
650
  j = n2;
651
 
652
  /* Input is in 1.15(q15) format */
653
 
654
  /*  start of first stage process */
655
  do
656
  {
657
    /*  Butterfly implementation */
658
 
659
    /*  index calculation for the input as, */
660
    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
661
    i1 = i0 + n2;
662
    i2 = i1 + n2;
663
    i3 = i2 + n2;
664
 
665
    /*  Reading i0, i0+fftLen/2 inputs */
666
 
667
    /* input is down scale by 4 to avoid overflow */
668
    /* Read ya (real), xa(imag) input */
669
    T0 = pSrc16[i0 * 2u] >> 2u;
670
    T1 = pSrc16[(i0 * 2u) + 1u] >> 2u;
671
 
672
    /* input is down scale by 4 to avoid overflow */
673
    /* Read yc (real), xc(imag) input */
674
    S0 = pSrc16[i2 * 2u] >> 2u;
675
    S1 = pSrc16[(i2 * 2u) + 1u] >> 2u;
676
 
677
    /* R0 = (ya + yc) */
678
    R0 = __SSAT(T0 + S0, 16u);
679
    /* R1 = (xa + xc) */
680
    R1 = __SSAT(T1 + S1, 16u);
681
 
682
    /* S0 = (ya - yc) */
683
    S0 = __SSAT(T0 - S0, 16);
684
    /* S1 = (xa - xc) */
685
    S1 = __SSAT(T1 - S1, 16);
686
 
687
    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
688
    /* input is down scale by 4 to avoid overflow */
689
    /* Read yb (real), xb(imag) input */
690
    T0 = pSrc16[i1 * 2u] >> 2u;
691
    T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
692
 
693
    /* input is down scale by 4 to avoid overflow */
694
    /* Read yd (real), xd(imag) input */
695
    U0 = pSrc16[i3 * 2u] >> 2u;
696
    U1 = pSrc16[(i3 * 2u) + 1] >> 2u;
697
 
698
    /* T0 = (yb + yd) */
699
    T0 = __SSAT(T0 + U0, 16u);
700
    /* T1 = (xb + xd) */
701
    T1 = __SSAT(T1 + U1, 16u);
702
 
703
    /*  writing the butterfly processed i0 sample */
704
    /* ya' = ya + yb + yc + yd */
705
    /* xa' = xa + xb + xc + xd */
706
    pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
707
    pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
708
 
709
    /* R0 = (ya + yc) - (yb + yd) */
710
    /* R1 = (xa + xc) - (xb + xd) */
711
    R0 = __SSAT(R0 - T0, 16u);
712
    R1 = __SSAT(R1 - T1, 16u);
713
 
714
    /* co2 & si2 are read from Coefficient pointer */
715
    Co2 = pCoef16[2u * ic * 2u];
716
    Si2 = pCoef16[(2u * ic * 2u) + 1];
717
 
718
    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
719
    out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u);
720
    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
721
    out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u);
722
 
723
    /*  Reading i0+fftLen/4 */
724
    /* input is down scale by 4 to avoid overflow */
725
    /* T0 = yb, T1 =  xb */
726
    T0 = pSrc16[i1 * 2u] >> 2;
727
    T1 = pSrc16[(i1 * 2u) + 1] >> 2;
728
 
729
    /* writing the butterfly processed i0 + fftLen/4 sample */
730
    /* writing output(xc', yc') in little endian format */
731
    pSrc16[i1 * 2u] = out1;
732
    pSrc16[(i1 * 2u) + 1] = out2;
733
 
734
    /*  Butterfly calculations */
735
    /* input is down scale by 4 to avoid overflow */
736
    /* U0 = yd, U1 = xd */
737
    U0 = pSrc16[i3 * 2u] >> 2;
738
    U1 = pSrc16[(i3 * 2u) + 1] >> 2;
739
    /* T0 = yb-yd */
740
    T0 = __SSAT(T0 - U0, 16);
741
    /* T1 = xb-xd */
742
    T1 = __SSAT(T1 - U1, 16);
743
 
744
    /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
745
    R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
746
    R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
747
 
748
    /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
749
    S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16u);
750
    S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16u);
751
 
752
    /* co1 & si1 are read from Coefficient pointer */
753
    Co1 = pCoef16[ic * 2u];
754
    Si1 = pCoef16[(ic * 2u) + 1];
755
    /*  Butterfly process for the i0+fftLen/2 sample */
756
    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
757
    out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
758
    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
759
    out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
760
 
761
    /* writing output(xb', yb') in little endian format */
762
    pSrc16[i2 * 2u] = out1;
763
    pSrc16[(i2 * 2u) + 1] = out2;
764
 
765
    /* Co3 & si3 are read from Coefficient pointer */
766
    Co3 = pCoef16[3u * (ic * 2u)];
767
    Si3 = pCoef16[(3u * (ic * 2u)) + 1];
768
    /*  Butterfly process for the i0+3fftLen/4 sample */
769
    /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
770
    out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u);
771
    /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
772
    out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u);
773
    /* writing output(xd', yd') in little endian format */
774
    pSrc16[i3 * 2u] = out1;
775
    pSrc16[(i3 * 2u) + 1] = out2;
776
 
777
    /*  Twiddle coefficients index modifier */
778
    ic = ic + twidCoefModifier;
779
 
780
    /*  Updating input index */
781
    i0 = i0 + 1u;
782
 
783
  } while(--j);
784
  /* data is in 4.11(q11) format */
785
 
786
  /* end of first stage process */
787
 
788
 
789
  /* start of middle stage process */
790
 
791
  /*  Twiddle coefficients index modifier */
792
  twidCoefModifier <<= 2u;
793
 
794
  /*  Calculation of Middle stage */
795
  for (k = fftLen / 4u; k > 4u; k >>= 2u)
796
  {
797
    /*  Initializations for the middle stage */
798
    n1 = n2;
799
    n2 >>= 2u;
800
    ic = 0u;
801
 
802
    for (j = 0u; j <= (n2 - 1u); j++)
803
    {
804
      /*  index calculation for the coefficients */
805
      Co1 = pCoef16[ic * 2u];
806
      Si1 = pCoef16[(ic * 2u) + 1u];
807
      Co2 = pCoef16[2u * (ic * 2u)];
808
      Si2 = pCoef16[(2u * (ic * 2u)) + 1u];
809
      Co3 = pCoef16[3u * (ic * 2u)];
810
      Si3 = pCoef16[(3u * (ic * 2u)) + 1u];
811
 
812
      /*  Twiddle coefficients index modifier */
813
      ic = ic + twidCoefModifier;
814
 
815
      /*  Butterfly implementation */
816
      for (i0 = j; i0 < fftLen; i0 += n1)
817
      {
818
        /*  index calculation for the input as, */
819
        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
820
        i1 = i0 + n2;
821
        i2 = i1 + n2;
822
        i3 = i2 + n2;
823
 
824
        /*  Reading i0, i0+fftLen/2 inputs */
825
        /* Read ya (real), xa(imag) input */
826
        T0 = pSrc16[i0 * 2u];
827
        T1 = pSrc16[(i0 * 2u) + 1u];
828
 
829
        /* Read yc (real), xc(imag) input */
830
        S0 = pSrc16[i2 * 2u];
831
        S1 = pSrc16[(i2 * 2u) + 1u];
832
 
833
        /* R0 = (ya + yc), R1 = (xa + xc) */
834
        R0 = __SSAT(T0 + S0, 16);
835
        R1 = __SSAT(T1 + S1, 16);
836
 
837
        /* S0 = (ya - yc), S1 =(xa - xc) */
838
        S0 = __SSAT(T0 - S0, 16);
839
        S1 = __SSAT(T1 - S1, 16);
840
 
841
        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
842
        /* Read yb (real), xb(imag) input */
843
        T0 = pSrc16[i1 * 2u];
844
        T1 = pSrc16[(i1 * 2u) + 1u];
845
 
846
        /* Read yd (real), xd(imag) input */
847
        U0 = pSrc16[i3 * 2u];
848
        U1 = pSrc16[(i3 * 2u) + 1u];
849
 
850
 
851
        /* T0 = (yb + yd), T1 = (xb + xd) */
852
        T0 = __SSAT(T0 + U0, 16);
853
        T1 = __SSAT(T1 + U1, 16);
854
 
855
        /*  writing the butterfly processed i0 sample */
856
 
857
        /* xa' = xa + xb + xc + xd */
858
        /* ya' = ya + yb + yc + yd */
859
        out1 = ((R0 >> 1u) + (T0 >> 1u)) >> 1u;
860
        out2 = ((R1 >> 1u) + (T1 >> 1u)) >> 1u;
861
 
862
        pSrc16[i0 * 2u] = out1;
863
        pSrc16[(2u * i0) + 1u] = out2;
864
 
865
        /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
866
        R0 = (R0 >> 1u) - (T0 >> 1u);
867
        R1 = (R1 >> 1u) - (T1 >> 1u);
868
 
869
        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
870
        out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u);
871
 
872
        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
873
        out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u);
874
 
875
        /*  Reading i0+3fftLen/4 */
876
        /* Read yb (real), xb(imag) input */
877
        T0 = pSrc16[i1 * 2u];
878
        T1 = pSrc16[(i1 * 2u) + 1u];
879
 
880
        /*  writing the butterfly processed i0 + fftLen/4 sample */
881
        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
882
        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
883
        pSrc16[i1 * 2u] = out1;
884
        pSrc16[(i1 * 2u) + 1u] = out2;
885
 
886
        /*  Butterfly calculations */
887
 
888
        /* Read yd (real), xd(imag) input */
889
        U0 = pSrc16[i3 * 2u];
890
        U1 = pSrc16[(i3 * 2u) + 1u];
891
 
892
        /* T0 = yb-yd, T1 = xb-xd */
893
        T0 = __SSAT(T0 - U0, 16);
894
        T1 = __SSAT(T1 - U1, 16);
895
 
896
        /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
897
        R0 = (S0 >> 1u) - (T1 >> 1u);
898
        R1 = (S1 >> 1u) + (T0 >> 1u);
899
 
900
        /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
901
        S0 = (S0 >> 1u) + (T1 >> 1u);
902
        S1 = (S1 >> 1u) - (T0 >> 1u);
903
 
904
        /*  Butterfly process for the i0+fftLen/2 sample */
905
        out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16u);
906
 
907
        out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16u);
908
 
909
        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
910
        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
911
        pSrc16[i2 * 2u] = out1;
912
        pSrc16[(i2 * 2u) + 1u] = out2;
913
 
914
        /*  Butterfly process for the i0+3fftLen/4 sample */
915
        out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u);
916
 
917
        out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u);
918
        /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
919
        /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
920
        pSrc16[i3 * 2u] = out1;
921
        pSrc16[(i3 * 2u) + 1u] = out2;
922
      }
923
    }
924
    /*  Twiddle coefficients index modifier */
925
    twidCoefModifier <<= 2u;
926
  }
927
  /* end of middle stage process */
928
 
929
 
930
  /* data is in 10.6(q6) format for the 1024 point */
931
  /* data is in 8.8(q8) format for the 256 point */
932
  /* data is in 6.10(q10) format for the 64 point */
933
  /* data is in 4.12(q12) format for the 16 point */
934
 
935
  /*  Initializations for the last stage */
936
  n1 = n2;
937
  n2 >>= 2u;
938
 
939
  /* start of last stage process */
940
 
941
  /*  Butterfly implementation */
942
  for (i0 = 0u; i0 <= (fftLen - n1); i0 += n1)
943
  {
944
    /*  index calculation for the input as, */
945
    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
946
    i1 = i0 + n2;
947
    i2 = i1 + n2;
948
    i3 = i2 + n2;
949
 
950
    /*  Reading i0, i0+fftLen/2 inputs */
951
    /* Read ya (real), xa(imag) input */
952
    T0 = pSrc16[i0 * 2u];
953
    T1 = pSrc16[(i0 * 2u) + 1u];
954
 
955
    /* Read yc (real), xc(imag) input */
956
    S0 = pSrc16[i2 * 2u];
957
    S1 = pSrc16[(i2 * 2u) + 1u];
958
 
959
    /* R0 = (ya + yc), R1 = (xa + xc) */
960
    R0 = __SSAT(T0 + S0, 16u);
961
    R1 = __SSAT(T1 + S1, 16u);
962
 
963
    /* S0 = (ya - yc), S1 = (xa - xc) */
964
    S0 = __SSAT(T0 - S0, 16u);
965
    S1 = __SSAT(T1 - S1, 16u);
966
 
967
    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
968
    /* Read yb (real), xb(imag) input */
969
    T0 = pSrc16[i1 * 2u];
970
    T1 = pSrc16[(i1 * 2u) + 1u];
971
    /* Read yd (real), xd(imag) input */
972
    U0 = pSrc16[i3 * 2u];
973
    U1 = pSrc16[(i3 * 2u) + 1u];
974
 
975
    /* T0 = (yb + yd), T1 = (xb + xd)) */
976
    T0 = __SSAT(T0 + U0, 16u);
977
    T1 = __SSAT(T1 + U1, 16u);
978
 
979
    /*  writing the butterfly processed i0 sample */
980
    /* xa' = xa + xb + xc + xd */
981
    /* ya' = ya + yb + yc + yd */
982
    pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
983
    pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
984
 
985
    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
986
    R0 = (R0 >> 1u) - (T0 >> 1u);
987
    R1 = (R1 >> 1u) - (T1 >> 1u);
988
    /* Read yb (real), xb(imag) input */
989
    T0 = pSrc16[i1 * 2u];
990
    T1 = pSrc16[(i1 * 2u) + 1u];
991
 
992
    /*  writing the butterfly processed i0 + fftLen/4 sample */
993
    /* xc' = (xa-xb+xc-xd) */
994
    /* yc' = (ya-yb+yc-yd) */
995
    pSrc16[i1 * 2u] = R0;
996
    pSrc16[(i1 * 2u) + 1u] = R1;
997
 
998
    /* Read yd (real), xd(imag) input */
999
    U0 = pSrc16[i3 * 2u];
1000
    U1 = pSrc16[(i3 * 2u) + 1u];
1001
    /* T0 = (yb - yd), T1 = (xb - xd)  */
1002
    T0 = __SSAT(T0 - U0, 16u);
1003
    T1 = __SSAT(T1 - U1, 16u);
1004
 
1005
    /*  writing the butterfly processed i0 + fftLen/2 sample */
1006
    /* xb' = (xa+yb-xc-yd) */
1007
    /* yb' = (ya-xb-yc+xd) */
1008
    pSrc16[i2 * 2u] = (S0 >> 1u) + (T1 >> 1u);
1009
    pSrc16[(i2 * 2u) + 1u] = (S1 >> 1u) - (T0 >> 1u);
1010
 
1011
    /*  writing the butterfly processed i0 + 3fftLen/4 sample */
1012
    /* xd' = (xa-yb-xc+yd) */
1013
    /* yd' = (ya+xb-yc-xd) */
1014
    pSrc16[i3 * 2u] = (S0 >> 1u) - (T1 >> 1u);
1015
    pSrc16[(i3 * 2u) + 1u] = (S1 >> 1u) + (T0 >> 1u);
1016
 
1017
  }
1018
 
1019
  /* end of last stage process */
1020
 
1021
  /* output is in 11.5(q5) format for the 1024 point */
1022
  /* output is in 9.7(q7) format for the 256 point   */
1023
  /* output is in 7.9(q9) format for the 64 point  */
1024
  /* output is in 5.11(q11) format for the 16 point  */
1025
 
1026
#endif /* #ifndef ARM_MATH_CM0_FAMILY */
1027
 
1028
}
1029
 
1030
 
1031
/**    
1032
 * @brief  Core function for the Q15 CIFFT butterfly process.  
1033
 * @param[in, out] *pSrc16          points to the in-place buffer of Q15 data type.  
1034
 * @param[in]      fftLen           length of the FFT.  
1035
 * @param[in]      *pCoef16         points to twiddle coefficient buffer.  
1036
 * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.  
1037
 * @return none.  
1038
 */
1039
 
1040
/*    
1041
* Radix-4 IFFT algorithm used is :    
1042
*    
1043
* CIFFT uses same twiddle coefficients as CFFT function    
1044
*  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]    
1045
*    
1046
*    
1047
* IFFT is implemented with following changes in equations from FFT    
1048
*    
1049
* Input real and imaginary data:    
1050
* x(n) = xa + j * ya    
1051
* x(n+N/4 ) = xb + j * yb    
1052
* x(n+N/2 ) = xc + j * yc    
1053
* x(n+3N 4) = xd + j * yd    
1054
*    
1055
*    
1056
* Output real and imaginary data:    
1057
* x(4r) = xa'+ j * ya'    
1058
* x(4r+1) = xb'+ j * yb'    
1059
* x(4r+2) = xc'+ j * yc'    
1060
* x(4r+3) = xd'+ j * yd'    
1061
*    
1062
*    
1063
* Twiddle factors for radix-4 IFFT:    
1064
* Wn = co1 + j * (si1)    
1065
* W2n = co2 + j * (si2)    
1066
* W3n = co3 + j * (si3)    
1067
 
1068
* The real and imaginary output values for the radix-4 butterfly are    
1069
* xa' = xa + xb + xc + xd    
1070
* ya' = ya + yb + yc + yd    
1071
* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)    
1072
* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)    
1073
* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)    
1074
* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)    
1075
* xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)    
1076
* yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)    
1077
*    
1078
*/
1079
 
1080
void arm_radix4_butterfly_inverse_q15(
1081
  q15_t * pSrc16,
1082
  uint32_t fftLen,
1083
  q15_t * pCoef16,
1084
  uint32_t twidCoefModifier)
1085
{
1086
 
1087
#ifndef ARM_MATH_CM0_FAMILY
1088
 
1089
  /* Run the below code for Cortex-M4 and Cortex-M3 */
1090
 
1091
  q31_t R, S, T, U;
1092
  q31_t C1, C2, C3, out1, out2;
1093
  uint32_t n1, n2, ic, i0, j, k;
1094
 
1095
  q15_t *ptr1;
1096
  q15_t *pSi0;
1097
  q15_t *pSi1;
1098
  q15_t *pSi2;
1099
  q15_t *pSi3;
1100
 
1101
  q31_t xaya, xbyb, xcyc, xdyd;
1102
 
1103
  /* Total process is divided into three stages */
1104
 
1105
  /* process first stage, middle stages, & last stage */
1106
 
1107
  /*  Initializations for the first stage */
1108
  n2 = fftLen;
1109
  n1 = n2;
1110
 
1111
  /* n2 = fftLen/4 */
1112
  n2 >>= 2u;
1113
 
1114
  /* Index for twiddle coefficient */
1115
  ic = 0u;
1116
 
1117
  /* Index for input read and output write */
1118
  j = n2;
1119
 
1120
  pSi0 = pSrc16;
1121
  pSi1 = pSi0 + 2 * n2;
1122
  pSi2 = pSi1 + 2 * n2;
1123
  pSi3 = pSi2 + 2 * n2;
1124
 
1125
  /* Input is in 1.15(q15) format */
1126
 
1127
  /*  start of first stage process */
1128
  do
1129
  {
1130
    /*  Butterfly implementation */
1131
 
1132
    /*  Reading i0, i0+fftLen/2 inputs */
1133
    /* Read ya (real), xa(imag) input */
1134
    T = _SIMD32_OFFSET(pSi0);
1135
    T = __SHADD16(T, 0);
1136
    T = __SHADD16(T, 0);
1137
 
1138
    /* Read yc (real), xc(imag) input */
1139
    S = _SIMD32_OFFSET(pSi2);
1140
    S = __SHADD16(S, 0);
1141
    S = __SHADD16(S, 0);
1142
 
1143
    /* R = packed((ya + yc), (xa + xc) ) */
1144
    R = __QADD16(T, S);
1145
 
1146
    /* S = packed((ya - yc), (xa - xc) ) */
1147
    S = __QSUB16(T, S);
1148
 
1149
    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1150
    /* Read yb (real), xb(imag) input */
1151
    T = _SIMD32_OFFSET(pSi1);
1152
    T = __SHADD16(T, 0);
1153
    T = __SHADD16(T, 0);
1154
 
1155
    /* Read yd (real), xd(imag) input */
1156
    U = _SIMD32_OFFSET(pSi3);
1157
    U = __SHADD16(U, 0);
1158
    U = __SHADD16(U, 0);
1159
 
1160
    /* T = packed((yb + yd), (xb + xd) ) */
1161
    T = __QADD16(T, U);
1162
 
1163
    /*  writing the butterfly processed i0 sample */
1164
    /* xa' = xa + xb + xc + xd */
1165
    /* ya' = ya + yb + yc + yd */
1166
    _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
1167
    pSi0 += 2;
1168
 
1169
    /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
1170
    R = __QSUB16(R, T);
1171
 
1172
    /* co2 & si2 are read from SIMD Coefficient pointer */
1173
    C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
1174
 
1175
#ifndef ARM_MATH_BIG_ENDIAN
1176
 
1177
    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1178
    out1 = __SMUSD(C2, R) >> 16u;
1179
    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1180
    out2 = __SMUADX(C2, R);
1181
 
1182
#else
1183
 
1184
    /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1185
    out1 = __SMUADX(C2, R) >> 16u;
1186
    /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1187
    out2 = __SMUSD(__QSUB16(0, C2), R);
1188
 
1189
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1190
 
1191
    /*  Reading i0+fftLen/4 */
1192
    /* T = packed(yb, xb) */
1193
    T = _SIMD32_OFFSET(pSi1);
1194
    T = __SHADD16(T, 0);
1195
    T = __SHADD16(T, 0);
1196
 
1197
    /* writing the butterfly processed i0 + fftLen/4 sample */
1198
    /* writing output(xc', yc') in little endian format */
1199
    _SIMD32_OFFSET(pSi1) =
1200
      (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1201
    pSi1 += 2;
1202
 
1203
    /*  Butterfly calculations */
1204
    /* U = packed(yd, xd) */
1205
    U = _SIMD32_OFFSET(pSi3);
1206
    U = __SHADD16(U, 0);
1207
    U = __SHADD16(U, 0);
1208
 
1209
    /* T = packed(yb-yd, xb-xd) */
1210
    T = __QSUB16(T, U);
1211
 
1212
#ifndef ARM_MATH_BIG_ENDIAN
1213
 
1214
    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1215
    R = __QSAX(S, T);
1216
    /* S = packed((ya-yc) + (xb- xd),  (xa-xc) - (yb-yd)) */
1217
    S = __QASX(S, T);
1218
 
1219
#else
1220
 
1221
    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1222
    R = __QASX(S, T);
1223
    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1224
    S = __QSAX(S, T);
1225
 
1226
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1227
 
1228
    /* co1 & si1 are read from SIMD Coefficient pointer */
1229
    C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
1230
    /*  Butterfly process for the i0+fftLen/2 sample */
1231
 
1232
#ifndef ARM_MATH_BIG_ENDIAN
1233
 
1234
    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1235
    out1 = __SMUSD(C1, S) >> 16u;
1236
    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1237
    out2 = __SMUADX(C1, S);
1238
 
1239
#else
1240
 
1241
    /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1242
    out1 = __SMUADX(C1, S) >> 16u;
1243
    /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1244
    out2 = __SMUSD(__QSUB16(0, C1), S);
1245
 
1246
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1247
 
1248
    /* writing output(xb', yb') in little endian format */
1249
    _SIMD32_OFFSET(pSi2) =
1250
      ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
1251
    pSi2 += 2;
1252
 
1253
 
1254
    /* co3 & si3 are read from SIMD Coefficient pointer */
1255
    C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
1256
    /*  Butterfly process for the i0+3fftLen/4 sample */
1257
 
1258
#ifndef ARM_MATH_BIG_ENDIAN
1259
 
1260
    /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1261
    out1 = __SMUSD(C3, R) >> 16u;
1262
    /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1263
    out2 = __SMUADX(C3, R);
1264
 
1265
#else
1266
 
1267
    /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1268
    out1 = __SMUADX(C3, R) >> 16u;
1269
    /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1270
    out2 = __SMUSD(__QSUB16(0, C3), R);
1271
 
1272
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1273
 
1274
    /* writing output(xd', yd') in little endian format */
1275
    _SIMD32_OFFSET(pSi3) =
1276
      ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1277
    pSi3 += 2;
1278
 
1279
    /*  Twiddle coefficients index modifier */
1280
    ic = ic + twidCoefModifier;
1281
 
1282
  } while(--j);
1283
  /* data is in 4.11(q11) format */
1284
 
1285
  /* end of first stage process */
1286
 
1287
 
1288
  /* start of middle stage process */
1289
 
1290
  /*  Twiddle coefficients index modifier */
1291
  twidCoefModifier <<= 2u;
1292
 
1293
  /*  Calculation of Middle stage */
1294
  for (k = fftLen / 4u; k > 4u; k >>= 2u)
1295
  {
1296
    /*  Initializations for the middle stage */
1297
    n1 = n2;
1298
    n2 >>= 2u;
1299
    ic = 0u;
1300
 
1301
    for (j = 0u; j <= (n2 - 1u); j++)
1302
    {
1303
      /*  index calculation for the coefficients */
1304
      C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
1305
      C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
1306
      C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
1307
 
1308
      /*  Twiddle coefficients index modifier */
1309
      ic = ic + twidCoefModifier;
1310
 
1311
      pSi0 = pSrc16 + 2 * j;
1312
      pSi1 = pSi0 + 2 * n2;
1313
      pSi2 = pSi1 + 2 * n2;
1314
      pSi3 = pSi2 + 2 * n2;
1315
 
1316
      /*  Butterfly implementation */
1317
      for (i0 = j; i0 < fftLen; i0 += n1)
1318
      {
1319
        /*  Reading i0, i0+fftLen/2 inputs */
1320
        /* Read ya (real), xa(imag) input */
1321
        T = _SIMD32_OFFSET(pSi0);
1322
 
1323
        /* Read yc (real), xc(imag) input */
1324
        S = _SIMD32_OFFSET(pSi2);
1325
 
1326
        /* R = packed( (ya + yc), (xa + xc)) */
1327
        R = __QADD16(T, S);
1328
 
1329
        /* S = packed((ya - yc), (xa - xc)) */
1330
        S = __QSUB16(T, S);
1331
 
1332
        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1333
        /* Read yb (real), xb(imag) input */
1334
        T = _SIMD32_OFFSET(pSi1);
1335
 
1336
        /* Read yd (real), xd(imag) input */
1337
        U = _SIMD32_OFFSET(pSi3);
1338
 
1339
        /* T = packed( (yb + yd), (xb + xd)) */
1340
        T = __QADD16(T, U);
1341
 
1342
        /*  writing the butterfly processed i0 sample */
1343
 
1344
        /* xa' = xa + xb + xc + xd */
1345
        /* ya' = ya + yb + yc + yd */
1346
        out1 = __SHADD16(R, T);
1347
        out1 = __SHADD16(out1, 0);
1348
        _SIMD32_OFFSET(pSi0) = out1;
1349
        pSi0 += 2 * n1;
1350
 
1351
        /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
1352
        R = __SHSUB16(R, T);
1353
 
1354
#ifndef ARM_MATH_BIG_ENDIAN
1355
 
1356
        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1357
        out1 = __SMUSD(C2, R) >> 16u;
1358
 
1359
        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1360
        out2 = __SMUADX(C2, R);
1361
 
1362
#else
1363
 
1364
        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1365
        out1 = __SMUADX(R, C2) >> 16u;
1366
 
1367
        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1368
        out2 = __SMUSD(__QSUB16(0, C2), R);
1369
 
1370
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1371
 
1372
        /*  Reading i0+3fftLen/4 */
1373
        /* Read yb (real), xb(imag) input */
1374
        T = _SIMD32_OFFSET(pSi1);
1375
 
1376
        /*  writing the butterfly processed i0 + fftLen/4 sample */
1377
        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1378
        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1379
        _SIMD32_OFFSET(pSi1) =
1380
          ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1381
        pSi1 += 2 * n1;
1382
 
1383
        /*  Butterfly calculations */
1384
 
1385
        /* Read yd (real), xd(imag) input */
1386
        U = _SIMD32_OFFSET(pSi3);
1387
 
1388
        /* T = packed(yb-yd, xb-xd) */
1389
        T = __QSUB16(T, U);
1390
 
1391
#ifndef ARM_MATH_BIG_ENDIAN
1392
 
1393
        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1394
        R = __SHSAX(S, T);
1395
 
1396
        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1397
        S = __SHASX(S, T);
1398
 
1399
 
1400
        /*  Butterfly process for the i0+fftLen/2 sample */
1401
        out1 = __SMUSD(C1, S) >> 16u;
1402
        out2 = __SMUADX(C1, S);
1403
 
1404
#else
1405
 
1406
        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1407
        R = __SHASX(S, T);
1408
 
1409
        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1410
        S = __SHSAX(S, T);
1411
 
1412
 
1413
        /*  Butterfly process for the i0+fftLen/2 sample */
1414
        out1 = __SMUADX(S, C1) >> 16u;
1415
        out2 = __SMUSD(__QSUB16(0, C1), S);
1416
 
1417
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1418
 
1419
        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1420
        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1421
        _SIMD32_OFFSET(pSi2) =
1422
          ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1423
        pSi2 += 2 * n1;
1424
 
1425
        /*  Butterfly process for the i0+3fftLen/4 sample */
1426
 
1427
#ifndef ARM_MATH_BIG_ENDIAN
1428
 
1429
        out1 = __SMUSD(C3, R) >> 16u;
1430
        out2 = __SMUADX(C3, R);
1431
 
1432
#else
1433
 
1434
        out1 = __SMUADX(C3, R) >> 16u;
1435
        out2 = __SMUSD(__QSUB16(0, C3), R);
1436
 
1437
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1438
 
1439
        /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1440
        /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1441
        _SIMD32_OFFSET(pSi3) =
1442
          ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1443
        pSi3 += 2 * n1;
1444
      }
1445
    }
1446
    /*  Twiddle coefficients index modifier */
1447
    twidCoefModifier <<= 2u;
1448
  }
1449
  /* end of middle stage process */
1450
 
1451
  /* data is in 10.6(q6) format for the 1024 point */
1452
  /* data is in 8.8(q8) format for the 256 point */
1453
  /* data is in 6.10(q10) format for the 64 point */
1454
  /* data is in 4.12(q12) format for the 16 point */
1455
 
1456
  /*  Initializations for the last stage */
1457
  j = fftLen >> 2;
1458
 
1459
  ptr1 = &pSrc16[0];
1460
 
1461
  /* start of last stage process */
1462
 
1463
  /*  Butterfly implementation */
1464
  do
1465
  {
1466
    /* Read xa (real), ya(imag) input */
1467
    xaya = *__SIMD32(ptr1)++;
1468
 
1469
    /* Read xb (real), yb(imag) input */
1470
    xbyb = *__SIMD32(ptr1)++;
1471
 
1472
    /* Read xc (real), yc(imag) input */
1473
    xcyc = *__SIMD32(ptr1)++;
1474
 
1475
    /* Read xd (real), yd(imag) input */
1476
    xdyd = *__SIMD32(ptr1)++;
1477
 
1478
    /* R = packed((ya + yc), (xa + xc)) */
1479
    R = __QADD16(xaya, xcyc);
1480
 
1481
    /* T = packed((yb + yd), (xb + xd)) */
1482
    T = __QADD16(xbyb, xdyd);
1483
 
1484
    /* pointer updation for writing */
1485
    ptr1 = ptr1 - 8u;
1486
 
1487
 
1488
    /* xa' = xa + xb + xc + xd */
1489
    /* ya' = ya + yb + yc + yd */
1490
    *__SIMD32(ptr1)++ = __SHADD16(R, T);
1491
 
1492
    /* T = packed((yb + yd), (xb + xd)) */
1493
    T = __QADD16(xbyb, xdyd);
1494
 
1495
    /* xc' = (xa-xb+xc-xd) */
1496
    /* yc' = (ya-yb+yc-yd) */
1497
    *__SIMD32(ptr1)++ = __SHSUB16(R, T);
1498
 
1499
    /* S = packed((ya - yc), (xa - xc)) */
1500
    S = __QSUB16(xaya, xcyc);
1501
 
1502
    /* Read yd (real), xd(imag) input */
1503
    /* T = packed( (yb - yd), (xb - xd))  */
1504
    U = __QSUB16(xbyb, xdyd);
1505
 
1506
#ifndef ARM_MATH_BIG_ENDIAN
1507
 
1508
    /* xb' = (xa+yb-xc-yd) */
1509
    /* yb' = (ya-xb-yc+xd) */
1510
    *__SIMD32(ptr1)++ = __SHASX(S, U);
1511
 
1512
 
1513
    /* xd' = (xa-yb-xc+yd) */
1514
    /* yd' = (ya+xb-yc-xd) */
1515
    *__SIMD32(ptr1)++ = __SHSAX(S, U);
1516
 
1517
#else
1518
 
1519
    /* xb' = (xa+yb-xc-yd) */
1520
    /* yb' = (ya-xb-yc+xd) */
1521
    *__SIMD32(ptr1)++ = __SHSAX(S, U);
1522
 
1523
 
1524
    /* xd' = (xa-yb-xc+yd) */
1525
    /* yd' = (ya+xb-yc-xd) */
1526
    *__SIMD32(ptr1)++ = __SHASX(S, U);
1527
 
1528
 
1529
#endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1530
 
1531
  } while(--j);
1532
 
1533
  /* end of last stage  process */
1534
 
1535
  /* output is in 11.5(q5) format for the 1024 point */
1536
  /* output is in 9.7(q7) format for the 256 point   */
1537
  /* output is in 7.9(q9) format for the 64 point  */
1538
  /* output is in 5.11(q11) format for the 16 point  */
1539
 
1540
 
1541
#else
1542
 
1543
  /* Run the below code for Cortex-M0 */
1544
 
1545
  q15_t R0, R1, S0, S1, T0, T1, U0, U1;
1546
  q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
1547
  uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
1548
 
1549
  /* Total process is divided into three stages */
1550
 
1551
  /* process first stage, middle stages, & last stage */
1552
 
1553
  /*  Initializations for the first stage */
1554
  n2 = fftLen;
1555
  n1 = n2;
1556
 
1557
  /* n2 = fftLen/4 */
1558
  n2 >>= 2u;
1559
 
1560
  /* Index for twiddle coefficient */
1561
  ic = 0u;
1562
 
1563
  /* Index for input read and output write */
1564
  i0 = 0u;
1565
 
1566
  j = n2;
1567
 
1568
  /* Input is in 1.15(q15) format */
1569
 
1570
  /*  Start of first stage process */
1571
  do
1572
  {
1573
    /*  Butterfly implementation */
1574
 
1575
    /*  index calculation for the input as, */
1576
    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1577
    i1 = i0 + n2;
1578
    i2 = i1 + n2;
1579
    i3 = i2 + n2;
1580
 
1581
    /*  Reading i0, i0+fftLen/2 inputs */
1582
    /* input is down scale by 4 to avoid overflow */
1583
    /* Read ya (real), xa(imag) input */
1584
    T0 = pSrc16[i0 * 2u] >> 2u;
1585
    T1 = pSrc16[(i0 * 2u) + 1u] >> 2u;
1586
    /* input is down scale by 4 to avoid overflow */
1587
    /* Read yc (real), xc(imag) input */
1588
    S0 = pSrc16[i2 * 2u] >> 2u;
1589
    S1 = pSrc16[(i2 * 2u) + 1u] >> 2u;
1590
 
1591
    /* R0 = (ya + yc), R1 = (xa + xc) */
1592
    R0 = __SSAT(T0 + S0, 16u);
1593
    R1 = __SSAT(T1 + S1, 16u);
1594
    /* S0 = (ya - yc), S1 = (xa - xc) */
1595
    S0 = __SSAT(T0 - S0, 16u);
1596
    S1 = __SSAT(T1 - S1, 16u);
1597
 
1598
    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1599
    /* input is down scale by 4 to avoid overflow */
1600
    /* Read yb (real), xb(imag) input */
1601
    T0 = pSrc16[i1 * 2u] >> 2u;
1602
    T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
1603
    /* Read yd (real), xd(imag) input */
1604
    /* input is down scale by 4 to avoid overflow */
1605
    U0 = pSrc16[i3 * 2u] >> 2u;
1606
    U1 = pSrc16[(i3 * 2u) + 1u] >> 2u;
1607
 
1608
    /* T0 = (yb + yd), T1 = (xb + xd) */
1609
    T0 = __SSAT(T0 + U0, 16u);
1610
    T1 = __SSAT(T1 + U1, 16u);
1611
 
1612
    /*  writing the butterfly processed i0 sample */
1613
    /* xa' = xa + xb + xc + xd */
1614
    /* ya' = ya + yb + yc + yd */
1615
    pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
1616
    pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
1617
 
1618
    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
1619
    R0 = __SSAT(R0 - T0, 16u);
1620
    R1 = __SSAT(R1 - T1, 16u);
1621
    /* co2 & si2 are read from Coefficient pointer */
1622
    Co2 = pCoef16[2u * ic * 2u];
1623
    Si2 = pCoef16[(2u * ic * 2u) + 1u];
1624
    /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1625
    out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16u);
1626
    /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1627
    out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16u);
1628
 
1629
    /*  Reading i0+fftLen/4 */
1630
    /* input is down scale by 4 to avoid overflow */
1631
    /* T0 = yb, T1 = xb */
1632
    T0 = pSrc16[i1 * 2u] >> 2u;
1633
    T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
1634
 
1635
    /* writing the butterfly processed i0 + fftLen/4 sample */
1636
    /* writing output(xc', yc') in little endian format */
1637
    pSrc16[i1 * 2u] = out1;
1638
    pSrc16[(i1 * 2u) + 1u] = out2;
1639
 
1640
    /*  Butterfly calculations */
1641
    /* input is down scale by 4 to avoid overflow */
1642
    /* U0 = yd, U1 = xd) */
1643
    U0 = pSrc16[i3 * 2u] >> 2u;
1644
    U1 = pSrc16[(i3 * 2u) + 1u] >> 2u;
1645
 
1646
    /* T0 = yb-yd, T1 = xb-xd) */
1647
    T0 = __SSAT(T0 - U0, 16u);
1648
    T1 = __SSAT(T1 - U1, 16u);
1649
    /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1650
    R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
1651
    R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
1652
    /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1653
    S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
1654
    S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
1655
 
1656
    /* co1 & si1 are read from Coefficient pointer */
1657
    Co1 = pCoef16[ic * 2u];
1658
    Si1 = pCoef16[(ic * 2u) + 1u];
1659
    /*  Butterfly process for the i0+fftLen/2 sample */
1660
    /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1661
    out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u);
1662
    /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1663
    out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u);
1664
    /* writing output(xb', yb') in little endian format */
1665
    pSrc16[i2 * 2u] = out1;
1666
    pSrc16[(i2 * 2u) + 1u] = out2;
1667
 
1668
    /* Co3 & si3 are read from Coefficient pointer */
1669
    Co3 = pCoef16[3u * ic * 2u];
1670
    Si3 = pCoef16[(3u * ic * 2u) + 1u];
1671
    /*  Butterfly process for the i0+3fftLen/4 sample */
1672
    /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1673
    out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u);
1674
    /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1675
    out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u);
1676
    /* writing output(xd', yd') in little endian format */
1677
    pSrc16[i3 * 2u] = out1;
1678
    pSrc16[(i3 * 2u) + 1u] = out2;
1679
 
1680
    /*  Twiddle coefficients index modifier */
1681
    ic = ic + twidCoefModifier;
1682
 
1683
    /*  Updating input index */
1684
    i0 = i0 + 1u;
1685
 
1686
  } while(--j);
1687
 
1688
  /*  End of first stage process */
1689
 
1690
  /* data is in 4.11(q11) format */
1691
 
1692
 
1693
  /*  Start of Middle stage process */
1694
 
1695
  /*  Twiddle coefficients index modifier */
1696
  twidCoefModifier <<= 2u;
1697
 
1698
  /*  Calculation of Middle stage */
1699
  for (k = fftLen / 4u; k > 4u; k >>= 2u)
1700
  {
1701
    /*  Initializations for the middle stage */
1702
    n1 = n2;
1703
    n2 >>= 2u;
1704
    ic = 0u;
1705
 
1706
    for (j = 0u; j <= (n2 - 1u); j++)
1707
    {
1708
      /*  index calculation for the coefficients */
1709
      Co1 = pCoef16[ic * 2u];
1710
      Si1 = pCoef16[(ic * 2u) + 1u];
1711
      Co2 = pCoef16[2u * ic * 2u];
1712
      Si2 = pCoef16[2u * ic * 2u + 1u];
1713
      Co3 = pCoef16[3u * ic * 2u];
1714
      Si3 = pCoef16[(3u * ic * 2u) + 1u];
1715
 
1716
      /*  Twiddle coefficients index modifier */
1717
      ic = ic + twidCoefModifier;
1718
 
1719
      /*  Butterfly implementation */
1720
      for (i0 = j; i0 < fftLen; i0 += n1)
1721
      {
1722
        /*  index calculation for the input as, */
1723
        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1724
        i1 = i0 + n2;
1725
        i2 = i1 + n2;
1726
        i3 = i2 + n2;
1727
 
1728
        /*  Reading i0, i0+fftLen/2 inputs */
1729
        /* Read ya (real), xa(imag) input */
1730
        T0 = pSrc16[i0 * 2u];
1731
        T1 = pSrc16[(i0 * 2u) + 1u];
1732
 
1733
        /* Read yc (real), xc(imag) input */
1734
        S0 = pSrc16[i2 * 2u];
1735
        S1 = pSrc16[(i2 * 2u) + 1u];
1736
 
1737
 
1738
        /* R0 = (ya + yc), R1 = (xa + xc) */
1739
        R0 = __SSAT(T0 + S0, 16u);
1740
        R1 = __SSAT(T1 + S1, 16u);
1741
        /* S0 = (ya - yc), S1 = (xa - xc) */
1742
        S0 = __SSAT(T0 - S0, 16u);
1743
        S1 = __SSAT(T1 - S1, 16u);
1744
 
1745
        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1746
        /* Read yb (real), xb(imag) input */
1747
        T0 = pSrc16[i1 * 2u];
1748
        T1 = pSrc16[(i1 * 2u) + 1u];
1749
 
1750
        /* Read yd (real), xd(imag) input */
1751
        U0 = pSrc16[i3 * 2u];
1752
        U1 = pSrc16[(i3 * 2u) + 1u];
1753
 
1754
        /* T0 = (yb + yd), T1 = (xb + xd) */
1755
        T0 = __SSAT(T0 + U0, 16u);
1756
        T1 = __SSAT(T1 + U1, 16u);
1757
 
1758
        /*  writing the butterfly processed i0 sample */
1759
        /* xa' = xa + xb + xc + xd */
1760
        /* ya' = ya + yb + yc + yd */
1761
        pSrc16[i0 * 2u] = ((R0 >> 1u) + (T0 >> 1u)) >> 1u;
1762
        pSrc16[(i0 * 2u) + 1u] = ((R1 >> 1u) + (T1 >> 1u)) >> 1u;
1763
 
1764
        /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1765
        R0 = (R0 >> 1u) - (T0 >> 1u);
1766
        R1 = (R1 >> 1u) - (T1 >> 1u);
1767
 
1768
        /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
1769
        out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
1770
        /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1771
        out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
1772
 
1773
        /*  Reading i0+3fftLen/4 */
1774
        /* Read yb (real), xb(imag) input */
1775
        T0 = pSrc16[i1 * 2u];
1776
        T1 = pSrc16[(i1 * 2u) + 1u];
1777
 
1778
        /*  writing the butterfly processed i0 + fftLen/4 sample */
1779
        /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1780
        /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1781
        pSrc16[i1 * 2u] = out1;
1782
        pSrc16[(i1 * 2u) + 1u] = out2;
1783
 
1784
        /*  Butterfly calculations */
1785
        /* Read yd (real), xd(imag) input */
1786
        U0 = pSrc16[i3 * 2u];
1787
        U1 = pSrc16[(i3 * 2u) + 1u];
1788
 
1789
        /* T0 = yb-yd, T1 = xb-xd) */
1790
        T0 = __SSAT(T0 - U0, 16u);
1791
        T1 = __SSAT(T1 - U1, 16u);
1792
 
1793
        /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1794
        R0 = (S0 >> 1u) + (T1 >> 1u);
1795
        R1 = (S1 >> 1u) - (T0 >> 1u);
1796
 
1797
        /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1798
        S0 = (S0 >> 1u) - (T1 >> 1u);
1799
        S1 = (S1 >> 1u) + (T0 >> 1u);
1800
 
1801
        /*  Butterfly process for the i0+fftLen/2 sample */
1802
        out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u);
1803
        out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u);
1804
        /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1805
        /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1806
        pSrc16[i2 * 2u] = out1;
1807
        pSrc16[(i2 * 2u) + 1u] = out2;
1808
 
1809
        /*  Butterfly process for the i0+3fftLen/4 sample */
1810
        out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u);
1811
 
1812
        out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u);
1813
        /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1814
        /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1815
        pSrc16[i3 * 2u] = out1;
1816
        pSrc16[(i3 * 2u) + 1u] = out2;
1817
 
1818
 
1819
      }
1820
    }
1821
    /*  Twiddle coefficients index modifier */
1822
    twidCoefModifier <<= 2u;
1823
  }
1824
  /*  End of Middle stages process */
1825
 
1826
 
1827
  /* data is in 10.6(q6) format for the 1024 point */
1828
  /* data is in 8.8(q8) format for the 256 point   */
1829
  /* data is in 6.10(q10) format for the 64 point  */
1830
  /* data is in 4.12(q12) format for the 16 point  */
1831
 
1832
  /* start of last stage process */
1833
 
1834
 
1835
  /*  Initializations for the last stage */
1836
  n1 = n2;
1837
  n2 >>= 2u;
1838
 
1839
  /*  Butterfly implementation */
1840
  for (i0 = 0u; i0 <= (fftLen - n1); i0 += n1)
1841
  {
1842
    /*  index calculation for the input as, */
1843
    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1844
    i1 = i0 + n2;
1845
    i2 = i1 + n2;
1846
    i3 = i2 + n2;
1847
 
1848
    /*  Reading i0, i0+fftLen/2 inputs */
1849
    /* Read ya (real), xa(imag) input */
1850
    T0 = pSrc16[i0 * 2u];
1851
    T1 = pSrc16[(i0 * 2u) + 1u];
1852
    /* Read yc (real), xc(imag) input */
1853
    S0 = pSrc16[i2 * 2u];
1854
    S1 = pSrc16[(i2 * 2u) + 1u];
1855
 
1856
    /* R0 = (ya + yc), R1 = (xa + xc) */
1857
    R0 = __SSAT(T0 + S0, 16u);
1858
    R1 = __SSAT(T1 + S1, 16u);
1859
    /* S0 = (ya - yc), S1 = (xa - xc) */
1860
    S0 = __SSAT(T0 - S0, 16u);
1861
    S1 = __SSAT(T1 - S1, 16u);
1862
 
1863
    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1864
    /* Read yb (real), xb(imag) input */
1865
    T0 = pSrc16[i1 * 2u];
1866
    T1 = pSrc16[(i1 * 2u) + 1u];
1867
    /* Read yd (real), xd(imag) input */
1868
    U0 = pSrc16[i3 * 2u];
1869
    U1 = pSrc16[(i3 * 2u) + 1u];
1870
 
1871
    /* T0 = (yb + yd), T1 = (xb + xd) */
1872
    T0 = __SSAT(T0 + U0, 16u);
1873
    T1 = __SSAT(T1 + U1, 16u);
1874
 
1875
    /*  writing the butterfly processed i0 sample */
1876
    /* xa' = xa + xb + xc + xd */
1877
    /* ya' = ya + yb + yc + yd */
1878
    pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
1879
    pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
1880
 
1881
    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1882
    R0 = (R0 >> 1u) - (T0 >> 1u);
1883
    R1 = (R1 >> 1u) - (T1 >> 1u);
1884
 
1885
    /* Read yb (real), xb(imag) input */
1886
    T0 = pSrc16[i1 * 2u];
1887
    T1 = pSrc16[(i1 * 2u) + 1u];
1888
 
1889
    /*  writing the butterfly processed i0 + fftLen/4 sample */
1890
    /* xc' = (xa-xb+xc-xd) */
1891
    /* yc' = (ya-yb+yc-yd) */
1892
    pSrc16[i1 * 2u] = R0;
1893
    pSrc16[(i1 * 2u) + 1u] = R1;
1894
 
1895
    /* Read yd (real), xd(imag) input */
1896
    U0 = pSrc16[i3 * 2u];
1897
    U1 = pSrc16[(i3 * 2u) + 1u];
1898
    /* T0 = (yb - yd), T1 = (xb - xd) */
1899
    T0 = __SSAT(T0 - U0, 16u);
1900
    T1 = __SSAT(T1 - U1, 16u);
1901
 
1902
    /*  writing the butterfly processed i0 + fftLen/2 sample */
1903
    /* xb' = (xa-yb-xc+yd) */
1904
    /* yb' = (ya+xb-yc-xd) */
1905
    pSrc16[i2 * 2u] = (S0 >> 1u) - (T1 >> 1u);
1906
    pSrc16[(i2 * 2u) + 1u] = (S1 >> 1u) + (T0 >> 1u);
1907
 
1908
 
1909
    /*  writing the butterfly processed i0 + 3fftLen/4 sample */
1910
    /* xd' = (xa+yb-xc-yd) */
1911
    /* yd' = (ya-xb-yc+xd) */
1912
    pSrc16[i3 * 2u] = (S0 >> 1u) + (T1 >> 1u);
1913
    pSrc16[(i3 * 2u) + 1u] = (S1 >> 1u) - (T0 >> 1u);
1914
  }
1915
  /* end of last stage  process */
1916
 
1917
  /* output is in 11.5(q5) format for the 1024 point */
1918
  /* output is in 9.7(q7) format for the 256 point   */
1919
  /* output is in 7.9(q9) format for the 64 point  */
1920
  /* output is in 5.11(q11) format for the 16 point  */
1921
 
1922
#endif /* #ifndef ARM_MATH_CM0_FAMILY */
1923
 
1924
}