WebSVN – AFRtranscoder – Blame – /trunk/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c

Rev	Author	Line No.	Line
2	mjames	1	/* ----------------------------------------------------------------------
		2	* Project: CMSIS DSP Library
		3	* Title: arm_cfft_radix4_q15.c
		4	* Description: This file has function definition of Radix-4 FFT & IFFT function and
		5	* In-place bit reversal using bit reversal table
		6	*
		7	* $Date: 27. January 2017
		8	* $Revision: V.1.5.1
		9	*
		10	* Target Processor: Cortex-M cores
		11	* -------------------------------------------------------------------- */
		12	/*
		13	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
		14	*
		15	* SPDX-License-Identifier: Apache-2.0
		16	*
		17	* Licensed under the Apache License, Version 2.0 (the License); you may
		18	* not use this file except in compliance with the License.
		19	* You may obtain a copy of the License at
		20	*
		21	* www.apache.org/licenses/LICENSE-2.0
		22	*
		23	* Unless required by applicable law or agreed to in writing, software
		24	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		25	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		26	* See the License for the specific language governing permissions and
		27	* limitations under the License.
		28	*/
		29
		30	#include "arm_math.h"
		31
		32
		33	void arm_radix4_butterfly_q15(
		34	q15_t * pSrc16,
		35	uint32_t fftLen,
		36	q15_t * pCoef16,
		37	uint32_t twidCoefModifier);
		38
		39	void arm_radix4_butterfly_inverse_q15(
		40	q15_t * pSrc16,
		41	uint32_t fftLen,
		42	q15_t * pCoef16,
		43	uint32_t twidCoefModifier);
		44
		45	void arm_bitreversal_q15(
		46	q15_t * pSrc,
		47	uint32_t fftLen,
		48	uint16_t bitRevFactor,
		49	uint16_t * pBitRevTab);
		50
		51	/**
		52	* @ingroup groupTransforms
		53	*/
		54
		55	/**
		56	* @addtogroup ComplexFFT
		57	* @{
		58	*/
		59
		60
		61	/**
		62	* @details
		63	* @brief Processing function for the Q15 CFFT/CIFFT.
		64	* @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
		65	* @param[in] *S points to an instance of the Q15 CFFT/CIFFT structure.
		66	* @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.
		67	* @return none.
		68	*
		69	* \par Input and output formats:
		70	* \par
		71	* Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
		72	* Hence the output format is different for different FFT sizes.
		73	* The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
		74	* \par
		75	* \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
		76	* \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
		77	*/
		78
		79	void arm_cfft_radix4_q15(
		80	const arm_cfft_radix4_instance_q15 * S,
		81	q15_t * pSrc)
		82	{
		83	if (S->ifftFlag == 1U)
		84	{
		85	/* Complex IFFT radix-4 */
		86	arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
		87	}
		88	else
		89	{
		90	/* Complex FFT radix-4 */
		91	arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
		92	}
		93
		94	if (S->bitReverseFlag == 1U)
		95	{
		96	/* Bit Reversal */
		97	arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
		98	}
		99
		100	}
		101
		102	/**
		103	* @} end of ComplexFFT group
		104	*/
		105
		106	/*
		107	* Radix-4 FFT algorithm used is :
		108	*
		109	* Input real and imaginary data:
		110	* x(n) = xa + j * ya
		111	* x(n+N/4 ) = xb + j * yb
		112	* x(n+N/2 ) = xc + j * yc
		113	* x(n+3N 4) = xd + j * yd
		114	*
		115	*
		116	* Output real and imaginary data:
		117	* x(4r) = xa'+ j * ya'
		118	* x(4r+1) = xb'+ j * yb'
		119	* x(4r+2) = xc'+ j * yc'
		120	* x(4r+3) = xd'+ j * yd'
		121	*
		122	*
		123	* Twiddle factors for radix-4 FFT:
		124	* Wn = co1 + j * (- si1)
		125	* W2n = co2 + j * (- si2)
		126	* W3n = co3 + j * (- si3)
		127
		128	* The real and imaginary output values for the radix-4 butterfly are
		129	* xa' = xa + xb + xc + xd
		130	* ya' = ya + yb + yc + yd
		131	* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
		132	* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
		133	* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
		134	* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
		135	* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
		136	* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
		137	*
		138	*/
		139
		140	/**
		141	* @brief Core function for the Q15 CFFT butterfly process.
		142	* @param[in, out] *pSrc16 points to the in-place buffer of Q15 data type.
		143	* @param[in] fftLen length of the FFT.
		144	* @param[in] *pCoef16 points to twiddle coefficient buffer.
		145	* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
		146	* @return none.
		147	*/
		148
		149	void arm_radix4_butterfly_q15(
		150	q15_t * pSrc16,
		151	uint32_t fftLen,
		152	q15_t * pCoef16,
		153	uint32_t twidCoefModifier)
		154	{
		155
		156	#if defined (ARM_MATH_DSP)
		157
		158	/* Run the below code for Cortex-M4 and Cortex-M3 */
		159
		160	q31_t R, S, T, U;
		161	q31_t C1, C2, C3, out1, out2;
		162	uint32_t n1, n2, ic, i0, j, k;
		163
		164	q15_t *ptr1;
		165	q15_t *pSi0;
		166	q15_t *pSi1;
		167	q15_t *pSi2;
		168	q15_t *pSi3;
		169
		170	q31_t xaya, xbyb, xcyc, xdyd;
		171
		172	/* Total process is divided into three stages */
		173
		174	/* process first stage, middle stages, & last stage */
		175
		176	/* Initializations for the first stage */
		177	n2 = fftLen;
		178	n1 = n2;
		179
		180	/* n2 = fftLen/4 */
		181	n2 >>= 2U;
		182
		183	/* Index for twiddle coefficient */
		184	ic = 0U;
		185
		186	/* Index for input read and output write */
		187	j = n2;
		188
		189	pSi0 = pSrc16;
		190	pSi1 = pSi0 + 2 * n2;
		191	pSi2 = pSi1 + 2 * n2;
		192	pSi3 = pSi2 + 2 * n2;
		193
		194	/* Input is in 1.15(q15) format */
		195
		196	/* start of first stage process */
		197	do
		198	{
		199	/* Butterfly implementation */
		200
		201	/* Reading i0, i0+fftLen/2 inputs */
		202	/* Read ya (real), xa(imag) input */
		203	T = _SIMD32_OFFSET(pSi0);
		204	T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
		205	T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
		206	//in = ((int16_t) (T & 0xFFFF)) >> 2; // alternative code that takes 3 cycles
		207	//T = ((T >> 2) & 0xFFFF0000) \| (in & 0xFFFF);
		208
		209	/* Read yc (real), xc(imag) input */
		210	S = _SIMD32_OFFSET(pSi2);
		211	S = __SHADD16(S, 0);
		212	S = __SHADD16(S, 0);
		213
		214	/* R = packed((ya + yc), (xa + xc) ) */
		215	R = __QADD16(T, S);
		216
		217	/* S = packed((ya - yc), (xa - xc) ) */
		218	S = __QSUB16(T, S);
		219
		220	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		221	/* Read yb (real), xb(imag) input */
		222	T = _SIMD32_OFFSET(pSi1);
		223	T = __SHADD16(T, 0);
		224	T = __SHADD16(T, 0);
		225
		226	/* Read yd (real), xd(imag) input */
		227	U = _SIMD32_OFFSET(pSi3);
		228	U = __SHADD16(U, 0);
		229	U = __SHADD16(U, 0);
		230
		231	/* T = packed((yb + yd), (xb + xd) ) */
		232	T = __QADD16(T, U);
		233
		234	/* writing the butterfly processed i0 sample */
		235	/* xa' = xa + xb + xc + xd */
		236	/* ya' = ya + yb + yc + yd */
		237	_SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
		238	pSi0 += 2;
		239
		240	/* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
		241	R = __QSUB16(R, T);
		242
		243	/* co2 & si2 are read from SIMD Coefficient pointer */
		244	C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
		245
		246	#ifndef ARM_MATH_BIG_ENDIAN
		247
		248	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		249	out1 = __SMUAD(C2, R) >> 16U;
		250	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		251	out2 = __SMUSDX(C2, R);
		252
		253	#else
		254
		255	/* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		256	out1 = __SMUSDX(R, C2) >> 16U;
		257	/* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		258	out2 = __SMUAD(C2, R);
		259
		260	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		261
		262	/* Reading i0+fftLen/4 */
		263	/* T = packed(yb, xb) */
		264	T = _SIMD32_OFFSET(pSi1);
		265	T = __SHADD16(T, 0);
		266	T = __SHADD16(T, 0);
		267
		268	/* writing the butterfly processed i0 + fftLen/4 sample */
		269	/* writing output(xc', yc') in little endian format */
		270	_SIMD32_OFFSET(pSi1) =
		271	(q31_t) ((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		272	pSi1 += 2;
		273
		274	/* Butterfly calculations */
		275	/* U = packed(yd, xd) */
		276	U = _SIMD32_OFFSET(pSi3);
		277	U = __SHADD16(U, 0);
		278	U = __SHADD16(U, 0);
		279
		280	/* T = packed(yb-yd, xb-xd) */
		281	T = __QSUB16(T, U);
		282
		283	#ifndef ARM_MATH_BIG_ENDIAN
		284
		285	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		286	R = __QASX(S, T);
		287	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		288	S = __QSAX(S, T);
		289
		290	#else
		291
		292	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		293	R = __QSAX(S, T);
		294	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		295	S = __QASX(S, T);
		296
		297	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		298
		299	/* co1 & si1 are read from SIMD Coefficient pointer */
		300	C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
		301	/* Butterfly process for the i0+fftLen/2 sample */
		302
		303	#ifndef ARM_MATH_BIG_ENDIAN
		304
		305	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		306	out1 = __SMUAD(C1, S) >> 16U;
		307	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		308	out2 = __SMUSDX(C1, S);
		309
		310	#else
		311
		312	/* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		313	out1 = __SMUSDX(S, C1) >> 16U;
		314	/* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		315	out2 = __SMUAD(C1, S);
		316
		317	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		318
		319	/* writing output(xb', yb') in little endian format */
		320	_SIMD32_OFFSET(pSi2) =
		321	((out2) & 0xFFFF0000) \| ((out1) & 0x0000FFFF);
		322	pSi2 += 2;
		323
		324
		325	/* co3 & si3 are read from SIMD Coefficient pointer */
		326	C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
		327	/* Butterfly process for the i0+3fftLen/4 sample */
		328
		329	#ifndef ARM_MATH_BIG_ENDIAN
		330
		331	/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
		332	out1 = __SMUAD(C3, R) >> 16U;
		333	/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
		334	out2 = __SMUSDX(C3, R);
		335
		336	#else
		337
		338	/* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
		339	out1 = __SMUSDX(R, C3) >> 16U;
		340	/* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
		341	out2 = __SMUAD(C3, R);
		342
		343	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		344
		345	/* writing output(xd', yd') in little endian format */
		346	_SIMD32_OFFSET(pSi3) =
		347	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		348	pSi3 += 2;
		349
		350	/* Twiddle coefficients index modifier */
		351	ic = ic + twidCoefModifier;
		352
		353	} while (--j);
		354	/* data is in 4.11(q11) format */
		355
		356	/* end of first stage process */
		357
		358
		359	/* start of middle stage process */
		360
		361	/* Twiddle coefficients index modifier */
		362	twidCoefModifier <<= 2U;
		363
		364	/* Calculation of Middle stage */
		365	for (k = fftLen / 4U; k > 4U; k >>= 2U)
		366	{
		367	/* Initializations for the middle stage */
		368	n1 = n2;
		369	n2 >>= 2U;
		370	ic = 0U;
		371
		372	for (j = 0U; j <= (n2 - 1U); j++)
		373	{
		374	/* index calculation for the coefficients */
		375	C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
		376	C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
		377	C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
		378
		379	/* Twiddle coefficients index modifier */
		380	ic = ic + twidCoefModifier;
		381
		382	pSi0 = pSrc16 + 2 * j;
		383	pSi1 = pSi0 + 2 * n2;
		384	pSi2 = pSi1 + 2 * n2;
		385	pSi3 = pSi2 + 2 * n2;
		386
		387	/* Butterfly implementation */
		388	for (i0 = j; i0 < fftLen; i0 += n1)
		389	{
		390	/* Reading i0, i0+fftLen/2 inputs */
		391	/* Read ya (real), xa(imag) input */
		392	T = _SIMD32_OFFSET(pSi0);
		393
		394	/* Read yc (real), xc(imag) input */
		395	S = _SIMD32_OFFSET(pSi2);
		396
		397	/* R = packed( (ya + yc), (xa + xc)) */
		398	R = __QADD16(T, S);
		399
		400	/* S = packed((ya - yc), (xa - xc)) */
		401	S = __QSUB16(T, S);
		402
		403	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		404	/* Read yb (real), xb(imag) input */
		405	T = _SIMD32_OFFSET(pSi1);
		406
		407	/* Read yd (real), xd(imag) input */
		408	U = _SIMD32_OFFSET(pSi3);
		409
		410	/* T = packed( (yb + yd), (xb + xd)) */
		411	T = __QADD16(T, U);
		412
		413	/* writing the butterfly processed i0 sample */
		414
		415	/* xa' = xa + xb + xc + xd */
		416	/* ya' = ya + yb + yc + yd */
		417	out1 = __SHADD16(R, T);
		418	out1 = __SHADD16(out1, 0);
		419	_SIMD32_OFFSET(pSi0) = out1;
		420	pSi0 += 2 * n1;
		421
		422	/* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
		423	R = __SHSUB16(R, T);
		424
		425	#ifndef ARM_MATH_BIG_ENDIAN
		426
		427	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
		428	out1 = __SMUAD(C2, R) >> 16U;
		429
		430	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		431	out2 = __SMUSDX(C2, R);
		432
		433	#else
		434
		435	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		436	out1 = __SMUSDX(R, C2) >> 16U;
		437
		438	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
		439	out2 = __SMUAD(C2, R);
		440
		441	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		442
		443	/* Reading i0+3fftLen/4 */
		444	/* Read yb (real), xb(imag) input */
		445	T = _SIMD32_OFFSET(pSi1);
		446
		447	/* writing the butterfly processed i0 + fftLen/4 sample */
		448	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		449	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		450	_SIMD32_OFFSET(pSi1) =
		451	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		452	pSi1 += 2 * n1;
		453
		454	/* Butterfly calculations */
		455
		456	/* Read yd (real), xd(imag) input */
		457	U = _SIMD32_OFFSET(pSi3);
		458
		459	/* T = packed(yb-yd, xb-xd) */
		460	T = __QSUB16(T, U);
		461
		462	#ifndef ARM_MATH_BIG_ENDIAN
		463
		464	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		465	R = __SHASX(S, T);
		466
		467	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		468	S = __SHSAX(S, T);
		469
		470
		471	/* Butterfly process for the i0+fftLen/2 sample */
		472	out1 = __SMUAD(C1, S) >> 16U;
		473	out2 = __SMUSDX(C1, S);
		474
		475	#else
		476
		477	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		478	R = __SHSAX(S, T);
		479
		480	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		481	S = __SHASX(S, T);
		482
		483
		484	/* Butterfly process for the i0+fftLen/2 sample */
		485	out1 = __SMUSDX(S, C1) >> 16U;
		486	out2 = __SMUAD(C1, S);
		487
		488	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		489
		490	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		491	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		492	_SIMD32_OFFSET(pSi2) =
		493	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		494	pSi2 += 2 * n1;
		495
		496	/* Butterfly process for the i0+3fftLen/4 sample */
		497
		498	#ifndef ARM_MATH_BIG_ENDIAN
		499
		500	out1 = __SMUAD(C3, R) >> 16U;
		501	out2 = __SMUSDX(C3, R);
		502
		503	#else
		504
		505	out1 = __SMUSDX(R, C3) >> 16U;
		506	out2 = __SMUAD(C3, R);
		507
		508	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		509
		510	/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
		511	/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
		512	_SIMD32_OFFSET(pSi3) =
		513	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		514	pSi3 += 2 * n1;
		515	}
		516	}
		517	/* Twiddle coefficients index modifier */
		518	twidCoefModifier <<= 2U;
		519	}
		520	/* end of middle stage process */
		521
		522
		523	/* data is in 10.6(q6) format for the 1024 point */
		524	/* data is in 8.8(q8) format for the 256 point */
		525	/* data is in 6.10(q10) format for the 64 point */
		526	/* data is in 4.12(q12) format for the 16 point */
		527
		528	/* Initializations for the last stage */
		529	j = fftLen >> 2;
		530
		531	ptr1 = &pSrc16[0];
		532
		533	/* start of last stage process */
		534
		535	/* Butterfly implementation */
		536	do
		537	{
		538	/* Read xa (real), ya(imag) input */
		539	xaya = *__SIMD32(ptr1)++;
		540
		541	/* Read xb (real), yb(imag) input */
		542	xbyb = *__SIMD32(ptr1)++;
		543
		544	/* Read xc (real), yc(imag) input */
		545	xcyc = *__SIMD32(ptr1)++;
		546
		547	/* Read xd (real), yd(imag) input */
		548	xdyd = *__SIMD32(ptr1)++;
		549
		550	/* R = packed((ya + yc), (xa + xc)) */
		551	R = __QADD16(xaya, xcyc);
		552
		553	/* T = packed((yb + yd), (xb + xd)) */
		554	T = __QADD16(xbyb, xdyd);
		555
		556	/* pointer updation for writing */
		557	ptr1 = ptr1 - 8U;
		558
		559
		560	/* xa' = xa + xb + xc + xd */
		561	/* ya' = ya + yb + yc + yd */
		562	*__SIMD32(ptr1)++ = __SHADD16(R, T);
		563
		564	/* T = packed((yb + yd), (xb + xd)) */
		565	T = __QADD16(xbyb, xdyd);
		566
		567	/* xc' = (xa-xb+xc-xd) */
		568	/* yc' = (ya-yb+yc-yd) */
		569	*__SIMD32(ptr1)++ = __SHSUB16(R, T);
		570
		571	/* S = packed((ya - yc), (xa - xc)) */
		572	S = __QSUB16(xaya, xcyc);
		573
		574	/* Read yd (real), xd(imag) input */
		575	/* T = packed( (yb - yd), (xb - xd)) */
		576	U = __QSUB16(xbyb, xdyd);
		577
		578	#ifndef ARM_MATH_BIG_ENDIAN
		579
		580	/* xb' = (xa+yb-xc-yd) */
		581	/* yb' = (ya-xb-yc+xd) */
		582	*__SIMD32(ptr1)++ = __SHSAX(S, U);
		583
		584
		585	/* xd' = (xa-yb-xc+yd) */
		586	/* yd' = (ya+xb-yc-xd) */
		587	*__SIMD32(ptr1)++ = __SHASX(S, U);
		588
		589	#else
		590
		591	/* xb' = (xa+yb-xc-yd) */
		592	/* yb' = (ya-xb-yc+xd) */
		593	*__SIMD32(ptr1)++ = __SHASX(S, U);
		594
		595
		596	/* xd' = (xa-yb-xc+yd) */
		597	/* yd' = (ya+xb-yc-xd) */
		598	*__SIMD32(ptr1)++ = __SHSAX(S, U);
		599
		600	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		601
		602	} while (--j);
		603
		604	/* end of last stage process */
		605
		606	/* output is in 11.5(q5) format for the 1024 point */
		607	/* output is in 9.7(q7) format for the 256 point */
		608	/* output is in 7.9(q9) format for the 64 point */
		609	/* output is in 5.11(q11) format for the 16 point */
		610
		611
		612	#else
		613
		614	/* Run the below code for Cortex-M0 */
		615
		616	q15_t R0, R1, S0, S1, T0, T1, U0, U1;
		617	q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
		618	uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
		619
		620	/* Total process is divided into three stages */
		621
		622	/* process first stage, middle stages, & last stage */
		623
		624	/* Initializations for the first stage */
		625	n2 = fftLen;
		626	n1 = n2;
		627
		628	/* n2 = fftLen/4 */
		629	n2 >>= 2U;
		630
		631	/* Index for twiddle coefficient */
		632	ic = 0U;
		633
		634	/* Index for input read and output write */
		635	i0 = 0U;
		636	j = n2;
		637
		638	/* Input is in 1.15(q15) format */
		639
		640	/* start of first stage process */
		641	do
		642	{
		643	/* Butterfly implementation */
		644
		645	/* index calculation for the input as, */
		646	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
		647	i1 = i0 + n2;
		648	i2 = i1 + n2;
		649	i3 = i2 + n2;
		650
		651	/* Reading i0, i0+fftLen/2 inputs */
		652
		653	/* input is down scale by 4 to avoid overflow */
		654	/* Read ya (real), xa(imag) input */
		655	T0 = pSrc16[i0 * 2U] >> 2U;
		656	T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
		657
		658	/* input is down scale by 4 to avoid overflow */
		659	/* Read yc (real), xc(imag) input */
		660	S0 = pSrc16[i2 * 2U] >> 2U;
		661	S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
		662
		663	/* R0 = (ya + yc) */
		664	R0 = __SSAT(T0 + S0, 16U);
		665	/* R1 = (xa + xc) */
		666	R1 = __SSAT(T1 + S1, 16U);
		667
		668	/* S0 = (ya - yc) */
		669	S0 = __SSAT(T0 - S0, 16);
		670	/* S1 = (xa - xc) */
		671	S1 = __SSAT(T1 - S1, 16);
		672
		673	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		674	/* input is down scale by 4 to avoid overflow */
		675	/* Read yb (real), xb(imag) input */
		676	T0 = pSrc16[i1 * 2U] >> 2U;
		677	T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
		678
		679	/* input is down scale by 4 to avoid overflow */
		680	/* Read yd (real), xd(imag) input */
		681	U0 = pSrc16[i3 * 2U] >> 2U;
		682	U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
		683
		684	/* T0 = (yb + yd) */
		685	T0 = __SSAT(T0 + U0, 16U);
		686	/* T1 = (xb + xd) */
		687	T1 = __SSAT(T1 + U1, 16U);
		688
		689	/* writing the butterfly processed i0 sample */
		690	/* ya' = ya + yb + yc + yd */
		691	/* xa' = xa + xb + xc + xd */
		692	pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
		693	pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
		694
		695	/* R0 = (ya + yc) - (yb + yd) */
		696	/* R1 = (xa + xc) - (xb + xd) */
		697	R0 = __SSAT(R0 - T0, 16U);
		698	R1 = __SSAT(R1 - T1, 16U);
		699
		700	/* co2 & si2 are read from Coefficient pointer */
		701	Co2 = pCoef16[2U * ic * 2U];
		702	Si2 = pCoef16[(2U * ic * 2U) + 1];
		703
		704	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		705	out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
		706	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		707	out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
		708
		709	/* Reading i0+fftLen/4 */
		710	/* input is down scale by 4 to avoid overflow */
		711	/* T0 = yb, T1 = xb */
		712	T0 = pSrc16[i1 * 2U] >> 2;
		713	T1 = pSrc16[(i1 * 2U) + 1] >> 2;
		714
		715	/* writing the butterfly processed i0 + fftLen/4 sample */
		716	/* writing output(xc', yc') in little endian format */
		717	pSrc16[i1 * 2U] = out1;
		718	pSrc16[(i1 * 2U) + 1] = out2;
		719
		720	/* Butterfly calculations */
		721	/* input is down scale by 4 to avoid overflow */
		722	/* U0 = yd, U1 = xd */
		723	U0 = pSrc16[i3 * 2U] >> 2;
		724	U1 = pSrc16[(i3 * 2U) + 1] >> 2;
		725	/* T0 = yb-yd */
		726	T0 = __SSAT(T0 - U0, 16);
		727	/* T1 = xb-xd */
		728	T1 = __SSAT(T1 - U1, 16);
		729
		730	/* R1 = (ya-yc) + (xb- xd), R0 = (xa-xc) - (yb-yd)) */
		731	R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
		732	R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
		733
		734	/* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
		735	S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
		736	S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
		737
		738	/* co1 & si1 are read from Coefficient pointer */
		739	Co1 = pCoef16[ic * 2U];
		740	Si1 = pCoef16[(ic * 2U) + 1];
		741	/* Butterfly process for the i0+fftLen/2 sample */
		742	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		743	out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
		744	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		745	out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
		746
		747	/* writing output(xb', yb') in little endian format */
		748	pSrc16[i2 * 2U] = out1;
		749	pSrc16[(i2 * 2U) + 1] = out2;
		750
		751	/* Co3 & si3 are read from Coefficient pointer */
		752	Co3 = pCoef16[3U * (ic * 2U)];
		753	Si3 = pCoef16[(3U * (ic * 2U)) + 1];
		754	/* Butterfly process for the i0+3fftLen/4 sample */
		755	/* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
		756	out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
		757	/* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
		758	out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
		759	/* writing output(xd', yd') in little endian format */
		760	pSrc16[i3 * 2U] = out1;
		761	pSrc16[(i3 * 2U) + 1] = out2;
		762
		763	/* Twiddle coefficients index modifier */
		764	ic = ic + twidCoefModifier;
		765
		766	/* Updating input index */
		767	i0 = i0 + 1U;
		768
		769	} while (--j);
		770	/* data is in 4.11(q11) format */
		771
		772	/* end of first stage process */
		773
		774
		775	/* start of middle stage process */
		776
		777	/* Twiddle coefficients index modifier */
		778	twidCoefModifier <<= 2U;
		779
		780	/* Calculation of Middle stage */
		781	for (k = fftLen / 4U; k > 4U; k >>= 2U)
		782	{
		783	/* Initializations for the middle stage */
		784	n1 = n2;
		785	n2 >>= 2U;
		786	ic = 0U;
		787
		788	for (j = 0U; j <= (n2 - 1U); j++)
		789	{
		790	/* index calculation for the coefficients */
		791	Co1 = pCoef16[ic * 2U];
		792	Si1 = pCoef16[(ic * 2U) + 1U];
		793	Co2 = pCoef16[2U * (ic * 2U)];
		794	Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
		795	Co3 = pCoef16[3U * (ic * 2U)];
		796	Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
		797
		798	/* Twiddle coefficients index modifier */
		799	ic = ic + twidCoefModifier;
		800
		801	/* Butterfly implementation */
		802	for (i0 = j; i0 < fftLen; i0 += n1)
		803	{
		804	/* index calculation for the input as, */
		805	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
		806	i1 = i0 + n2;
		807	i2 = i1 + n2;
		808	i3 = i2 + n2;
		809
		810	/* Reading i0, i0+fftLen/2 inputs */
		811	/* Read ya (real), xa(imag) input */
		812	T0 = pSrc16[i0 * 2U];
		813	T1 = pSrc16[(i0 * 2U) + 1U];
		814
		815	/* Read yc (real), xc(imag) input */
		816	S0 = pSrc16[i2 * 2U];
		817	S1 = pSrc16[(i2 * 2U) + 1U];
		818
		819	/* R0 = (ya + yc), R1 = (xa + xc) */
		820	R0 = __SSAT(T0 + S0, 16);
		821	R1 = __SSAT(T1 + S1, 16);
		822
		823	/* S0 = (ya - yc), S1 =(xa - xc) */
		824	S0 = __SSAT(T0 - S0, 16);
		825	S1 = __SSAT(T1 - S1, 16);
		826
		827	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		828	/* Read yb (real), xb(imag) input */
		829	T0 = pSrc16[i1 * 2U];
		830	T1 = pSrc16[(i1 * 2U) + 1U];
		831
		832	/* Read yd (real), xd(imag) input */
		833	U0 = pSrc16[i3 * 2U];
		834	U1 = pSrc16[(i3 * 2U) + 1U];
		835
		836
		837	/* T0 = (yb + yd), T1 = (xb + xd) */
		838	T0 = __SSAT(T0 + U0, 16);
		839	T1 = __SSAT(T1 + U1, 16);
		840
		841	/* writing the butterfly processed i0 sample */
		842
		843	/* xa' = xa + xb + xc + xd */
		844	/* ya' = ya + yb + yc + yd */
		845	out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
		846	out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
		847
		848	pSrc16[i0 * 2U] = out1;
		849	pSrc16[(2U * i0) + 1U] = out2;
		850
		851	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
		852	R0 = (R0 >> 1U) - (T0 >> 1U);
		853	R1 = (R1 >> 1U) - (T1 >> 1U);
		854
		855	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
		856	out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
		857
		858	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		859	out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
		860
		861	/* Reading i0+3fftLen/4 */
		862	/* Read yb (real), xb(imag) input */
		863	T0 = pSrc16[i1 * 2U];
		864	T1 = pSrc16[(i1 * 2U) + 1U];
		865
		866	/* writing the butterfly processed i0 + fftLen/4 sample */
		867	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		868	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		869	pSrc16[i1 * 2U] = out1;
		870	pSrc16[(i1 * 2U) + 1U] = out2;
		871
		872	/* Butterfly calculations */
		873
		874	/* Read yd (real), xd(imag) input */
		875	U0 = pSrc16[i3 * 2U];
		876	U1 = pSrc16[(i3 * 2U) + 1U];
		877
		878	/* T0 = yb-yd, T1 = xb-xd */
		879	T0 = __SSAT(T0 - U0, 16);
		880	T1 = __SSAT(T1 - U1, 16);
		881
		882	/* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
		883	R0 = (S0 >> 1U) - (T1 >> 1U);
		884	R1 = (S1 >> 1U) + (T0 >> 1U);
		885
		886	/* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
		887	S0 = (S0 >> 1U) + (T1 >> 1U);
		888	S1 = (S1 >> 1U) - (T0 >> 1U);
		889
		890	/* Butterfly process for the i0+fftLen/2 sample */
		891	out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
		892
		893	out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
		894
		895	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		896	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		897	pSrc16[i2 * 2U] = out1;
		898	pSrc16[(i2 * 2U) + 1U] = out2;
		899
		900	/* Butterfly process for the i0+3fftLen/4 sample */
		901	out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
		902
		903	out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
		904	/* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
		905	/* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
		906	pSrc16[i3 * 2U] = out1;
		907	pSrc16[(i3 * 2U) + 1U] = out2;
		908	}
		909	}
		910	/* Twiddle coefficients index modifier */
		911	twidCoefModifier <<= 2U;
		912	}
		913	/* end of middle stage process */
		914
		915
		916	/* data is in 10.6(q6) format for the 1024 point */
		917	/* data is in 8.8(q8) format for the 256 point */
		918	/* data is in 6.10(q10) format for the 64 point */
		919	/* data is in 4.12(q12) format for the 16 point */
		920
		921	/* Initializations for the last stage */
		922	n1 = n2;
		923	n2 >>= 2U;
		924
		925	/* start of last stage process */
		926
		927	/* Butterfly implementation */
		928	for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
		929	{
		930	/* index calculation for the input as, */
		931	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
		932	i1 = i0 + n2;
		933	i2 = i1 + n2;
		934	i3 = i2 + n2;
		935
		936	/* Reading i0, i0+fftLen/2 inputs */
		937	/* Read ya (real), xa(imag) input */
		938	T0 = pSrc16[i0 * 2U];
		939	T1 = pSrc16[(i0 * 2U) + 1U];
		940
		941	/* Read yc (real), xc(imag) input */
		942	S0 = pSrc16[i2 * 2U];
		943	S1 = pSrc16[(i2 * 2U) + 1U];
		944
		945	/* R0 = (ya + yc), R1 = (xa + xc) */
		946	R0 = __SSAT(T0 + S0, 16U);
		947	R1 = __SSAT(T1 + S1, 16U);
		948
		949	/* S0 = (ya - yc), S1 = (xa - xc) */
		950	S0 = __SSAT(T0 - S0, 16U);
		951	S1 = __SSAT(T1 - S1, 16U);
		952
		953	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		954	/* Read yb (real), xb(imag) input */
		955	T0 = pSrc16[i1 * 2U];
		956	T1 = pSrc16[(i1 * 2U) + 1U];
		957	/* Read yd (real), xd(imag) input */
		958	U0 = pSrc16[i3 * 2U];
		959	U1 = pSrc16[(i3 * 2U) + 1U];
		960
		961	/* T0 = (yb + yd), T1 = (xb + xd)) */
		962	T0 = __SSAT(T0 + U0, 16U);
		963	T1 = __SSAT(T1 + U1, 16U);
		964
		965	/* writing the butterfly processed i0 sample */
		966	/* xa' = xa + xb + xc + xd */
		967	/* ya' = ya + yb + yc + yd */
		968	pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
		969	pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
		970
		971	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
		972	R0 = (R0 >> 1U) - (T0 >> 1U);
		973	R1 = (R1 >> 1U) - (T1 >> 1U);
		974	/* Read yb (real), xb(imag) input */
		975	T0 = pSrc16[i1 * 2U];
		976	T1 = pSrc16[(i1 * 2U) + 1U];
		977
		978	/* writing the butterfly processed i0 + fftLen/4 sample */
		979	/* xc' = (xa-xb+xc-xd) */
		980	/* yc' = (ya-yb+yc-yd) */
		981	pSrc16[i1 * 2U] = R0;
		982	pSrc16[(i1 * 2U) + 1U] = R1;
		983
		984	/* Read yd (real), xd(imag) input */
		985	U0 = pSrc16[i3 * 2U];
		986	U1 = pSrc16[(i3 * 2U) + 1U];
		987	/* T0 = (yb - yd), T1 = (xb - xd) */
		988	T0 = __SSAT(T0 - U0, 16U);
		989	T1 = __SSAT(T1 - U1, 16U);
		990
		991	/* writing the butterfly processed i0 + fftLen/2 sample */
		992	/* xb' = (xa+yb-xc-yd) */
		993	/* yb' = (ya-xb-yc+xd) */
		994	pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
		995	pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
		996
		997	/* writing the butterfly processed i0 + 3fftLen/4 sample */
		998	/* xd' = (xa-yb-xc+yd) */
		999	/* yd' = (ya+xb-yc-xd) */
		1000	pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
		1001	pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
		1002
		1003	}
		1004
		1005	/* end of last stage process */
		1006
		1007	/* output is in 11.5(q5) format for the 1024 point */
		1008	/* output is in 9.7(q7) format for the 256 point */
		1009	/* output is in 7.9(q9) format for the 64 point */
		1010	/* output is in 5.11(q11) format for the 16 point */
		1011
		1012	#endif /* #if defined (ARM_MATH_DSP) */
		1013
		1014	}
		1015
		1016
		1017	/**
		1018	* @brief Core function for the Q15 CIFFT butterfly process.
		1019	* @param[in, out] *pSrc16 points to the in-place buffer of Q15 data type.
		1020	* @param[in] fftLen length of the FFT.
		1021	* @param[in] *pCoef16 points to twiddle coefficient buffer.
		1022	* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
		1023	* @return none.
		1024	*/
		1025
		1026	/*
		1027	* Radix-4 IFFT algorithm used is :
		1028	*
		1029	* CIFFT uses same twiddle coefficients as CFFT function
		1030	* x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
		1031	*
		1032	*
		1033	* IFFT is implemented with following changes in equations from FFT
		1034	*
		1035	* Input real and imaginary data:
		1036	* x(n) = xa + j * ya
		1037	* x(n+N/4 ) = xb + j * yb
		1038	* x(n+N/2 ) = xc + j * yc
		1039	* x(n+3N 4) = xd + j * yd
		1040	*
		1041	*
		1042	* Output real and imaginary data:
		1043	* x(4r) = xa'+ j * ya'
		1044	* x(4r+1) = xb'+ j * yb'
		1045	* x(4r+2) = xc'+ j * yc'
		1046	* x(4r+3) = xd'+ j * yd'
		1047	*
		1048	*
		1049	* Twiddle factors for radix-4 IFFT:
		1050	* Wn = co1 + j * (si1)
		1051	* W2n = co2 + j * (si2)
		1052	* W3n = co3 + j * (si3)
		1053
		1054	* The real and imaginary output values for the radix-4 butterfly are
		1055	* xa' = xa + xb + xc + xd
		1056	* ya' = ya + yb + yc + yd
		1057	* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
		1058	* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
		1059	* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
		1060	* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
		1061	* xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
		1062	* yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
		1063	*
		1064	*/
		1065
		1066	void arm_radix4_butterfly_inverse_q15(
		1067	q15_t * pSrc16,
		1068	uint32_t fftLen,
		1069	q15_t * pCoef16,
		1070	uint32_t twidCoefModifier)
		1071	{
		1072
		1073	#if defined (ARM_MATH_DSP)
		1074
		1075	/* Run the below code for Cortex-M4 and Cortex-M3 */
		1076
		1077	q31_t R, S, T, U;
		1078	q31_t C1, C2, C3, out1, out2;
		1079	uint32_t n1, n2, ic, i0, j, k;
		1080
		1081	q15_t *ptr1;
		1082	q15_t *pSi0;
		1083	q15_t *pSi1;
		1084	q15_t *pSi2;
		1085	q15_t *pSi3;
		1086
		1087	q31_t xaya, xbyb, xcyc, xdyd;
		1088
		1089	/* Total process is divided into three stages */
		1090
		1091	/* process first stage, middle stages, & last stage */
		1092
		1093	/* Initializations for the first stage */
		1094	n2 = fftLen;
		1095	n1 = n2;
		1096
		1097	/* n2 = fftLen/4 */
		1098	n2 >>= 2U;
		1099
		1100	/* Index for twiddle coefficient */
		1101	ic = 0U;
		1102
		1103	/* Index for input read and output write */
		1104	j = n2;
		1105
		1106	pSi0 = pSrc16;
		1107	pSi1 = pSi0 + 2 * n2;
		1108	pSi2 = pSi1 + 2 * n2;
		1109	pSi3 = pSi2 + 2 * n2;
		1110
		1111	/* Input is in 1.15(q15) format */
		1112
		1113	/* start of first stage process */
		1114	do
		1115	{
		1116	/* Butterfly implementation */
		1117
		1118	/* Reading i0, i0+fftLen/2 inputs */
		1119	/* Read ya (real), xa(imag) input */
		1120	T = _SIMD32_OFFSET(pSi0);
		1121	T = __SHADD16(T, 0);
		1122	T = __SHADD16(T, 0);
		1123
		1124	/* Read yc (real), xc(imag) input */
		1125	S = _SIMD32_OFFSET(pSi2);
		1126	S = __SHADD16(S, 0);
		1127	S = __SHADD16(S, 0);
		1128
		1129	/* R = packed((ya + yc), (xa + xc) ) */
		1130	R = __QADD16(T, S);
		1131
		1132	/* S = packed((ya - yc), (xa - xc) ) */
		1133	S = __QSUB16(T, S);
		1134
		1135	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		1136	/* Read yb (real), xb(imag) input */
		1137	T = _SIMD32_OFFSET(pSi1);
		1138	T = __SHADD16(T, 0);
		1139	T = __SHADD16(T, 0);
		1140
		1141	/* Read yd (real), xd(imag) input */
		1142	U = _SIMD32_OFFSET(pSi3);
		1143	U = __SHADD16(U, 0);
		1144	U = __SHADD16(U, 0);
		1145
		1146	/* T = packed((yb + yd), (xb + xd) ) */
		1147	T = __QADD16(T, U);
		1148
		1149	/* writing the butterfly processed i0 sample */
		1150	/* xa' = xa + xb + xc + xd */
		1151	/* ya' = ya + yb + yc + yd */
		1152	_SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
		1153	pSi0 += 2;
		1154
		1155	/* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
		1156	R = __QSUB16(R, T);
		1157
		1158	/* co2 & si2 are read from SIMD Coefficient pointer */
		1159	C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
		1160
		1161	#ifndef ARM_MATH_BIG_ENDIAN
		1162
		1163	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		1164	out1 = __SMUSD(C2, R) >> 16U;
		1165	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		1166	out2 = __SMUADX(C2, R);
		1167
		1168	#else
		1169
		1170	/* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		1171	out1 = __SMUADX(C2, R) >> 16U;
		1172	/* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		1173	out2 = __SMUSD(__QSUB16(0, C2), R);
		1174
		1175	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1176
		1177	/* Reading i0+fftLen/4 */
		1178	/* T = packed(yb, xb) */
		1179	T = _SIMD32_OFFSET(pSi1);
		1180	T = __SHADD16(T, 0);
		1181	T = __SHADD16(T, 0);
		1182
		1183	/* writing the butterfly processed i0 + fftLen/4 sample */
		1184	/* writing output(xc', yc') in little endian format */
		1185	_SIMD32_OFFSET(pSi1) =
		1186	(q31_t) ((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		1187	pSi1 += 2;
		1188
		1189	/* Butterfly calculations */
		1190	/* U = packed(yd, xd) */
		1191	U = _SIMD32_OFFSET(pSi3);
		1192	U = __SHADD16(U, 0);
		1193	U = __SHADD16(U, 0);
		1194
		1195	/* T = packed(yb-yd, xb-xd) */
		1196	T = __QSUB16(T, U);
		1197
		1198	#ifndef ARM_MATH_BIG_ENDIAN
		1199
		1200	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		1201	R = __QSAX(S, T);
		1202	/* S = packed((ya-yc) + (xb- xd), (xa-xc) - (yb-yd)) */
		1203	S = __QASX(S, T);
		1204
		1205	#else
		1206
		1207	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		1208	R = __QASX(S, T);
		1209	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		1210	S = __QSAX(S, T);
		1211
		1212	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1213
		1214	/* co1 & si1 are read from SIMD Coefficient pointer */
		1215	C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
		1216	/* Butterfly process for the i0+fftLen/2 sample */
		1217
		1218	#ifndef ARM_MATH_BIG_ENDIAN
		1219
		1220	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		1221	out1 = __SMUSD(C1, S) >> 16U;
		1222	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		1223	out2 = __SMUADX(C1, S);
		1224
		1225	#else
		1226
		1227	/* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		1228	out1 = __SMUADX(C1, S) >> 16U;
		1229	/* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		1230	out2 = __SMUSD(__QSUB16(0, C1), S);
		1231
		1232	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1233
		1234	/* writing output(xb', yb') in little endian format */
		1235	_SIMD32_OFFSET(pSi2) =
		1236	((out2) & 0xFFFF0000) \| ((out1) & 0x0000FFFF);
		1237	pSi2 += 2;
		1238
		1239
		1240	/* co3 & si3 are read from SIMD Coefficient pointer */
		1241	C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
		1242	/* Butterfly process for the i0+3fftLen/4 sample */
		1243
		1244	#ifndef ARM_MATH_BIG_ENDIAN
		1245
		1246	/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
		1247	out1 = __SMUSD(C3, R) >> 16U;
		1248	/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
		1249	out2 = __SMUADX(C3, R);
		1250
		1251	#else
		1252
		1253	/* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
		1254	out1 = __SMUADX(C3, R) >> 16U;
		1255	/* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
		1256	out2 = __SMUSD(__QSUB16(0, C3), R);
		1257
		1258	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1259
		1260	/* writing output(xd', yd') in little endian format */
		1261	_SIMD32_OFFSET(pSi3) =
		1262	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		1263	pSi3 += 2;
		1264
		1265	/* Twiddle coefficients index modifier */
		1266	ic = ic + twidCoefModifier;
		1267
		1268	} while (--j);
		1269	/* data is in 4.11(q11) format */
		1270
		1271	/* end of first stage process */
		1272
		1273
		1274	/* start of middle stage process */
		1275
		1276	/* Twiddle coefficients index modifier */
		1277	twidCoefModifier <<= 2U;
		1278
		1279	/* Calculation of Middle stage */
		1280	for (k = fftLen / 4U; k > 4U; k >>= 2U)
		1281	{
		1282	/* Initializations for the middle stage */
		1283	n1 = n2;
		1284	n2 >>= 2U;
		1285	ic = 0U;
		1286
		1287	for (j = 0U; j <= (n2 - 1U); j++)
		1288	{
		1289	/* index calculation for the coefficients */
		1290	C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
		1291	C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
		1292	C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
		1293
		1294	/* Twiddle coefficients index modifier */
		1295	ic = ic + twidCoefModifier;
		1296
		1297	pSi0 = pSrc16 + 2 * j;
		1298	pSi1 = pSi0 + 2 * n2;
		1299	pSi2 = pSi1 + 2 * n2;
		1300	pSi3 = pSi2 + 2 * n2;
		1301
		1302	/* Butterfly implementation */
		1303	for (i0 = j; i0 < fftLen; i0 += n1)
		1304	{
		1305	/* Reading i0, i0+fftLen/2 inputs */
		1306	/* Read ya (real), xa(imag) input */
		1307	T = _SIMD32_OFFSET(pSi0);
		1308
		1309	/* Read yc (real), xc(imag) input */
		1310	S = _SIMD32_OFFSET(pSi2);
		1311
		1312	/* R = packed( (ya + yc), (xa + xc)) */
		1313	R = __QADD16(T, S);
		1314
		1315	/* S = packed((ya - yc), (xa - xc)) */
		1316	S = __QSUB16(T, S);
		1317
		1318	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		1319	/* Read yb (real), xb(imag) input */
		1320	T = _SIMD32_OFFSET(pSi1);
		1321
		1322	/* Read yd (real), xd(imag) input */
		1323	U = _SIMD32_OFFSET(pSi3);
		1324
		1325	/* T = packed( (yb + yd), (xb + xd)) */
		1326	T = __QADD16(T, U);
		1327
		1328	/* writing the butterfly processed i0 sample */
		1329
		1330	/* xa' = xa + xb + xc + xd */
		1331	/* ya' = ya + yb + yc + yd */
		1332	out1 = __SHADD16(R, T);
		1333	out1 = __SHADD16(out1, 0);
		1334	_SIMD32_OFFSET(pSi0) = out1;
		1335	pSi0 += 2 * n1;
		1336
		1337	/* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
		1338	R = __SHSUB16(R, T);
		1339
		1340	#ifndef ARM_MATH_BIG_ENDIAN
		1341
		1342	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
		1343	out1 = __SMUSD(C2, R) >> 16U;
		1344
		1345	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		1346	out2 = __SMUADX(C2, R);
		1347
		1348	#else
		1349
		1350	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		1351	out1 = __SMUADX(R, C2) >> 16U;
		1352
		1353	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
		1354	out2 = __SMUSD(__QSUB16(0, C2), R);
		1355
		1356	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1357
		1358	/* Reading i0+3fftLen/4 */
		1359	/* Read yb (real), xb(imag) input */
		1360	T = _SIMD32_OFFSET(pSi1);
		1361
		1362	/* writing the butterfly processed i0 + fftLen/4 sample */
		1363	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		1364	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		1365	_SIMD32_OFFSET(pSi1) =
		1366	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		1367	pSi1 += 2 * n1;
		1368
		1369	/* Butterfly calculations */
		1370
		1371	/* Read yd (real), xd(imag) input */
		1372	U = _SIMD32_OFFSET(pSi3);
		1373
		1374	/* T = packed(yb-yd, xb-xd) */
		1375	T = __QSUB16(T, U);
		1376
		1377	#ifndef ARM_MATH_BIG_ENDIAN
		1378
		1379	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		1380	R = __SHSAX(S, T);
		1381
		1382	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		1383	S = __SHASX(S, T);
		1384
		1385
		1386	/* Butterfly process for the i0+fftLen/2 sample */
		1387	out1 = __SMUSD(C1, S) >> 16U;
		1388	out2 = __SMUADX(C1, S);
		1389
		1390	#else
		1391
		1392	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		1393	R = __SHASX(S, T);
		1394
		1395	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		1396	S = __SHSAX(S, T);
		1397
		1398
		1399	/* Butterfly process for the i0+fftLen/2 sample */
		1400	out1 = __SMUADX(S, C1) >> 16U;
		1401	out2 = __SMUSD(__QSUB16(0, C1), S);
		1402
		1403	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1404
		1405	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		1406	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		1407	_SIMD32_OFFSET(pSi2) =
		1408	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		1409	pSi2 += 2 * n1;
		1410
		1411	/* Butterfly process for the i0+3fftLen/4 sample */
		1412
		1413	#ifndef ARM_MATH_BIG_ENDIAN
		1414
		1415	out1 = __SMUSD(C3, R) >> 16U;
		1416	out2 = __SMUADX(C3, R);
		1417
		1418	#else
		1419
		1420	out1 = __SMUADX(C3, R) >> 16U;
		1421	out2 = __SMUSD(__QSUB16(0, C3), R);
		1422
		1423	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1424
		1425	/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
		1426	/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
		1427	_SIMD32_OFFSET(pSi3) =
		1428	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		1429	pSi3 += 2 * n1;
		1430	}
		1431	}
		1432	/* Twiddle coefficients index modifier */
		1433	twidCoefModifier <<= 2U;
		1434	}
		1435	/* end of middle stage process */
		1436
		1437	/* data is in 10.6(q6) format for the 1024 point */
		1438	/* data is in 8.8(q8) format for the 256 point */
		1439	/* data is in 6.10(q10) format for the 64 point */
		1440	/* data is in 4.12(q12) format for the 16 point */
		1441
		1442	/* Initializations for the last stage */
		1443	j = fftLen >> 2;
		1444
		1445	ptr1 = &pSrc16[0];
		1446
		1447	/* start of last stage process */
		1448
		1449	/* Butterfly implementation */
		1450	do
		1451	{
		1452	/* Read xa (real), ya(imag) input */
		1453	xaya = *__SIMD32(ptr1)++;
		1454
		1455	/* Read xb (real), yb(imag) input */
		1456	xbyb = *__SIMD32(ptr1)++;
		1457
		1458	/* Read xc (real), yc(imag) input */
		1459	xcyc = *__SIMD32(ptr1)++;
		1460
		1461	/* Read xd (real), yd(imag) input */
		1462	xdyd = *__SIMD32(ptr1)++;
		1463
		1464	/* R = packed((ya + yc), (xa + xc)) */
		1465	R = __QADD16(xaya, xcyc);
		1466
		1467	/* T = packed((yb + yd), (xb + xd)) */
		1468	T = __QADD16(xbyb, xdyd);
		1469
		1470	/* pointer updation for writing */
		1471	ptr1 = ptr1 - 8U;
		1472
		1473
		1474	/* xa' = xa + xb + xc + xd */
		1475	/* ya' = ya + yb + yc + yd */
		1476	*__SIMD32(ptr1)++ = __SHADD16(R, T);
		1477
		1478	/* T = packed((yb + yd), (xb + xd)) */
		1479	T = __QADD16(xbyb, xdyd);
		1480
		1481	/* xc' = (xa-xb+xc-xd) */
		1482	/* yc' = (ya-yb+yc-yd) */
		1483	*__SIMD32(ptr1)++ = __SHSUB16(R, T);
		1484
		1485	/* S = packed((ya - yc), (xa - xc)) */
		1486	S = __QSUB16(xaya, xcyc);
		1487
		1488	/* Read yd (real), xd(imag) input */
		1489	/* T = packed( (yb - yd), (xb - xd)) */
		1490	U = __QSUB16(xbyb, xdyd);
		1491
		1492	#ifndef ARM_MATH_BIG_ENDIAN
		1493
		1494	/* xb' = (xa+yb-xc-yd) */
		1495	/* yb' = (ya-xb-yc+xd) */
		1496	*__SIMD32(ptr1)++ = __SHASX(S, U);
		1497
		1498
		1499	/* xd' = (xa-yb-xc+yd) */
		1500	/* yd' = (ya+xb-yc-xd) */
		1501	*__SIMD32(ptr1)++ = __SHSAX(S, U);
		1502
		1503	#else
		1504
		1505	/* xb' = (xa+yb-xc-yd) */
		1506	/* yb' = (ya-xb-yc+xd) */
		1507	*__SIMD32(ptr1)++ = __SHSAX(S, U);
		1508
		1509
		1510	/* xd' = (xa-yb-xc+yd) */
		1511	/* yd' = (ya+xb-yc-xd) */
		1512	*__SIMD32(ptr1)++ = __SHASX(S, U);
		1513
		1514
		1515	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1516
		1517	} while (--j);
		1518
		1519	/* end of last stage process */
		1520
		1521	/* output is in 11.5(q5) format for the 1024 point */
		1522	/* output is in 9.7(q7) format for the 256 point */
		1523	/* output is in 7.9(q9) format for the 64 point */
		1524	/* output is in 5.11(q11) format for the 16 point */
		1525
		1526
		1527	#else
		1528
		1529	/* Run the below code for Cortex-M0 */
		1530
		1531	q15_t R0, R1, S0, S1, T0, T1, U0, U1;
		1532	q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
		1533	uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
		1534
		1535	/* Total process is divided into three stages */
		1536
		1537	/* process first stage, middle stages, & last stage */
		1538
		1539	/* Initializations for the first stage */
		1540	n2 = fftLen;
		1541	n1 = n2;
		1542
		1543	/* n2 = fftLen/4 */
		1544	n2 >>= 2U;
		1545
		1546	/* Index for twiddle coefficient */
		1547	ic = 0U;
		1548
		1549	/* Index for input read and output write */
		1550	i0 = 0U;
		1551
		1552	j = n2;
		1553
		1554	/* Input is in 1.15(q15) format */
		1555
		1556	/* Start of first stage process */
		1557	do
		1558	{
		1559	/* Butterfly implementation */
		1560
		1561	/* index calculation for the input as, */
		1562	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
		1563	i1 = i0 + n2;
		1564	i2 = i1 + n2;
		1565	i3 = i2 + n2;
		1566
		1567	/* Reading i0, i0+fftLen/2 inputs */
		1568	/* input is down scale by 4 to avoid overflow */
		1569	/* Read ya (real), xa(imag) input */
		1570	T0 = pSrc16[i0 * 2U] >> 2U;
		1571	T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
		1572	/* input is down scale by 4 to avoid overflow */
		1573	/* Read yc (real), xc(imag) input */
		1574	S0 = pSrc16[i2 * 2U] >> 2U;
		1575	S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
		1576
		1577	/* R0 = (ya + yc), R1 = (xa + xc) */
		1578	R0 = __SSAT(T0 + S0, 16U);
		1579	R1 = __SSAT(T1 + S1, 16U);
		1580	/* S0 = (ya - yc), S1 = (xa - xc) */
		1581	S0 = __SSAT(T0 - S0, 16U);
		1582	S1 = __SSAT(T1 - S1, 16U);
		1583
		1584	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		1585	/* input is down scale by 4 to avoid overflow */
		1586	/* Read yb (real), xb(imag) input */
		1587	T0 = pSrc16[i1 * 2U] >> 2U;
		1588	T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
		1589	/* Read yd (real), xd(imag) input */
		1590	/* input is down scale by 4 to avoid overflow */
		1591	U0 = pSrc16[i3 * 2U] >> 2U;
		1592	U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
		1593
		1594	/* T0 = (yb + yd), T1 = (xb + xd) */
		1595	T0 = __SSAT(T0 + U0, 16U);
		1596	T1 = __SSAT(T1 + U1, 16U);
		1597
		1598	/* writing the butterfly processed i0 sample */
		1599	/* xa' = xa + xb + xc + xd */
		1600	/* ya' = ya + yb + yc + yd */
		1601	pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
		1602	pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
		1603
		1604	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
		1605	R0 = __SSAT(R0 - T0, 16U);
		1606	R1 = __SSAT(R1 - T1, 16U);
		1607	/* co2 & si2 are read from Coefficient pointer */
		1608	Co2 = pCoef16[2U * ic * 2U];
		1609	Si2 = pCoef16[(2U * ic * 2U) + 1U];
		1610	/* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
		1611	out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
		1612	/* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
		1613	out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
		1614
		1615	/* Reading i0+fftLen/4 */
		1616	/* input is down scale by 4 to avoid overflow */
		1617	/* T0 = yb, T1 = xb */
		1618	T0 = pSrc16[i1 * 2U] >> 2U;
		1619	T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
		1620
		1621	/* writing the butterfly processed i0 + fftLen/4 sample */
		1622	/* writing output(xc', yc') in little endian format */
		1623	pSrc16[i1 * 2U] = out1;
		1624	pSrc16[(i1 * 2U) + 1U] = out2;
		1625
		1626	/* Butterfly calculations */
		1627	/* input is down scale by 4 to avoid overflow */
		1628	/* U0 = yd, U1 = xd) */
		1629	U0 = pSrc16[i3 * 2U] >> 2U;
		1630	U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
		1631
		1632	/* T0 = yb-yd, T1 = xb-xd) */
		1633	T0 = __SSAT(T0 - U0, 16U);
		1634	T1 = __SSAT(T1 - U1, 16U);
		1635	/* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
		1636	R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
		1637	R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
		1638	/* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
		1639	S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
		1640	S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
		1641
		1642	/* co1 & si1 are read from Coefficient pointer */
		1643	Co1 = pCoef16[ic * 2U];
		1644	Si1 = pCoef16[(ic * 2U) + 1U];
		1645	/* Butterfly process for the i0+fftLen/2 sample */
		1646	/* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
		1647	out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
		1648	/* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
		1649	out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
		1650	/* writing output(xb', yb') in little endian format */
		1651	pSrc16[i2 * 2U] = out1;
		1652	pSrc16[(i2 * 2U) + 1U] = out2;
		1653
		1654	/* Co3 & si3 are read from Coefficient pointer */
		1655	Co3 = pCoef16[3U * ic * 2U];
		1656	Si3 = pCoef16[(3U * ic * 2U) + 1U];
		1657	/* Butterfly process for the i0+3fftLen/4 sample */
		1658	/* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
		1659	out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
		1660	/* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
		1661	out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
		1662	/* writing output(xd', yd') in little endian format */
		1663	pSrc16[i3 * 2U] = out1;
		1664	pSrc16[(i3 * 2U) + 1U] = out2;
		1665
		1666	/* Twiddle coefficients index modifier */
		1667	ic = ic + twidCoefModifier;
		1668
		1669	/* Updating input index */
		1670	i0 = i0 + 1U;
		1671
		1672	} while (--j);
		1673
		1674	/* End of first stage process */
		1675
		1676	/* data is in 4.11(q11) format */
		1677
		1678
		1679	/* Start of Middle stage process */
		1680
		1681	/* Twiddle coefficients index modifier */
		1682	twidCoefModifier <<= 2U;
		1683
		1684	/* Calculation of Middle stage */
		1685	for (k = fftLen / 4U; k > 4U; k >>= 2U)
		1686	{
		1687	/* Initializations for the middle stage */
		1688	n1 = n2;
		1689	n2 >>= 2U;
		1690	ic = 0U;
		1691
		1692	for (j = 0U; j <= (n2 - 1U); j++)
		1693	{
		1694	/* index calculation for the coefficients */
		1695	Co1 = pCoef16[ic * 2U];
		1696	Si1 = pCoef16[(ic * 2U) + 1U];
		1697	Co2 = pCoef16[2U * ic * 2U];
		1698	Si2 = pCoef16[2U * ic * 2U + 1U];
		1699	Co3 = pCoef16[3U * ic * 2U];
		1700	Si3 = pCoef16[(3U * ic * 2U) + 1U];
		1701
		1702	/* Twiddle coefficients index modifier */
		1703	ic = ic + twidCoefModifier;
		1704
		1705	/* Butterfly implementation */
		1706	for (i0 = j; i0 < fftLen; i0 += n1)
		1707	{
		1708	/* index calculation for the input as, */
		1709	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
		1710	i1 = i0 + n2;
		1711	i2 = i1 + n2;
		1712	i3 = i2 + n2;
		1713
		1714	/* Reading i0, i0+fftLen/2 inputs */
		1715	/* Read ya (real), xa(imag) input */
		1716	T0 = pSrc16[i0 * 2U];
		1717	T1 = pSrc16[(i0 * 2U) + 1U];
		1718
		1719	/* Read yc (real), xc(imag) input */
		1720	S0 = pSrc16[i2 * 2U];
		1721	S1 = pSrc16[(i2 * 2U) + 1U];
		1722
		1723
		1724	/* R0 = (ya + yc), R1 = (xa + xc) */
		1725	R0 = __SSAT(T0 + S0, 16U);
		1726	R1 = __SSAT(T1 + S1, 16U);
		1727	/* S0 = (ya - yc), S1 = (xa - xc) */
		1728	S0 = __SSAT(T0 - S0, 16U);
		1729	S1 = __SSAT(T1 - S1, 16U);
		1730
		1731	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		1732	/* Read yb (real), xb(imag) input */
		1733	T0 = pSrc16[i1 * 2U];
		1734	T1 = pSrc16[(i1 * 2U) + 1U];
		1735
		1736	/* Read yd (real), xd(imag) input */
		1737	U0 = pSrc16[i3 * 2U];
		1738	U1 = pSrc16[(i3 * 2U) + 1U];
		1739
		1740	/* T0 = (yb + yd), T1 = (xb + xd) */
		1741	T0 = __SSAT(T0 + U0, 16U);
		1742	T1 = __SSAT(T1 + U1, 16U);
		1743
		1744	/* writing the butterfly processed i0 sample */
		1745	/* xa' = xa + xb + xc + xd */
		1746	/* ya' = ya + yb + yc + yd */
		1747	pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
		1748	pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
		1749
		1750	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
		1751	R0 = (R0 >> 1U) - (T0 >> 1U);
		1752	R1 = (R1 >> 1U) - (T1 >> 1U);
		1753
		1754	/* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
		1755	out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
		1756	/* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
		1757	out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
		1758
		1759	/* Reading i0+3fftLen/4 */
		1760	/* Read yb (real), xb(imag) input */
		1761	T0 = pSrc16[i1 * 2U];
		1762	T1 = pSrc16[(i1 * 2U) + 1U];
		1763
		1764	/* writing the butterfly processed i0 + fftLen/4 sample */
		1765	/* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
		1766	/* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
		1767	pSrc16[i1 * 2U] = out1;
		1768	pSrc16[(i1 * 2U) + 1U] = out2;
		1769
		1770	/* Butterfly calculations */
		1771	/* Read yd (real), xd(imag) input */
		1772	U0 = pSrc16[i3 * 2U];
		1773	U1 = pSrc16[(i3 * 2U) + 1U];
		1774
		1775	/* T0 = yb-yd, T1 = xb-xd) */
		1776	T0 = __SSAT(T0 - U0, 16U);
		1777	T1 = __SSAT(T1 - U1, 16U);
		1778
		1779	/* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
		1780	R0 = (S0 >> 1U) + (T1 >> 1U);
		1781	R1 = (S1 >> 1U) - (T0 >> 1U);
		1782
		1783	/* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
		1784	S0 = (S0 >> 1U) - (T1 >> 1U);
		1785	S1 = (S1 >> 1U) + (T0 >> 1U);
		1786
		1787	/* Butterfly process for the i0+fftLen/2 sample */
		1788	out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
		1789	out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
		1790	/* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
		1791	/* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
		1792	pSrc16[i2 * 2U] = out1;
		1793	pSrc16[(i2 * 2U) + 1U] = out2;
		1794
		1795	/* Butterfly process for the i0+3fftLen/4 sample */
		1796	out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
		1797
		1798	out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
		1799	/* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
		1800	/* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
		1801	pSrc16[i3 * 2U] = out1;
		1802	pSrc16[(i3 * 2U) + 1U] = out2;
		1803
		1804
		1805	}
		1806	}
		1807	/* Twiddle coefficients index modifier */
		1808	twidCoefModifier <<= 2U;
		1809	}
		1810	/* End of Middle stages process */
		1811
		1812
		1813	/* data is in 10.6(q6) format for the 1024 point */
		1814	/* data is in 8.8(q8) format for the 256 point */
		1815	/* data is in 6.10(q10) format for the 64 point */
		1816	/* data is in 4.12(q12) format for the 16 point */
		1817
		1818	/* start of last stage process */
		1819
		1820
		1821	/* Initializations for the last stage */
		1822	n1 = n2;
		1823	n2 >>= 2U;
		1824
		1825	/* Butterfly implementation */
		1826	for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
		1827	{
		1828	/* index calculation for the input as, */
		1829	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
		1830	i1 = i0 + n2;
		1831	i2 = i1 + n2;
		1832	i3 = i2 + n2;
		1833
		1834	/* Reading i0, i0+fftLen/2 inputs */
		1835	/* Read ya (real), xa(imag) input */
		1836	T0 = pSrc16[i0 * 2U];
		1837	T1 = pSrc16[(i0 * 2U) + 1U];
		1838	/* Read yc (real), xc(imag) input */
		1839	S0 = pSrc16[i2 * 2U];
		1840	S1 = pSrc16[(i2 * 2U) + 1U];
		1841
		1842	/* R0 = (ya + yc), R1 = (xa + xc) */
		1843	R0 = __SSAT(T0 + S0, 16U);
		1844	R1 = __SSAT(T1 + S1, 16U);
		1845	/* S0 = (ya - yc), S1 = (xa - xc) */
		1846	S0 = __SSAT(T0 - S0, 16U);
		1847	S1 = __SSAT(T1 - S1, 16U);
		1848
		1849	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		1850	/* Read yb (real), xb(imag) input */
		1851	T0 = pSrc16[i1 * 2U];
		1852	T1 = pSrc16[(i1 * 2U) + 1U];
		1853	/* Read yd (real), xd(imag) input */
		1854	U0 = pSrc16[i3 * 2U];
		1855	U1 = pSrc16[(i3 * 2U) + 1U];
		1856
		1857	/* T0 = (yb + yd), T1 = (xb + xd) */
		1858	T0 = __SSAT(T0 + U0, 16U);
		1859	T1 = __SSAT(T1 + U1, 16U);
		1860
		1861	/* writing the butterfly processed i0 sample */
		1862	/* xa' = xa + xb + xc + xd */
		1863	/* ya' = ya + yb + yc + yd */
		1864	pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
		1865	pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
		1866
		1867	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
		1868	R0 = (R0 >> 1U) - (T0 >> 1U);
		1869	R1 = (R1 >> 1U) - (T1 >> 1U);
		1870
		1871	/* Read yb (real), xb(imag) input */
		1872	T0 = pSrc16[i1 * 2U];
		1873	T1 = pSrc16[(i1 * 2U) + 1U];
		1874
		1875	/* writing the butterfly processed i0 + fftLen/4 sample */
		1876	/* xc' = (xa-xb+xc-xd) */
		1877	/* yc' = (ya-yb+yc-yd) */
		1878	pSrc16[i1 * 2U] = R0;
		1879	pSrc16[(i1 * 2U) + 1U] = R1;
		1880
		1881	/* Read yd (real), xd(imag) input */
		1882	U0 = pSrc16[i3 * 2U];
		1883	U1 = pSrc16[(i3 * 2U) + 1U];
		1884	/* T0 = (yb - yd), T1 = (xb - xd) */
		1885	T0 = __SSAT(T0 - U0, 16U);
		1886	T1 = __SSAT(T1 - U1, 16U);
		1887
		1888	/* writing the butterfly processed i0 + fftLen/2 sample */
		1889	/* xb' = (xa-yb-xc+yd) */
		1890	/* yb' = (ya+xb-yc-xd) */
		1891	pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
		1892	pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
		1893
		1894
		1895	/* writing the butterfly processed i0 + 3fftLen/4 sample */
		1896	/* xd' = (xa+yb-xc-yd) */
		1897	/* yd' = (ya-xb-yc+xd) */
		1898	pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
		1899	pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
		1900	}
		1901	/* end of last stage process */
		1902
		1903	/* output is in 11.5(q5) format for the 1024 point */
		1904	/* output is in 9.7(q7) format for the 256 point */
		1905	/* output is in 7.9(q9) format for the 64 point */
		1906	/* output is in 5.11(q11) format for the 16 point */
		1907
		1908	#endif /* #if defined (ARM_MATH_DSP) */
		1909
		1910	}

Subversion Repositories AFRtranscoder

(root)/trunk/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c – Rev 2