WebSVN – DashDisplay – Blame – /branches/Dashboard_L152/Drivers/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix4_q15.c

Rev	Author	Line No.	Line
2	mjames	1	/* ----------------------------------------------------------------------
		2	* Copyright (C) 2010-2014 ARM Limited. All rights reserved.
		3	*
		4	* $Date: 19. March 2015
		5	* $Revision: V.1.4.5
		6	*
		7	* Project: CMSIS DSP Library
		8	* Title: arm_cfft_radix4_q15.c
		9	*
		10	* Description: This file has function definition of Radix-4 FFT & IFFT function and
		11	* In-place bit reversal using bit reversal table
		12	*
		13	* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
		14	*
		15	* Redistribution and use in source and binary forms, with or without
		16	* modification, are permitted provided that the following conditions
		17	* are met:
		18	* - Redistributions of source code must retain the above copyright
		19	* notice, this list of conditions and the following disclaimer.
		20	* - Redistributions in binary form must reproduce the above copyright
		21	* notice, this list of conditions and the following disclaimer in
		22	* the documentation and/or other materials provided with the
		23	* distribution.
		24	* - Neither the name of ARM LIMITED nor the names of its contributors
		25	* may be used to endorse or promote products derived from this
		26	* software without specific prior written permission.
		27	*
		28	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
		29	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
		30	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
		31	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
		32	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
		33	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
		34	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
		35	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
		36	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
		37	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
		38	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
		39	* POSSIBILITY OF SUCH DAMAGE.
		40	* -------------------------------------------------------------------- */
		41
		42	#include "arm_math.h"
		43
		44
		45	void arm_radix4_butterfly_q15(
		46	q15_t * pSrc16,
		47	uint32_t fftLen,
		48	q15_t * pCoef16,
		49	uint32_t twidCoefModifier);
		50
		51	void arm_radix4_butterfly_inverse_q15(
		52	q15_t * pSrc16,
		53	uint32_t fftLen,
		54	q15_t * pCoef16,
		55	uint32_t twidCoefModifier);
		56
		57	void arm_bitreversal_q15(
		58	q15_t * pSrc,
		59	uint32_t fftLen,
		60	uint16_t bitRevFactor,
		61	uint16_t * pBitRevTab);
		62
		63	/**
		64	* @ingroup groupTransforms
		65	*/
		66
		67	/**
		68	* @addtogroup ComplexFFT
		69	* @{
		70	*/
		71
		72
		73	/**
		74	* @details
		75	* @brief Processing function for the Q15 CFFT/CIFFT.
		76	* @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
		77	* @param[in] *S points to an instance of the Q15 CFFT/CIFFT structure.
		78	* @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.
		79	* @return none.
		80	*
		81	* \par Input and output formats:
		82	* \par
		83	* Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
		84	* Hence the output format is different for different FFT sizes.
		85	* The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
		86	* \par
		87	* \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
		88	* \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
		89	*/
		90
		91	void arm_cfft_radix4_q15(
		92	const arm_cfft_radix4_instance_q15 * S,
		93	q15_t * pSrc)
		94	{
		95	if(S->ifftFlag == 1u)
		96	{
		97	/* Complex IFFT radix-4 */
		98	arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle,
		99	S->twidCoefModifier);
		100	}
		101	else
		102	{
		103	/* Complex FFT radix-4 */
		104	arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle,
		105	S->twidCoefModifier);
		106	}
		107
		108	if(S->bitReverseFlag == 1u)
		109	{
		110	/* Bit Reversal */
		111	arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
		112	}
		113
		114	}
		115
		116	/**
		117	* @} end of ComplexFFT group
		118	*/
		119
		120	/*
		121	* Radix-4 FFT algorithm used is :
		122	*
		123	* Input real and imaginary data:
		124	* x(n) = xa + j * ya
		125	* x(n+N/4 ) = xb + j * yb
		126	* x(n+N/2 ) = xc + j * yc
		127	* x(n+3N 4) = xd + j * yd
		128	*
		129	*
		130	* Output real and imaginary data:
		131	* x(4r) = xa'+ j * ya'
		132	* x(4r+1) = xb'+ j * yb'
		133	* x(4r+2) = xc'+ j * yc'
		134	* x(4r+3) = xd'+ j * yd'
		135	*
		136	*
		137	* Twiddle factors for radix-4 FFT:
		138	* Wn = co1 + j * (- si1)
		139	* W2n = co2 + j * (- si2)
		140	* W3n = co3 + j * (- si3)
		141
		142	* The real and imaginary output values for the radix-4 butterfly are
		143	* xa' = xa + xb + xc + xd
		144	* ya' = ya + yb + yc + yd
		145	* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
		146	* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
		147	* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
		148	* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
		149	* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
		150	* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
		151	*
		152	*/
		153
		154	/**
		155	* @brief Core function for the Q15 CFFT butterfly process.
		156	* @param[in, out] *pSrc16 points to the in-place buffer of Q15 data type.
		157	* @param[in] fftLen length of the FFT.
		158	* @param[in] *pCoef16 points to twiddle coefficient buffer.
		159	* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
		160	* @return none.
		161	*/
		162
		163	void arm_radix4_butterfly_q15(
		164	q15_t * pSrc16,
		165	uint32_t fftLen,
		166	q15_t * pCoef16,
		167	uint32_t twidCoefModifier)
		168	{
		169
		170	#ifndef ARM_MATH_CM0_FAMILY
		171
		172	/* Run the below code for Cortex-M4 and Cortex-M3 */
		173
		174	q31_t R, S, T, U;
		175	q31_t C1, C2, C3, out1, out2;
		176	uint32_t n1, n2, ic, i0, j, k;
		177
		178	q15_t *ptr1;
		179	q15_t *pSi0;
		180	q15_t *pSi1;
		181	q15_t *pSi2;
		182	q15_t *pSi3;
		183
		184	q31_t xaya, xbyb, xcyc, xdyd;
		185
		186	/* Total process is divided into three stages */
		187
		188	/* process first stage, middle stages, & last stage */
		189
		190	/* Initializations for the first stage */
		191	n2 = fftLen;
		192	n1 = n2;
		193
		194	/* n2 = fftLen/4 */
		195	n2 >>= 2u;
		196
		197	/* Index for twiddle coefficient */
		198	ic = 0u;
		199
		200	/* Index for input read and output write */
		201	j = n2;
		202
		203	pSi0 = pSrc16;
		204	pSi1 = pSi0 + 2 * n2;
		205	pSi2 = pSi1 + 2 * n2;
		206	pSi3 = pSi2 + 2 * n2;
		207
		208	/* Input is in 1.15(q15) format */
		209
		210	/* start of first stage process */
		211	do
		212	{
		213	/* Butterfly implementation */
		214
		215	/* Reading i0, i0+fftLen/2 inputs */
		216	/* Read ya (real), xa(imag) input */
		217	T = _SIMD32_OFFSET(pSi0);
		218	T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
		219	T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
		220	//in = ((int16_t) (T & 0xFFFF)) >> 2; // alternative code that takes 3 cycles
		221	//T = ((T >> 2) & 0xFFFF0000) \| (in & 0xFFFF);
		222
		223	/* Read yc (real), xc(imag) input */
		224	S = _SIMD32_OFFSET(pSi2);
		225	S = __SHADD16(S, 0);
		226	S = __SHADD16(S, 0);
		227
		228	/* R = packed((ya + yc), (xa + xc) ) */
		229	R = __QADD16(T, S);
		230
		231	/* S = packed((ya - yc), (xa - xc) ) */
		232	S = __QSUB16(T, S);
		233
		234	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		235	/* Read yb (real), xb(imag) input */
		236	T = _SIMD32_OFFSET(pSi1);
		237	T = __SHADD16(T, 0);
		238	T = __SHADD16(T, 0);
		239
		240	/* Read yd (real), xd(imag) input */
		241	U = _SIMD32_OFFSET(pSi3);
		242	U = __SHADD16(U, 0);
		243	U = __SHADD16(U, 0);
		244
		245	/* T = packed((yb + yd), (xb + xd) ) */
		246	T = __QADD16(T, U);
		247
		248	/* writing the butterfly processed i0 sample */
		249	/* xa' = xa + xb + xc + xd */
		250	/* ya' = ya + yb + yc + yd */
		251	_SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
		252	pSi0 += 2;
		253
		254	/* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
		255	R = __QSUB16(R, T);
		256
		257	/* co2 & si2 are read from SIMD Coefficient pointer */
		258	C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
		259
		260	#ifndef ARM_MATH_BIG_ENDIAN
		261
		262	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		263	out1 = __SMUAD(C2, R) >> 16u;
		264	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		265	out2 = __SMUSDX(C2, R);
		266
		267	#else
		268
		269	/* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		270	out1 = __SMUSDX(R, C2) >> 16u;
		271	/* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		272	out2 = __SMUAD(C2, R);
		273
		274	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		275
		276	/* Reading i0+fftLen/4 */
		277	/* T = packed(yb, xb) */
		278	T = _SIMD32_OFFSET(pSi1);
		279	T = __SHADD16(T, 0);
		280	T = __SHADD16(T, 0);
		281
		282	/* writing the butterfly processed i0 + fftLen/4 sample */
		283	/* writing output(xc', yc') in little endian format */
		284	_SIMD32_OFFSET(pSi1) =
		285	(q31_t) ((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		286	pSi1 += 2;
		287
		288	/* Butterfly calculations */
		289	/* U = packed(yd, xd) */
		290	U = _SIMD32_OFFSET(pSi3);
		291	U = __SHADD16(U, 0);
		292	U = __SHADD16(U, 0);
		293
		294	/* T = packed(yb-yd, xb-xd) */
		295	T = __QSUB16(T, U);
		296
		297	#ifndef ARM_MATH_BIG_ENDIAN
		298
		299	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		300	R = __QASX(S, T);
		301	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		302	S = __QSAX(S, T);
		303
		304	#else
		305
		306	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		307	R = __QSAX(S, T);
		308	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		309	S = __QASX(S, T);
		310
		311	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		312
		313	/* co1 & si1 are read from SIMD Coefficient pointer */
		314	C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
		315	/* Butterfly process for the i0+fftLen/2 sample */
		316
		317	#ifndef ARM_MATH_BIG_ENDIAN
		318
		319	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		320	out1 = __SMUAD(C1, S) >> 16u;
		321	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		322	out2 = __SMUSDX(C1, S);
		323
		324	#else
		325
		326	/* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		327	out1 = __SMUSDX(S, C1) >> 16u;
		328	/* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		329	out2 = __SMUAD(C1, S);
		330
		331	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		332
		333	/* writing output(xb', yb') in little endian format */
		334	_SIMD32_OFFSET(pSi2) =
		335	((out2) & 0xFFFF0000) \| ((out1) & 0x0000FFFF);
		336	pSi2 += 2;
		337
		338
		339	/* co3 & si3 are read from SIMD Coefficient pointer */
		340	C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
		341	/* Butterfly process for the i0+3fftLen/4 sample */
		342
		343	#ifndef ARM_MATH_BIG_ENDIAN
		344
		345	/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
		346	out1 = __SMUAD(C3, R) >> 16u;
		347	/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
		348	out2 = __SMUSDX(C3, R);
		349
		350	#else
		351
		352	/* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
		353	out1 = __SMUSDX(R, C3) >> 16u;
		354	/* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
		355	out2 = __SMUAD(C3, R);
		356
		357	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		358
		359	/* writing output(xd', yd') in little endian format */
		360	_SIMD32_OFFSET(pSi3) =
		361	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		362	pSi3 += 2;
		363
		364	/* Twiddle coefficients index modifier */
		365	ic = ic + twidCoefModifier;
		366
		367	} while(--j);
		368	/* data is in 4.11(q11) format */
		369
		370	/* end of first stage process */
		371
		372
		373	/* start of middle stage process */
		374
		375	/* Twiddle coefficients index modifier */
		376	twidCoefModifier <<= 2u;
		377
		378	/* Calculation of Middle stage */
		379	for (k = fftLen / 4u; k > 4u; k >>= 2u)
		380	{
		381	/* Initializations for the middle stage */
		382	n1 = n2;
		383	n2 >>= 2u;
		384	ic = 0u;
		385
		386	for (j = 0u; j <= (n2 - 1u); j++)
		387	{
		388	/* index calculation for the coefficients */
		389	C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
		390	C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
		391	C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
		392
		393	/* Twiddle coefficients index modifier */
		394	ic = ic + twidCoefModifier;
		395
		396	pSi0 = pSrc16 + 2 * j;
		397	pSi1 = pSi0 + 2 * n2;
		398	pSi2 = pSi1 + 2 * n2;
		399	pSi3 = pSi2 + 2 * n2;
		400
		401	/* Butterfly implementation */
		402	for (i0 = j; i0 < fftLen; i0 += n1)
		403	{
		404	/* Reading i0, i0+fftLen/2 inputs */
		405	/* Read ya (real), xa(imag) input */
		406	T = _SIMD32_OFFSET(pSi0);
		407
		408	/* Read yc (real), xc(imag) input */
		409	S = _SIMD32_OFFSET(pSi2);
		410
		411	/* R = packed( (ya + yc), (xa + xc)) */
		412	R = __QADD16(T, S);
		413
		414	/* S = packed((ya - yc), (xa - xc)) */
		415	S = __QSUB16(T, S);
		416
		417	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		418	/* Read yb (real), xb(imag) input */
		419	T = _SIMD32_OFFSET(pSi1);
		420
		421	/* Read yd (real), xd(imag) input */
		422	U = _SIMD32_OFFSET(pSi3);
		423
		424	/* T = packed( (yb + yd), (xb + xd)) */
		425	T = __QADD16(T, U);
		426
		427	/* writing the butterfly processed i0 sample */
		428
		429	/* xa' = xa + xb + xc + xd */
		430	/* ya' = ya + yb + yc + yd */
		431	out1 = __SHADD16(R, T);
		432	out1 = __SHADD16(out1, 0);
		433	_SIMD32_OFFSET(pSi0) = out1;
		434	pSi0 += 2 * n1;
		435
		436	/* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
		437	R = __SHSUB16(R, T);
		438
		439	#ifndef ARM_MATH_BIG_ENDIAN
		440
		441	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
		442	out1 = __SMUAD(C2, R) >> 16u;
		443
		444	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		445	out2 = __SMUSDX(C2, R);
		446
		447	#else
		448
		449	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		450	out1 = __SMUSDX(R, C2) >> 16u;
		451
		452	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
		453	out2 = __SMUAD(C2, R);
		454
		455	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		456
		457	/* Reading i0+3fftLen/4 */
		458	/* Read yb (real), xb(imag) input */
		459	T = _SIMD32_OFFSET(pSi1);
		460
		461	/* writing the butterfly processed i0 + fftLen/4 sample */
		462	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		463	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		464	_SIMD32_OFFSET(pSi1) =
		465	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		466	pSi1 += 2 * n1;
		467
		468	/* Butterfly calculations */
		469
		470	/* Read yd (real), xd(imag) input */
		471	U = _SIMD32_OFFSET(pSi3);
		472
		473	/* T = packed(yb-yd, xb-xd) */
		474	T = __QSUB16(T, U);
		475
		476	#ifndef ARM_MATH_BIG_ENDIAN
		477
		478	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		479	R = __SHASX(S, T);
		480
		481	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		482	S = __SHSAX(S, T);
		483
		484
		485	/* Butterfly process for the i0+fftLen/2 sample */
		486	out1 = __SMUAD(C1, S) >> 16u;
		487	out2 = __SMUSDX(C1, S);
		488
		489	#else
		490
		491	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		492	R = __SHSAX(S, T);
		493
		494	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		495	S = __SHASX(S, T);
		496
		497
		498	/* Butterfly process for the i0+fftLen/2 sample */
		499	out1 = __SMUSDX(S, C1) >> 16u;
		500	out2 = __SMUAD(C1, S);
		501
		502	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		503
		504	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		505	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		506	_SIMD32_OFFSET(pSi2) =
		507	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		508	pSi2 += 2 * n1;
		509
		510	/* Butterfly process for the i0+3fftLen/4 sample */
		511
		512	#ifndef ARM_MATH_BIG_ENDIAN
		513
		514	out1 = __SMUAD(C3, R) >> 16u;
		515	out2 = __SMUSDX(C3, R);
		516
		517	#else
		518
		519	out1 = __SMUSDX(R, C3) >> 16u;
		520	out2 = __SMUAD(C3, R);
		521
		522	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		523
		524	/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
		525	/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
		526	_SIMD32_OFFSET(pSi3) =
		527	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		528	pSi3 += 2 * n1;
		529	}
		530	}
		531	/* Twiddle coefficients index modifier */
		532	twidCoefModifier <<= 2u;
		533	}
		534	/* end of middle stage process */
		535
		536
		537	/* data is in 10.6(q6) format for the 1024 point */
		538	/* data is in 8.8(q8) format for the 256 point */
		539	/* data is in 6.10(q10) format for the 64 point */
		540	/* data is in 4.12(q12) format for the 16 point */
		541
		542	/* Initializations for the last stage */
		543	j = fftLen >> 2;
		544
		545	ptr1 = &pSrc16[0];
		546
		547	/* start of last stage process */
		548
		549	/* Butterfly implementation */
		550	do
		551	{
		552	/* Read xa (real), ya(imag) input */
		553	xaya = *__SIMD32(ptr1)++;
		554
		555	/* Read xb (real), yb(imag) input */
		556	xbyb = *__SIMD32(ptr1)++;
		557
		558	/* Read xc (real), yc(imag) input */
		559	xcyc = *__SIMD32(ptr1)++;
		560
		561	/* Read xd (real), yd(imag) input */
		562	xdyd = *__SIMD32(ptr1)++;
		563
		564	/* R = packed((ya + yc), (xa + xc)) */
		565	R = __QADD16(xaya, xcyc);
		566
		567	/* T = packed((yb + yd), (xb + xd)) */
		568	T = __QADD16(xbyb, xdyd);
		569
		570	/* pointer updation for writing */
		571	ptr1 = ptr1 - 8u;
		572
		573
		574	/* xa' = xa + xb + xc + xd */
		575	/* ya' = ya + yb + yc + yd */
		576	*__SIMD32(ptr1)++ = __SHADD16(R, T);
		577
		578	/* T = packed((yb + yd), (xb + xd)) */
		579	T = __QADD16(xbyb, xdyd);
		580
		581	/* xc' = (xa-xb+xc-xd) */
		582	/* yc' = (ya-yb+yc-yd) */
		583	*__SIMD32(ptr1)++ = __SHSUB16(R, T);
		584
		585	/* S = packed((ya - yc), (xa - xc)) */
		586	S = __QSUB16(xaya, xcyc);
		587
		588	/* Read yd (real), xd(imag) input */
		589	/* T = packed( (yb - yd), (xb - xd)) */
		590	U = __QSUB16(xbyb, xdyd);
		591
		592	#ifndef ARM_MATH_BIG_ENDIAN
		593
		594	/* xb' = (xa+yb-xc-yd) */
		595	/* yb' = (ya-xb-yc+xd) */
		596	*__SIMD32(ptr1)++ = __SHSAX(S, U);
		597
		598
		599	/* xd' = (xa-yb-xc+yd) */
		600	/* yd' = (ya+xb-yc-xd) */
		601	*__SIMD32(ptr1)++ = __SHASX(S, U);
		602
		603	#else
		604
		605	/* xb' = (xa+yb-xc-yd) */
		606	/* yb' = (ya-xb-yc+xd) */
		607	*__SIMD32(ptr1)++ = __SHASX(S, U);
		608
		609
		610	/* xd' = (xa-yb-xc+yd) */
		611	/* yd' = (ya+xb-yc-xd) */
		612	*__SIMD32(ptr1)++ = __SHSAX(S, U);
		613
		614	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		615
		616	} while(--j);
		617
		618	/* end of last stage process */
		619
		620	/* output is in 11.5(q5) format for the 1024 point */
		621	/* output is in 9.7(q7) format for the 256 point */
		622	/* output is in 7.9(q9) format for the 64 point */
		623	/* output is in 5.11(q11) format for the 16 point */
		624
		625
		626	#else
		627
		628	/* Run the below code for Cortex-M0 */
		629
		630	q15_t R0, R1, S0, S1, T0, T1, U0, U1;
		631	q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
		632	uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
		633
		634	/* Total process is divided into three stages */
		635
		636	/* process first stage, middle stages, & last stage */
		637
		638	/* Initializations for the first stage */
		639	n2 = fftLen;
		640	n1 = n2;
		641
		642	/* n2 = fftLen/4 */
		643	n2 >>= 2u;
		644
		645	/* Index for twiddle coefficient */
		646	ic = 0u;
		647
		648	/* Index for input read and output write */
		649	i0 = 0u;
		650	j = n2;
		651
		652	/* Input is in 1.15(q15) format */
		653
		654	/* start of first stage process */
		655	do
		656	{
		657	/* Butterfly implementation */
		658
		659	/* index calculation for the input as, */
		660	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
		661	i1 = i0 + n2;
		662	i2 = i1 + n2;
		663	i3 = i2 + n2;
		664
		665	/* Reading i0, i0+fftLen/2 inputs */
		666
		667	/* input is down scale by 4 to avoid overflow */
		668	/* Read ya (real), xa(imag) input */
		669	T0 = pSrc16[i0 * 2u] >> 2u;
		670	T1 = pSrc16[(i0 * 2u) + 1u] >> 2u;
		671
		672	/* input is down scale by 4 to avoid overflow */
		673	/* Read yc (real), xc(imag) input */
		674	S0 = pSrc16[i2 * 2u] >> 2u;
		675	S1 = pSrc16[(i2 * 2u) + 1u] >> 2u;
		676
		677	/* R0 = (ya + yc) */
		678	R0 = __SSAT(T0 + S0, 16u);
		679	/* R1 = (xa + xc) */
		680	R1 = __SSAT(T1 + S1, 16u);
		681
		682	/* S0 = (ya - yc) */
		683	S0 = __SSAT(T0 - S0, 16);
		684	/* S1 = (xa - xc) */
		685	S1 = __SSAT(T1 - S1, 16);
		686
		687	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		688	/* input is down scale by 4 to avoid overflow */
		689	/* Read yb (real), xb(imag) input */
		690	T0 = pSrc16[i1 * 2u] >> 2u;
		691	T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
		692
		693	/* input is down scale by 4 to avoid overflow */
		694	/* Read yd (real), xd(imag) input */
		695	U0 = pSrc16[i3 * 2u] >> 2u;
		696	U1 = pSrc16[(i3 * 2u) + 1] >> 2u;
		697
		698	/* T0 = (yb + yd) */
		699	T0 = __SSAT(T0 + U0, 16u);
		700	/* T1 = (xb + xd) */
		701	T1 = __SSAT(T1 + U1, 16u);
		702
		703	/* writing the butterfly processed i0 sample */
		704	/* ya' = ya + yb + yc + yd */
		705	/* xa' = xa + xb + xc + xd */
		706	pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
		707	pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
		708
		709	/* R0 = (ya + yc) - (yb + yd) */
		710	/* R1 = (xa + xc) - (xb + xd) */
		711	R0 = __SSAT(R0 - T0, 16u);
		712	R1 = __SSAT(R1 - T1, 16u);
		713
		714	/* co2 & si2 are read from Coefficient pointer */
		715	Co2 = pCoef16[2u * ic * 2u];
		716	Si2 = pCoef16[(2u * ic * 2u) + 1];
		717
		718	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		719	out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u);
		720	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		721	out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u);
		722
		723	/* Reading i0+fftLen/4 */
		724	/* input is down scale by 4 to avoid overflow */
		725	/* T0 = yb, T1 = xb */
		726	T0 = pSrc16[i1 * 2u] >> 2;
		727	T1 = pSrc16[(i1 * 2u) + 1] >> 2;
		728
		729	/* writing the butterfly processed i0 + fftLen/4 sample */
		730	/* writing output(xc', yc') in little endian format */
		731	pSrc16[i1 * 2u] = out1;
		732	pSrc16[(i1 * 2u) + 1] = out2;
		733
		734	/* Butterfly calculations */
		735	/* input is down scale by 4 to avoid overflow */
		736	/* U0 = yd, U1 = xd */
		737	U0 = pSrc16[i3 * 2u] >> 2;
		738	U1 = pSrc16[(i3 * 2u) + 1] >> 2;
		739	/* T0 = yb-yd */
		740	T0 = __SSAT(T0 - U0, 16);
		741	/* T1 = xb-xd */
		742	T1 = __SSAT(T1 - U1, 16);
		743
		744	/* R1 = (ya-yc) + (xb- xd), R0 = (xa-xc) - (yb-yd)) */
		745	R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
		746	R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
		747
		748	/* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
		749	S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16u);
		750	S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16u);
		751
		752	/* co1 & si1 are read from Coefficient pointer */
		753	Co1 = pCoef16[ic * 2u];
		754	Si1 = pCoef16[(ic * 2u) + 1];
		755	/* Butterfly process for the i0+fftLen/2 sample */
		756	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		757	out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
		758	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		759	out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
		760
		761	/* writing output(xb', yb') in little endian format */
		762	pSrc16[i2 * 2u] = out1;
		763	pSrc16[(i2 * 2u) + 1] = out2;
		764
		765	/* Co3 & si3 are read from Coefficient pointer */
		766	Co3 = pCoef16[3u * (ic * 2u)];
		767	Si3 = pCoef16[(3u * (ic * 2u)) + 1];
		768	/* Butterfly process for the i0+3fftLen/4 sample */
		769	/* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
		770	out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u);
		771	/* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
		772	out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u);
		773	/* writing output(xd', yd') in little endian format */
		774	pSrc16[i3 * 2u] = out1;
		775	pSrc16[(i3 * 2u) + 1] = out2;
		776
		777	/* Twiddle coefficients index modifier */
		778	ic = ic + twidCoefModifier;
		779
		780	/* Updating input index */
		781	i0 = i0 + 1u;
		782
		783	} while(--j);
		784	/* data is in 4.11(q11) format */
		785
		786	/* end of first stage process */
		787
		788
		789	/* start of middle stage process */
		790
		791	/* Twiddle coefficients index modifier */
		792	twidCoefModifier <<= 2u;
		793
		794	/* Calculation of Middle stage */
		795	for (k = fftLen / 4u; k > 4u; k >>= 2u)
		796	{
		797	/* Initializations for the middle stage */
		798	n1 = n2;
		799	n2 >>= 2u;
		800	ic = 0u;
		801
		802	for (j = 0u; j <= (n2 - 1u); j++)
		803	{
		804	/* index calculation for the coefficients */
		805	Co1 = pCoef16[ic * 2u];
		806	Si1 = pCoef16[(ic * 2u) + 1u];
		807	Co2 = pCoef16[2u * (ic * 2u)];
		808	Si2 = pCoef16[(2u * (ic * 2u)) + 1u];
		809	Co3 = pCoef16[3u * (ic * 2u)];
		810	Si3 = pCoef16[(3u * (ic * 2u)) + 1u];
		811
		812	/* Twiddle coefficients index modifier */
		813	ic = ic + twidCoefModifier;
		814
		815	/* Butterfly implementation */
		816	for (i0 = j; i0 < fftLen; i0 += n1)
		817	{
		818	/* index calculation for the input as, */
		819	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
		820	i1 = i0 + n2;
		821	i2 = i1 + n2;
		822	i3 = i2 + n2;
		823
		824	/* Reading i0, i0+fftLen/2 inputs */
		825	/* Read ya (real), xa(imag) input */
		826	T0 = pSrc16[i0 * 2u];
		827	T1 = pSrc16[(i0 * 2u) + 1u];
		828
		829	/* Read yc (real), xc(imag) input */
		830	S0 = pSrc16[i2 * 2u];
		831	S1 = pSrc16[(i2 * 2u) + 1u];
		832
		833	/* R0 = (ya + yc), R1 = (xa + xc) */
		834	R0 = __SSAT(T0 + S0, 16);
		835	R1 = __SSAT(T1 + S1, 16);
		836
		837	/* S0 = (ya - yc), S1 =(xa - xc) */
		838	S0 = __SSAT(T0 - S0, 16);
		839	S1 = __SSAT(T1 - S1, 16);
		840
		841	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		842	/* Read yb (real), xb(imag) input */
		843	T0 = pSrc16[i1 * 2u];
		844	T1 = pSrc16[(i1 * 2u) + 1u];
		845
		846	/* Read yd (real), xd(imag) input */
		847	U0 = pSrc16[i3 * 2u];
		848	U1 = pSrc16[(i3 * 2u) + 1u];
		849
		850
		851	/* T0 = (yb + yd), T1 = (xb + xd) */
		852	T0 = __SSAT(T0 + U0, 16);
		853	T1 = __SSAT(T1 + U1, 16);
		854
		855	/* writing the butterfly processed i0 sample */
		856
		857	/* xa' = xa + xb + xc + xd */
		858	/* ya' = ya + yb + yc + yd */
		859	out1 = ((R0 >> 1u) + (T0 >> 1u)) >> 1u;
		860	out2 = ((R1 >> 1u) + (T1 >> 1u)) >> 1u;
		861
		862	pSrc16[i0 * 2u] = out1;
		863	pSrc16[(2u * i0) + 1u] = out2;
		864
		865	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
		866	R0 = (R0 >> 1u) - (T0 >> 1u);
		867	R1 = (R1 >> 1u) - (T1 >> 1u);
		868
		869	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
		870	out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u);
		871
		872	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		873	out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u);
		874
		875	/* Reading i0+3fftLen/4 */
		876	/* Read yb (real), xb(imag) input */
		877	T0 = pSrc16[i1 * 2u];
		878	T1 = pSrc16[(i1 * 2u) + 1u];
		879
		880	/* writing the butterfly processed i0 + fftLen/4 sample */
		881	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		882	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		883	pSrc16[i1 * 2u] = out1;
		884	pSrc16[(i1 * 2u) + 1u] = out2;
		885
		886	/* Butterfly calculations */
		887
		888	/* Read yd (real), xd(imag) input */
		889	U0 = pSrc16[i3 * 2u];
		890	U1 = pSrc16[(i3 * 2u) + 1u];
		891
		892	/* T0 = yb-yd, T1 = xb-xd */
		893	T0 = __SSAT(T0 - U0, 16);
		894	T1 = __SSAT(T1 - U1, 16);
		895
		896	/* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
		897	R0 = (S0 >> 1u) - (T1 >> 1u);
		898	R1 = (S1 >> 1u) + (T0 >> 1u);
		899
		900	/* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
		901	S0 = (S0 >> 1u) + (T1 >> 1u);
		902	S1 = (S1 >> 1u) - (T0 >> 1u);
		903
		904	/* Butterfly process for the i0+fftLen/2 sample */
		905	out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16u);
		906
		907	out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16u);
		908
		909	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		910	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		911	pSrc16[i2 * 2u] = out1;
		912	pSrc16[(i2 * 2u) + 1u] = out2;
		913
		914	/* Butterfly process for the i0+3fftLen/4 sample */
		915	out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u);
		916
		917	out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u);
		918	/* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
		919	/* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
		920	pSrc16[i3 * 2u] = out1;
		921	pSrc16[(i3 * 2u) + 1u] = out2;
		922	}
		923	}
		924	/* Twiddle coefficients index modifier */
		925	twidCoefModifier <<= 2u;
		926	}
		927	/* end of middle stage process */
		928
		929
		930	/* data is in 10.6(q6) format for the 1024 point */
		931	/* data is in 8.8(q8) format for the 256 point */
		932	/* data is in 6.10(q10) format for the 64 point */
		933	/* data is in 4.12(q12) format for the 16 point */
		934
		935	/* Initializations for the last stage */
		936	n1 = n2;
		937	n2 >>= 2u;
		938
		939	/* start of last stage process */
		940
		941	/* Butterfly implementation */
		942	for (i0 = 0u; i0 <= (fftLen - n1); i0 += n1)
		943	{
		944	/* index calculation for the input as, */
		945	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
		946	i1 = i0 + n2;
		947	i2 = i1 + n2;
		948	i3 = i2 + n2;
		949
		950	/* Reading i0, i0+fftLen/2 inputs */
		951	/* Read ya (real), xa(imag) input */
		952	T0 = pSrc16[i0 * 2u];
		953	T1 = pSrc16[(i0 * 2u) + 1u];
		954
		955	/* Read yc (real), xc(imag) input */
		956	S0 = pSrc16[i2 * 2u];
		957	S1 = pSrc16[(i2 * 2u) + 1u];
		958
		959	/* R0 = (ya + yc), R1 = (xa + xc) */
		960	R0 = __SSAT(T0 + S0, 16u);
		961	R1 = __SSAT(T1 + S1, 16u);
		962
		963	/* S0 = (ya - yc), S1 = (xa - xc) */
		964	S0 = __SSAT(T0 - S0, 16u);
		965	S1 = __SSAT(T1 - S1, 16u);
		966
		967	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		968	/* Read yb (real), xb(imag) input */
		969	T0 = pSrc16[i1 * 2u];
		970	T1 = pSrc16[(i1 * 2u) + 1u];
		971	/* Read yd (real), xd(imag) input */
		972	U0 = pSrc16[i3 * 2u];
		973	U1 = pSrc16[(i3 * 2u) + 1u];
		974
		975	/* T0 = (yb + yd), T1 = (xb + xd)) */
		976	T0 = __SSAT(T0 + U0, 16u);
		977	T1 = __SSAT(T1 + U1, 16u);
		978
		979	/* writing the butterfly processed i0 sample */
		980	/* xa' = xa + xb + xc + xd */
		981	/* ya' = ya + yb + yc + yd */
		982	pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
		983	pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
		984
		985	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
		986	R0 = (R0 >> 1u) - (T0 >> 1u);
		987	R1 = (R1 >> 1u) - (T1 >> 1u);
		988	/* Read yb (real), xb(imag) input */
		989	T0 = pSrc16[i1 * 2u];
		990	T1 = pSrc16[(i1 * 2u) + 1u];
		991
		992	/* writing the butterfly processed i0 + fftLen/4 sample */
		993	/* xc' = (xa-xb+xc-xd) */
		994	/* yc' = (ya-yb+yc-yd) */
		995	pSrc16[i1 * 2u] = R0;
		996	pSrc16[(i1 * 2u) + 1u] = R1;
		997
		998	/* Read yd (real), xd(imag) input */
		999	U0 = pSrc16[i3 * 2u];
		1000	U1 = pSrc16[(i3 * 2u) + 1u];
		1001	/* T0 = (yb - yd), T1 = (xb - xd) */
		1002	T0 = __SSAT(T0 - U0, 16u);
		1003	T1 = __SSAT(T1 - U1, 16u);
		1004
		1005	/* writing the butterfly processed i0 + fftLen/2 sample */
		1006	/* xb' = (xa+yb-xc-yd) */
		1007	/* yb' = (ya-xb-yc+xd) */
		1008	pSrc16[i2 * 2u] = (S0 >> 1u) + (T1 >> 1u);
		1009	pSrc16[(i2 * 2u) + 1u] = (S1 >> 1u) - (T0 >> 1u);
		1010
		1011	/* writing the butterfly processed i0 + 3fftLen/4 sample */
		1012	/* xd' = (xa-yb-xc+yd) */
		1013	/* yd' = (ya+xb-yc-xd) */
		1014	pSrc16[i3 * 2u] = (S0 >> 1u) - (T1 >> 1u);
		1015	pSrc16[(i3 * 2u) + 1u] = (S1 >> 1u) + (T0 >> 1u);
		1016
		1017	}
		1018
		1019	/* end of last stage process */
		1020
		1021	/* output is in 11.5(q5) format for the 1024 point */
		1022	/* output is in 9.7(q7) format for the 256 point */
		1023	/* output is in 7.9(q9) format for the 64 point */
		1024	/* output is in 5.11(q11) format for the 16 point */
		1025
		1026	#endif /* #ifndef ARM_MATH_CM0_FAMILY */
		1027
		1028	}
		1029
		1030
		1031	/**
		1032	* @brief Core function for the Q15 CIFFT butterfly process.
		1033	* @param[in, out] *pSrc16 points to the in-place buffer of Q15 data type.
		1034	* @param[in] fftLen length of the FFT.
		1035	* @param[in] *pCoef16 points to twiddle coefficient buffer.
		1036	* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
		1037	* @return none.
		1038	*/
		1039
		1040	/*
		1041	* Radix-4 IFFT algorithm used is :
		1042	*
		1043	* CIFFT uses same twiddle coefficients as CFFT function
		1044	* x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
		1045	*
		1046	*
		1047	* IFFT is implemented with following changes in equations from FFT
		1048	*
		1049	* Input real and imaginary data:
		1050	* x(n) = xa + j * ya
		1051	* x(n+N/4 ) = xb + j * yb
		1052	* x(n+N/2 ) = xc + j * yc
		1053	* x(n+3N 4) = xd + j * yd
		1054	*
		1055	*
		1056	* Output real and imaginary data:
		1057	* x(4r) = xa'+ j * ya'
		1058	* x(4r+1) = xb'+ j * yb'
		1059	* x(4r+2) = xc'+ j * yc'
		1060	* x(4r+3) = xd'+ j * yd'
		1061	*
		1062	*
		1063	* Twiddle factors for radix-4 IFFT:
		1064	* Wn = co1 + j * (si1)
		1065	* W2n = co2 + j * (si2)
		1066	* W3n = co3 + j * (si3)
		1067
		1068	* The real and imaginary output values for the radix-4 butterfly are
		1069	* xa' = xa + xb + xc + xd
		1070	* ya' = ya + yb + yc + yd
		1071	* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
		1072	* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
		1073	* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
		1074	* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
		1075	* xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
		1076	* yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
		1077	*
		1078	*/
		1079
		1080	void arm_radix4_butterfly_inverse_q15(
		1081	q15_t * pSrc16,
		1082	uint32_t fftLen,
		1083	q15_t * pCoef16,
		1084	uint32_t twidCoefModifier)
		1085	{
		1086
		1087	#ifndef ARM_MATH_CM0_FAMILY
		1088
		1089	/* Run the below code for Cortex-M4 and Cortex-M3 */
		1090
		1091	q31_t R, S, T, U;
		1092	q31_t C1, C2, C3, out1, out2;
		1093	uint32_t n1, n2, ic, i0, j, k;
		1094
		1095	q15_t *ptr1;
		1096	q15_t *pSi0;
		1097	q15_t *pSi1;
		1098	q15_t *pSi2;
		1099	q15_t *pSi3;
		1100
		1101	q31_t xaya, xbyb, xcyc, xdyd;
		1102
		1103	/* Total process is divided into three stages */
		1104
		1105	/* process first stage, middle stages, & last stage */
		1106
		1107	/* Initializations for the first stage */
		1108	n2 = fftLen;
		1109	n1 = n2;
		1110
		1111	/* n2 = fftLen/4 */
		1112	n2 >>= 2u;
		1113
		1114	/* Index for twiddle coefficient */
		1115	ic = 0u;
		1116
		1117	/* Index for input read and output write */
		1118	j = n2;
		1119
		1120	pSi0 = pSrc16;
		1121	pSi1 = pSi0 + 2 * n2;
		1122	pSi2 = pSi1 + 2 * n2;
		1123	pSi3 = pSi2 + 2 * n2;
		1124
		1125	/* Input is in 1.15(q15) format */
		1126
		1127	/* start of first stage process */
		1128	do
		1129	{
		1130	/* Butterfly implementation */
		1131
		1132	/* Reading i0, i0+fftLen/2 inputs */
		1133	/* Read ya (real), xa(imag) input */
		1134	T = _SIMD32_OFFSET(pSi0);
		1135	T = __SHADD16(T, 0);
		1136	T = __SHADD16(T, 0);
		1137
		1138	/* Read yc (real), xc(imag) input */
		1139	S = _SIMD32_OFFSET(pSi2);
		1140	S = __SHADD16(S, 0);
		1141	S = __SHADD16(S, 0);
		1142
		1143	/* R = packed((ya + yc), (xa + xc) ) */
		1144	R = __QADD16(T, S);
		1145
		1146	/* S = packed((ya - yc), (xa - xc) ) */
		1147	S = __QSUB16(T, S);
		1148
		1149	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		1150	/* Read yb (real), xb(imag) input */
		1151	T = _SIMD32_OFFSET(pSi1);
		1152	T = __SHADD16(T, 0);
		1153	T = __SHADD16(T, 0);
		1154
		1155	/* Read yd (real), xd(imag) input */
		1156	U = _SIMD32_OFFSET(pSi3);
		1157	U = __SHADD16(U, 0);
		1158	U = __SHADD16(U, 0);
		1159
		1160	/* T = packed((yb + yd), (xb + xd) ) */
		1161	T = __QADD16(T, U);
		1162
		1163	/* writing the butterfly processed i0 sample */
		1164	/* xa' = xa + xb + xc + xd */
		1165	/* ya' = ya + yb + yc + yd */
		1166	_SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
		1167	pSi0 += 2;
		1168
		1169	/* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
		1170	R = __QSUB16(R, T);
		1171
		1172	/* co2 & si2 are read from SIMD Coefficient pointer */
		1173	C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
		1174
		1175	#ifndef ARM_MATH_BIG_ENDIAN
		1176
		1177	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		1178	out1 = __SMUSD(C2, R) >> 16u;
		1179	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		1180	out2 = __SMUADX(C2, R);
		1181
		1182	#else
		1183
		1184	/* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		1185	out1 = __SMUADX(C2, R) >> 16u;
		1186	/* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		1187	out2 = __SMUSD(__QSUB16(0, C2), R);
		1188
		1189	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1190
		1191	/* Reading i0+fftLen/4 */
		1192	/* T = packed(yb, xb) */
		1193	T = _SIMD32_OFFSET(pSi1);
		1194	T = __SHADD16(T, 0);
		1195	T = __SHADD16(T, 0);
		1196
		1197	/* writing the butterfly processed i0 + fftLen/4 sample */
		1198	/* writing output(xc', yc') in little endian format */
		1199	_SIMD32_OFFSET(pSi1) =
		1200	(q31_t) ((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		1201	pSi1 += 2;
		1202
		1203	/* Butterfly calculations */
		1204	/* U = packed(yd, xd) */
		1205	U = _SIMD32_OFFSET(pSi3);
		1206	U = __SHADD16(U, 0);
		1207	U = __SHADD16(U, 0);
		1208
		1209	/* T = packed(yb-yd, xb-xd) */
		1210	T = __QSUB16(T, U);
		1211
		1212	#ifndef ARM_MATH_BIG_ENDIAN
		1213
		1214	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		1215	R = __QSAX(S, T);
		1216	/* S = packed((ya-yc) + (xb- xd), (xa-xc) - (yb-yd)) */
		1217	S = __QASX(S, T);
		1218
		1219	#else
		1220
		1221	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		1222	R = __QASX(S, T);
		1223	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		1224	S = __QSAX(S, T);
		1225
		1226	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1227
		1228	/* co1 & si1 are read from SIMD Coefficient pointer */
		1229	C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
		1230	/* Butterfly process for the i0+fftLen/2 sample */
		1231
		1232	#ifndef ARM_MATH_BIG_ENDIAN
		1233
		1234	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		1235	out1 = __SMUSD(C1, S) >> 16u;
		1236	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		1237	out2 = __SMUADX(C1, S);
		1238
		1239	#else
		1240
		1241	/* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		1242	out1 = __SMUADX(C1, S) >> 16u;
		1243	/* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		1244	out2 = __SMUSD(__QSUB16(0, C1), S);
		1245
		1246	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1247
		1248	/* writing output(xb', yb') in little endian format */
		1249	_SIMD32_OFFSET(pSi2) =
		1250	((out2) & 0xFFFF0000) \| ((out1) & 0x0000FFFF);
		1251	pSi2 += 2;
		1252
		1253
		1254	/* co3 & si3 are read from SIMD Coefficient pointer */
		1255	C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
		1256	/* Butterfly process for the i0+3fftLen/4 sample */
		1257
		1258	#ifndef ARM_MATH_BIG_ENDIAN
		1259
		1260	/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
		1261	out1 = __SMUSD(C3, R) >> 16u;
		1262	/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
		1263	out2 = __SMUADX(C3, R);
		1264
		1265	#else
		1266
		1267	/* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
		1268	out1 = __SMUADX(C3, R) >> 16u;
		1269	/* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
		1270	out2 = __SMUSD(__QSUB16(0, C3), R);
		1271
		1272	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1273
		1274	/* writing output(xd', yd') in little endian format */
		1275	_SIMD32_OFFSET(pSi3) =
		1276	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		1277	pSi3 += 2;
		1278
		1279	/* Twiddle coefficients index modifier */
		1280	ic = ic + twidCoefModifier;
		1281
		1282	} while(--j);
		1283	/* data is in 4.11(q11) format */
		1284
		1285	/* end of first stage process */
		1286
		1287
		1288	/* start of middle stage process */
		1289
		1290	/* Twiddle coefficients index modifier */
		1291	twidCoefModifier <<= 2u;
		1292
		1293	/* Calculation of Middle stage */
		1294	for (k = fftLen / 4u; k > 4u; k >>= 2u)
		1295	{
		1296	/* Initializations for the middle stage */
		1297	n1 = n2;
		1298	n2 >>= 2u;
		1299	ic = 0u;
		1300
		1301	for (j = 0u; j <= (n2 - 1u); j++)
		1302	{
		1303	/* index calculation for the coefficients */
		1304	C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
		1305	C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
		1306	C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
		1307
		1308	/* Twiddle coefficients index modifier */
		1309	ic = ic + twidCoefModifier;
		1310
		1311	pSi0 = pSrc16 + 2 * j;
		1312	pSi1 = pSi0 + 2 * n2;
		1313	pSi2 = pSi1 + 2 * n2;
		1314	pSi3 = pSi2 + 2 * n2;
		1315
		1316	/* Butterfly implementation */
		1317	for (i0 = j; i0 < fftLen; i0 += n1)
		1318	{
		1319	/* Reading i0, i0+fftLen/2 inputs */
		1320	/* Read ya (real), xa(imag) input */
		1321	T = _SIMD32_OFFSET(pSi0);
		1322
		1323	/* Read yc (real), xc(imag) input */
		1324	S = _SIMD32_OFFSET(pSi2);
		1325
		1326	/* R = packed( (ya + yc), (xa + xc)) */
		1327	R = __QADD16(T, S);
		1328
		1329	/* S = packed((ya - yc), (xa - xc)) */
		1330	S = __QSUB16(T, S);
		1331
		1332	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		1333	/* Read yb (real), xb(imag) input */
		1334	T = _SIMD32_OFFSET(pSi1);
		1335
		1336	/* Read yd (real), xd(imag) input */
		1337	U = _SIMD32_OFFSET(pSi3);
		1338
		1339	/* T = packed( (yb + yd), (xb + xd)) */
		1340	T = __QADD16(T, U);
		1341
		1342	/* writing the butterfly processed i0 sample */
		1343
		1344	/* xa' = xa + xb + xc + xd */
		1345	/* ya' = ya + yb + yc + yd */
		1346	out1 = __SHADD16(R, T);
		1347	out1 = __SHADD16(out1, 0);
		1348	_SIMD32_OFFSET(pSi0) = out1;
		1349	pSi0 += 2 * n1;
		1350
		1351	/* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
		1352	R = __SHSUB16(R, T);
		1353
		1354	#ifndef ARM_MATH_BIG_ENDIAN
		1355
		1356	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
		1357	out1 = __SMUSD(C2, R) >> 16u;
		1358
		1359	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		1360	out2 = __SMUADX(C2, R);
		1361
		1362	#else
		1363
		1364	/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		1365	out1 = __SMUADX(R, C2) >> 16u;
		1366
		1367	/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
		1368	out2 = __SMUSD(__QSUB16(0, C2), R);
		1369
		1370	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1371
		1372	/* Reading i0+3fftLen/4 */
		1373	/* Read yb (real), xb(imag) input */
		1374	T = _SIMD32_OFFSET(pSi1);
		1375
		1376	/* writing the butterfly processed i0 + fftLen/4 sample */
		1377	/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
		1378	/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
		1379	_SIMD32_OFFSET(pSi1) =
		1380	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		1381	pSi1 += 2 * n1;
		1382
		1383	/* Butterfly calculations */
		1384
		1385	/* Read yd (real), xd(imag) input */
		1386	U = _SIMD32_OFFSET(pSi3);
		1387
		1388	/* T = packed(yb-yd, xb-xd) */
		1389	T = __QSUB16(T, U);
		1390
		1391	#ifndef ARM_MATH_BIG_ENDIAN
		1392
		1393	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		1394	R = __SHSAX(S, T);
		1395
		1396	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		1397	S = __SHASX(S, T);
		1398
		1399
		1400	/* Butterfly process for the i0+fftLen/2 sample */
		1401	out1 = __SMUSD(C1, S) >> 16u;
		1402	out2 = __SMUADX(C1, S);
		1403
		1404	#else
		1405
		1406	/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
		1407	R = __SHASX(S, T);
		1408
		1409	/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
		1410	S = __SHSAX(S, T);
		1411
		1412
		1413	/* Butterfly process for the i0+fftLen/2 sample */
		1414	out1 = __SMUADX(S, C1) >> 16u;
		1415	out2 = __SMUSD(__QSUB16(0, C1), S);
		1416
		1417	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1418
		1419	/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
		1420	/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
		1421	_SIMD32_OFFSET(pSi2) =
		1422	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		1423	pSi2 += 2 * n1;
		1424
		1425	/* Butterfly process for the i0+3fftLen/4 sample */
		1426
		1427	#ifndef ARM_MATH_BIG_ENDIAN
		1428
		1429	out1 = __SMUSD(C3, R) >> 16u;
		1430	out2 = __SMUADX(C3, R);
		1431
		1432	#else
		1433
		1434	out1 = __SMUADX(C3, R) >> 16u;
		1435	out2 = __SMUSD(__QSUB16(0, C3), R);
		1436
		1437	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1438
		1439	/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
		1440	/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
		1441	_SIMD32_OFFSET(pSi3) =
		1442	((out2) & 0xFFFF0000) \| (out1 & 0x0000FFFF);
		1443	pSi3 += 2 * n1;
		1444	}
		1445	}
		1446	/* Twiddle coefficients index modifier */
		1447	twidCoefModifier <<= 2u;
		1448	}
		1449	/* end of middle stage process */
		1450
		1451	/* data is in 10.6(q6) format for the 1024 point */
		1452	/* data is in 8.8(q8) format for the 256 point */
		1453	/* data is in 6.10(q10) format for the 64 point */
		1454	/* data is in 4.12(q12) format for the 16 point */
		1455
		1456	/* Initializations for the last stage */
		1457	j = fftLen >> 2;
		1458
		1459	ptr1 = &pSrc16[0];
		1460
		1461	/* start of last stage process */
		1462
		1463	/* Butterfly implementation */
		1464	do
		1465	{
		1466	/* Read xa (real), ya(imag) input */
		1467	xaya = *__SIMD32(ptr1)++;
		1468
		1469	/* Read xb (real), yb(imag) input */
		1470	xbyb = *__SIMD32(ptr1)++;
		1471
		1472	/* Read xc (real), yc(imag) input */
		1473	xcyc = *__SIMD32(ptr1)++;
		1474
		1475	/* Read xd (real), yd(imag) input */
		1476	xdyd = *__SIMD32(ptr1)++;
		1477
		1478	/* R = packed((ya + yc), (xa + xc)) */
		1479	R = __QADD16(xaya, xcyc);
		1480
		1481	/* T = packed((yb + yd), (xb + xd)) */
		1482	T = __QADD16(xbyb, xdyd);
		1483
		1484	/* pointer updation for writing */
		1485	ptr1 = ptr1 - 8u;
		1486
		1487
		1488	/* xa' = xa + xb + xc + xd */
		1489	/* ya' = ya + yb + yc + yd */
		1490	*__SIMD32(ptr1)++ = __SHADD16(R, T);
		1491
		1492	/* T = packed((yb + yd), (xb + xd)) */
		1493	T = __QADD16(xbyb, xdyd);
		1494
		1495	/* xc' = (xa-xb+xc-xd) */
		1496	/* yc' = (ya-yb+yc-yd) */
		1497	*__SIMD32(ptr1)++ = __SHSUB16(R, T);
		1498
		1499	/* S = packed((ya - yc), (xa - xc)) */
		1500	S = __QSUB16(xaya, xcyc);
		1501
		1502	/* Read yd (real), xd(imag) input */
		1503	/* T = packed( (yb - yd), (xb - xd)) */
		1504	U = __QSUB16(xbyb, xdyd);
		1505
		1506	#ifndef ARM_MATH_BIG_ENDIAN
		1507
		1508	/* xb' = (xa+yb-xc-yd) */
		1509	/* yb' = (ya-xb-yc+xd) */
		1510	*__SIMD32(ptr1)++ = __SHASX(S, U);
		1511
		1512
		1513	/* xd' = (xa-yb-xc+yd) */
		1514	/* yd' = (ya+xb-yc-xd) */
		1515	*__SIMD32(ptr1)++ = __SHSAX(S, U);
		1516
		1517	#else
		1518
		1519	/* xb' = (xa+yb-xc-yd) */
		1520	/* yb' = (ya-xb-yc+xd) */
		1521	*__SIMD32(ptr1)++ = __SHSAX(S, U);
		1522
		1523
		1524	/* xd' = (xa-yb-xc+yd) */
		1525	/* yd' = (ya+xb-yc-xd) */
		1526	*__SIMD32(ptr1)++ = __SHASX(S, U);
		1527
		1528
		1529	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		1530
		1531	} while(--j);
		1532
		1533	/* end of last stage process */
		1534
		1535	/* output is in 11.5(q5) format for the 1024 point */
		1536	/* output is in 9.7(q7) format for the 256 point */
		1537	/* output is in 7.9(q9) format for the 64 point */
		1538	/* output is in 5.11(q11) format for the 16 point */
		1539
		1540
		1541	#else
		1542
		1543	/* Run the below code for Cortex-M0 */
		1544
		1545	q15_t R0, R1, S0, S1, T0, T1, U0, U1;
		1546	q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
		1547	uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
		1548
		1549	/* Total process is divided into three stages */
		1550
		1551	/* process first stage, middle stages, & last stage */
		1552
		1553	/* Initializations for the first stage */
		1554	n2 = fftLen;
		1555	n1 = n2;
		1556
		1557	/* n2 = fftLen/4 */
		1558	n2 >>= 2u;
		1559
		1560	/* Index for twiddle coefficient */
		1561	ic = 0u;
		1562
		1563	/* Index for input read and output write */
		1564	i0 = 0u;
		1565
		1566	j = n2;
		1567
		1568	/* Input is in 1.15(q15) format */
		1569
		1570	/* Start of first stage process */
		1571	do
		1572	{
		1573	/* Butterfly implementation */
		1574
		1575	/* index calculation for the input as, */
		1576	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
		1577	i1 = i0 + n2;
		1578	i2 = i1 + n2;
		1579	i3 = i2 + n2;
		1580
		1581	/* Reading i0, i0+fftLen/2 inputs */
		1582	/* input is down scale by 4 to avoid overflow */
		1583	/* Read ya (real), xa(imag) input */
		1584	T0 = pSrc16[i0 * 2u] >> 2u;
		1585	T1 = pSrc16[(i0 * 2u) + 1u] >> 2u;
		1586	/* input is down scale by 4 to avoid overflow */
		1587	/* Read yc (real), xc(imag) input */
		1588	S0 = pSrc16[i2 * 2u] >> 2u;
		1589	S1 = pSrc16[(i2 * 2u) + 1u] >> 2u;
		1590
		1591	/* R0 = (ya + yc), R1 = (xa + xc) */
		1592	R0 = __SSAT(T0 + S0, 16u);
		1593	R1 = __SSAT(T1 + S1, 16u);
		1594	/* S0 = (ya - yc), S1 = (xa - xc) */
		1595	S0 = __SSAT(T0 - S0, 16u);
		1596	S1 = __SSAT(T1 - S1, 16u);
		1597
		1598	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		1599	/* input is down scale by 4 to avoid overflow */
		1600	/* Read yb (real), xb(imag) input */
		1601	T0 = pSrc16[i1 * 2u] >> 2u;
		1602	T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
		1603	/* Read yd (real), xd(imag) input */
		1604	/* input is down scale by 4 to avoid overflow */
		1605	U0 = pSrc16[i3 * 2u] >> 2u;
		1606	U1 = pSrc16[(i3 * 2u) + 1u] >> 2u;
		1607
		1608	/* T0 = (yb + yd), T1 = (xb + xd) */
		1609	T0 = __SSAT(T0 + U0, 16u);
		1610	T1 = __SSAT(T1 + U1, 16u);
		1611
		1612	/* writing the butterfly processed i0 sample */
		1613	/* xa' = xa + xb + xc + xd */
		1614	/* ya' = ya + yb + yc + yd */
		1615	pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
		1616	pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
		1617
		1618	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
		1619	R0 = __SSAT(R0 - T0, 16u);
		1620	R1 = __SSAT(R1 - T1, 16u);
		1621	/* co2 & si2 are read from Coefficient pointer */
		1622	Co2 = pCoef16[2u * ic * 2u];
		1623	Si2 = pCoef16[(2u * ic * 2u) + 1u];
		1624	/* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
		1625	out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16u);
		1626	/* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
		1627	out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16u);
		1628
		1629	/* Reading i0+fftLen/4 */
		1630	/* input is down scale by 4 to avoid overflow */
		1631	/* T0 = yb, T1 = xb */
		1632	T0 = pSrc16[i1 * 2u] >> 2u;
		1633	T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
		1634
		1635	/* writing the butterfly processed i0 + fftLen/4 sample */
		1636	/* writing output(xc', yc') in little endian format */
		1637	pSrc16[i1 * 2u] = out1;
		1638	pSrc16[(i1 * 2u) + 1u] = out2;
		1639
		1640	/* Butterfly calculations */
		1641	/* input is down scale by 4 to avoid overflow */
		1642	/* U0 = yd, U1 = xd) */
		1643	U0 = pSrc16[i3 * 2u] >> 2u;
		1644	U1 = pSrc16[(i3 * 2u) + 1u] >> 2u;
		1645
		1646	/* T0 = yb-yd, T1 = xb-xd) */
		1647	T0 = __SSAT(T0 - U0, 16u);
		1648	T1 = __SSAT(T1 - U1, 16u);
		1649	/* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
		1650	R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
		1651	R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
		1652	/* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
		1653	S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
		1654	S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
		1655
		1656	/* co1 & si1 are read from Coefficient pointer */
		1657	Co1 = pCoef16[ic * 2u];
		1658	Si1 = pCoef16[(ic * 2u) + 1u];
		1659	/* Butterfly process for the i0+fftLen/2 sample */
		1660	/* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
		1661	out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u);
		1662	/* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
		1663	out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u);
		1664	/* writing output(xb', yb') in little endian format */
		1665	pSrc16[i2 * 2u] = out1;
		1666	pSrc16[(i2 * 2u) + 1u] = out2;
		1667
		1668	/* Co3 & si3 are read from Coefficient pointer */
		1669	Co3 = pCoef16[3u * ic * 2u];
		1670	Si3 = pCoef16[(3u * ic * 2u) + 1u];
		1671	/* Butterfly process for the i0+3fftLen/4 sample */
		1672	/* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
		1673	out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u);
		1674	/* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
		1675	out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u);
		1676	/* writing output(xd', yd') in little endian format */
		1677	pSrc16[i3 * 2u] = out1;
		1678	pSrc16[(i3 * 2u) + 1u] = out2;
		1679
		1680	/* Twiddle coefficients index modifier */
		1681	ic = ic + twidCoefModifier;
		1682
		1683	/* Updating input index */
		1684	i0 = i0 + 1u;
		1685
		1686	} while(--j);
		1687
		1688	/* End of first stage process */
		1689
		1690	/* data is in 4.11(q11) format */
		1691
		1692
		1693	/* Start of Middle stage process */
		1694
		1695	/* Twiddle coefficients index modifier */
		1696	twidCoefModifier <<= 2u;
		1697
		1698	/* Calculation of Middle stage */
		1699	for (k = fftLen / 4u; k > 4u; k >>= 2u)
		1700	{
		1701	/* Initializations for the middle stage */
		1702	n1 = n2;
		1703	n2 >>= 2u;
		1704	ic = 0u;
		1705
		1706	for (j = 0u; j <= (n2 - 1u); j++)
		1707	{
		1708	/* index calculation for the coefficients */
		1709	Co1 = pCoef16[ic * 2u];
		1710	Si1 = pCoef16[(ic * 2u) + 1u];
		1711	Co2 = pCoef16[2u * ic * 2u];
		1712	Si2 = pCoef16[2u * ic * 2u + 1u];
		1713	Co3 = pCoef16[3u * ic * 2u];
		1714	Si3 = pCoef16[(3u * ic * 2u) + 1u];
		1715
		1716	/* Twiddle coefficients index modifier */
		1717	ic = ic + twidCoefModifier;
		1718
		1719	/* Butterfly implementation */
		1720	for (i0 = j; i0 < fftLen; i0 += n1)
		1721	{
		1722	/* index calculation for the input as, */
		1723	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
		1724	i1 = i0 + n2;
		1725	i2 = i1 + n2;
		1726	i3 = i2 + n2;
		1727
		1728	/* Reading i0, i0+fftLen/2 inputs */
		1729	/* Read ya (real), xa(imag) input */
		1730	T0 = pSrc16[i0 * 2u];
		1731	T1 = pSrc16[(i0 * 2u) + 1u];
		1732
		1733	/* Read yc (real), xc(imag) input */
		1734	S0 = pSrc16[i2 * 2u];
		1735	S1 = pSrc16[(i2 * 2u) + 1u];
		1736
		1737
		1738	/* R0 = (ya + yc), R1 = (xa + xc) */
		1739	R0 = __SSAT(T0 + S0, 16u);
		1740	R1 = __SSAT(T1 + S1, 16u);
		1741	/* S0 = (ya - yc), S1 = (xa - xc) */
		1742	S0 = __SSAT(T0 - S0, 16u);
		1743	S1 = __SSAT(T1 - S1, 16u);
		1744
		1745	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		1746	/* Read yb (real), xb(imag) input */
		1747	T0 = pSrc16[i1 * 2u];
		1748	T1 = pSrc16[(i1 * 2u) + 1u];
		1749
		1750	/* Read yd (real), xd(imag) input */
		1751	U0 = pSrc16[i3 * 2u];
		1752	U1 = pSrc16[(i3 * 2u) + 1u];
		1753
		1754	/* T0 = (yb + yd), T1 = (xb + xd) */
		1755	T0 = __SSAT(T0 + U0, 16u);
		1756	T1 = __SSAT(T1 + U1, 16u);
		1757
		1758	/* writing the butterfly processed i0 sample */
		1759	/* xa' = xa + xb + xc + xd */
		1760	/* ya' = ya + yb + yc + yd */
		1761	pSrc16[i0 * 2u] = ((R0 >> 1u) + (T0 >> 1u)) >> 1u;
		1762	pSrc16[(i0 * 2u) + 1u] = ((R1 >> 1u) + (T1 >> 1u)) >> 1u;
		1763
		1764	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
		1765	R0 = (R0 >> 1u) - (T0 >> 1u);
		1766	R1 = (R1 >> 1u) - (T1 >> 1u);
		1767
		1768	/* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
		1769	out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
		1770	/* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
		1771	out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
		1772
		1773	/* Reading i0+3fftLen/4 */
		1774	/* Read yb (real), xb(imag) input */
		1775	T0 = pSrc16[i1 * 2u];
		1776	T1 = pSrc16[(i1 * 2u) + 1u];
		1777
		1778	/* writing the butterfly processed i0 + fftLen/4 sample */
		1779	/* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
		1780	/* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
		1781	pSrc16[i1 * 2u] = out1;
		1782	pSrc16[(i1 * 2u) + 1u] = out2;
		1783
		1784	/* Butterfly calculations */
		1785	/* Read yd (real), xd(imag) input */
		1786	U0 = pSrc16[i3 * 2u];
		1787	U1 = pSrc16[(i3 * 2u) + 1u];
		1788
		1789	/* T0 = yb-yd, T1 = xb-xd) */
		1790	T0 = __SSAT(T0 - U0, 16u);
		1791	T1 = __SSAT(T1 - U1, 16u);
		1792
		1793	/* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
		1794	R0 = (S0 >> 1u) + (T1 >> 1u);
		1795	R1 = (S1 >> 1u) - (T0 >> 1u);
		1796
		1797	/* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
		1798	S0 = (S0 >> 1u) - (T1 >> 1u);
		1799	S1 = (S1 >> 1u) + (T0 >> 1u);
		1800
		1801	/* Butterfly process for the i0+fftLen/2 sample */
		1802	out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u);
		1803	out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u);
		1804	/* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
		1805	/* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
		1806	pSrc16[i2 * 2u] = out1;
		1807	pSrc16[(i2 * 2u) + 1u] = out2;
		1808
		1809	/* Butterfly process for the i0+3fftLen/4 sample */
		1810	out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u);
		1811
		1812	out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u);
		1813	/* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
		1814	/* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
		1815	pSrc16[i3 * 2u] = out1;
		1816	pSrc16[(i3 * 2u) + 1u] = out2;
		1817
		1818
		1819	}
		1820	}
		1821	/* Twiddle coefficients index modifier */
		1822	twidCoefModifier <<= 2u;
		1823	}
		1824	/* End of Middle stages process */
		1825
		1826
		1827	/* data is in 10.6(q6) format for the 1024 point */
		1828	/* data is in 8.8(q8) format for the 256 point */
		1829	/* data is in 6.10(q10) format for the 64 point */
		1830	/* data is in 4.12(q12) format for the 16 point */
		1831
		1832	/* start of last stage process */
		1833
		1834
		1835	/* Initializations for the last stage */
		1836	n1 = n2;
		1837	n2 >>= 2u;
		1838
		1839	/* Butterfly implementation */
		1840	for (i0 = 0u; i0 <= (fftLen - n1); i0 += n1)
		1841	{
		1842	/* index calculation for the input as, */
		1843	/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
		1844	i1 = i0 + n2;
		1845	i2 = i1 + n2;
		1846	i3 = i2 + n2;
		1847
		1848	/* Reading i0, i0+fftLen/2 inputs */
		1849	/* Read ya (real), xa(imag) input */
		1850	T0 = pSrc16[i0 * 2u];
		1851	T1 = pSrc16[(i0 * 2u) + 1u];
		1852	/* Read yc (real), xc(imag) input */
		1853	S0 = pSrc16[i2 * 2u];
		1854	S1 = pSrc16[(i2 * 2u) + 1u];
		1855
		1856	/* R0 = (ya + yc), R1 = (xa + xc) */
		1857	R0 = __SSAT(T0 + S0, 16u);
		1858	R1 = __SSAT(T1 + S1, 16u);
		1859	/* S0 = (ya - yc), S1 = (xa - xc) */
		1860	S0 = __SSAT(T0 - S0, 16u);
		1861	S1 = __SSAT(T1 - S1, 16u);
		1862
		1863	/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
		1864	/* Read yb (real), xb(imag) input */
		1865	T0 = pSrc16[i1 * 2u];
		1866	T1 = pSrc16[(i1 * 2u) + 1u];
		1867	/* Read yd (real), xd(imag) input */
		1868	U0 = pSrc16[i3 * 2u];
		1869	U1 = pSrc16[(i3 * 2u) + 1u];
		1870
		1871	/* T0 = (yb + yd), T1 = (xb + xd) */
		1872	T0 = __SSAT(T0 + U0, 16u);
		1873	T1 = __SSAT(T1 + U1, 16u);
		1874
		1875	/* writing the butterfly processed i0 sample */
		1876	/* xa' = xa + xb + xc + xd */
		1877	/* ya' = ya + yb + yc + yd */
		1878	pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
		1879	pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
		1880
		1881	/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
		1882	R0 = (R0 >> 1u) - (T0 >> 1u);
		1883	R1 = (R1 >> 1u) - (T1 >> 1u);
		1884
		1885	/* Read yb (real), xb(imag) input */
		1886	T0 = pSrc16[i1 * 2u];
		1887	T1 = pSrc16[(i1 * 2u) + 1u];
		1888
		1889	/* writing the butterfly processed i0 + fftLen/4 sample */
		1890	/* xc' = (xa-xb+xc-xd) */
		1891	/* yc' = (ya-yb+yc-yd) */
		1892	pSrc16[i1 * 2u] = R0;
		1893	pSrc16[(i1 * 2u) + 1u] = R1;
		1894
		1895	/* Read yd (real), xd(imag) input */
		1896	U0 = pSrc16[i3 * 2u];
		1897	U1 = pSrc16[(i3 * 2u) + 1u];
		1898	/* T0 = (yb - yd), T1 = (xb - xd) */
		1899	T0 = __SSAT(T0 - U0, 16u);
		1900	T1 = __SSAT(T1 - U1, 16u);
		1901
		1902	/* writing the butterfly processed i0 + fftLen/2 sample */
		1903	/* xb' = (xa-yb-xc+yd) */
		1904	/* yb' = (ya+xb-yc-xd) */
		1905	pSrc16[i2 * 2u] = (S0 >> 1u) - (T1 >> 1u);
		1906	pSrc16[(i2 * 2u) + 1u] = (S1 >> 1u) + (T0 >> 1u);
		1907
		1908
		1909	/* writing the butterfly processed i0 + 3fftLen/4 sample */
		1910	/* xd' = (xa+yb-xc-yd) */
		1911	/* yd' = (ya-xb-yc+xd) */
		1912	pSrc16[i3 * 2u] = (S0 >> 1u) + (T1 >> 1u);
		1913	pSrc16[(i3 * 2u) + 1u] = (S1 >> 1u) - (T0 >> 1u);
		1914	}
		1915	/* end of last stage process */
		1916
		1917	/* output is in 11.5(q5) format for the 1024 point */
		1918	/* output is in 9.7(q7) format for the 256 point */
		1919	/* output is in 7.9(q9) format for the 64 point */
		1920	/* output is in 5.11(q11) format for the 16 point */
		1921
		1922	#endif /* #ifndef ARM_MATH_CM0_FAMILY */
		1923
		1924	}

Subversion Repositories DashDisplay

(root)/branches/Dashboard_L152/Drivers/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix4_q15.c – Rev 28