WebSVN – AFRtranscoder – Blame – /trunk/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c

Rev	Author	Line No.	Line
2	mjames	1	/* ----------------------------------------------------------------------
		2	* Project: CMSIS DSP Library
		3	* Title: arm_cfft_radix4_q31.c
		4	* Description: This file has function definition of Radix-4 FFT & IFFT function and
		5	* In-place bit reversal using bit reversal table
		6	*
		7	* $Date: 27. January 2017
		8	* $Revision: V.1.5.1
		9	*
		10	* Target Processor: Cortex-M cores
		11	* -------------------------------------------------------------------- */
		12	/*
		13	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
		14	*
		15	* SPDX-License-Identifier: Apache-2.0
		16	*
		17	* Licensed under the Apache License, Version 2.0 (the License); you may
		18	* not use this file except in compliance with the License.
		19	* You may obtain a copy of the License at
		20	*
		21	* www.apache.org/licenses/LICENSE-2.0
		22	*
		23	* Unless required by applicable law or agreed to in writing, software
		24	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		25	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		26	* See the License for the specific language governing permissions and
		27	* limitations under the License.
		28	*/
		29
		30	#include "arm_math.h"
		31
		32	void arm_radix4_butterfly_inverse_q31(
		33	q31_t * pSrc,
		34	uint32_t fftLen,
		35	q31_t * pCoef,
		36	uint32_t twidCoefModifier);
		37
		38	void arm_radix4_butterfly_q31(
		39	q31_t * pSrc,
		40	uint32_t fftLen,
		41	q31_t * pCoef,
		42	uint32_t twidCoefModifier);
		43
		44	void arm_bitreversal_q31(
		45	q31_t * pSrc,
		46	uint32_t fftLen,
		47	uint16_t bitRevFactor,
		48	uint16_t * pBitRevTab);
		49
		50	/**
		51	* @ingroup groupTransforms
		52	*/
		53
		54	/**
		55	* @addtogroup ComplexFFT
		56	* @{
		57	*/
		58
		59	/**
		60	* @details
		61	* @brief Processing function for the Q31 CFFT/CIFFT.
		62	* @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed
		63	* @param[in] *S points to an instance of the Q31 CFFT/CIFFT structure.
		64	* @param[in, out] pSrc points to the complex data buffer of size <code>2fftLen</code>. Processing occurs in-place.
		65	* @return none.
		66	*
		67	* \par Input and output formats:
		68	* \par
		69	* Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
		70	* Hence the output format is different for different FFT sizes.
		71	* The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
		72	* \par
		73	* \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"
		74	* \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"
		75	*
		76	*/
		77
		78	void arm_cfft_radix4_q31(
		79	const arm_cfft_radix4_instance_q31 * S,
		80	q31_t * pSrc)
		81	{
		82	if (S->ifftFlag == 1U)
		83	{
		84	/* Complex IFFT radix-4 */
		85	arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
		86	}
		87	else
		88	{
		89	/* Complex FFT radix-4 */
		90	arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
		91	}
		92
		93	if (S->bitReverseFlag == 1U)
		94	{
		95	/* Bit Reversal */
		96	arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
		97	}
		98
		99	}
		100
		101	/**
		102	* @} end of ComplexFFT group
		103	*/
		104
		105	/*
		106	* Radix-4 FFT algorithm used is :
		107	*
		108	* Input real and imaginary data:
		109	* x(n) = xa + j * ya
		110	* x(n+N/4 ) = xb + j * yb
		111	* x(n+N/2 ) = xc + j * yc
		112	* x(n+3N 4) = xd + j * yd
		113	*
		114	*
		115	* Output real and imaginary data:
		116	* x(4r) = xa'+ j * ya'
		117	* x(4r+1) = xb'+ j * yb'
		118	* x(4r+2) = xc'+ j * yc'
		119	* x(4r+3) = xd'+ j * yd'
		120	*
		121	*
		122	* Twiddle factors for radix-4 FFT:
		123	* Wn = co1 + j * (- si1)
		124	* W2n = co2 + j * (- si2)
		125	* W3n = co3 + j * (- si3)
		126	*
		127	* Butterfly implementation:
		128	* xa' = xa + xb + xc + xd
		129	* ya' = ya + yb + yc + yd
		130	* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
		131	* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
		132	* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
		133	* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
		134	* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
		135	* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
		136	*
		137	*/
		138
		139	/**
		140	* @brief Core function for the Q31 CFFT butterfly process.
		141	* @param[in, out] *pSrc points to the in-place buffer of Q31 data type.
		142	* @param[in] fftLen length of the FFT.
		143	* @param[in] *pCoef points to twiddle coefficient buffer.
		144	* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
		145	* @return none.
		146	*/
		147
		148	void arm_radix4_butterfly_q31(
		149	q31_t * pSrc,
		150	uint32_t fftLen,
		151	q31_t * pCoef,
		152	uint32_t twidCoefModifier)
		153	{
		154	#if defined(ARM_MATH_CM7)
		155	uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
		156	q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
		157
		158	q31_t xa, xb, xc, xd;
		159	q31_t ya, yb, yc, yd;
		160	q31_t xa_out, xb_out, xc_out, xd_out;
		161	q31_t ya_out, yb_out, yc_out, yd_out;
		162
		163	q31_t *ptr1;
		164	q63_t xaya, xbyb, xcyc, xdyd;
		165	/* Total process is divided into three stages */
		166
		167	/* process first stage, middle stages, & last stage */
		168
		169
		170	/* start of first stage process */
		171
		172	/* Initializations for the first stage */
		173	n2 = fftLen;
		174	n1 = n2;
		175	/* n2 = fftLen/4 */
		176	n2 >>= 2U;
		177	i0 = 0U;
		178	ia1 = 0U;
		179
		180	j = n2;
		181
		182	/* Calculation of first stage */
		183	do
		184	{
		185	/* index calculation for the input as, */
		186	/* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
		187	i1 = i0 + n2;
		188	i2 = i1 + n2;
		189	i3 = i2 + n2;
		190
		191	/* input is in 1.31(q31) format and provide 4 guard bits for the input */
		192
		193	/* Butterfly implementation */
		194	/* xa + xc */
		195	r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U);
		196	/* xa - xc */
		197	r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
		198
		199	/* xb + xd */
		200	t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
		201
		202	/* ya + yc */
		203	s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
		204	/* ya - yc */
		205	s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
		206
		207	/* xa' = xa + xb + xc + xd */
		208	pSrc[2U * i0] = (r1 + t1);
		209	/* (xa + xc) - (xb + xd) */
		210	r1 = r1 - t1;
		211	/* yb + yd */
		212	t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
		213
		214	/* ya' = ya + yb + yc + yd */
		215	pSrc[(2U * i0) + 1U] = (s1 + t2);
		216
		217	/* (ya + yc) - (yb + yd) */
		218	s1 = s1 - t2;
		219
		220	/* yb - yd */
		221	t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
		222	/* xb - xd */
		223	t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
		224
		225	/* index calculation for the coefficients */
		226	ia2 = 2U * ia1;
		227	co2 = pCoef[ia2 * 2U];
		228	si2 = pCoef[(ia2 * 2U) + 1U];
		229
		230	/* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
		231	pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
		232	((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
		233
		234	/* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
		235	pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
		236	((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
		237
		238	/* (xa - xc) + (yb - yd) */
		239	r1 = r2 + t1;
		240	/* (xa - xc) - (yb - yd) */
		241	r2 = r2 - t1;
		242
		243	/* (ya - yc) - (xb - xd) */
		244	s1 = s2 - t2;
		245	/* (ya - yc) + (xb - xd) */
		246	s2 = s2 + t2;
		247
		248	co1 = pCoef[ia1 * 2U];
		249	si1 = pCoef[(ia1 * 2U) + 1U];
		250
		251	/* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
		252	pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
		253	((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
		254
		255	/* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
		256	pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
		257	((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
		258
		259	/* index calculation for the coefficients */
		260	ia3 = 3U * ia1;
		261	co3 = pCoef[ia3 * 2U];
		262	si3 = pCoef[(ia3 * 2U) + 1U];
		263
		264	/* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
		265	pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
		266	((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
		267
		268	/* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
		269	pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
		270	((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
		271
		272	/* Twiddle coefficients index modifier */
		273	ia1 = ia1 + twidCoefModifier;
		274
		275	/* Updating input index */
		276	i0 = i0 + 1U;
		277
		278	} while (--j);
		279
		280	/* end of first stage process */
		281
		282	/* data is in 5.27(q27) format */
		283
		284
		285	/* start of Middle stages process */
		286
		287
		288	/* each stage in middle stages provides two down scaling of the input */
		289
		290	twidCoefModifier <<= 2U;
		291
		292
		293	for (k = fftLen / 4U; k > 4U; k >>= 2U)
		294	{
		295	/* Initializations for the first stage */
		296	n1 = n2;
		297	n2 >>= 2U;
		298	ia1 = 0U;
		299
		300	/* Calculation of first stage */
		301	for (j = 0U; j <= (n2 - 1U); j++)
		302	{
		303	/* index calculation for the coefficients */
		304	ia2 = ia1 + ia1;
		305	ia3 = ia2 + ia1;
		306	co1 = pCoef[ia1 * 2U];
		307	si1 = pCoef[(ia1 * 2U) + 1U];
		308	co2 = pCoef[ia2 * 2U];
		309	si2 = pCoef[(ia2 * 2U) + 1U];
		310	co3 = pCoef[ia3 * 2U];
		311	si3 = pCoef[(ia3 * 2U) + 1U];
		312	/* Twiddle coefficients index modifier */
		313	ia1 = ia1 + twidCoefModifier;
		314
		315	for (i0 = j; i0 < fftLen; i0 += n1)
		316	{
		317	/* index calculation for the input as, */
		318	/* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
		319	i1 = i0 + n2;
		320	i2 = i1 + n2;
		321	i3 = i2 + n2;
		322
		323	/* Butterfly implementation */
		324	/* xa + xc */
		325	r1 = pSrc[2U * i0] + pSrc[2U * i2];
		326	/* xa - xc */
		327	r2 = pSrc[2U * i0] - pSrc[2U * i2];
		328
		329	/* ya + yc */
		330	s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
		331	/* ya - yc */
		332	s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
		333
		334	/* xb + xd */
		335	t1 = pSrc[2U * i1] + pSrc[2U * i3];
		336
		337	/* xa' = xa + xb + xc + xd */
		338	pSrc[2U * i0] = (r1 + t1) >> 2U;
		339	/* xa + xc -(xb + xd) */
		340	r1 = r1 - t1;
		341
		342	/* yb + yd */
		343	t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
		344	/* ya' = ya + yb + yc + yd */
		345	pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
		346
		347	/* (ya + yc) - (yb + yd) */
		348	s1 = s1 - t2;
		349
		350	/* (yb - yd) */
		351	t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
		352	/* (xb - xd) */
		353	t2 = pSrc[2U * i1] - pSrc[2U * i3];
		354
		355	/* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
		356	pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
		357	((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
		358
		359	/* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
		360	pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
		361	((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
		362
		363	/* (xa - xc) + (yb - yd) */
		364	r1 = r2 + t1;
		365	/* (xa - xc) - (yb - yd) */
		366	r2 = r2 - t1;
		367
		368	/* (ya - yc) - (xb - xd) */
		369	s1 = s2 - t2;
		370	/* (ya - yc) + (xb - xd) */
		371	s2 = s2 + t2;
		372
		373	/* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
		374	pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
		375	((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
		376
		377	/* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
		378	pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
		379	((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
		380
		381	/* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
		382	pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
		383	((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
		384
		385	/* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
		386	pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
		387	((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
		388	}
		389	}
		390	twidCoefModifier <<= 2U;
		391	}
		392	#else
		393	uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
		394	q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
		395
		396	q31_t xa, xb, xc, xd;
		397	q31_t ya, yb, yc, yd;
		398	q31_t xa_out, xb_out, xc_out, xd_out;
		399	q31_t ya_out, yb_out, yc_out, yd_out;
		400
		401	q31_t *ptr1;
		402	q31_t *pSi0;
		403	q31_t *pSi1;
		404	q31_t *pSi2;
		405	q31_t *pSi3;
		406	q63_t xaya, xbyb, xcyc, xdyd;
		407	/* Total process is divided into three stages */
		408
		409	/* process first stage, middle stages, & last stage */
		410
		411
		412	/* start of first stage process */
		413
		414	/* Initializations for the first stage */
		415	n2 = fftLen;
		416	n1 = n2;
		417	/* n2 = fftLen/4 */
		418	n2 >>= 2U;
		419
		420	ia1 = 0U;
		421
		422	j = n2;
		423
		424	pSi0 = pSrc;
		425	pSi1 = pSi0 + 2 * n2;
		426	pSi2 = pSi1 + 2 * n2;
		427	pSi3 = pSi2 + 2 * n2;
		428
		429	/* Calculation of first stage */
		430	do
		431	{
		432	/* input is in 1.31(q31) format and provide 4 guard bits for the input */
		433
		434	/* Butterfly implementation */
		435	/* xa + xc */
		436	r1 = (pSi0[0] >> 4U) + (pSi2[0] >> 4U);
		437	/* xa - xc */
		438	r2 = (pSi0[0] >> 4U) - (pSi2[0] >> 4U);
		439
		440	/* xb + xd */
		441	t1 = (pSi1[0] >> 4U) + (pSi3[0] >> 4U);
		442
		443	/* ya + yc */
		444	s1 = (pSi0[1] >> 4U) + (pSi2[1] >> 4U);
		445	/* ya - yc */
		446	s2 = (pSi0[1] >> 4U) - (pSi2[1] >> 4U);
		447
		448	/* xa' = xa + xb + xc + xd */
		449	*pSi0++ = (r1 + t1);
		450	/* (xa + xc) - (xb + xd) */
		451	r1 = r1 - t1;
		452	/* yb + yd */
		453	t2 = (pSi1[1] >> 4U) + (pSi3[1] >> 4U);
		454
		455	/* ya' = ya + yb + yc + yd */
		456	*pSi0++ = (s1 + t2);
		457
		458	/* (ya + yc) - (yb + yd) */
		459	s1 = s1 - t2;
		460
		461	/* yb - yd */
		462	t1 = (pSi1[1] >> 4U) - (pSi3[1] >> 4U);
		463	/* xb - xd */
		464	t2 = (pSi1[0] >> 4U) - (pSi3[0] >> 4U);
		465
		466	/* index calculation for the coefficients */
		467	ia2 = 2U * ia1;
		468	co2 = pCoef[ia2 * 2U];
		469	si2 = pCoef[(ia2 * 2U) + 1U];
		470
		471	/* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
		472	pSi1++ = (((int32_t) (((q63_t) r1 co2) >> 32)) +
		473	((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
		474
		475	/* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
		476	pSi1++ = (((int32_t) (((q63_t) s1 co2) >> 32)) -
		477	((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
		478
		479	/* (xa - xc) + (yb - yd) */
		480	r1 = r2 + t1;
		481	/* (xa - xc) - (yb - yd) */
		482	r2 = r2 - t1;
		483
		484	/* (ya - yc) - (xb - xd) */
		485	s1 = s2 - t2;
		486	/* (ya - yc) + (xb - xd) */
		487	s2 = s2 + t2;
		488
		489	co1 = pCoef[ia1 * 2U];
		490	si1 = pCoef[(ia1 * 2U) + 1U];
		491
		492	/* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
		493	pSi2++ = (((int32_t) (((q63_t) r1 co1) >> 32)) +
		494	((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
		495
		496	/* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
		497	pSi2++ = (((int32_t) (((q63_t) s1 co1) >> 32)) -
		498	((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
		499
		500	/* index calculation for the coefficients */
		501	ia3 = 3U * ia1;
		502	co3 = pCoef[ia3 * 2U];
		503	si3 = pCoef[(ia3 * 2U) + 1U];
		504
		505	/* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
		506	pSi3++ = (((int32_t) (((q63_t) r2 co3) >> 32)) +
		507	((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
		508
		509	/* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
		510	pSi3++ = (((int32_t) (((q63_t) s2 co3) >> 32)) -
		511	((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
		512
		513	/* Twiddle coefficients index modifier */
		514	ia1 = ia1 + twidCoefModifier;
		515
		516	} while (--j);
		517
		518	/* end of first stage process */
		519
		520	/* data is in 5.27(q27) format */
		521
		522
		523	/* start of Middle stages process */
		524
		525
		526	/* each stage in middle stages provides two down scaling of the input */
		527
		528	twidCoefModifier <<= 2U;
		529
		530
		531	for (k = fftLen / 4U; k > 4U; k >>= 2U)
		532	{
		533	/* Initializations for the first stage */
		534	n1 = n2;
		535	n2 >>= 2U;
		536	ia1 = 0U;
		537
		538	/* Calculation of first stage */
		539	for (j = 0U; j <= (n2 - 1U); j++)
		540	{
		541	/* index calculation for the coefficients */
		542	ia2 = ia1 + ia1;
		543	ia3 = ia2 + ia1;
		544	co1 = pCoef[ia1 * 2U];
		545	si1 = pCoef[(ia1 * 2U) + 1U];
		546	co2 = pCoef[ia2 * 2U];
		547	si2 = pCoef[(ia2 * 2U) + 1U];
		548	co3 = pCoef[ia3 * 2U];
		549	si3 = pCoef[(ia3 * 2U) + 1U];
		550	/* Twiddle coefficients index modifier */
		551	ia1 = ia1 + twidCoefModifier;
		552
		553	pSi0 = pSrc + 2 * j;
		554	pSi1 = pSi0 + 2 * n2;
		555	pSi2 = pSi1 + 2 * n2;
		556	pSi3 = pSi2 + 2 * n2;
		557
		558	for (i0 = j; i0 < fftLen; i0 += n1)
		559	{
		560	/* Butterfly implementation */
		561	/* xa + xc */
		562	r1 = pSi0[0] + pSi2[0];
		563
		564	/* xa - xc */
		565	r2 = pSi0[0] - pSi2[0];
		566
		567
		568	/* ya + yc */
		569	s1 = pSi0[1] + pSi2[1];
		570
		571	/* ya - yc */
		572	s2 = pSi0[1] - pSi2[1];
		573
		574
		575	/* xb + xd */
		576	t1 = pSi1[0] + pSi3[0];
		577
		578
		579	/* xa' = xa + xb + xc + xd */
		580	pSi0[0] = (r1 + t1) >> 2U;
		581	/* xa + xc -(xb + xd) */
		582	r1 = r1 - t1;
		583
		584	/* yb + yd */
		585	t2 = pSi1[1] + pSi3[1];
		586
		587	/* ya' = ya + yb + yc + yd */
		588	pSi0[1] = (s1 + t2) >> 2U;
		589	pSi0 += 2 * n1;
		590
		591	/* (ya + yc) - (yb + yd) */
		592	s1 = s1 - t2;
		593
		594	/* (yb - yd) */
		595	t1 = pSi1[1] - pSi3[1];
		596
		597	/* (xb - xd) */
		598	t2 = pSi1[0] - pSi3[0];
		599
		600
		601	/* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
		602	pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
		603	((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
		604
		605	/* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
		606	pSi1[1] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
		607	((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
		608	pSi1 += 2 * n1;
		609
		610	/* (xa - xc) + (yb - yd) */
		611	r1 = r2 + t1;
		612	/* (xa - xc) - (yb - yd) */
		613	r2 = r2 - t1;
		614
		615	/* (ya - yc) - (xb - xd) */
		616	s1 = s2 - t2;
		617	/* (ya - yc) + (xb - xd) */
		618	s2 = s2 + t2;
		619
		620	/* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
		621	pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
		622	((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
		623
		624	/* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
		625	pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
		626	((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
		627	pSi2 += 2 * n1;
		628
		629	/* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
		630	pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
		631	((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
		632
		633	/* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
		634	pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
		635	((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
		636	pSi3 += 2 * n1;
		637	}
		638	}
		639	twidCoefModifier <<= 2U;
		640	}
		641	#endif
		642
		643	/* End of Middle stages process */
		644
		645	/* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
		646	/* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
		647	/* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
		648	/* data is in 5.27(q27) format for the 16 point as there are no middle stages */
		649
		650
		651	/* start of Last stage process */
		652	/* Initializations for the last stage */
		653	j = fftLen >> 2;
		654	ptr1 = &pSrc[0];
		655
		656	/* Calculations of last stage */
		657	do
		658	{
		659
		660	#ifndef ARM_MATH_BIG_ENDIAN
		661
		662	/* Read xa (real), ya(imag) input */
		663	xaya = *__SIMD64(ptr1)++;
		664	xa = (q31_t) xaya;
		665	ya = (q31_t) (xaya >> 32);
		666
		667	/* Read xb (real), yb(imag) input */
		668	xbyb = *__SIMD64(ptr1)++;
		669	xb = (q31_t) xbyb;
		670	yb = (q31_t) (xbyb >> 32);
		671
		672	/* Read xc (real), yc(imag) input */
		673	xcyc = *__SIMD64(ptr1)++;
		674	xc = (q31_t) xcyc;
		675	yc = (q31_t) (xcyc >> 32);
		676
		677	/* Read xc (real), yc(imag) input */
		678	xdyd = *__SIMD64(ptr1)++;
		679	xd = (q31_t) xdyd;
		680	yd = (q31_t) (xdyd >> 32);
		681
		682	#else
		683
		684	/* Read xa (real), ya(imag) input */
		685	xaya = *__SIMD64(ptr1)++;
		686	ya = (q31_t) xaya;
		687	xa = (q31_t) (xaya >> 32);
		688
		689	/* Read xb (real), yb(imag) input */
		690	xbyb = *__SIMD64(ptr1)++;
		691	yb = (q31_t) xbyb;
		692	xb = (q31_t) (xbyb >> 32);
		693
		694	/* Read xc (real), yc(imag) input */
		695	xcyc = *__SIMD64(ptr1)++;
		696	yc = (q31_t) xcyc;
		697	xc = (q31_t) (xcyc >> 32);
		698
		699	/* Read xc (real), yc(imag) input */
		700	xdyd = *__SIMD64(ptr1)++;
		701	yd = (q31_t) xdyd;
		702	xd = (q31_t) (xdyd >> 32);
		703
		704
		705	#endif
		706
		707	/* xa' = xa + xb + xc + xd */
		708	xa_out = xa + xb + xc + xd;
		709
		710	/* ya' = ya + yb + yc + yd */
		711	ya_out = ya + yb + yc + yd;
		712
		713	/* pointer updation for writing */
		714	ptr1 = ptr1 - 8U;
		715
		716	/* writing xa' and ya' */
		717	*ptr1++ = xa_out;
		718	*ptr1++ = ya_out;
		719
		720	xc_out = (xa - xb + xc - xd);
		721	yc_out = (ya - yb + yc - yd);
		722
		723	/* writing xc' and yc' */
		724	*ptr1++ = xc_out;
		725	*ptr1++ = yc_out;
		726
		727	xb_out = (xa + yb - xc - yd);
		728	yb_out = (ya - xb - yc + xd);
		729
		730	/* writing xb' and yb' */
		731	*ptr1++ = xb_out;
		732	*ptr1++ = yb_out;
		733
		734	xd_out = (xa - yb - xc + yd);
		735	yd_out = (ya + xb - yc - xd);
		736
		737	/* writing xd' and yd' */
		738	*ptr1++ = xd_out;
		739	*ptr1++ = yd_out;
		740
		741
		742	} while (--j);
		743
		744	/* output is in 11.21(q21) format for the 1024 point */
		745	/* output is in 9.23(q23) format for the 256 point */
		746	/* output is in 7.25(q25) format for the 64 point */
		747	/* output is in 5.27(q27) format for the 16 point */
		748
		749	/* End of last stage process */
		750
		751	}
		752
		753
		754	/**
		755	* @brief Core function for the Q31 CIFFT butterfly process.
		756	* @param[in, out] *pSrc points to the in-place buffer of Q31 data type.
		757	* @param[in] fftLen length of the FFT.
		758	* @param[in] *pCoef points to twiddle coefficient buffer.
		759	* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
		760	* @return none.
		761	*/
		762
		763
		764	/*
		765	* Radix-4 IFFT algorithm used is :
		766	*
		767	* CIFFT uses same twiddle coefficients as CFFT Function
		768	* x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
		769	*
		770	*
		771	* IFFT is implemented with following changes in equations from FFT
		772	*
		773	* Input real and imaginary data:
		774	* x(n) = xa + j * ya
		775	* x(n+N/4 ) = xb + j * yb
		776	* x(n+N/2 ) = xc + j * yc
		777	* x(n+3N 4) = xd + j * yd
		778	*
		779	*
		780	* Output real and imaginary data:
		781	* x(4r) = xa'+ j * ya'
		782	* x(4r+1) = xb'+ j * yb'
		783	* x(4r+2) = xc'+ j * yc'
		784	* x(4r+3) = xd'+ j * yd'
		785	*
		786	*
		787	* Twiddle factors for radix-4 IFFT:
		788	* Wn = co1 + j * (si1)
		789	* W2n = co2 + j * (si2)
		790	* W3n = co3 + j * (si3)
		791
		792	* The real and imaginary output values for the radix-4 butterfly are
		793	* xa' = xa + xb + xc + xd
		794	* ya' = ya + yb + yc + yd
		795	* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
		796	* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
		797	* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
		798	* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
		799	* xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
		800	* yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
		801	*
		802	*/
		803
		804	void arm_radix4_butterfly_inverse_q31(
		805	q31_t * pSrc,
		806	uint32_t fftLen,
		807	q31_t * pCoef,
		808	uint32_t twidCoefModifier)
		809	{
		810	#if defined(ARM_MATH_CM7)
		811	uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
		812	q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
		813	q31_t xa, xb, xc, xd;
		814	q31_t ya, yb, yc, yd;
		815	q31_t xa_out, xb_out, xc_out, xd_out;
		816	q31_t ya_out, yb_out, yc_out, yd_out;
		817
		818	q31_t *ptr1;
		819	q63_t xaya, xbyb, xcyc, xdyd;
		820
		821	/* input is be 1.31(q31) format for all FFT sizes */
		822	/* Total process is divided into three stages */
		823	/* process first stage, middle stages, & last stage */
		824
		825	/* Start of first stage process */
		826
		827	/* Initializations for the first stage */
		828	n2 = fftLen;
		829	n1 = n2;
		830	/* n2 = fftLen/4 */
		831	n2 >>= 2U;
		832	i0 = 0U;
		833	ia1 = 0U;
		834
		835	j = n2;
		836
		837	do
		838	{
		839
		840	/* input is in 1.31(q31) format and provide 4 guard bits for the input */
		841
		842	/* index calculation for the input as, */
		843	/* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
		844	i1 = i0 + n2;
		845	i2 = i1 + n2;
		846	i3 = i2 + n2;
		847
		848	/* Butterfly implementation */
		849	/* xa + xc */
		850	r1 = (pSrc[2U * i0] >> 4U) + (pSrc[2U * i2] >> 4U);
		851	/* xa - xc */
		852	r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
		853
		854	/* xb + xd */
		855	t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
		856
		857	/* ya + yc */
		858	s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
		859	/* ya - yc */
		860	s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
		861
		862	/* xa' = xa + xb + xc + xd */
		863	pSrc[2U * i0] = (r1 + t1);
		864	/* (xa + xc) - (xb + xd) */
		865	r1 = r1 - t1;
		866	/* yb + yd */
		867	t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
		868	/* ya' = ya + yb + yc + yd */
		869	pSrc[(2U * i0) + 1U] = (s1 + t2);
		870
		871	/* (ya + yc) - (yb + yd) */
		872	s1 = s1 - t2;
		873
		874	/* yb - yd */
		875	t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
		876	/* xb - xd */
		877	t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
		878
		879	/* index calculation for the coefficients */
		880	ia2 = 2U * ia1;
		881	co2 = pCoef[ia2 * 2U];
		882	si2 = pCoef[(ia2 * 2U) + 1U];
		883
		884	/* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
		885	pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
		886	((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
		887
		888	/* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
		889	pSrc[2U * i1 + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
		890	((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
		891
		892	/* (xa - xc) - (yb - yd) */
		893	r1 = r2 - t1;
		894	/* (xa - xc) + (yb - yd) */
		895	r2 = r2 + t1;
		896
		897	/* (ya - yc) + (xb - xd) */
		898	s1 = s2 + t2;
		899	/* (ya - yc) - (xb - xd) */
		900	s2 = s2 - t2;
		901
		902	co1 = pCoef[ia1 * 2U];
		903	si1 = pCoef[(ia1 * 2U) + 1U];
		904
		905	/* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
		906	pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
		907	((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
		908
		909	/* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
		910	pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
		911	((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
		912
		913	/* index calculation for the coefficients */
		914	ia3 = 3U * ia1;
		915	co3 = pCoef[ia3 * 2U];
		916	si3 = pCoef[(ia3 * 2U) + 1U];
		917
		918	/* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
		919	pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
		920	((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
		921
		922	/* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
		923	pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
		924	((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
		925
		926	/* Twiddle coefficients index modifier */
		927	ia1 = ia1 + twidCoefModifier;
		928
		929	/* Updating input index */
		930	i0 = i0 + 1U;
		931
		932	} while (--j);
		933
		934	/* data is in 5.27(q27) format */
		935	/* each stage provides two down scaling of the input */
		936
		937
		938	/* Start of Middle stages process */
		939
		940	twidCoefModifier <<= 2U;
		941
		942	/* Calculation of second stage to excluding last stage */
		943	for (k = fftLen / 4U; k > 4U; k >>= 2U)
		944	{
		945	/* Initializations for the first stage */
		946	n1 = n2;
		947	n2 >>= 2U;
		948	ia1 = 0U;
		949
		950	for (j = 0; j <= (n2 - 1U); j++)
		951	{
		952	/* index calculation for the coefficients */
		953	ia2 = ia1 + ia1;
		954	ia3 = ia2 + ia1;
		955	co1 = pCoef[ia1 * 2U];
		956	si1 = pCoef[(ia1 * 2U) + 1U];
		957	co2 = pCoef[ia2 * 2U];
		958	si2 = pCoef[(ia2 * 2U) + 1U];
		959	co3 = pCoef[ia3 * 2U];
		960	si3 = pCoef[(ia3 * 2U) + 1U];
		961	/* Twiddle coefficients index modifier */
		962	ia1 = ia1 + twidCoefModifier;
		963
		964	for (i0 = j; i0 < fftLen; i0 += n1)
		965	{
		966	/* index calculation for the input as, */
		967	/* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
		968	i1 = i0 + n2;
		969	i2 = i1 + n2;
		970	i3 = i2 + n2;
		971
		972	/* Butterfly implementation */
		973	/* xa + xc */
		974	r1 = pSrc[2U * i0] + pSrc[2U * i2];
		975	/* xa - xc */
		976	r2 = pSrc[2U * i0] - pSrc[2U * i2];
		977
		978	/* ya + yc */
		979	s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
		980	/* ya - yc */
		981	s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
		982
		983	/* xb + xd */
		984	t1 = pSrc[2U * i1] + pSrc[2U * i3];
		985
		986	/* xa' = xa + xb + xc + xd */
		987	pSrc[2U * i0] = (r1 + t1) >> 2U;
		988	/* xa + xc -(xb + xd) */
		989	r1 = r1 - t1;
		990	/* yb + yd */
		991	t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
		992	/* ya' = ya + yb + yc + yd */
		993	pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
		994
		995	/* (ya + yc) - (yb + yd) */
		996	s1 = s1 - t2;
		997
		998	/* (yb - yd) */
		999	t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
		1000	/* (xb - xd) */
		1001	t2 = pSrc[2U * i1] - pSrc[2U * i3];
		1002
		1003	/* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
		1004	pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
		1005	((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
		1006
		1007	/* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
		1008	pSrc[(2U * i1) + 1U] =
		1009	(((int32_t) (((q63_t) s1 * co2) >> 32U)) +
		1010	((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
		1011
		1012	/* (xa - xc) - (yb - yd) */
		1013	r1 = r2 - t1;
		1014	/* (xa - xc) + (yb - yd) */
		1015	r2 = r2 + t1;
		1016
		1017	/* (ya - yc) + (xb - xd) */
		1018	s1 = s2 + t2;
		1019	/* (ya - yc) - (xb - xd) */
		1020	s2 = s2 - t2;
		1021
		1022	/* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
		1023	pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
		1024	((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
		1025
		1026	/* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
		1027	pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
		1028	((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
		1029
		1030	/* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
		1031	pSrc[(2U * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
		1032	((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
		1033
		1034	/* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
		1035	pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
		1036	((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
		1037	}
		1038	}
		1039	twidCoefModifier <<= 2U;
		1040	}
		1041	#else
		1042	uint32_t n1, n2, ia1, ia2, ia3, i0, j, k;
		1043	q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
		1044	q31_t xa, xb, xc, xd;
		1045	q31_t ya, yb, yc, yd;
		1046	q31_t xa_out, xb_out, xc_out, xd_out;
		1047	q31_t ya_out, yb_out, yc_out, yd_out;
		1048
		1049	q31_t *ptr1;
		1050	q31_t *pSi0;
		1051	q31_t *pSi1;
		1052	q31_t *pSi2;
		1053	q31_t *pSi3;
		1054	q63_t xaya, xbyb, xcyc, xdyd;
		1055
		1056	/* input is be 1.31(q31) format for all FFT sizes */
		1057	/* Total process is divided into three stages */
		1058	/* process first stage, middle stages, & last stage */
		1059
		1060	/* Start of first stage process */
		1061
		1062	/* Initializations for the first stage */
		1063	n2 = fftLen;
		1064	n1 = n2;
		1065	/* n2 = fftLen/4 */
		1066	n2 >>= 2U;
		1067
		1068	ia1 = 0U;
		1069
		1070	j = n2;
		1071
		1072	pSi0 = pSrc;
		1073	pSi1 = pSi0 + 2 * n2;
		1074	pSi2 = pSi1 + 2 * n2;
		1075	pSi3 = pSi2 + 2 * n2;
		1076
		1077	do
		1078	{
		1079	/* Butterfly implementation */
		1080	/* xa + xc */
		1081	r1 = (pSi0[0] >> 4U) + (pSi2[0] >> 4U);
		1082	/* xa - xc */
		1083	r2 = (pSi0[0] >> 4U) - (pSi2[0] >> 4U);
		1084
		1085	/* xb + xd */
		1086	t1 = (pSi1[0] >> 4U) + (pSi3[0] >> 4U);
		1087
		1088	/* ya + yc */
		1089	s1 = (pSi0[1] >> 4U) + (pSi2[1] >> 4U);
		1090	/* ya - yc */
		1091	s2 = (pSi0[1] >> 4U) - (pSi2[1] >> 4U);
		1092
		1093	/* xa' = xa + xb + xc + xd */
		1094	*pSi0++ = (r1 + t1);
		1095	/* (xa + xc) - (xb + xd) */
		1096	r1 = r1 - t1;
		1097	/* yb + yd */
		1098	t2 = (pSi1[1] >> 4U) + (pSi3[1] >> 4U);
		1099	/* ya' = ya + yb + yc + yd */
		1100	*pSi0++ = (s1 + t2);
		1101
		1102	/* (ya + yc) - (yb + yd) */
		1103	s1 = s1 - t2;
		1104
		1105	/* yb - yd */
		1106	t1 = (pSi1[1] >> 4U) - (pSi3[1] >> 4U);
		1107	/* xb - xd */
		1108	t2 = (pSi1[0] >> 4U) - (pSi3[0] >> 4U);
		1109
		1110	/* index calculation for the coefficients */
		1111	ia2 = 2U * ia1;
		1112	co2 = pCoef[ia2 * 2U];
		1113	si2 = pCoef[(ia2 * 2U) + 1U];
		1114
		1115	/* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
		1116	pSi1++ = (((int32_t) (((q63_t) r1 co2) >> 32)) -
		1117	((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
		1118
		1119	/* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
		1120	pSi1++ = (((int32_t) (((q63_t) s1 co2) >> 32)) +
		1121	((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
		1122
		1123	/* (xa - xc) - (yb - yd) */
		1124	r1 = r2 - t1;
		1125	/* (xa - xc) + (yb - yd) */
		1126	r2 = r2 + t1;
		1127
		1128	/* (ya - yc) + (xb - xd) */
		1129	s1 = s2 + t2;
		1130	/* (ya - yc) - (xb - xd) */
		1131	s2 = s2 - t2;
		1132
		1133	co1 = pCoef[ia1 * 2U];
		1134	si1 = pCoef[(ia1 * 2U) + 1U];
		1135
		1136	/* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
		1137	pSi2++ = (((int32_t) (((q63_t) r1 co1) >> 32)) -
		1138	((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
		1139
		1140	/* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
		1141	pSi2++ = (((int32_t) (((q63_t) s1 co1) >> 32)) +
		1142	((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
		1143
		1144	/* index calculation for the coefficients */
		1145	ia3 = 3U * ia1;
		1146	co3 = pCoef[ia3 * 2U];
		1147	si3 = pCoef[(ia3 * 2U) + 1U];
		1148
		1149	/* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
		1150	pSi3++ = (((int32_t) (((q63_t) r2 co3) >> 32)) -
		1151	((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
		1152
		1153	/* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
		1154	pSi3++ = (((int32_t) (((q63_t) s2 co3) >> 32)) +
		1155	((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
		1156
		1157	/* Twiddle coefficients index modifier */
		1158	ia1 = ia1 + twidCoefModifier;
		1159
		1160	} while (--j);
		1161
		1162	/* data is in 5.27(q27) format */
		1163	/* each stage provides two down scaling of the input */
		1164
		1165
		1166	/* Start of Middle stages process */
		1167
		1168	twidCoefModifier <<= 2U;
		1169
		1170	/* Calculation of second stage to excluding last stage */
		1171	for (k = fftLen / 4U; k > 4U; k >>= 2U)
		1172	{
		1173	/* Initializations for the first stage */
		1174	n1 = n2;
		1175	n2 >>= 2U;
		1176	ia1 = 0U;
		1177
		1178	for (j = 0; j <= (n2 - 1U); j++)
		1179	{
		1180	/* index calculation for the coefficients */
		1181	ia2 = ia1 + ia1;
		1182	ia3 = ia2 + ia1;
		1183	co1 = pCoef[ia1 * 2U];
		1184	si1 = pCoef[(ia1 * 2U) + 1U];
		1185	co2 = pCoef[ia2 * 2U];
		1186	si2 = pCoef[(ia2 * 2U) + 1U];
		1187	co3 = pCoef[ia3 * 2U];
		1188	si3 = pCoef[(ia3 * 2U) + 1U];
		1189	/* Twiddle coefficients index modifier */
		1190	ia1 = ia1 + twidCoefModifier;
		1191
		1192	pSi0 = pSrc + 2 * j;
		1193	pSi1 = pSi0 + 2 * n2;
		1194	pSi2 = pSi1 + 2 * n2;
		1195	pSi3 = pSi2 + 2 * n2;
		1196
		1197	for (i0 = j; i0 < fftLen; i0 += n1)
		1198	{
		1199	/* Butterfly implementation */
		1200	/* xa + xc */
		1201	r1 = pSi0[0] + pSi2[0];
		1202
		1203	/* xa - xc */
		1204	r2 = pSi0[0] - pSi2[0];
		1205
		1206
		1207	/* ya + yc */
		1208	s1 = pSi0[1] + pSi2[1];
		1209
		1210	/* ya - yc */
		1211	s2 = pSi0[1] - pSi2[1];
		1212
		1213
		1214	/* xb + xd */
		1215	t1 = pSi1[0] + pSi3[0];
		1216
		1217
		1218	/* xa' = xa + xb + xc + xd */
		1219	pSi0[0] = (r1 + t1) >> 2U;
		1220	/* xa + xc -(xb + xd) */
		1221	r1 = r1 - t1;
		1222	/* yb + yd */
		1223	t2 = pSi1[1] + pSi3[1];
		1224
		1225	/* ya' = ya + yb + yc + yd */
		1226	pSi0[1] = (s1 + t2) >> 2U;
		1227	pSi0 += 2 * n1;
		1228
		1229	/* (ya + yc) - (yb + yd) */
		1230	s1 = s1 - t2;
		1231
		1232	/* (yb - yd) */
		1233	t1 = pSi1[1] - pSi3[1];
		1234
		1235	/* (xb - xd) */
		1236	t2 = pSi1[0] - pSi3[0];
		1237
		1238
		1239	/* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
		1240	pSi1[0] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
		1241	((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
		1242
		1243	/* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
		1244	pSi1[1] =
		1245
		1246	(((int32_t) (((q63_t) s1 * co2) >> 32U)) +
		1247	((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
		1248	pSi1 += 2 * n1;
		1249
		1250	/* (xa - xc) - (yb - yd) */
		1251	r1 = r2 - t1;
		1252	/* (xa - xc) + (yb - yd) */
		1253	r2 = r2 + t1;
		1254
		1255	/* (ya - yc) + (xb - xd) */
		1256	s1 = s2 + t2;
		1257	/* (ya - yc) - (xb - xd) */
		1258	s2 = s2 - t2;
		1259
		1260	/* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
		1261	pSi2[0] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
		1262	((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
		1263
		1264	/* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
		1265	pSi2[1] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
		1266	((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
		1267	pSi2 += 2 * n1;
		1268
		1269	/* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
		1270	pSi3[0] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
		1271	((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
		1272
		1273	/* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
		1274	pSi3[1] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
		1275	((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
		1276	pSi3 += 2 * n1;
		1277	}
		1278	}
		1279	twidCoefModifier <<= 2U;
		1280	}
		1281	#endif
		1282
		1283	/* End of Middle stages process */
		1284
		1285	/* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
		1286	/* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
		1287	/* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
		1288	/* data is in 5.27(q27) format for the 16 point as there are no middle stages */
		1289
		1290
		1291	/* Start of last stage process */
		1292
		1293
		1294	/* Initializations for the last stage */
		1295	j = fftLen >> 2;
		1296	ptr1 = &pSrc[0];
		1297
		1298	/* Calculations of last stage */
		1299	do
		1300	{
		1301	#ifndef ARM_MATH_BIG_ENDIAN
		1302	/* Read xa (real), ya(imag) input */
		1303	xaya = *__SIMD64(ptr1)++;
		1304	xa = (q31_t) xaya;
		1305	ya = (q31_t) (xaya >> 32);
		1306
		1307	/* Read xb (real), yb(imag) input */
		1308	xbyb = *__SIMD64(ptr1)++;
		1309	xb = (q31_t) xbyb;
		1310	yb = (q31_t) (xbyb >> 32);
		1311
		1312	/* Read xc (real), yc(imag) input */
		1313	xcyc = *__SIMD64(ptr1)++;
		1314	xc = (q31_t) xcyc;
		1315	yc = (q31_t) (xcyc >> 32);
		1316
		1317	/* Read xc (real), yc(imag) input */
		1318	xdyd = *__SIMD64(ptr1)++;
		1319	xd = (q31_t) xdyd;
		1320	yd = (q31_t) (xdyd >> 32);
		1321
		1322	#else
		1323
		1324	/* Read xa (real), ya(imag) input */
		1325	xaya = *__SIMD64(ptr1)++;
		1326	ya = (q31_t) xaya;
		1327	xa = (q31_t) (xaya >> 32);
		1328
		1329	/* Read xb (real), yb(imag) input */
		1330	xbyb = *__SIMD64(ptr1)++;
		1331	yb = (q31_t) xbyb;
		1332	xb = (q31_t) (xbyb >> 32);
		1333
		1334	/* Read xc (real), yc(imag) input */
		1335	xcyc = *__SIMD64(ptr1)++;
		1336	yc = (q31_t) xcyc;
		1337	xc = (q31_t) (xcyc >> 32);
		1338
		1339	/* Read xc (real), yc(imag) input */
		1340	xdyd = *__SIMD64(ptr1)++;
		1341	yd = (q31_t) xdyd;
		1342	xd = (q31_t) (xdyd >> 32);
		1343
		1344
		1345	#endif
		1346
		1347	/* xa' = xa + xb + xc + xd */
		1348	xa_out = xa + xb + xc + xd;
		1349
		1350	/* ya' = ya + yb + yc + yd */
		1351	ya_out = ya + yb + yc + yd;
		1352
		1353	/* pointer updation for writing */
		1354	ptr1 = ptr1 - 8U;
		1355
		1356	/* writing xa' and ya' */
		1357	*ptr1++ = xa_out;
		1358	*ptr1++ = ya_out;
		1359
		1360	xc_out = (xa - xb + xc - xd);
		1361	yc_out = (ya - yb + yc - yd);
		1362
		1363	/* writing xc' and yc' */
		1364	*ptr1++ = xc_out;
		1365	*ptr1++ = yc_out;
		1366
		1367	xb_out = (xa - yb - xc + yd);
		1368	yb_out = (ya + xb - yc - xd);
		1369
		1370	/* writing xb' and yb' */
		1371	*ptr1++ = xb_out;
		1372	*ptr1++ = yb_out;
		1373
		1374	xd_out = (xa + yb - xc - yd);
		1375	yd_out = (ya - xb - yc + xd);
		1376
		1377	/* writing xd' and yd' */
		1378	*ptr1++ = xd_out;
		1379	*ptr1++ = yd_out;
		1380
		1381	} while (--j);
		1382
		1383	/* output is in 11.21(q21) format for the 1024 point */
		1384	/* output is in 9.23(q23) format for the 256 point */
		1385	/* output is in 7.25(q25) format for the 64 point */
		1386	/* output is in 5.27(q27) format for the 16 point */
		1387
		1388	/* End of last stage process */
		1389	}

Subversion Repositories AFRtranscoder

(root)/trunk/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c – Rev 2