WebSVN – AFRtranscoder – Blame – /trunk/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c

Rev	Author	Line No.	Line
2	mjames	1	/* ----------------------------------------------------------------------
		2	* Project: CMSIS DSP Library
		3	* Title: arm_dct4_q31.c
		4	* Description: Processing function of DCT4 & IDCT4 Q31
		5	*
		6	* $Date: 27. January 2017
		7	* $Revision: V.1.5.1
		8	*
		9	* Target Processor: Cortex-M cores
		10	* -------------------------------------------------------------------- */
		11	/*
		12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
		13	*
		14	* SPDX-License-Identifier: Apache-2.0
		15	*
		16	* Licensed under the Apache License, Version 2.0 (the License); you may
		17	* not use this file except in compliance with the License.
		18	* You may obtain a copy of the License at
		19	*
		20	* www.apache.org/licenses/LICENSE-2.0
		21	*
		22	* Unless required by applicable law or agreed to in writing, software
		23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		25	* See the License for the specific language governing permissions and
		26	* limitations under the License.
		27	*/
		28
		29	#include "arm_math.h"
		30
		31	/**
		32	* @addtogroup DCT4_IDCT4
		33	* @{
		34	*/
		35
		36	/**
		37	* @brief Processing function for the Q31 DCT4/IDCT4.
		38	* @param[in] *S points to an instance of the Q31 DCT4 structure.
		39	* @param[in] *pState points to state buffer.
		40	* @param[in,out] *pInlineBuffer points to the in-place input and output buffer.
		41	* @return none.
		42	* \par Input an output formats:
		43	* Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process,
		44	* as the conversion from DCT2 to DCT4 involves one subtraction.
		45	* Internally inputs are downscaled in the RFFT process function to avoid overflows.
		46	* Number of bits downscaled, depends on the size of the transform.
		47	* The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below:
		48	*
		49	* \image html dct4FormatsQ31Table.gif
		50	*/
		51
		52	void arm_dct4_q31(
		53	const arm_dct4_instance_q31 * S,
		54	q31_t * pState,
		55	q31_t * pInlineBuffer)
		56	{
		57	uint16_t i; /* Loop counter */
		58	q31_t weights = S->pTwiddle; / Pointer to the Weights table */
		59	q31_t cosFact = S->pCosFactor; / Pointer to the cos factors table */
		60	q31_t pS1, pS2, pbuff; / Temporary pointers for input buffer and pState buffer */
		61	q31_t in; /* Temporary variable */
		62
		63
		64	/* DCT4 computation involves DCT2 (which is calculated using RFFT)
		65	* along with some pre-processing and post-processing.
		66	* Computational procedure is explained as follows:
		67	* (a) Pre-processing involves multiplying input with cos factor,
		68	* r(n) = 2 * u(n) * cos(pi(2n+1)/(4*n))
		69	* where,
		70	* r(n) -- output of preprocessing
		71	* u(n) -- input to preprocessing(actual Source buffer)
		72	* (b) Calculation of DCT2 using FFT is divided into three steps:
		73	* Step1: Re-ordering of even and odd elements of input.
		74	* Step2: Calculating FFT of the re-ordered input.
		75	* Step3: Taking the real part of the product of FFT output and weights.
		76	* (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
		77	* Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
		78	* where,
		79	* Y4 -- DCT4 output, Y2 -- DCT2 output
		80	* (d) Multiplying the output with the normalizing factor sqrt(2/N).
		81	*/
		82
		83	/-------- Pre-processing ------------/
		84	/* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi(2n+1)/(4n)) /
		85	arm_mult_q31(pInlineBuffer, cosFact, pInlineBuffer, S->N);
		86	arm_shift_q31(pInlineBuffer, 1, pInlineBuffer, S->N);
		87
		88	/* ----------------------------------------------------------------
		89	* Step1: Re-ordering of even and odd elements as
		90	* pState[i] = pInlineBuffer[2*i] and
		91	* pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
		92	---------------------------------------------------------------------*/
		93
		94	/* pS1 initialized to pState */
		95	pS1 = pState;
		96
		97	/* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
		98	pS2 = pState + (S->N - 1U);
		99
		100	/* pbuff initialized to input buffer */
		101	pbuff = pInlineBuffer;
		102
		103	#if defined (ARM_MATH_DSP)
		104
		105	/* Run the below code for Cortex-M4 and Cortex-M3 */
		106
		107	/* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
		108	i = S->Nby2 >> 2U;
		109
		110	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
		111	** a second loop below computes the remaining 1 to 3 samples. */
		112	do
		113	{
		114	/* Re-ordering of even and odd elements */
		115	/* pState[i] = pInlineBuffer[2i] /
		116	pS1++ = pbuff++;
		117	/* pState[N-i-1] = pInlineBuffer[2i+1] /
		118	pS2-- = pbuff++;
		119
		120	pS1++ = pbuff++;
		121	pS2-- = pbuff++;
		122
		123	pS1++ = pbuff++;
		124	pS2-- = pbuff++;
		125
		126	pS1++ = pbuff++;
		127	pS2-- = pbuff++;
		128
		129	/* Decrement the loop counter */
		130	i--;
		131	} while (i > 0U);
		132
		133	/* pbuff initialized to input buffer */
		134	pbuff = pInlineBuffer;
		135
		136	/* pS1 initialized to pState */
		137	pS1 = pState;
		138
		139	/* Initializing the loop counter to N/4 instead of N for loop unrolling */
		140	i = S->N >> 2U;
		141
		142	/* Processing with loop unrolling 4 times as N is always multiple of 4.
		143	* Compute 4 outputs at a time */
		144	do
		145	{
		146	/* Writing the re-ordered output back to inplace input buffer */
		147	pbuff++ = pS1++;
		148	pbuff++ = pS1++;
		149	pbuff++ = pS1++;
		150	pbuff++ = pS1++;
		151
		152	/* Decrement the loop counter */
		153	i--;
		154	} while (i > 0U);
		155
		156
		157	/* ---------------------------------------------------------
		158	* Step2: Calculate RFFT for N-point input
		159	* ---------------------------------------------------------- */
		160	/* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
		161	arm_rfft_q31(S->pRfft, pInlineBuffer, pState);
		162
		163	/*----------------------------------------------------------------------
		164	* Step3: Multiply the FFT output with the weights.
		165	----------------------------------------------------------------------/
		166	arm_cmplx_mult_cmplx_q31(pState, weights, pState, S->N);
		167
		168	/* The output of complex multiplication is in 3.29 format.
		169	* Hence changing the format of N (i.e. 2N elements) complex numbers to 1.31 format by shifting left by 2 bits. /
		170	arm_shift_q31(pState, 2, pState, S->N * 2);
		171
		172	/* ----------- Post-processing ---------- */
		173	/* DCT-IV can be obtained from DCT-II by the equation,
		174	* Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
		175	* Hence, Y4(0) = Y2(0)/2 */
		176	/* Getting only real part from the output and Converting to DCT-IV */
		177
		178	/* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
		179	i = (S->N - 1U) >> 2U;
		180
		181	/* pbuff initialized to input buffer. */
		182	pbuff = pInlineBuffer;
		183
		184	/* pS1 initialized to pState */
		185	pS1 = pState;
		186
		187	/* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
		188	in = *pS1++ >> 1U;
		189	/* input buffer acts as inplace, so output values are stored in the input itself. */
		190	*pbuff++ = in;
		191
		192	/* pState pointer is incremented twice as the real values are located alternatively in the array */
		193	pS1++;
		194
		195	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
		196	** a second loop below computes the remaining 1 to 3 samples. */
		197	do
		198	{
		199	/* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
		200	/* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
		201	in = *pS1++ - in;
		202	*pbuff++ = in;
		203	/* points to the next real value */
		204	pS1++;
		205
		206	in = *pS1++ - in;
		207	*pbuff++ = in;
		208	pS1++;
		209
		210	in = *pS1++ - in;
		211	*pbuff++ = in;
		212	pS1++;
		213
		214	in = *pS1++ - in;
		215	*pbuff++ = in;
		216	pS1++;
		217
		218	/* Decrement the loop counter */
		219	i--;
		220	} while (i > 0U);
		221
		222	/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
		223	** No loop unrolling is used. */
		224	i = (S->N - 1U) % 0x4U;
		225
		226	while (i > 0U)
		227	{
		228	/* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
		229	/* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
		230	in = *pS1++ - in;
		231	*pbuff++ = in;
		232	/* points to the next real value */
		233	pS1++;
		234
		235	/* Decrement the loop counter */
		236	i--;
		237	}
		238
		239
		240	/------------ Normalizing the output by multiplying with the normalizing factor ----------/
		241
		242	/* Initializing the loop counter to N/4 instead of N for loop unrolling */
		243	i = S->N >> 2U;
		244
		245	/* pbuff initialized to the pInlineBuffer(now contains the output values) */
		246	pbuff = pInlineBuffer;
		247
		248	/* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */
		249	do
		250	{
		251	/* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
		252	in = *pbuff;
		253	pbuff++ = ((q31_t) (((q63_t) in S->normalize) >> 31));
		254
		255	in = *pbuff;
		256	pbuff++ = ((q31_t) (((q63_t) in S->normalize) >> 31));
		257
		258	in = *pbuff;
		259	pbuff++ = ((q31_t) (((q63_t) in S->normalize) >> 31));
		260
		261	in = *pbuff;
		262	pbuff++ = ((q31_t) (((q63_t) in S->normalize) >> 31));
		263
		264	/* Decrement the loop counter */
		265	i--;
		266	} while (i > 0U);
		267
		268
		269	#else
		270
		271	/* Run the below code for Cortex-M0 */
		272
		273	/* Initializing the loop counter to N/2 */
		274	i = S->Nby2;
		275
		276	do
		277	{
		278	/* Re-ordering of even and odd elements */
		279	/* pState[i] = pInlineBuffer[2i] /
		280	pS1++ = pbuff++;
		281	/* pState[N-i-1] = pInlineBuffer[2i+1] /
		282	pS2-- = pbuff++;
		283
		284	/* Decrement the loop counter */
		285	i--;
		286	} while (i > 0U);
		287
		288	/* pbuff initialized to input buffer */
		289	pbuff = pInlineBuffer;
		290
		291	/* pS1 initialized to pState */
		292	pS1 = pState;
		293
		294	/* Initializing the loop counter */
		295	i = S->N;
		296
		297	do
		298	{
		299	/* Writing the re-ordered output back to inplace input buffer */
		300	pbuff++ = pS1++;
		301
		302	/* Decrement the loop counter */
		303	i--;
		304	} while (i > 0U);
		305
		306
		307	/* ---------------------------------------------------------
		308	* Step2: Calculate RFFT for N-point input
		309	* ---------------------------------------------------------- */
		310	/* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
		311	arm_rfft_q31(S->pRfft, pInlineBuffer, pState);
		312
		313	/*----------------------------------------------------------------------
		314	* Step3: Multiply the FFT output with the weights.
		315	----------------------------------------------------------------------/
		316	arm_cmplx_mult_cmplx_q31(pState, weights, pState, S->N);
		317
		318	/* The output of complex multiplication is in 3.29 format.
		319	* Hence changing the format of N (i.e. 2N elements) complex numbers to 1.31 format by shifting left by 2 bits. /
		320	arm_shift_q31(pState, 2, pState, S->N * 2);
		321
		322	/* ----------- Post-processing ---------- */
		323	/* DCT-IV can be obtained from DCT-II by the equation,
		324	* Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
		325	* Hence, Y4(0) = Y2(0)/2 */
		326	/* Getting only real part from the output and Converting to DCT-IV */
		327
		328	/* pbuff initialized to input buffer. */
		329	pbuff = pInlineBuffer;
		330
		331	/* pS1 initialized to pState */
		332	pS1 = pState;
		333
		334	/* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
		335	in = *pS1++ >> 1U;
		336	/* input buffer acts as inplace, so output values are stored in the input itself. */
		337	*pbuff++ = in;
		338
		339	/* pState pointer is incremented twice as the real values are located alternatively in the array */
		340	pS1++;
		341
		342	/* Initializing the loop counter */
		343	i = (S->N - 1U);
		344
		345	while (i > 0U)
		346	{
		347	/* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
		348	/* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
		349	in = *pS1++ - in;
		350	*pbuff++ = in;
		351	/* points to the next real value */
		352	pS1++;
		353
		354	/* Decrement the loop counter */
		355	i--;
		356	}
		357
		358
		359	/------------ Normalizing the output by multiplying with the normalizing factor ----------/
		360
		361	/* Initializing the loop counter */
		362	i = S->N;
		363
		364	/* pbuff initialized to the pInlineBuffer(now contains the output values) */
		365	pbuff = pInlineBuffer;
		366
		367	do
		368	{
		369	/* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
		370	in = *pbuff;
		371	pbuff++ = ((q31_t) (((q63_t) in S->normalize) >> 31));
		372
		373	/* Decrement the loop counter */
		374	i--;
		375	} while (i > 0U);
		376
		377	#endif /* #if defined (ARM_MATH_DSP) */
		378
		379	}
		380
		381	/**
		382	* @} end of DCT4_IDCT4 group
		383	*/

Subversion Repositories AFRtranscoder

(root)/trunk/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c – Rev 2