WebSVN – AFRtranscoder – Blame – /trunk/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c

Rev	Author	Line No.	Line
2	mjames	1	/* ----------------------------------------------------------------------
		2	* Project: CMSIS DSP Library
		3	* Title: arm_dct4_f32.c
		4	* Description: Processing function of DCT4 & IDCT4 F32
		5	*
		6	* $Date: 27. January 2017
		7	* $Revision: V.1.5.1
		8	*
		9	* Target Processor: Cortex-M cores
		10	* -------------------------------------------------------------------- */
		11	/*
		12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
		13	*
		14	* SPDX-License-Identifier: Apache-2.0
		15	*
		16	* Licensed under the Apache License, Version 2.0 (the License); you may
		17	* not use this file except in compliance with the License.
		18	* You may obtain a copy of the License at
		19	*
		20	* www.apache.org/licenses/LICENSE-2.0
		21	*
		22	* Unless required by applicable law or agreed to in writing, software
		23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		25	* See the License for the specific language governing permissions and
		26	* limitations under the License.
		27	*/
		28
		29	#include "arm_math.h"
		30
		31	/**
		32	* @ingroup groupTransforms
		33	*/
		34
		35	/**
		36	* @defgroup DCT4_IDCT4 DCT Type IV Functions
		37	* Representation of signals by minimum number of values is important for storage and transmission.
		38	* The possibility of large discontinuity between the beginning and end of a period of a signal
		39	* in DFT can be avoided by extending the signal so that it is even-symmetric.
		40	* Discrete Cosine Transform (DCT) is constructed such that its energy is heavily concentrated in the lower part of the
		41	* spectrum and is very widely used in signal and image coding applications.
		42	* The family of DCTs (DCT type- 1,2,3,4) is the outcome of different combinations of homogeneous boundary conditions.
		43	* DCT has an excellent energy-packing capability, hence has many applications and in data compression in particular.
		44	*
		45	* DCT is essentially the Discrete Fourier Transform(DFT) of an even-extended real signal.
		46	* Reordering of the input data makes the computation of DCT just a problem of
		47	* computing the DFT of a real signal with a few additional operations.
		48	* This approach provides regular, simple, and very efficient DCT algorithms for practical hardware and software implementations.
		49	*
		50	* DCT type-II can be implemented using Fast fourier transform (FFT) internally, as the transform is applied on real values, Real FFT can be used.
		51	* DCT4 is implemented using DCT2 as their implementations are similar except with some added pre-processing and post-processing.
		52	* DCT2 implementation can be described in the following steps:
		53	* - Re-ordering input
		54	* - Calculating Real FFT
		55	* - Multiplication of weights and Real FFT output and getting real part from the product.
		56	*
		57	* This process is explained by the block diagram below:
		58	* \image html DCT4.gif "Discrete Cosine Transform - type-IV"
		59	*
		60	* \par Algorithm:
		61	* The N-point type-IV DCT is defined as a real, linear transformation by the formula:
		62	* \image html DCT4Equation.gif
		63	* where <code>k = 0,1,2,.....N-1</code>
		64	*\par
		65	* Its inverse is defined as follows:
		66	* \image html IDCT4Equation.gif
		67	* where <code>n = 0,1,2,.....N-1</code>
		68	*\par
		69	* The DCT4 matrices become involutory (i.e. they are self-inverse) by multiplying with an overall scale factor of sqrt(2/N).
		70	* The symmetry of the transform matrix indicates that the fast algorithms for the forward
		71	* and inverse transform computation are identical.
		72	* Note that the implementation of Inverse DCT4 and DCT4 is same, hence same process function can be used for both.
		73	*
		74	* \par Lengths supported by the transform:
		75	* As DCT4 internally uses Real FFT, it supports all the lengths 128, 512, 2048 and 8192.
		76	* The library provides separate functions for Q15, Q31, and floating-point data types.
		77	* \par Instance Structure
		78	* The instances for Real FFT and FFT, cosine values table and twiddle factor table are stored in an instance data structure.
		79	* A separate instance structure must be defined for each transform.
		80	* There are separate instance structure declarations for each of the 3 supported data types.
		81	*
		82	* \par Initialization Functions
		83	* There is also an associated initialization function for each data type.
		84	* The initialization function performs the following operations:
		85	* - Sets the values of the internal structure fields.
		86	* - Initializes Real FFT as its process function is used internally in DCT4, by calling arm_rfft_init_f32().
		87	* \par
		88	* Use of the initialization function is optional.
		89	* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
		90	* To place an instance structure into a const data section, the instance structure must be manually initialized.
		91	* Manually initialize the instance structure as follows:
		92	* <pre>
		93	*arm_dct4_instance_f32 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
		94	*arm_dct4_instance_q31 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
		95	*arm_dct4_instance_q15 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
		96	* </pre>
		97	* where \c N is the length of the DCT4; \c Nby2 is half of the length of the DCT4;
		98	* \c normalize is normalizing factor used and is equal to <code>sqrt(2/N)</code>;
		99	* \c pTwiddle points to the twiddle factor table;
		100	* \c pCosFactor points to the cosFactor table;
		101	* \c pRfft points to the real FFT instance;
		102	* \c pCfft points to the complex FFT instance;
		103	* The CFFT and RFFT structures also needs to be initialized, refer to arm_cfft_radix4_f32()
		104	* and arm_rfft_f32() respectively for details regarding static initialization.
		105	*
		106	* \par Fixed-Point Behavior
		107	* Care must be taken when using the fixed-point versions of the DCT4 transform functions.
		108	* In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
		109	* Refer to the function specific documentation below for usage guidelines.
		110	*/
		111
		112	/**
		113	* @addtogroup DCT4_IDCT4
		114	* @{
		115	*/
		116
		117	/**
		118	* @brief Processing function for the floating-point DCT4/IDCT4.
		119	* @param[in] *S points to an instance of the floating-point DCT4/IDCT4 structure.
		120	* @param[in] *pState points to state buffer.
		121	* @param[in,out] *pInlineBuffer points to the in-place input and output buffer.
		122	* @return none.
		123	*/
		124
		125	void arm_dct4_f32(
		126	const arm_dct4_instance_f32 * S,
		127	float32_t * pState,
		128	float32_t * pInlineBuffer)
		129	{
		130	uint32_t i; /* Loop counter */
		131	float32_t weights = S->pTwiddle; / Pointer to the Weights table */
		132	float32_t cosFact = S->pCosFactor; / Pointer to the cos factors table */
		133	float32_t pS1, pS2, pbuff; / Temporary pointers for input buffer and pState buffer */
		134	float32_t in; /* Temporary variable */
		135
		136
		137	/* DCT4 computation involves DCT2 (which is calculated using RFFT)
		138	* along with some pre-processing and post-processing.
		139	* Computational procedure is explained as follows:
		140	* (a) Pre-processing involves multiplying input with cos factor,
		141	* r(n) = 2 * u(n) * cos(pi(2n+1)/(4*n))
		142	* where,
		143	* r(n) -- output of preprocessing
		144	* u(n) -- input to preprocessing(actual Source buffer)
		145	* (b) Calculation of DCT2 using FFT is divided into three steps:
		146	* Step1: Re-ordering of even and odd elements of input.
		147	* Step2: Calculating FFT of the re-ordered input.
		148	* Step3: Taking the real part of the product of FFT output and weights.
		149	* (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
		150	* Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
		151	* where,
		152	* Y4 -- DCT4 output, Y2 -- DCT2 output
		153	* (d) Multiplying the output with the normalizing factor sqrt(2/N).
		154	*/
		155
		156	/-------- Pre-processing ------------/
		157	/* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi(2n+1)/(4n)) /
		158	arm_scale_f32(pInlineBuffer, 2.0f, pInlineBuffer, S->N);
		159	arm_mult_f32(pInlineBuffer, cosFact, pInlineBuffer, S->N);
		160
		161	/* ----------------------------------------------------------------
		162	* Step1: Re-ordering of even and odd elements as,
		163	* pState[i] = pInlineBuffer[2*i] and
		164	* pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
		165	---------------------------------------------------------------------*/
		166
		167	/* pS1 initialized to pState */
		168	pS1 = pState;
		169
		170	/* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
		171	pS2 = pState + (S->N - 1U);
		172
		173	/* pbuff initialized to input buffer */
		174	pbuff = pInlineBuffer;
		175
		176	#if defined (ARM_MATH_DSP)
		177
		178	/* Run the below code for Cortex-M4 and Cortex-M3 */
		179
		180	/* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
		181	i = (uint32_t) S->Nby2 >> 2U;
		182
		183	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
		184	** a second loop below computes the remaining 1 to 3 samples. */
		185	do
		186	{
		187	/* Re-ordering of even and odd elements */
		188	/* pState[i] = pInlineBuffer[2i] /
		189	pS1++ = pbuff++;
		190	/* pState[N-i-1] = pInlineBuffer[2i+1] /
		191	pS2-- = pbuff++;
		192
		193	pS1++ = pbuff++;
		194	pS2-- = pbuff++;
		195
		196	pS1++ = pbuff++;
		197	pS2-- = pbuff++;
		198
		199	pS1++ = pbuff++;
		200	pS2-- = pbuff++;
		201
		202	/* Decrement the loop counter */
		203	i--;
		204	} while (i > 0U);
		205
		206	/* pbuff initialized to input buffer */
		207	pbuff = pInlineBuffer;
		208
		209	/* pS1 initialized to pState */
		210	pS1 = pState;
		211
		212	/* Initializing the loop counter to N/4 instead of N for loop unrolling */
		213	i = (uint32_t) S->N >> 2U;
		214
		215	/* Processing with loop unrolling 4 times as N is always multiple of 4.
		216	* Compute 4 outputs at a time */
		217	do
		218	{
		219	/* Writing the re-ordered output back to inplace input buffer */
		220	pbuff++ = pS1++;
		221	pbuff++ = pS1++;
		222	pbuff++ = pS1++;
		223	pbuff++ = pS1++;
		224
		225	/* Decrement the loop counter */
		226	i--;
		227	} while (i > 0U);
		228
		229
		230	/* ---------------------------------------------------------
		231	* Step2: Calculate RFFT for N-point input
		232	* ---------------------------------------------------------- */
		233	/* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
		234	arm_rfft_f32(S->pRfft, pInlineBuffer, pState);
		235
		236	/*----------------------------------------------------------------------
		237	* Step3: Multiply the FFT output with the weights.
		238	----------------------------------------------------------------------/
		239	arm_cmplx_mult_cmplx_f32(pState, weights, pState, S->N);
		240
		241	/* ----------- Post-processing ---------- */
		242	/* DCT-IV can be obtained from DCT-II by the equation,
		243	* Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
		244	* Hence, Y4(0) = Y2(0)/2 */
		245	/* Getting only real part from the output and Converting to DCT-IV */
		246
		247	/* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
		248	i = ((uint32_t) S->N - 1U) >> 2U;
		249
		250	/* pbuff initialized to input buffer. */
		251	pbuff = pInlineBuffer;
		252
		253	/* pS1 initialized to pState */
		254	pS1 = pState;
		255
		256	/* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
		257	in = pS1++ (float32_t) 0.5;
		258	/* input buffer acts as inplace, so output values are stored in the input itself. */
		259	*pbuff++ = in;
		260
		261	/* pState pointer is incremented twice as the real values are located alternatively in the array */
		262	pS1++;
		263
		264	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
		265	** a second loop below computes the remaining 1 to 3 samples. */
		266	do
		267	{
		268	/* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
		269	/* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
		270	in = *pS1++ - in;
		271	*pbuff++ = in;
		272	/* points to the next real value */
		273	pS1++;
		274
		275	in = *pS1++ - in;
		276	*pbuff++ = in;
		277	pS1++;
		278
		279	in = *pS1++ - in;
		280	*pbuff++ = in;
		281	pS1++;
		282
		283	in = *pS1++ - in;
		284	*pbuff++ = in;
		285	pS1++;
		286
		287	/* Decrement the loop counter */
		288	i--;
		289	} while (i > 0U);
		290
		291	/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
		292	** No loop unrolling is used. */
		293	i = ((uint32_t) S->N - 1U) % 0x4U;
		294
		295	while (i > 0U)
		296	{
		297	/* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
		298	/* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
		299	in = *pS1++ - in;
		300	*pbuff++ = in;
		301	/* points to the next real value */
		302	pS1++;
		303
		304	/* Decrement the loop counter */
		305	i--;
		306	}
		307
		308
		309	/------------ Normalizing the output by multiplying with the normalizing factor ----------/
		310
		311	/* Initializing the loop counter to N/4 instead of N for loop unrolling */
		312	i = (uint32_t) S->N >> 2U;
		313
		314	/* pbuff initialized to the pInlineBuffer(now contains the output values) */
		315	pbuff = pInlineBuffer;
		316
		317	/* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */
		318	do
		319	{
		320	/* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
		321	in = *pbuff;
		322	pbuff++ = in S->normalize;
		323
		324	in = *pbuff;
		325	pbuff++ = in S->normalize;
		326
		327	in = *pbuff;
		328	pbuff++ = in S->normalize;
		329
		330	in = *pbuff;
		331	pbuff++ = in S->normalize;
		332
		333	/* Decrement the loop counter */
		334	i--;
		335	} while (i > 0U);
		336
		337
		338	#else
		339
		340	/* Run the below code for Cortex-M0 */
		341
		342	/* Initializing the loop counter to N/2 */
		343	i = (uint32_t) S->Nby2;
		344
		345	do
		346	{
		347	/* Re-ordering of even and odd elements */
		348	/* pState[i] = pInlineBuffer[2i] /
		349	pS1++ = pbuff++;
		350	/* pState[N-i-1] = pInlineBuffer[2i+1] /
		351	pS2-- = pbuff++;
		352
		353	/* Decrement the loop counter */
		354	i--;
		355	} while (i > 0U);
		356
		357	/* pbuff initialized to input buffer */
		358	pbuff = pInlineBuffer;
		359
		360	/* pS1 initialized to pState */
		361	pS1 = pState;
		362
		363	/* Initializing the loop counter */
		364	i = (uint32_t) S->N;
		365
		366	do
		367	{
		368	/* Writing the re-ordered output back to inplace input buffer */
		369	pbuff++ = pS1++;
		370
		371	/* Decrement the loop counter */
		372	i--;
		373	} while (i > 0U);
		374
		375
		376	/* ---------------------------------------------------------
		377	* Step2: Calculate RFFT for N-point input
		378	* ---------------------------------------------------------- */
		379	/* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
		380	arm_rfft_f32(S->pRfft, pInlineBuffer, pState);
		381
		382	/*----------------------------------------------------------------------
		383	* Step3: Multiply the FFT output with the weights.
		384	----------------------------------------------------------------------/
		385	arm_cmplx_mult_cmplx_f32(pState, weights, pState, S->N);
		386
		387	/* ----------- Post-processing ---------- */
		388	/* DCT-IV can be obtained from DCT-II by the equation,
		389	* Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
		390	* Hence, Y4(0) = Y2(0)/2 */
		391	/* Getting only real part from the output and Converting to DCT-IV */
		392
		393	/* pbuff initialized to input buffer. */
		394	pbuff = pInlineBuffer;
		395
		396	/* pS1 initialized to pState */
		397	pS1 = pState;
		398
		399	/* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
		400	in = pS1++ (float32_t) 0.5;
		401	/* input buffer acts as inplace, so output values are stored in the input itself. */
		402	*pbuff++ = in;
		403
		404	/* pState pointer is incremented twice as the real values are located alternatively in the array */
		405	pS1++;
		406
		407	/* Initializing the loop counter */
		408	i = ((uint32_t) S->N - 1U);
		409
		410	do
		411	{
		412	/* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
		413	/* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
		414	in = *pS1++ - in;
		415	*pbuff++ = in;
		416	/* points to the next real value */
		417	pS1++;
		418
		419
		420	/* Decrement the loop counter */
		421	i--;
		422	} while (i > 0U);
		423
		424
		425	/------------ Normalizing the output by multiplying with the normalizing factor ----------/
		426
		427	/* Initializing the loop counter */
		428	i = (uint32_t) S->N;
		429
		430	/* pbuff initialized to the pInlineBuffer(now contains the output values) */
		431	pbuff = pInlineBuffer;
		432
		433	do
		434	{
		435	/* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
		436	in = *pbuff;
		437	pbuff++ = in S->normalize;
		438
		439	/* Decrement the loop counter */
		440	i--;
		441	} while (i > 0U);
		442
		443	#endif /* #if defined (ARM_MATH_DSP) */
		444
		445	}
		446
		447	/**
		448	* @} end of DCT4_IDCT4 group
		449	*/

Subversion Repositories AFRtranscoder

(root)/trunk/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c – Rev 2