WebSVN – LedShow – Blame – /trunk/Drivers/CMSIS/DSP_Lib/Source/TransformFunctions/arm_dct4_f32.c

Rev	Author	Line No.	Line
2	mjames	1	/* ----------------------------------------------------------------------
		2	* Copyright (C) 2010-2014 ARM Limited. All rights reserved.
		3	*
		4	* $Date: 19. March 2015
		5	* $Revision: V.1.4.5
		6	*
		7	* Project: CMSIS DSP Library
		8	* Title: arm_dct4_f32.c
		9	*
		10	* Description: Processing function of DCT4 & IDCT4 F32.
		11	*
		12	* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
		13	*
		14	* Redistribution and use in source and binary forms, with or without
		15	* modification, are permitted provided that the following conditions
		16	* are met:
		17	* - Redistributions of source code must retain the above copyright
		18	* notice, this list of conditions and the following disclaimer.
		19	* - Redistributions in binary form must reproduce the above copyright
		20	* notice, this list of conditions and the following disclaimer in
		21	* the documentation and/or other materials provided with the
		22	* distribution.
		23	* - Neither the name of ARM LIMITED nor the names of its contributors
		24	* may be used to endorse or promote products derived from this
		25	* software without specific prior written permission.
		26	*
		27	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
		28	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
		29	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
		30	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
		31	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
		32	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
		33	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
		34	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
		35	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
		36	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
		37	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
		38	* POSSIBILITY OF SUCH DAMAGE.
		39	* -------------------------------------------------------------------- */
		40
		41	#include "arm_math.h"
		42
		43	/**
		44	* @ingroup groupTransforms
		45	*/
		46
		47	/**
		48	* @defgroup DCT4_IDCT4 DCT Type IV Functions
		49	* Representation of signals by minimum number of values is important for storage and transmission.
		50	* The possibility of large discontinuity between the beginning and end of a period of a signal
		51	* in DFT can be avoided by extending the signal so that it is even-symmetric.
		52	* Discrete Cosine Transform (DCT) is constructed such that its energy is heavily concentrated in the lower part of the
		53	* spectrum and is very widely used in signal and image coding applications.
		54	* The family of DCTs (DCT type- 1,2,3,4) is the outcome of different combinations of homogeneous boundary conditions.
		55	* DCT has an excellent energy-packing capability, hence has many applications and in data compression in particular.
		56	*
		57	* DCT is essentially the Discrete Fourier Transform(DFT) of an even-extended real signal.
		58	* Reordering of the input data makes the computation of DCT just a problem of
		59	* computing the DFT of a real signal with a few additional operations.
		60	* This approach provides regular, simple, and very efficient DCT algorithms for practical hardware and software implementations.
		61	*
		62	* DCT type-II can be implemented using Fast fourier transform (FFT) internally, as the transform is applied on real values, Real FFT can be used.
		63	* DCT4 is implemented using DCT2 as their implementations are similar except with some added pre-processing and post-processing.
		64	* DCT2 implementation can be described in the following steps:
		65	* - Re-ordering input
		66	* - Calculating Real FFT
		67	* - Multiplication of weights and Real FFT output and getting real part from the product.
		68	*
		69	* This process is explained by the block diagram below:
		70	* \image html DCT4.gif "Discrete Cosine Transform - type-IV"
		71	*
		72	* \par Algorithm:
		73	* The N-point type-IV DCT is defined as a real, linear transformation by the formula:
		74	* \image html DCT4Equation.gif
		75	* where <code>k = 0,1,2,.....N-1</code>
		76	*\par
		77	* Its inverse is defined as follows:
		78	* \image html IDCT4Equation.gif
		79	* where <code>n = 0,1,2,.....N-1</code>
		80	*\par
		81	* The DCT4 matrices become involutory (i.e. they are self-inverse) by multiplying with an overall scale factor of sqrt(2/N).
		82	* The symmetry of the transform matrix indicates that the fast algorithms for the forward
		83	* and inverse transform computation are identical.
		84	* Note that the implementation of Inverse DCT4 and DCT4 is same, hence same process function can be used for both.
		85	*
		86	* \par Lengths supported by the transform:
		87	* As DCT4 internally uses Real FFT, it supports all the lengths supported by arm_rfft_f32().
		88	* The library provides separate functions for Q15, Q31, and floating-point data types.
		89	* \par Instance Structure
		90	* The instances for Real FFT and FFT, cosine values table and twiddle factor table are stored in an instance data structure.
		91	* A separate instance structure must be defined for each transform.
		92	* There are separate instance structure declarations for each of the 3 supported data types.
		93	*
		94	* \par Initialization Functions
		95	* There is also an associated initialization function for each data type.
		96	* The initialization function performs the following operations:
		97	* - Sets the values of the internal structure fields.
		98	* - Initializes Real FFT as its process function is used internally in DCT4, by calling arm_rfft_init_f32().
		99	* \par
		100	* Use of the initialization function is optional.
		101	* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
		102	* To place an instance structure into a const data section, the instance structure must be manually initialized.
		103	* Manually initialize the instance structure as follows:
		104	* <pre>
		105	*arm_dct4_instance_f32 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
		106	*arm_dct4_instance_q31 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
		107	*arm_dct4_instance_q15 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
		108	* </pre>
		109	* where \c N is the length of the DCT4; \c Nby2 is half of the length of the DCT4;
		110	* \c normalize is normalizing factor used and is equal to <code>sqrt(2/N)</code>;
		111	* \c pTwiddle points to the twiddle factor table;
		112	* \c pCosFactor points to the cosFactor table;
		113	* \c pRfft points to the real FFT instance;
		114	* \c pCfft points to the complex FFT instance;
		115	* The CFFT and RFFT structures also needs to be initialized, refer to arm_cfft_radix4_f32()
		116	* and arm_rfft_f32() respectively for details regarding static initialization.
		117	*
		118	* \par Fixed-Point Behavior
		119	* Care must be taken when using the fixed-point versions of the DCT4 transform functions.
		120	* In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
		121	* Refer to the function specific documentation below for usage guidelines.
		122	*/
		123
		124	/**
		125	* @addtogroup DCT4_IDCT4
		126	* @{
		127	*/
		128
		129	/**
		130	* @brief Processing function for the floating-point DCT4/IDCT4.
		131	* @param[in] *S points to an instance of the floating-point DCT4/IDCT4 structure.
		132	* @param[in] *pState points to state buffer.
		133	* @param[in,out] *pInlineBuffer points to the in-place input and output buffer.
		134	* @return none.
		135	*/
		136
		137	void arm_dct4_f32(
		138	const arm_dct4_instance_f32 * S,
		139	float32_t * pState,
		140	float32_t * pInlineBuffer)
		141	{
		142	uint32_t i; /* Loop counter */
		143	float32_t weights = S->pTwiddle; / Pointer to the Weights table */
		144	float32_t cosFact = S->pCosFactor; / Pointer to the cos factors table */
		145	float32_t pS1, pS2, pbuff; / Temporary pointers for input buffer and pState buffer */
		146	float32_t in; /* Temporary variable */
		147
		148
		149	/* DCT4 computation involves DCT2 (which is calculated using RFFT)
		150	* along with some pre-processing and post-processing.
		151	* Computational procedure is explained as follows:
		152	* (a) Pre-processing involves multiplying input with cos factor,
		153	* r(n) = 2 * u(n) * cos(pi(2n+1)/(4*n))
		154	* where,
		155	* r(n) -- output of preprocessing
		156	* u(n) -- input to preprocessing(actual Source buffer)
		157	* (b) Calculation of DCT2 using FFT is divided into three steps:
		158	* Step1: Re-ordering of even and odd elements of input.
		159	* Step2: Calculating FFT of the re-ordered input.
		160	* Step3: Taking the real part of the product of FFT output and weights.
		161	* (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
		162	* Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
		163	* where,
		164	* Y4 -- DCT4 output, Y2 -- DCT2 output
		165	* (d) Multiplying the output with the normalizing factor sqrt(2/N).
		166	*/
		167
		168	/-------- Pre-processing ------------/
		169	/* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi(2n+1)/(4n)) /
		170	arm_scale_f32(pInlineBuffer, 2.0f, pInlineBuffer, S->N);
		171	arm_mult_f32(pInlineBuffer, cosFact, pInlineBuffer, S->N);
		172
		173	/* ----------------------------------------------------------------
		174	* Step1: Re-ordering of even and odd elements as,
		175	* pState[i] = pInlineBuffer[2*i] and
		176	* pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
		177	---------------------------------------------------------------------*/
		178
		179	/* pS1 initialized to pState */
		180	pS1 = pState;
		181
		182	/* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
		183	pS2 = pState + (S->N - 1u);
		184
		185	/* pbuff initialized to input buffer */
		186	pbuff = pInlineBuffer;
		187
		188	#ifndef ARM_MATH_CM0_FAMILY
		189
		190	/* Run the below code for Cortex-M4 and Cortex-M3 */
		191
		192	/* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
		193	i = (uint32_t) S->Nby2 >> 2u;
		194
		195	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
		196	** a second loop below computes the remaining 1 to 3 samples. */
		197	do
		198	{
		199	/* Re-ordering of even and odd elements */
		200	/* pState[i] = pInlineBuffer[2i] /
		201	pS1++ = pbuff++;
		202	/* pState[N-i-1] = pInlineBuffer[2i+1] /
		203	pS2-- = pbuff++;
		204
		205	pS1++ = pbuff++;
		206	pS2-- = pbuff++;
		207
		208	pS1++ = pbuff++;
		209	pS2-- = pbuff++;
		210
		211	pS1++ = pbuff++;
		212	pS2-- = pbuff++;
		213
		214	/* Decrement the loop counter */
		215	i--;
		216	} while(i > 0u);
		217
		218	/* pbuff initialized to input buffer */
		219	pbuff = pInlineBuffer;
		220
		221	/* pS1 initialized to pState */
		222	pS1 = pState;
		223
		224	/* Initializing the loop counter to N/4 instead of N for loop unrolling */
		225	i = (uint32_t) S->N >> 2u;
		226
		227	/* Processing with loop unrolling 4 times as N is always multiple of 4.
		228	* Compute 4 outputs at a time */
		229	do
		230	{
		231	/* Writing the re-ordered output back to inplace input buffer */
		232	pbuff++ = pS1++;
		233	pbuff++ = pS1++;
		234	pbuff++ = pS1++;
		235	pbuff++ = pS1++;
		236
		237	/* Decrement the loop counter */
		238	i--;
		239	} while(i > 0u);
		240
		241
		242	/* ---------------------------------------------------------
		243	* Step2: Calculate RFFT for N-point input
		244	* ---------------------------------------------------------- */
		245	/* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
		246	arm_rfft_f32(S->pRfft, pInlineBuffer, pState);
		247
		248	/*----------------------------------------------------------------------
		249	* Step3: Multiply the FFT output with the weights.
		250	----------------------------------------------------------------------/
		251	arm_cmplx_mult_cmplx_f32(pState, weights, pState, S->N);
		252
		253	/* ----------- Post-processing ---------- */
		254	/* DCT-IV can be obtained from DCT-II by the equation,
		255	* Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
		256	* Hence, Y4(0) = Y2(0)/2 */
		257	/* Getting only real part from the output and Converting to DCT-IV */
		258
		259	/* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
		260	i = ((uint32_t) S->N - 1u) >> 2u;
		261
		262	/* pbuff initialized to input buffer. */
		263	pbuff = pInlineBuffer;
		264
		265	/* pS1 initialized to pState */
		266	pS1 = pState;
		267
		268	/* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
		269	in = pS1++ (float32_t) 0.5;
		270	/* input buffer acts as inplace, so output values are stored in the input itself. */
		271	*pbuff++ = in;
		272
		273	/* pState pointer is incremented twice as the real values are located alternatively in the array */
		274	pS1++;
		275
		276	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
		277	** a second loop below computes the remaining 1 to 3 samples. */
		278	do
		279	{
		280	/* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
		281	/* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
		282	in = *pS1++ - in;
		283	*pbuff++ = in;
		284	/* points to the next real value */
		285	pS1++;
		286
		287	in = *pS1++ - in;
		288	*pbuff++ = in;
		289	pS1++;
		290
		291	in = *pS1++ - in;
		292	*pbuff++ = in;
		293	pS1++;
		294
		295	in = *pS1++ - in;
		296	*pbuff++ = in;
		297	pS1++;
		298
		299	/* Decrement the loop counter */
		300	i--;
		301	} while(i > 0u);
		302
		303	/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
		304	** No loop unrolling is used. */
		305	i = ((uint32_t) S->N - 1u) % 0x4u;
		306
		307	while(i > 0u)
		308	{
		309	/* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
		310	/* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
		311	in = *pS1++ - in;
		312	*pbuff++ = in;
		313	/* points to the next real value */
		314	pS1++;
		315
		316	/* Decrement the loop counter */
		317	i--;
		318	}
		319
		320
		321	/------------ Normalizing the output by multiplying with the normalizing factor ----------/
		322
		323	/* Initializing the loop counter to N/4 instead of N for loop unrolling */
		324	i = (uint32_t) S->N >> 2u;
		325
		326	/* pbuff initialized to the pInlineBuffer(now contains the output values) */
		327	pbuff = pInlineBuffer;
		328
		329	/* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */
		330	do
		331	{
		332	/* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
		333	in = *pbuff;
		334	pbuff++ = in S->normalize;
		335
		336	in = *pbuff;
		337	pbuff++ = in S->normalize;
		338
		339	in = *pbuff;
		340	pbuff++ = in S->normalize;
		341
		342	in = *pbuff;
		343	pbuff++ = in S->normalize;
		344
		345	/* Decrement the loop counter */
		346	i--;
		347	} while(i > 0u);
		348
		349
		350	#else
		351
		352	/* Run the below code for Cortex-M0 */
		353
		354	/* Initializing the loop counter to N/2 */
		355	i = (uint32_t) S->Nby2;
		356
		357	do
		358	{
		359	/* Re-ordering of even and odd elements */
		360	/* pState[i] = pInlineBuffer[2i] /
		361	pS1++ = pbuff++;
		362	/* pState[N-i-1] = pInlineBuffer[2i+1] /
		363	pS2-- = pbuff++;
		364
		365	/* Decrement the loop counter */
		366	i--;
		367	} while(i > 0u);
		368
		369	/* pbuff initialized to input buffer */
		370	pbuff = pInlineBuffer;
		371
		372	/* pS1 initialized to pState */
		373	pS1 = pState;
		374
		375	/* Initializing the loop counter */
		376	i = (uint32_t) S->N;
		377
		378	do
		379	{
		380	/* Writing the re-ordered output back to inplace input buffer */
		381	pbuff++ = pS1++;
		382
		383	/* Decrement the loop counter */
		384	i--;
		385	} while(i > 0u);
		386
		387
		388	/* ---------------------------------------------------------
		389	* Step2: Calculate RFFT for N-point input
		390	* ---------------------------------------------------------- */
		391	/* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
		392	arm_rfft_f32(S->pRfft, pInlineBuffer, pState);
		393
		394	/*----------------------------------------------------------------------
		395	* Step3: Multiply the FFT output with the weights.
		396	----------------------------------------------------------------------/
		397	arm_cmplx_mult_cmplx_f32(pState, weights, pState, S->N);
		398
		399	/* ----------- Post-processing ---------- */
		400	/* DCT-IV can be obtained from DCT-II by the equation,
		401	* Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
		402	* Hence, Y4(0) = Y2(0)/2 */
		403	/* Getting only real part from the output and Converting to DCT-IV */
		404
		405	/* pbuff initialized to input buffer. */
		406	pbuff = pInlineBuffer;
		407
		408	/* pS1 initialized to pState */
		409	pS1 = pState;
		410
		411	/* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
		412	in = pS1++ (float32_t) 0.5;
		413	/* input buffer acts as inplace, so output values are stored in the input itself. */
		414	*pbuff++ = in;
		415
		416	/* pState pointer is incremented twice as the real values are located alternatively in the array */
		417	pS1++;
		418
		419	/* Initializing the loop counter */
		420	i = ((uint32_t) S->N - 1u);
		421
		422	do
		423	{
		424	/* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
		425	/* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
		426	in = *pS1++ - in;
		427	*pbuff++ = in;
		428	/* points to the next real value */
		429	pS1++;
		430
		431
		432	/* Decrement the loop counter */
		433	i--;
		434	} while(i > 0u);
		435
		436
		437	/------------ Normalizing the output by multiplying with the normalizing factor ----------/
		438
		439	/* Initializing the loop counter */
		440	i = (uint32_t) S->N;
		441
		442	/* pbuff initialized to the pInlineBuffer(now contains the output values) */
		443	pbuff = pInlineBuffer;
		444
		445	do
		446	{
		447	/* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
		448	in = *pbuff;
		449	pbuff++ = in S->normalize;
		450
		451	/* Decrement the loop counter */
		452	i--;
		453	} while(i > 0u);
		454
		455	#endif /* #ifndef ARM_MATH_CM0_FAMILY */
		456
		457	}
		458
		459	/**
		460	* @} end of DCT4_IDCT4 group
		461	*/

Subversion Repositories LedShow

(root)/trunk/Drivers/CMSIS/DSP_Lib/Source/TransformFunctions/arm_dct4_f32.c – Rev 2