WebSVN – dashGPS – Blame – /branches/dashGPS-bmp/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c

Rev	Author	Line No.	Line
2	mjames	1	/* ----------------------------------------------------------------------
		2	* Project: CMSIS DSP Library
		3	* Title: arm_cmplx_mat_mult_q15.c
		4	* Description: Q15 complex matrix multiplication
		5	*
		6	* $Date: 27. January 2017
		7	* $Revision: V.1.5.1
		8	*
		9	* Target Processor: Cortex-M cores
		10	* -------------------------------------------------------------------- */
		11	/*
		12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
		13	*
		14	* SPDX-License-Identifier: Apache-2.0
		15	*
		16	* Licensed under the Apache License, Version 2.0 (the License); you may
		17	* not use this file except in compliance with the License.
		18	* You may obtain a copy of the License at
		19	*
		20	* www.apache.org/licenses/LICENSE-2.0
		21	*
		22	* Unless required by applicable law or agreed to in writing, software
		23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		25	* See the License for the specific language governing permissions and
		26	* limitations under the License.
		27	*/
		28
		29	#include "arm_math.h"
		30
		31	/**
		32	* @ingroup groupMatrix
		33	*/
		34
		35	/**
		36	* @addtogroup CmplxMatrixMult
		37	* @{
		38	*/
		39
		40
		41	/**
		42	* @brief Q15 Complex matrix multiplication
		43	* @param[in] *pSrcA points to the first input complex matrix structure
		44	* @param[in] *pSrcB points to the second input complex matrix structure
		45	* @param[out] *pDst points to output complex matrix structure
		46	* @param[in] *pScratch points to the array for storing intermediate results
		47	* @return The function returns either
		48	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		49	*
		50	* \par Conditions for optimum performance
		51	* Input, output and state buffers should be aligned by 32-bit
		52	*
		53	* \par Restrictions
		54	* If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
		55	* In this case input, output, scratch buffers should be aligned by 32-bit
		56	*
		57	* @details
		58	* <b>Scaling and Overflow Behavior:</b>
		59	*
		60	* \par
		61	* The function is implemented using a 64-bit internal accumulator. The inputs to the
		62	* multiplications are in 1.15 format and multiplications yield a 2.30 result.
		63	* The 2.30 intermediate
		64	* results are accumulated in a 64-bit accumulator in 34.30 format. This approach
		65	* provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
		66	* truncated to 34.15 format by discarding the low 15 bits and then saturated to
		67	* 1.15 format.
		68	*
		69	* \par
		70	* Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function.
		71	*
		72	*/
		73
		74
		75
		76
		77	arm_status arm_mat_cmplx_mult_q15(
		78	const arm_matrix_instance_q15 * pSrcA,
		79	const arm_matrix_instance_q15 * pSrcB,
		80	arm_matrix_instance_q15 * pDst,
		81	q15_t * pScratch)
		82	{
		83	/* accumulator */
		84	q15_t pSrcBT = pScratch; / input data matrix pointer for transpose */
		85	q15_t pInA = pSrcA->pData; / input data matrix pointer A of Q15 type */
		86	q15_t pInB = pSrcB->pData; / input data matrix pointer B of Q15 type */
		87	q15_t px; / Temporary output data matrix pointer */
		88	uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
		89	uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
		90	uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
		91	uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
		92	uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
		93	arm_status status; /* status of matrix multiplication */
		94	q63_t sumReal, sumImag;
		95
		96	#ifdef UNALIGNED_SUPPORT_DISABLE
		97	q15_t in; /* Temporary variable to hold the input value */
		98	q15_t a, b, c, d;
		99	#else
		100	q31_t in; /* Temporary variable to hold the input value */
		101	q31_t prod1, prod2;
		102	q31_t pSourceA, pSourceB;
		103	#endif
		104
		105	#ifdef ARM_MATH_MATRIX_CHECK
		106	/* Check for matrix mismatch condition */
		107	if ((pSrcA->numCols != pSrcB->numRows) \|\|
		108	(pSrcA->numRows != pDst->numRows) \|\| (pSrcB->numCols != pDst->numCols))
		109	{
		110	/* Set status as ARM_MATH_SIZE_MISMATCH */
		111	status = ARM_MATH_SIZE_MISMATCH;
		112	}
		113	else
		114	#endif
		115	{
		116	/* Matrix transpose */
		117	do
		118	{
		119	/* Apply loop unrolling and exchange the columns with row elements */
		120	col = numColsB >> 2;
		121
		122	/* The pointer px is set to starting address of the column being processed */
		123	px = pSrcBT + i;
		124
		125	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
		126	** a second loop below computes the remaining 1 to 3 samples. */
		127	while (col > 0U)
		128	{
		129	#ifdef UNALIGNED_SUPPORT_DISABLE
		130	/* Read two elements from the row */
		131	in = *pInB++;
		132	*px = in;
		133	in = *pInB++;
		134	px[1] = in;
		135
		136	/* Update the pointer px to point to the next row of the transposed matrix */
		137	px += numRowsB * 2;
		138
		139	/* Read two elements from the row */
		140	in = *pInB++;
		141	*px = in;
		142	in = *pInB++;
		143	px[1] = in;
		144
		145	/* Update the pointer px to point to the next row of the transposed matrix */
		146	px += numRowsB * 2;
		147
		148	/* Read two elements from the row */
		149	in = *pInB++;
		150	*px = in;
		151	in = *pInB++;
		152	px[1] = in;
		153
		154	/* Update the pointer px to point to the next row of the transposed matrix */
		155	px += numRowsB * 2;
		156
		157	/* Read two elements from the row */
		158	in = *pInB++;
		159	*px = in;
		160	in = *pInB++;
		161	px[1] = in;
		162
		163	/* Update the pointer px to point to the next row of the transposed matrix */
		164	px += numRowsB * 2;
		165
		166	/* Decrement the column loop counter */
		167	col--;
		168	}
		169
		170	/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
		171	** No loop unrolling is used. */
		172	col = numColsB % 0x4U;
		173
		174	while (col > 0U)
		175	{
		176	/* Read two elements from the row */
		177	in = *pInB++;
		178	*px = in;
		179	in = *pInB++;
		180	px[1] = in;
		181	#else
		182
		183	/* Read two elements from the row */
		184	in = *__SIMD32(pInB)++;
		185
		186	*__SIMD32(px) = in;
		187
		188	/* Update the pointer px to point to the next row of the transposed matrix */
		189	px += numRowsB * 2;
		190
		191
		192	/* Read two elements from the row */
		193	in = *__SIMD32(pInB)++;
		194
		195	*__SIMD32(px) = in;
		196
		197	/* Update the pointer px to point to the next row of the transposed matrix */
		198	px += numRowsB * 2;
		199
		200	/* Read two elements from the row */
		201	in = *__SIMD32(pInB)++;
		202
		203	*__SIMD32(px) = in;
		204
		205	/* Update the pointer px to point to the next row of the transposed matrix */
		206	px += numRowsB * 2;
		207
		208	/* Read two elements from the row */
		209	in = *__SIMD32(pInB)++;
		210
		211	*__SIMD32(px) = in;
		212
		213	/* Update the pointer px to point to the next row of the transposed matrix */
		214	px += numRowsB * 2;
		215
		216	/* Decrement the column loop counter */
		217	col--;
		218	}
		219
		220	/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
		221	** No loop unrolling is used. */
		222	col = numColsB % 0x4U;
		223
		224	while (col > 0U)
		225	{
		226	/* Read two elements from the row */
		227	in = *__SIMD32(pInB)++;
		228
		229	*__SIMD32(px) = in;
		230	#endif
		231
		232	/* Update the pointer px to point to the next row of the transposed matrix */
		233	px += numRowsB * 2;
		234
		235	/* Decrement the column loop counter */
		236	col--;
		237	}
		238
		239	i = i + 2U;
		240
		241	/* Decrement the row loop counter */
		242	row--;
		243
		244	} while (row > 0U);
		245
		246	/* Reset the variables for the usage in the following multiplication process */
		247	row = numRowsA;
		248	i = 0U;
		249	px = pDst->pData;
		250
		251	/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
		252	/* row loop */
		253	do
		254	{
		255	/* For every row wise process, the column loop counter is to be initiated */
		256	col = numColsB;
		257
		258	/* For every row wise process, the pIn2 pointer is set
		259	** to the starting address of the transposed pSrcB data */
		260	pInB = pSrcBT;
		261
		262	/* column loop */
		263	do
		264	{
		265	/* Set the variable sum, that acts as accumulator, to zero */
		266	sumReal = 0;
		267	sumImag = 0;
		268
		269	/* Apply loop unrolling and compute 2 MACs simultaneously. */
		270	colCnt = numColsA >> 1;
		271
		272	/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
		273	pInA = pSrcA->pData + i * 2;
		274
		275
		276	/* matrix multiplication */
		277	while (colCnt > 0U)
		278	{
		279	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		280
		281	#ifdef UNALIGNED_SUPPORT_DISABLE
		282
		283	/* read real and imag values from pSrcA buffer */
		284	a = *pInA;
		285	b = *(pInA + 1U);
		286	/* read real and imag values from pSrcB buffer */
		287	c = *pInB;
		288	d = *(pInB + 1U);
		289
		290	/* Multiply and Accumlates */
		291	sumReal += (q31_t) a *c;
		292	sumImag += (q31_t) a *d;
		293	sumReal -= (q31_t) b *d;
		294	sumImag += (q31_t) b *c;
		295
		296	/* read next real and imag values from pSrcA buffer */
		297	a = *(pInA + 2U);
		298	b = *(pInA + 3U);
		299	/* read next real and imag values from pSrcB buffer */
		300	c = *(pInB + 2U);
		301	d = *(pInB + 3U);
		302
		303	/* update pointer */
		304	pInA += 4U;
		305
		306	/* Multiply and Accumlates */
		307	sumReal += (q31_t) a *c;
		308	sumImag += (q31_t) a *d;
		309	sumReal -= (q31_t) b *d;
		310	sumImag += (q31_t) b *c;
		311	/* update pointer */
		312	pInB += 4U;
		313	#else
		314	/* read real and imag values from pSrcA and pSrcB buffer */
		315	pSourceA = *__SIMD32(pInA)++;
		316	pSourceB = *__SIMD32(pInB)++;
		317
		318	/* Multiply and Accumlates */
		319	#ifdef ARM_MATH_BIG_ENDIAN
		320	prod1 = -__SMUSD(pSourceA, pSourceB);
		321	#else
		322	prod1 = __SMUSD(pSourceA, pSourceB);
		323	#endif
		324	prod2 = __SMUADX(pSourceA, pSourceB);
		325	sumReal += (q63_t) prod1;
		326	sumImag += (q63_t) prod2;
		327
		328	/* read real and imag values from pSrcA and pSrcB buffer */
		329	pSourceA = *__SIMD32(pInA)++;
		330	pSourceB = *__SIMD32(pInB)++;
		331
		332	/* Multiply and Accumlates */
		333	#ifdef ARM_MATH_BIG_ENDIAN
		334	prod1 = -__SMUSD(pSourceA, pSourceB);
		335	#else
		336	prod1 = __SMUSD(pSourceA, pSourceB);
		337	#endif
		338	prod2 = __SMUADX(pSourceA, pSourceB);
		339	sumReal += (q63_t) prod1;
		340	sumImag += (q63_t) prod2;
		341
		342	#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
		343
		344	/* Decrement the loop counter */
		345	colCnt--;
		346	}
		347
		348	/* process odd column samples */
		349	if ((numColsA & 0x1U) > 0U)
		350	{
		351	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		352
		353	#ifdef UNALIGNED_SUPPORT_DISABLE
		354
		355	/* read real and imag values from pSrcA and pSrcB buffer */
		356	a = *pInA++;
		357	b = *pInA++;
		358	c = *pInB++;
		359	d = *pInB++;
		360
		361	/* Multiply and Accumlates */
		362	sumReal += (q31_t) a *c;
		363	sumImag += (q31_t) a *d;
		364	sumReal -= (q31_t) b *d;
		365	sumImag += (q31_t) b *c;
		366
		367	#else
		368	/* read real and imag values from pSrcA and pSrcB buffer */
		369	pSourceA = *__SIMD32(pInA)++;
		370	pSourceB = *__SIMD32(pInB)++;
		371
		372	/* Multiply and Accumlates */
		373	#ifdef ARM_MATH_BIG_ENDIAN
		374	prod1 = -__SMUSD(pSourceA, pSourceB);
		375	#else
		376	prod1 = __SMUSD(pSourceA, pSourceB);
		377	#endif
		378	prod2 = __SMUADX(pSourceA, pSourceB);
		379	sumReal += (q63_t) prod1;
		380	sumImag += (q63_t) prod2;
		381
		382	#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
		383
		384	}
		385
		386	/* Saturate and store the result in the destination buffer */
		387
		388	*px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
		389	*px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
		390
		391	/* Decrement the column loop counter */
		392	col--;
		393
		394	} while (col > 0U);
		395
		396	i = i + numColsA;
		397
		398	/* Decrement the row loop counter */
		399	row--;
		400
		401	} while (row > 0U);
		402
		403	/* set status as ARM_MATH_SUCCESS */
		404	status = ARM_MATH_SUCCESS;
		405	}
		406
		407	/* Return to application */
		408	return (status);
		409	}
		410
		411	/**
		412	* @} end of MatrixMult group
		413	*/

Subversion Repositories dashGPS

(root)/branches/dashGPS-bmp/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c – Rev 18