WebSVN – dashGPS – Blame – /branches/dashGPS-bmp/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c

Rev	Author	Line No.	Line
2	mjames	1	/* ----------------------------------------------------------------------
		2	* Project: CMSIS DSP Library
		3	* Title: arm_mat_cmplx_mult_q31.c
		4	* Description: Floating-point matrix multiplication
		5	*
		6	* $Date: 27. January 2017
		7	* $Revision: V.1.5.1
		8	*
		9	* Target Processor: Cortex-M cores
		10	* -------------------------------------------------------------------- */
		11	/*
		12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
		13	*
		14	* SPDX-License-Identifier: Apache-2.0
		15	*
		16	* Licensed under the Apache License, Version 2.0 (the License); you may
		17	* not use this file except in compliance with the License.
		18	* You may obtain a copy of the License at
		19	*
		20	* www.apache.org/licenses/LICENSE-2.0
		21	*
		22	* Unless required by applicable law or agreed to in writing, software
		23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		25	* See the License for the specific language governing permissions and
		26	* limitations under the License.
		27	*/
		28
		29	#include "arm_math.h"
		30
		31	/**
		32	* @ingroup groupMatrix
		33	*/
		34
		35	/**
		36	* @addtogroup CmplxMatrixMult
		37	* @{
		38	*/
		39
		40	/**
		41	* @brief Q31 Complex matrix multiplication
		42	* @param[in] *pSrcA points to the first input complex matrix structure
		43	* @param[in] *pSrcB points to the second input complex matrix structure
		44	* @param[out] *pDst points to output complex matrix structure
		45	* @return The function returns either
		46	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		47	*
		48	* @details
		49	* <b>Scaling and Overflow Behavior:</b>
		50	*
		51	* \par
		52	* The function is implemented using an internal 64-bit accumulator.
		53	* The accumulator has a 2.62 format and maintains full precision of the intermediate
		54	* multiplication results but provides only a single guard bit. There is no saturation
		55	* on intermediate additions. Thus, if the accumulator overflows it wraps around and
		56	* distorts the result. The input signals should be scaled down to avoid intermediate
		57	* overflows. The input is thus scaled down by log2(numColsA) bits
		58	* to avoid overflows, as a total of numColsA additions are performed internally.
		59	* The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
		60	*
		61	*
		62	*/
		63
		64	arm_status arm_mat_cmplx_mult_q31(
		65	const arm_matrix_instance_q31 * pSrcA,
		66	const arm_matrix_instance_q31 * pSrcB,
		67	arm_matrix_instance_q31 * pDst)
		68	{
		69	q31_t pIn1 = pSrcA->pData; / input data matrix pointer A */
		70	q31_t pIn2 = pSrcB->pData; / input data matrix pointer B */
		71	q31_t pInA = pSrcA->pData; / input data matrix pointer A */
		72	q31_t pOut = pDst->pData; / output data matrix pointer */
		73	q31_t px; / Temporary output data matrix pointer */
		74	uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
		75	uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
		76	uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
		77	q63_t sumReal1, sumImag1; /* accumulator */
		78	q31_t a0, b0, c0, d0;
		79	q31_t a1, b1, c1, d1;
		80
		81
		82	/* Run the below code for Cortex-M4 and Cortex-M3 */
		83
		84	uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
		85	arm_status status; /* status of matrix multiplication */
		86
		87	#ifdef ARM_MATH_MATRIX_CHECK
		88
		89
		90	/* Check for matrix mismatch condition */
		91	if ((pSrcA->numCols != pSrcB->numRows) \|\|
		92	(pSrcA->numRows != pDst->numRows) \|\| (pSrcB->numCols != pDst->numCols))
		93	{
		94
		95	/* Set status as ARM_MATH_SIZE_MISMATCH */
		96	status = ARM_MATH_SIZE_MISMATCH;
		97	}
		98	else
		99	#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
		100
		101	{
		102	/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
		103	/* row loop */
		104	do
		105	{
		106	/* Output pointer is set to starting address of the row being processed */
		107	px = pOut + 2 * i;
		108
		109	/* For every row wise process, the column loop counter is to be initiated */
		110	col = numColsB;
		111
		112	/* For every row wise process, the pIn2 pointer is set
		113	** to the starting address of the pSrcB data */
		114	pIn2 = pSrcB->pData;
		115
		116	j = 0U;
		117
		118	/* column loop */
		119	do
		120	{
		121	/* Set the variable sum, that acts as accumulator, to zero */
		122	sumReal1 = 0.0;
		123	sumImag1 = 0.0;
		124
		125	/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
		126	pIn1 = pInA;
		127
		128	/* Apply loop unrolling and compute 4 MACs simultaneously. */
		129	colCnt = numColsA >> 2;
		130
		131	/* matrix multiplication */
		132	while (colCnt > 0U)
		133	{
		134
		135	/* Reading real part of complex matrix A */
		136	a0 = *pIn1;
		137
		138	/* Reading real part of complex matrix B */
		139	c0 = *pIn2;
		140
		141	/* Reading imaginary part of complex matrix A */
		142	b0 = *(pIn1 + 1U);
		143
		144	/* Reading imaginary part of complex matrix B */
		145	d0 = *(pIn2 + 1U);
		146
		147	/* Multiply and Accumlates */
		148	sumReal1 += (q63_t) a0 *c0;
		149	sumImag1 += (q63_t) b0 *c0;
		150
		151	/* update pointers */
		152	pIn1 += 2U;
		153	pIn2 += 2 * numColsB;
		154
		155	/* Multiply and Accumlates */
		156	sumReal1 -= (q63_t) b0 *d0;
		157	sumImag1 += (q63_t) a0 *d0;
		158
		159	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		160
		161	/* read real and imag values from pSrcA and pSrcB buffer */
		162	a1 = *pIn1;
		163	c1 = *pIn2;
		164	b1 = *(pIn1 + 1U);
		165	d1 = *(pIn2 + 1U);
		166
		167	/* Multiply and Accumlates */
		168	sumReal1 += (q63_t) a1 *c1;
		169	sumImag1 += (q63_t) b1 *c1;
		170
		171	/* update pointers */
		172	pIn1 += 2U;
		173	pIn2 += 2 * numColsB;
		174
		175	/* Multiply and Accumlates */
		176	sumReal1 -= (q63_t) b1 *d1;
		177	sumImag1 += (q63_t) a1 *d1;
		178
		179	a0 = *pIn1;
		180	c0 = *pIn2;
		181
		182	b0 = *(pIn1 + 1U);
		183	d0 = *(pIn2 + 1U);
		184
		185	/* Multiply and Accumlates */
		186	sumReal1 += (q63_t) a0 *c0;
		187	sumImag1 += (q63_t) b0 *c0;
		188
		189	/* update pointers */
		190	pIn1 += 2U;
		191	pIn2 += 2 * numColsB;
		192
		193	/* Multiply and Accumlates */
		194	sumReal1 -= (q63_t) b0 *d0;
		195	sumImag1 += (q63_t) a0 *d0;
		196
		197	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		198
		199	a1 = *pIn1;
		200	c1 = *pIn2;
		201
		202	b1 = *(pIn1 + 1U);
		203	d1 = *(pIn2 + 1U);
		204
		205	/* Multiply and Accumlates */
		206	sumReal1 += (q63_t) a1 *c1;
		207	sumImag1 += (q63_t) b1 *c1;
		208
		209	/* update pointers */
		210	pIn1 += 2U;
		211	pIn2 += 2 * numColsB;
		212
		213	/* Multiply and Accumlates */
		214	sumReal1 -= (q63_t) b1 *d1;
		215	sumImag1 += (q63_t) a1 *d1;
		216
		217	/* Decrement the loop count */
		218	colCnt--;
		219	}
		220
		221	/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
		222	** No loop unrolling is used. */
		223	colCnt = numColsA % 0x4U;
		224
		225	while (colCnt > 0U)
		226	{
		227	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		228	a1 = *pIn1;
		229	c1 = *pIn2;
		230
		231	b1 = *(pIn1 + 1U);
		232	d1 = *(pIn2 + 1U);
		233
		234	/* Multiply and Accumlates */
		235	sumReal1 += (q63_t) a1 *c1;
		236	sumImag1 += (q63_t) b1 *c1;
		237
		238	/* update pointers */
		239	pIn1 += 2U;
		240	pIn2 += 2 * numColsB;
		241
		242	/* Multiply and Accumlates */
		243	sumReal1 -= (q63_t) b1 *d1;
		244	sumImag1 += (q63_t) a1 *d1;
		245
		246	/* Decrement the loop counter */
		247	colCnt--;
		248	}
		249
		250	/* Store the result in the destination buffer */
		251	*px++ = (q31_t) clip_q63_to_q31(sumReal1 >> 31);
		252	*px++ = (q31_t) clip_q63_to_q31(sumImag1 >> 31);
		253
		254	/* Update the pointer pIn2 to point to the starting address of the next column */
		255	j++;
		256	pIn2 = pSrcB->pData + 2U * j;
		257
		258	/* Decrement the column loop counter */
		259	col--;
		260
		261	} while (col > 0U);
		262
		263	/* Update the pointer pInA to point to the starting address of the next row */
		264	i = i + numColsB;
		265	pInA = pInA + 2 * numColsA;
		266
		267	/* Decrement the row loop counter */
		268	row--;
		269
		270	} while (row > 0U);
		271
		272	/* Set status as ARM_MATH_SUCCESS */
		273	status = ARM_MATH_SUCCESS;
		274	}
		275
		276	/* Return to application */
		277	return (status);
		278	}
		279
		280	/**
		281	* @} end of MatrixMult group
		282	*/

Subversion Repositories dashGPS

(root)/branches/dashGPS-bmp/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c – Rev 18