WebSVN – LedShow – Blame – /trunk/Drivers/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_q31.c

Rev	Author	Line No.	Line
2	mjames	1	/* ----------------------------------------------------------------------
		2	* Copyright (C) 2010-2014 ARM Limited. All rights reserved.
		3	*
		4	* $Date: 19. March 2015
		5	* $Revision: V.1.4.5
		6	*
		7	* Project: CMSIS DSP Library
		8	* Title: arm_mat_mult_q31.c
		9	*
		10	* Description: Q31 matrix multiplication.
		11	*
		12	* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
		13	*
		14	* Redistribution and use in source and binary forms, with or without
		15	* modification, are permitted provided that the following conditions
		16	* are met:
		17	* - Redistributions of source code must retain the above copyright
		18	* notice, this list of conditions and the following disclaimer.
		19	* - Redistributions in binary form must reproduce the above copyright
		20	* notice, this list of conditions and the following disclaimer in
		21	* the documentation and/or other materials provided with the
		22	* distribution.
		23	* - Neither the name of ARM LIMITED nor the names of its contributors
		24	* may be used to endorse or promote products derived from this
		25	* software without specific prior written permission.
		26	*
		27	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
		28	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
		29	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
		30	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
		31	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
		32	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
		33	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
		34	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
		35	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
		36	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
		37	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
		38	* POSSIBILITY OF SUCH DAMAGE.
		39	* -------------------------------------------------------------------- */
		40
		41	#include "arm_math.h"
		42
		43	/**
		44	* @ingroup groupMatrix
		45	*/
		46
		47	/**
		48	* @addtogroup MatrixMult
		49	* @{
		50	*/
		51
		52	/**
		53	* @brief Q31 matrix multiplication
		54	* @param[in] *pSrcA points to the first input matrix structure
		55	* @param[in] *pSrcB points to the second input matrix structure
		56	* @param[out] *pDst points to output matrix structure
		57	* @return The function returns either
		58	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		59	*
		60	* @details
		61	* <b>Scaling and Overflow Behavior:</b>
		62	*
		63	* \par
		64	* The function is implemented using an internal 64-bit accumulator.
		65	* The accumulator has a 2.62 format and maintains full precision of the intermediate
		66	* multiplication results but provides only a single guard bit. There is no saturation
		67	* on intermediate additions. Thus, if the accumulator overflows it wraps around and
		68	* distorts the result. The input signals should be scaled down to avoid intermediate
		69	* overflows. The input is thus scaled down by log2(numColsA) bits
		70	* to avoid overflows, as a total of numColsA additions are performed internally.
		71	* The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
		72	*
		73	* \par
		74	* See <code>arm_mat_mult_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
		75	*
		76	*/
		77
		78	arm_status arm_mat_mult_q31(
		79	const arm_matrix_instance_q31 * pSrcA,
		80	const arm_matrix_instance_q31 * pSrcB,
		81	arm_matrix_instance_q31 * pDst)
		82	{
		83	q31_t pIn1 = pSrcA->pData; / input data matrix pointer A */
		84	q31_t pIn2 = pSrcB->pData; / input data matrix pointer B */
		85	q31_t pInA = pSrcA->pData; / input data matrix pointer A */
		86	q31_t pOut = pDst->pData; / output data matrix pointer */
		87	q31_t px; / Temporary output data matrix pointer */
		88	q63_t sum; /* Accumulator */
		89	uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
		90	uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
		91	uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
		92
		93	#ifndef ARM_MATH_CM0_FAMILY
		94
		95	/* Run the below code for Cortex-M4 and Cortex-M3 */
		96
		97	uint16_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */
		98	arm_status status; /* status of matrix multiplication */
		99	q31_t a0, a1, a2, a3, b0, b1, b2, b3;
		100
		101	#ifdef ARM_MATH_MATRIX_CHECK
		102
		103
		104	/* Check for matrix mismatch condition */
		105	if((pSrcA->numCols != pSrcB->numRows) \|\|
		106	(pSrcA->numRows != pDst->numRows) \|\| (pSrcB->numCols != pDst->numCols))
		107	{
		108	/* Set status as ARM_MATH_SIZE_MISMATCH */
		109	status = ARM_MATH_SIZE_MISMATCH;
		110	}
		111	else
		112	#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
		113
		114	{
		115	/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
		116	/* row loop */
		117	do
		118	{
		119	/* Output pointer is set to starting address of the row being processed */
		120	px = pOut + i;
		121
		122	/* For every row wise process, the column loop counter is to be initiated */
		123	col = numColsB;
		124
		125	/* For every row wise process, the pIn2 pointer is set
		126	** to the starting address of the pSrcB data */
		127	pIn2 = pSrcB->pData;
		128
		129	j = 0u;
		130
		131	/* column loop */
		132	do
		133	{
		134	/* Set the variable sum, that acts as accumulator, to zero */
		135	sum = 0;
		136
		137	/* Initiate the pointer pIn1 to point to the starting address of pInA */
		138	pIn1 = pInA;
		139
		140	/* Apply loop unrolling and compute 4 MACs simultaneously. */
		141	colCnt = numColsA >> 2;
		142
		143
		144	/* matrix multiplication */
		145	while(colCnt > 0u)
		146	{
		147	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		148	/* Perform the multiply-accumulates */
		149	b0 = *pIn2;
		150	pIn2 += numColsB;
		151
		152	a0 = *pIn1++;
		153	a1 = *pIn1++;
		154
		155	b1 = *pIn2;
		156	pIn2 += numColsB;
		157	b2 = *pIn2;
		158	pIn2 += numColsB;
		159
		160	sum += (q63_t) a0 *b0;
		161	sum += (q63_t) a1 *b1;
		162
		163	a2 = *pIn1++;
		164	a3 = *pIn1++;
		165
		166	b3 = *pIn2;
		167	pIn2 += numColsB;
		168
		169	sum += (q63_t) a2 *b2;
		170	sum += (q63_t) a3 *b3;
		171
		172	/* Decrement the loop counter */
		173	colCnt--;
		174	}
		175
		176	/* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.
		177	** No loop unrolling is used. */
		178	colCnt = numColsA % 0x4u;
		179
		180	while(colCnt > 0u)
		181	{
		182	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		183	/* Perform the multiply-accumulates */
		184	sum += (q63_t) * pIn1++ * *pIn2;
		185	pIn2 += numColsB;
		186
		187	/* Decrement the loop counter */
		188	colCnt--;
		189	}
		190
		191	/* Convert the result from 2.62 to 1.31 format and store in destination buffer */
		192	*px++ = (q31_t) (sum >> 31);
		193
		194	/* Update the pointer pIn2 to point to the starting address of the next column */
		195	j++;
		196	pIn2 = (pSrcB->pData) + j;
		197
		198	/* Decrement the column loop counter */
		199	col--;
		200
		201	} while(col > 0u);
		202
		203	#else
		204
		205	/* Run the below code for Cortex-M0 */
		206
		207	q31_t pInB = pSrcB->pData; / input data matrix pointer B */
		208	uint16_t col, i = 0u, row = numRowsA, colCnt; /* loop counters */
		209	arm_status status; /* status of matrix multiplication */
		210
		211
		212	#ifdef ARM_MATH_MATRIX_CHECK
		213
		214	/* Check for matrix mismatch condition */
		215	if((pSrcA->numCols != pSrcB->numRows) \|\|
		216	(pSrcA->numRows != pDst->numRows) \|\| (pSrcB->numCols != pDst->numCols))
		217	{
		218	/* Set status as ARM_MATH_SIZE_MISMATCH */
		219	status = ARM_MATH_SIZE_MISMATCH;
		220	}
		221	else
		222	#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
		223
		224	{
		225	/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
		226	/* row loop */
		227	do
		228	{
		229	/* Output pointer is set to starting address of the row being processed */
		230	px = pOut + i;
		231
		232	/* For every row wise process, the column loop counter is to be initiated */
		233	col = numColsB;
		234
		235	/* For every row wise process, the pIn2 pointer is set
		236	** to the starting address of the pSrcB data */
		237	pIn2 = pSrcB->pData;
		238
		239	/* column loop */
		240	do
		241	{
		242	/* Set the variable sum, that acts as accumulator, to zero */
		243	sum = 0;
		244
		245	/* Initiate the pointer pIn1 to point to the starting address of pInA */
		246	pIn1 = pInA;
		247
		248	/* Matrix A columns number of MAC operations are to be performed */
		249	colCnt = numColsA;
		250
		251	/* matrix multiplication */
		252	while(colCnt > 0u)
		253	{
		254	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		255	/* Perform the multiply-accumulates */
		256	sum += (q63_t) * pIn1++ * *pIn2;
		257	pIn2 += numColsB;
		258
		259	/* Decrement the loop counter */
		260	colCnt--;
		261	}
		262
		263	/* Convert the result from 2.62 to 1.31 format and store in destination buffer */
		264	*px++ = (q31_t) clip_q63_to_q31(sum >> 31);
		265
		266	/* Decrement the column loop counter */
		267	col--;
		268
		269	/* Update the pointer pIn2 to point to the starting address of the next column */
		270	pIn2 = pInB + (numColsB - col);
		271
		272	} while(col > 0u);
		273
		274	#endif
		275
		276	/* Update the pointer pInA to point to the starting address of the next row */
		277	i = i + numColsB;
		278	pInA = pInA + numColsA;
		279
		280	/* Decrement the row loop counter */
		281	row--;
		282
		283	} while(row > 0u);
		284
		285	/* set status as ARM_MATH_SUCCESS */
		286	status = ARM_MATH_SUCCESS;
		287	}
		288	/* Return to application */
		289	return (status);
		290	}
		291
		292	/**
		293	* @} end of MatrixMult group
		294	*/

Subversion Repositories LedShow

(root)/trunk/Drivers/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_q31.c – Rev 2