WebSVN – ScreenTimer – Blame – /trunk/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c

Rev	Author	Line No.	Line
2	mjames	1	/* ----------------------------------------------------------------------
		2	* Project: CMSIS DSP Library
		3	* Title: arm_mat_mult_q15.c
		4	* Description: Q15 matrix multiplication
		5	*
		6	* $Date: 27. January 2017
		7	* $Revision: V.1.5.1
		8	*
		9	* Target Processor: Cortex-M cores
		10	* -------------------------------------------------------------------- */
		11	/*
		12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
		13	*
		14	* SPDX-License-Identifier: Apache-2.0
		15	*
		16	* Licensed under the Apache License, Version 2.0 (the License); you may
		17	* not use this file except in compliance with the License.
		18	* You may obtain a copy of the License at
		19	*
		20	* www.apache.org/licenses/LICENSE-2.0
		21	*
		22	* Unless required by applicable law or agreed to in writing, software
		23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		25	* See the License for the specific language governing permissions and
		26	* limitations under the License.
		27	*/
		28
		29	#include "arm_math.h"
		30
		31	/**
		32	* @ingroup groupMatrix
		33	*/
		34
		35	/**
		36	* @addtogroup MatrixMult
		37	* @{
		38	*/
		39
		40
		41	/**
		42	* @brief Q15 matrix multiplication
		43	* @param[in] *pSrcA points to the first input matrix structure
		44	* @param[in] *pSrcB points to the second input matrix structure
		45	* @param[out] *pDst points to output matrix structure
		46	* @param[in] *pState points to the array for storing intermediate results (Unused)
		47	* @return The function returns either
		48	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		49	*
		50	* @details
		51	* <b>Scaling and Overflow Behavior:</b>
		52	*
		53	* \par
		54	* The function is implemented using a 64-bit internal accumulator. The inputs to the
		55	* multiplications are in 1.15 format and multiplications yield a 2.30 result.
		56	* The 2.30 intermediate
		57	* results are accumulated in a 64-bit accumulator in 34.30 format. This approach
		58	* provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
		59	* truncated to 34.15 format by discarding the low 15 bits and then saturated to
		60	* 1.15 format.
		61	*
		62	* \par
		63	* Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
		64	*
		65	*/
		66
		67	arm_status arm_mat_mult_q15(
		68	const arm_matrix_instance_q15 * pSrcA,
		69	const arm_matrix_instance_q15 * pSrcB,
		70	arm_matrix_instance_q15 * pDst,
		71	q15_t * pState)
		72	{
		73	q63_t sum; /* accumulator */
		74
		75	#if defined (ARM_MATH_DSP)
		76
		77	/* Run the below code for Cortex-M4 and Cortex-M3 */
		78
		79	q15_t pSrcBT = pState; / input data matrix pointer for transpose */
		80	q15_t pInA = pSrcA->pData; / input data matrix pointer A of Q15 type */
		81	q15_t pInB = pSrcB->pData; / input data matrix pointer B of Q15 type */
		82	q15_t px; / Temporary output data matrix pointer */
		83	uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
		84	uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
		85	uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
		86	uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
		87	uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
		88	arm_status status; /* status of matrix multiplication */
		89
		90	#ifndef UNALIGNED_SUPPORT_DISABLE
		91
		92	q31_t in; /* Temporary variable to hold the input value */
		93	q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2;
		94
		95	#else
		96
		97	q15_t in; /* Temporary variable to hold the input value */
		98	q15_t inA1, inB1, inA2, inB2;
		99
		100	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
		101
		102	#ifdef ARM_MATH_MATRIX_CHECK
		103	/* Check for matrix mismatch condition */
		104	if ((pSrcA->numCols != pSrcB->numRows) \|\|
		105	(pSrcA->numRows != pDst->numRows) \|\| (pSrcB->numCols != pDst->numCols))
		106	{
		107	/* Set status as ARM_MATH_SIZE_MISMATCH */
		108	status = ARM_MATH_SIZE_MISMATCH;
		109	}
		110	else
		111	#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
		112	{
		113	/* Matrix transpose */
		114	do
		115	{
		116	/* Apply loop unrolling and exchange the columns with row elements */
		117	col = numColsB >> 2;
		118
		119	/* The pointer px is set to starting address of the column being processed */
		120	px = pSrcBT + i;
		121
		122	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
		123	** a second loop below computes the remaining 1 to 3 samples. */
		124	while (col > 0U)
		125	{
		126	#ifndef UNALIGNED_SUPPORT_DISABLE
		127
		128	/* Read two elements from the row */
		129	in = *__SIMD32(pInB)++;
		130
		131	/* Unpack and store one element in the destination */
		132	#ifndef ARM_MATH_BIG_ENDIAN
		133
		134	*px = (q15_t) in;
		135
		136	#else
		137
		138	*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
		139
		140	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		141
		142	/* Update the pointer px to point to the next row of the transposed matrix */
		143	px += numRowsB;
		144
		145	/* Unpack and store the second element in the destination */
		146	#ifndef ARM_MATH_BIG_ENDIAN
		147
		148	*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
		149
		150	#else
		151
		152	*px = (q15_t) in;
		153
		154	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		155
		156	/* Update the pointer px to point to the next row of the transposed matrix */
		157	px += numRowsB;
		158
		159	/* Read two elements from the row */
		160	in = *__SIMD32(pInB)++;
		161
		162	/* Unpack and store one element in the destination */
		163	#ifndef ARM_MATH_BIG_ENDIAN
		164
		165	*px = (q15_t) in;
		166
		167	#else
		168
		169	*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
		170
		171	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		172
		173	/* Update the pointer px to point to the next row of the transposed matrix */
		174	px += numRowsB;
		175
		176	/* Unpack and store the second element in the destination */
		177
		178	#ifndef ARM_MATH_BIG_ENDIAN
		179
		180	*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
		181
		182	#else
		183
		184	*px = (q15_t) in;
		185
		186	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		187
		188	/* Update the pointer px to point to the next row of the transposed matrix */
		189	px += numRowsB;
		190
		191	#else
		192
		193	/* Read one element from the row */
		194	in = *pInB++;
		195
		196	/* Store one element in the destination */
		197	*px = in;
		198
		199	/* Update the pointer px to point to the next row of the transposed matrix */
		200	px += numRowsB;
		201
		202	/* Read one element from the row */
		203	in = *pInB++;
		204
		205	/* Store one element in the destination */
		206	*px = in;
		207
		208	/* Update the pointer px to point to the next row of the transposed matrix */
		209	px += numRowsB;
		210
		211	/* Read one element from the row */
		212	in = *pInB++;
		213
		214	/* Store one element in the destination */
		215	*px = in;
		216
		217	/* Update the pointer px to point to the next row of the transposed matrix */
		218	px += numRowsB;
		219
		220	/* Read one element from the row */
		221	in = *pInB++;
		222
		223	/* Store one element in the destination */
		224	*px = in;
		225
		226	/* Update the pointer px to point to the next row of the transposed matrix */
		227	px += numRowsB;
		228
		229	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
		230
		231	/* Decrement the column loop counter */
		232	col--;
		233	}
		234
		235	/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
		236	** No loop unrolling is used. */
		237	col = numColsB % 0x4U;
		238
		239	while (col > 0U)
		240	{
		241	/* Read and store the input element in the destination */
		242	px = pInB++;
		243
		244	/* Update the pointer px to point to the next row of the transposed matrix */
		245	px += numRowsB;
		246
		247	/* Decrement the column loop counter */
		248	col--;
		249	}
		250
		251	i++;
		252
		253	/* Decrement the row loop counter */
		254	row--;
		255
		256	} while (row > 0U);
		257
		258	/* Reset the variables for the usage in the following multiplication process */
		259	row = numRowsA;
		260	i = 0U;
		261	px = pDst->pData;
		262
		263	/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
		264	/* row loop */
		265	do
		266	{
		267	/* For every row wise process, the column loop counter is to be initiated */
		268	col = numColsB;
		269
		270	/* For every row wise process, the pIn2 pointer is set
		271	** to the starting address of the transposed pSrcB data */
		272	pInB = pSrcBT;
		273
		274	/* column loop */
		275	do
		276	{
		277	/* Set the variable sum, that acts as accumulator, to zero */
		278	sum = 0;
		279
		280	/* Apply loop unrolling and compute 2 MACs simultaneously. */
		281	colCnt = numColsA >> 2;
		282
		283	/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
		284	pInA = pSrcA->pData + i;
		285
		286
		287	/* matrix multiplication */
		288	while (colCnt > 0U)
		289	{
		290	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		291	#ifndef UNALIGNED_SUPPORT_DISABLE
		292
		293	/* read real and imag values from pSrcA and pSrcB buffer */
		294	pSourceA1 = *__SIMD32(pInA)++;
		295	pSourceB1 = *__SIMD32(pInB)++;
		296
		297	pSourceA2 = *__SIMD32(pInA)++;
		298	pSourceB2 = *__SIMD32(pInB)++;
		299
		300	/* Multiply and Accumlates */
		301	sum = __SMLALD(pSourceA1, pSourceB1, sum);
		302	sum = __SMLALD(pSourceA2, pSourceB2, sum);
		303
		304	#else
		305	/* read real and imag values from pSrcA and pSrcB buffer */
		306	inA1 = *pInA++;
		307	inB1 = *pInB++;
		308	inA2 = *pInA++;
		309	/* Multiply and Accumlates */
		310	sum += inA1 * inB1;
		311	inB2 = *pInB++;
		312
		313	inA1 = *pInA++;
		314	inB1 = *pInB++;
		315	/* Multiply and Accumlates */
		316	sum += inA2 * inB2;
		317	inA2 = *pInA++;
		318	inB2 = *pInB++;
		319
		320	/* Multiply and Accumlates */
		321	sum += inA1 * inB1;
		322	sum += inA2 * inB2;
		323
		324	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
		325
		326	/* Decrement the loop counter */
		327	colCnt--;
		328	}
		329
		330	/* process remaining column samples */
		331	colCnt = numColsA & 3U;
		332
		333	while (colCnt > 0U)
		334	{
		335	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		336	sum += pInA++ *pInB++;
		337
		338	/* Decrement the loop counter */
		339	colCnt--;
		340	}
		341
		342	/* Saturate and store the result in the destination buffer */
		343	*px = (q15_t) (__SSAT((sum >> 15), 16));
		344	px++;
		345
		346	/* Decrement the column loop counter */
		347	col--;
		348
		349	} while (col > 0U);
		350
		351	i = i + numColsA;
		352
		353	/* Decrement the row loop counter */
		354	row--;
		355
		356	} while (row > 0U);
		357
		358	#else
		359
		360	/* Run the below code for Cortex-M0 */
		361
		362	q15_t pIn1 = pSrcA->pData; / input data matrix pointer A */
		363	q15_t pIn2 = pSrcB->pData; / input data matrix pointer B */
		364	q15_t pInA = pSrcA->pData; / input data matrix pointer A of Q15 type */
		365	q15_t pInB = pSrcB->pData; / input data matrix pointer B of Q15 type */
		366	q15_t pOut = pDst->pData; / output data matrix pointer */
		367	q15_t px; / Temporary output data matrix pointer */
		368	uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
		369	uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
		370	uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
		371	uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
		372	arm_status status; /* status of matrix multiplication */
		373
		374	#ifdef ARM_MATH_MATRIX_CHECK
		375
		376	/* Check for matrix mismatch condition */
		377	if ((pSrcA->numCols != pSrcB->numRows) \|\|
		378	(pSrcA->numRows != pDst->numRows) \|\| (pSrcB->numCols != pDst->numCols))
		379	{
		380	/* Set status as ARM_MATH_SIZE_MISMATCH */
		381	status = ARM_MATH_SIZE_MISMATCH;
		382	}
		383	else
		384	#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
		385
		386	{
		387	/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
		388	/* row loop */
		389	do
		390	{
		391	/* Output pointer is set to starting address of the row being processed */
		392	px = pOut + i;
		393
		394	/* For every row wise process, the column loop counter is to be initiated */
		395	col = numColsB;
		396
		397	/* For every row wise process, the pIn2 pointer is set
		398	** to the starting address of the pSrcB data */
		399	pIn2 = pSrcB->pData;
		400
		401	/* column loop */
		402	do
		403	{
		404	/* Set the variable sum, that acts as accumulator, to zero */
		405	sum = 0;
		406
		407	/* Initiate the pointer pIn1 to point to the starting address of pSrcA */
		408	pIn1 = pInA;
		409
		410	/* Matrix A columns number of MAC operations are to be performed */
		411	colCnt = numColsA;
		412
		413	/* matrix multiplication */
		414	while (colCnt > 0U)
		415	{
		416	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		417	/* Perform the multiply-accumulates */
		418	sum += (q31_t) * pIn1++ * *pIn2;
		419	pIn2 += numColsB;
		420
		421	/* Decrement the loop counter */
		422	colCnt--;
		423	}
		424
		425	/* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
		426	/* Saturate and store the result in the destination buffer */
		427	*px++ = (q15_t) __SSAT((sum >> 15), 16);
		428
		429	/* Decrement the column loop counter */
		430	col--;
		431
		432	/* Update the pointer pIn2 to point to the starting address of the next column */
		433	pIn2 = pInB + (numColsB - col);
		434
		435	} while (col > 0U);
		436
		437	/* Update the pointer pSrcA to point to the starting address of the next row */
		438	i = i + numColsB;
		439	pInA = pInA + numColsA;
		440
		441	/* Decrement the row loop counter */
		442	row--;
		443
		444	} while (row > 0U);
		445
		446	#endif /* #if defined (ARM_MATH_DSP) */
		447	/* set status as ARM_MATH_SUCCESS */
		448	status = ARM_MATH_SUCCESS;
		449	}
		450
		451	/* Return to application */
		452	return (status);
		453	}
		454
		455	/**
		456	* @} end of MatrixMult group
		457	*/

Subversion Repositories ScreenTimer

(root)/trunk/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c – Rev 2