WebSVN – DashDisplay – Blame – /branches/Dashboard_L152_v2/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c

Rev	Author	Line No.	Line
56	mjames	1	/* ----------------------------------------------------------------------
		2	* Project: CMSIS DSP Library
		3	* Title: arm_mat_mult_fast_q15.c
		4	* Description: Q15 matrix multiplication (fast variant)
		5	*
		6	* $Date: 27. January 2017
		7	* $Revision: V.1.5.1
		8	*
		9	* Target Processor: Cortex-M cores
		10	* -------------------------------------------------------------------- */
		11	/*
		12	* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
		13	*
		14	* SPDX-License-Identifier: Apache-2.0
		15	*
		16	* Licensed under the Apache License, Version 2.0 (the License); you may
		17	* not use this file except in compliance with the License.
		18	* You may obtain a copy of the License at
		19	*
		20	* www.apache.org/licenses/LICENSE-2.0
		21	*
		22	* Unless required by applicable law or agreed to in writing, software
		23	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		24	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		25	* See the License for the specific language governing permissions and
		26	* limitations under the License.
		27	*/
		28
		29	#include "arm_math.h"
		30
		31	/**
		32	* @ingroup groupMatrix
		33	*/
		34
		35	/**
		36	* @addtogroup MatrixMult
		37	* @{
		38	*/
		39
		40
		41	/**
		42	* @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
		43	* @param[in] *pSrcA points to the first input matrix structure
		44	* @param[in] *pSrcB points to the second input matrix structure
		45	* @param[out] *pDst points to output matrix structure
		46	* @param[in] *pState points to the array for storing intermediate results
		47	* @return The function returns either
		48	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		49	*
		50	* @details
		51	* <b>Scaling and Overflow Behavior:</b>
		52	*
		53	* \par
		54	* The difference between the function arm_mat_mult_q15() and this fast variant is that
		55	* the fast variant use a 32-bit rather than a 64-bit accumulator.
		56	* The result of each 1.15 x 1.15 multiplication is truncated to
		57	* 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
		58	* format. Finally, the accumulator is saturated and converted to a 1.15 result.
		59	*
		60	* \par
		61	* The fast version has the same overflow behavior as the standard version but provides
		62	* less precision since it discards the low 16 bits of each multiplication result.
		63	* In order to avoid overflows completely the input signals must be scaled down.
		64	* Scale down one of the input matrices by log2(numColsA) bits to
		65	* avoid overflows, as a total of numColsA additions are computed internally for each
		66	* output element.
		67	*
		68	* \par
		69	* See <code>arm_mat_mult_q15()</code> for a slower implementation of this function
		70	* which uses 64-bit accumulation to provide higher precision.
		71	*/
		72
		73	arm_status arm_mat_mult_fast_q15(
		74	const arm_matrix_instance_q15 * pSrcA,
		75	const arm_matrix_instance_q15 * pSrcB,
		76	arm_matrix_instance_q15 * pDst,
		77	q15_t * pState)
		78	{
		79	q31_t sum; /* accumulator */
		80	q15_t pSrcBT = pState; / input data matrix pointer for transpose */
		81	q15_t pInA = pSrcA->pData; / input data matrix pointer A of Q15 type */
		82	q15_t pInB = pSrcB->pData; / input data matrix pointer B of Q15 type */
		83	q15_t px; / Temporary output data matrix pointer */
		84	uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
		85	uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
		86	uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
		87	uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
		88	uint32_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
		89	arm_status status; /* status of matrix multiplication */
		90
		91	#ifndef UNALIGNED_SUPPORT_DISABLE
		92
		93	q31_t in; /* Temporary variable to hold the input value */
		94	q31_t inA1, inA2, inB1, inB2;
		95	q31_t sum2, sum3, sum4;
		96	q15_t pInA2, pInB2, *px2;
		97	uint32_t j = 0;
		98
		99	#else
		100
		101	q15_t in; /* Temporary variable to hold the input value */
		102	q15_t inA1, inA2, inB1, inB2;
		103
		104	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
		105
		106	#ifdef ARM_MATH_MATRIX_CHECK
		107	/* Check for matrix mismatch condition */
		108	if ((pSrcA->numCols != pSrcB->numRows) \|\|
		109	(pSrcA->numRows != pDst->numRows) \|\| (pSrcB->numCols != pDst->numCols))
		110	{
		111	/* Set status as ARM_MATH_SIZE_MISMATCH */
		112	status = ARM_MATH_SIZE_MISMATCH;
		113	}
		114	else
		115	#endif
		116	{
		117	/* Matrix transpose */
		118	do
		119	{
		120	/* Apply loop unrolling and exchange the columns with row elements */
		121	col = numColsB >> 2;
		122
		123	/* The pointer px is set to starting address of the column being processed */
		124	px = pSrcBT + i;
		125
		126	/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
		127	** a second loop below computes the remaining 1 to 3 samples. */
		128	while (col > 0U)
		129	{
		130	#ifndef UNALIGNED_SUPPORT_DISABLE
		131	/* Read two elements from the row */
		132	in = *__SIMD32(pInB)++;
		133
		134	/* Unpack and store one element in the destination */
		135	#ifndef ARM_MATH_BIG_ENDIAN
		136
		137	*px = (q15_t) in;
		138
		139	#else
		140
		141	*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
		142
		143	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		144
		145	/* Update the pointer px to point to the next row of the transposed matrix */
		146	px += numRowsB;
		147
		148	/* Unpack and store the second element in the destination */
		149	#ifndef ARM_MATH_BIG_ENDIAN
		150
		151	*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
		152
		153	#else
		154
		155	*px = (q15_t) in;
		156
		157	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		158
		159	/* Update the pointer px to point to the next row of the transposed matrix */
		160	px += numRowsB;
		161
		162	/* Read two elements from the row */
		163	in = *__SIMD32(pInB)++;
		164
		165	/* Unpack and store one element in the destination */
		166	#ifndef ARM_MATH_BIG_ENDIAN
		167
		168	*px = (q15_t) in;
		169
		170	#else
		171
		172	*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
		173
		174	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		175
		176	/* Update the pointer px to point to the next row of the transposed matrix */
		177	px += numRowsB;
		178
		179	/* Unpack and store the second element in the destination */
		180
		181	#ifndef ARM_MATH_BIG_ENDIAN
		182
		183	*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
		184
		185	#else
		186
		187	*px = (q15_t) in;
		188
		189	#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
		190
		191	#else
		192
		193	/* Read one element from the row */
		194	in = *pInB++;
		195
		196	/* Store one element in the destination */
		197	*px = in;
		198
		199	/* Update the pointer px to point to the next row of the transposed matrix */
		200	px += numRowsB;
		201
		202	/* Read one element from the row */
		203	in = *pInB++;
		204
		205	/* Store one element in the destination */
		206	*px = in;
		207
		208	/* Update the pointer px to point to the next row of the transposed matrix */
		209	px += numRowsB;
		210
		211	/* Read one element from the row */
		212	in = *pInB++;
		213
		214	/* Store one element in the destination */
		215	*px = in;
		216
		217	/* Update the pointer px to point to the next row of the transposed matrix */
		218	px += numRowsB;
		219
		220	/* Read one element from the row */
		221	in = *pInB++;
		222
		223	/* Store one element in the destination */
		224	*px = in;
		225
		226	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
		227
		228	/* Update the pointer px to point to the next row of the transposed matrix */
		229	px += numRowsB;
		230
		231	/* Decrement the column loop counter */
		232	col--;
		233	}
		234
		235	/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
		236	** No loop unrolling is used. */
		237	col = numColsB % 0x4U;
		238
		239	while (col > 0U)
		240	{
		241	/* Read and store the input element in the destination */
		242	px = pInB++;
		243
		244	/* Update the pointer px to point to the next row of the transposed matrix */
		245	px += numRowsB;
		246
		247	/* Decrement the column loop counter */
		248	col--;
		249	}
		250
		251	i++;
		252
		253	/* Decrement the row loop counter */
		254	row--;
		255
		256	} while (row > 0U);
		257
		258	/* Reset the variables for the usage in the following multiplication process */
		259	row = numRowsA;
		260	i = 0U;
		261	px = pDst->pData;
		262
		263	#ifndef UNALIGNED_SUPPORT_DISABLE
		264	/* Process two rows from matrix A at a time and output two rows at a time */
		265	row = row >> 1;
		266	px2 = px + numColsB;
		267	#endif
		268
		269	/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
		270	/* row loop */
		271	while (row > 0U)
		272	{
		273	/* For every row wise process, the column loop counter is to be initiated */
		274	col = numColsB;
		275
		276	/* For every row wise process, the pIn2 pointer is set
		277	** to the starting address of the transposed pSrcB data */
		278	pInB = pSrcBT;
		279
		280	#ifndef UNALIGNED_SUPPORT_DISABLE
		281	/* Process two (transposed) columns from matrix B at a time */
		282	col = col >> 1;
		283	j = 0;
		284	#endif
		285
		286	/* column loop */
		287	while (col > 0U)
		288	{
		289	/* Set the variable sum, that acts as accumulator, to zero */
		290	sum = 0;
		291
		292	/* Initiate the pointer pInA to point to the starting address of the column being processed */
		293	pInA = pSrcA->pData + i;
		294
		295	#ifndef UNALIGNED_SUPPORT_DISABLE
		296	sum2 = 0;
		297	sum3 = 0;
		298	sum4 = 0;
		299	pInB = pSrcBT + j;
		300	pInA2 = pInA + numColsA;
		301	pInB2 = pInB + numRowsB;
		302
		303	/* Read in two elements at once - alows dual MAC instruction */
		304	colCnt = numColsA >> 1;
		305	#else
		306	colCnt = numColsA >> 2;
		307	#endif
		308
		309	/* matrix multiplication */
		310	while (colCnt > 0U)
		311	{
		312	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		313	#ifndef UNALIGNED_SUPPORT_DISABLE
		314
		315	inA1 = *__SIMD32(pInA)++;
		316	inB1 = *__SIMD32(pInB)++;
		317	inA2 = *__SIMD32(pInA2)++;
		318	inB2 = *__SIMD32(pInB2)++;
		319
		320	sum = __SMLAD(inA1, inB1, sum);
		321	sum2 = __SMLAD(inA1, inB2, sum2);
		322	sum3 = __SMLAD(inA2, inB1, sum3);
		323	sum4 = __SMLAD(inA2, inB2, sum4);
		324
		325	#else
		326
		327	inA1 = *pInA;
		328	inB1 = *pInB;
		329	sum += inA1 * inB1;
		330
		331	inA2 = pInA[1];
		332	inB2 = pInB[1];
		333	sum += inA2 * inB2;
		334
		335	inA1 = pInA[2];
		336	inB1 = pInB[2];
		337	sum += inA1 * inB1;
		338
		339	inA2 = pInA[3];
		340	inB2 = pInB[3];
		341	sum += inA2 * inB2;
		342
		343	pInA += 4;
		344	pInB += 4;
		345
		346	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
		347
		348	/* Decrement the loop counter */
		349	colCnt--;
		350	}
		351
		352	/* process odd column samples */
		353	#ifndef UNALIGNED_SUPPORT_DISABLE
		354	if (numColsA & 1U) {
		355	inA1 = *pInA++;
		356	inB1 = *pInB++;
		357	inA2 = *pInA2++;
		358	inB2 = *pInB2++;
		359	sum += inA1 * inB1;
		360	sum2 += inA1 * inB2;
		361	sum3 += inA2 * inB1;
		362	sum4 += inA2 * inB2;
		363	}
		364	#else
		365	colCnt = numColsA % 0x4U;
		366
		367	while (colCnt > 0U)
		368	{
		369	/* c(m,n) = a(1,1)b(1,1) + a(1,2) b(2,1) + .... + a(m,p)b(p,n) /
		370	sum += (q31_t) (pInA++) (*pInB++);
		371
		372	colCnt--;
		373	}
		374	#endif
		375
		376	/* Saturate and store the result in the destination buffer */
		377	*px++ = (q15_t) (sum >> 15);
		378
		379	#ifndef UNALIGNED_SUPPORT_DISABLE
		380	*px++ = (q15_t) (sum2 >> 15);
		381	*px2++ = (q15_t) (sum3 >> 15);
		382	*px2++ = (q15_t) (sum4 >> 15);
		383	j += numRowsB * 2;
		384	#endif
		385
		386	/* Decrement the column loop counter */
		387	col--;
		388
		389	}
		390
		391	i = i + numColsA;
		392
		393	#ifndef UNALIGNED_SUPPORT_DISABLE
		394	i = i + numColsA;
		395	px = px2 + (numColsB & 1U);
		396	px2 = px + numColsB;
		397	#endif
		398
		399	/* Decrement the row loop counter */
		400	row--;
		401
		402	}
		403
		404	/* Compute any remaining odd row/column below */
		405
		406	#ifndef UNALIGNED_SUPPORT_DISABLE
		407
		408	/* Compute remaining output column */
		409	if (numColsB & 1U) {
		410
		411	/* Avoid redundant computation of last element */
		412	row = numRowsA & (~0x1);
		413
		414	/* Point to remaining unfilled column in output matrix */
		415	px = pDst->pData+numColsB-1;
		416	pInA = pSrcA->pData;
		417
		418	/* row loop */
		419	while (row > 0)
		420	{
		421
		422	/* point to last column in matrix B */
		423	pInB = pSrcBT + numRowsB*(numColsB-1);
		424
		425	/* Set the variable sum, that acts as accumulator, to zero */
		426	sum = 0;
		427
		428	/* Compute 4 columns at once */
		429	colCnt = numColsA >> 2;
		430
		431	/* matrix multiplication */
		432	while (colCnt > 0U)
		433	{
		434	inA1 = *__SIMD32(pInA)++;
		435	inA2 = *__SIMD32(pInA)++;
		436	inB1 = *__SIMD32(pInB)++;
		437	inB2 = *__SIMD32(pInB)++;
		438
		439	sum = __SMLAD(inA1, inB1, sum);
		440	sum = __SMLAD(inA2, inB2, sum);
		441
		442	/* Decrement the loop counter */
		443	colCnt--;
		444	}
		445
		446	colCnt = numColsA & 3U;
		447	while (colCnt > 0U) {
		448	sum += (q31_t) (pInA++) (*pInB++);
		449	colCnt--;
		450	}
		451
		452	/* Store the result in the destination buffer */
		453	*px = (q15_t) (sum >> 15);
		454	px += numColsB;
		455
		456	/* Decrement the row loop counter */
		457	row--;
		458	}
		459	}
		460
		461	/* Compute remaining output row */
		462	if (numRowsA & 1U) {
		463
		464	/* point to last row in output matrix */
		465	px = pDst->pData+(numColsB)*(numRowsA-1);
		466
		467	pInB = pSrcBT;
		468	col = numColsB;
		469	i = 0U;
		470
		471	/* col loop */
		472	while (col > 0)
		473	{
		474
		475	/* point to last row in matrix A */
		476	pInA = pSrcA->pData + (numRowsA-1)*numColsA;
		477
		478	/* Set the variable sum, that acts as accumulator, to zero */
		479	sum = 0;
		480
		481	/* Compute 4 columns at once */
		482	colCnt = numColsA >> 2;
		483
		484	/* matrix multiplication */
		485	while (colCnt > 0U)
		486	{
		487	inA1 = *__SIMD32(pInA)++;
		488	inA2 = *__SIMD32(pInA)++;
		489	inB1 = *__SIMD32(pInB)++;
		490	inB2 = *__SIMD32(pInB)++;
		491
		492	sum = __SMLAD(inA1, inB1, sum);
		493	sum = __SMLAD(inA2, inB2, sum);
		494
		495	/* Decrement the loop counter */
		496	colCnt--;
		497	}
		498
		499	colCnt = numColsA & 3U;
		500	while (colCnt > 0U) {
		501	sum += (q31_t) (pInA++) (*pInB++);
		502	colCnt--;
		503	}
		504
		505	/* Store the result in the destination buffer */
		506	*px++ = (q15_t) (sum >> 15);
		507
		508	/* Decrement the col loop counter */
		509	col--;
		510	}
		511	}
		512
		513	#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
		514
		515	/* set status as ARM_MATH_SUCCESS */
		516	status = ARM_MATH_SUCCESS;
		517	}
		518
		519	/* Return to application */
		520	return (status);
		521	}
		522
		523	/**
		524	* @} end of MatrixMult group
		525	*/

Subversion Repositories DashDisplay

(root)/branches/Dashboard_L152_v2/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c – Rev 56