WebSVN – testOled – Blame – /trunk/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c

Rev	Author	Line No.	Line
2	mjames	1	/*
		2	* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
		3	*
		4	* SPDX-License-Identifier: Apache-2.0
		5	*
		6	* Licensed under the Apache License, Version 2.0 (the License); you may
		7	* not use this file except in compliance with the License.
		8	* You may obtain a copy of the License at
		9	*
		10	* www.apache.org/licenses/LICENSE-2.0
		11	*
		12	* Unless required by applicable law or agreed to in writing, software
		13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		15	* See the License for the specific language governing permissions and
		16	* limitations under the License.
		17	*/
		18
		19	/* ----------------------------------------------------------------------
		20	* Project: CMSIS NN Library
		21	* Title: arm_fully_connected_q7.c
		22	* Description: Q7 basic fully-connected layer function
		23	*
		24	* $Date: 17. January 2018
		25	* $Revision: V.1.0.0
		26	*
		27	* Target Processor: Cortex-M cores
		28	*
		29	* -------------------------------------------------------------------- */
		30
		31	#include "arm_math.h"
		32	#include "arm_nnfunctions.h"
		33
		34	/**
		35	* @ingroup groupNN
		36	*/
		37
		38	/**
		39	* @addtogroup FC
		40	* @{
		41	*/
		42
		43	/**
		44	* @brief Q7 basic fully-connected layer function
		45	* @param[in] pV pointer to input vector
		46	* @param[in] pM pointer to matrix weights
		47	* @param[in] dim_vec length of the vector
		48	* @param[in] num_of_rows number of rows in weight matrix
		49	* @param[in] bias_shift amount of left-shift for bias
		50	* @param[in] out_shift amount of right-shift for output
		51	* @param[in] bias pointer to bias
		52	* @param[in,out] pOut pointer to output vector
		53	* @param[in,out] vec_buffer pointer to buffer space for input
		54	* @return The function returns <code>ARM_MATH_SUCCESS</code>
		55	*
		56	* @details
		57	*
		58	* <b>Buffer size:</b>
		59	*
		60	* vec_buffer size: dim_vec
		61	*
		62	* This basic function is designed to work with regular weight
		63	* matrix without interleaving.
		64	*
		65	*/
		66
		67	arm_status
		68	arm_fully_connected_q7(const q7_t * pV,
		69	const q7_t * pM,
		70	const uint16_t dim_vec,
		71	const uint16_t num_of_rows,
		72	const uint16_t bias_shift,
		73	const uint16_t out_shift, const q7_t * bias, q7_t * pOut, q15_t * vec_buffer)
		74	{
		75
		76	#if defined (ARM_MATH_DSP)
		77	/* Run the following code for Cortex-M4 and Cortex-M7 */
		78
		79	const q7_t *pB = pM;
		80	const q7_t *pB2;
		81	q7_t *pO = pOut;
		82	const q7_t *pBias = bias;
		83	q15_t *pA;
		84	uint16_t rowCnt = num_of_rows >> 1;
		85
		86	/* expand the vector into the buffer */
		87	arm_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec);
		88
		89	while (rowCnt)
		90	{
		91	q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
		92	q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
		93	uint16_t colCnt = dim_vec >> 2;
		94
		95	pA = vec_buffer;
		96	pB2 = pB + dim_vec;
		97
		98	while (colCnt)
		99	{
		100	q31_t inV, inM11, inM12, inM21, inM22;
		101	pB = (q7_t ) read_and_pad_reordered((void )pB, &inM11, &inM12);
		102	pB2 = (q7_t ) read_and_pad_reordered((void )pB2, &inM21, &inM22);
		103
		104	inV = *__SIMD32(pA)++;
		105
		106	sum = __SMLAD(inV, inM11, sum);
		107	sum2 = __SMLAD(inV, inM21, sum2);
		108
		109	inV = *__SIMD32(pA)++;
		110
		111	sum = __SMLAD(inV, inM12, sum);
		112	sum2 = __SMLAD(inV, inM22, sum2);
		113
		114	colCnt--;
		115	}
		116	colCnt = dim_vec & 0x3;
		117	while (colCnt)
		118	{
		119	q7_t inV = *pA++;
		120	q15_t inM = *pB++;
		121	q15_t inM2 = *pB2++;
		122
		123	sum += inV * inM;
		124	sum2 += inV * inM2;
		125	colCnt--;
		126	} /* while over colCnt */
		127	*pO++ = (q7_t) (__SSAT((sum >> out_shift), 8));
		128	*pO++ = (q7_t) (__SSAT((sum2 >> out_shift), 8));
		129
		130	/* adjust the pointers and counters */
		131	pB += dim_vec;
		132	rowCnt--;
		133	}
		134
		135	/* left-over part of the rows */
		136	rowCnt = num_of_rows & 0x1;
		137
		138	while (rowCnt)
		139	{
		140	uint16_t colCnt = dim_vec >> 2;
		141	q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
		142
		143	pA = vec_buffer;
		144
		145	while (colCnt)
		146	{
		147	q31_t inV1, inV2, inM11, inM12;
		148
		149	pB = (q7_t ) read_and_pad_reordered((void )pB, &inM11, &inM12);
		150
		151	inV1 = *__SIMD32(pA)++;
		152	sum = __SMLAD(inV1, inM11, sum);
		153
		154	inV2 = *__SIMD32(pA)++;
		155	sum = __SMLAD(inV2, inM12, sum);
		156
		157	colCnt--;
		158	}
		159
		160	/* left-over of the vector */
		161	colCnt = dim_vec & 0x3;
		162	while (colCnt)
		163	{
		164	q7_t inV = *pA++;
		165	q15_t inM = *pB++;
		166	sum += inV * inM;
		167	colCnt--;
		168	}
		169
		170	*pO++ = (q7_t) (__SSAT((sum >> out_shift), 8));
		171
		172	rowCnt--;
		173	}
		174
		175	#else
		176	int i, j;
		177
		178	/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
		179	for (i = 0; i < num_of_rows; i++)
		180	{
		181	int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
		182	for (j = 0; j < dim_vec; j++)
		183	{
		184	ip_out += pV[j] * pM[i * dim_vec + j];
		185	}
		186	pOut[i] = (q7_t) __SSAT((ip_out >> out_shift), 8);
		187	}
		188
		189	#endif /* ARM_MATH_DSP */
		190
		191	/* Return to ARM_MATH_SUCCESS */
		192	return (ARM_MATH_SUCCESS);
		193
		194	}
		195
		196	/**
		197	* @} end of FC group
		198	*/

Subversion Repositories testOled

(root)/trunk/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c – Rev 2