WebSVN – AFRtranscoder – Blame – /trunk/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c

Rev	Author	Line No.	Line
2	mjames	1	/*
		2	* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
		3	*
		4	* SPDX-License-Identifier: Apache-2.0
		5	*
		6	* Licensed under the Apache License, Version 2.0 (the License); you may
		7	* not use this file except in compliance with the License.
		8	* You may obtain a copy of the License at
		9	*
		10	* www.apache.org/licenses/LICENSE-2.0
		11	*
		12	* Unless required by applicable law or agreed to in writing, software
		13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		15	* See the License for the specific language governing permissions and
		16	* limitations under the License.
		17	*/
		18
		19	/* ----------------------------------------------------------------------
		20	* Project: CMSIS NN Library
		21	* Title: arm_convolve_1x1_HWC_q7_fast_nonsquare.c
		22	* Description: Fast Q7 version of 1x1 convolution (non-square shape)
		23	*
		24	* $Date: 17. January 2018
		25	* $Revision: V.1.0.0
		26	*
		27	* Target Processor: Cortex-M cores
		28	*
		29	* -------------------------------------------------------------------- */
		30
		31	#include "arm_math.h"
		32	#include "arm_nnfunctions.h"
		33
		34	/**
		35	* @ingroup groupNN
		36	*/
		37
		38	/**
		39	* @addtogroup NNConv
		40	* @{
		41	*/
		42
		43	/**
		44	* @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
		45	* @param[in] Im_in pointer to input tensor
		46	* @param[in] dim_im_in_x input tensor dimention x
		47	* @param[in] dim_im_in_y input tensor dimention y
		48	* @param[in] ch_im_in number of input tensor channels
		49	* @param[in] wt pointer to kernel weights
		50	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		51	* @param[in] dim_kernel_x filter kernel size x
		52	* @param[in] dim_kernel_y filter kernel size y
		53	* @param[in] padding_x padding size x
		54	* @param[in] padding_y padding size y
		55	* @param[in] stride_x convolution stride x
		56	* @param[in] stride_y convolution stride y
		57	* @param[in] bias pointer to bias
		58	* @param[in] bias_shift amount of left-shift for bias
		59	* @param[in] out_shift amount of right-shift for output
		60	* @param[in,out] Im_out pointer to output tensor
		61	* @param[in] dim_im_out_x output tensor dimension x
		62	* @param[in] dim_im_out_y output tensor dimension y
		63	* @param[in,out] bufferA pointer to buffer space for input
		64	* @param[in,out] bufferB pointer to buffer space for output
		65	* @return The function returns either
		66	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		67	*
		68	* This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1
		69	* and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise
		70	* separable convolution.
		71	*
		72	* This function is the version with full list of optimization tricks, but with
		73	* some contraints:
		74	* ch_im_in is multiple of 4
		75	* ch_im_out is multiple of 2
		76	*
		77	* [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
		78	* https://arxiv.org/abs/1704.04861
		79	*/
		80
		81	arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
		82	const uint16_t dim_im_in_x,
		83	const uint16_t dim_im_in_y,
		84	const uint16_t ch_im_in,
		85	const q7_t * wt,
		86	const uint16_t ch_im_out,
		87	const uint16_t dim_kernel_x,
		88	const uint16_t dim_kernel_y,
		89	const uint16_t padding_x,
		90	const uint16_t padding_y,
		91	const uint16_t stride_x,
		92	const uint16_t stride_y,
		93	const q7_t * bias,
		94	const uint16_t bias_shift,
		95	const uint16_t out_shift,
		96	q7_t * Im_out,
		97	const uint16_t dim_im_out_x,
		98	const uint16_t dim_im_out_y,
		99	q15_t * bufferA,
		100	q7_t * bufferB)
		101	{
		102
		103	#if defined (ARM_MATH_DSP)
		104	/* Run the following code for Cortex-M4 and Cortex-M7 */
		105
		106	int16_t i_out_y, i_out_x;
		107	int16_t i_ch_out;
		108
		109	/* -----------------------
		110	* Here we use bufferA as q15_t internally as computation are done with q15_t level
		111	* im2col are done to output in q15_t format from q7_t input
		112	*/
		113
		114	q15_t *pBuffer = bufferA;
		115	q7_t *pOut = Im_out;
		116
		117	if (ch_im_in % 4 != 0 \|\| ch_im_out % 2 != 0 \|\| dim_kernel_x != 1 \|\| dim_kernel_y != 1
		118	\|\| padding_x != 0 \|\| padding_y != 0 \|\| stride_x != 1 \|\| stride_y != 1)
		119	{
		120	/* check if the input dimension meets the constraints */
		121	return ARM_MATH_SIZE_MISMATCH;
		122	}
		123
		124	for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
		125	{
		126	for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
		127	{
		128	/* This part implements the im2col function */
		129	arm_q7_to_q15_reordered_no_shift((q7_t ) Im_in + (i_out_y dim_im_in_x + i_out_x) * ch_im_in, pBuffer,
		130	ch_im_in);
		131	pBuffer += ch_im_in;
		132
		133	if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
		134	{
		135	pOut =
		136	arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in, bias_shift, out_shift, bias, pOut);
		137	/* counter reset */
		138	pBuffer = bufferA;
		139	}
		140	}
		141	}
		142
		143	/* check if there is left-over for compute */
		144	if (pBuffer != bufferA)
		145	{
		146	const q7_t *pA = wt;
		147	for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
		148	{
		149	q31_t sum = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
		150	q15_t *pB = bufferA;
		151	/* basically each time it process 4 entries */
		152	uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
		153
		154	while (colCnt)
		155	{
		156
		157	q31_t inA1, inA2;
		158	q31_t inB1, inB2;
		159
		160	pA = (const q7_t )read_and_pad_reordered((void )pA, &inA1, &inA2);
		161
		162	inB1 = *__SIMD32(pB)++;
		163	sum = __SMLAD(inA1, inB1, sum);
		164	inB2 = *__SIMD32(pB)++;
		165	sum = __SMLAD(inA2, inB2, sum);
		166
		167	colCnt--;
		168	}
		169	colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
		170	while (colCnt)
		171	{
		172	q7_t inA1 = *pA++;
		173	q15_t inB1 = *pB++;
		174	sum += inA1 * inB1;
		175	colCnt--;
		176	}
		177	*pOut = (q7_t) __SSAT((sum >> out_shift), 8);
		178	pOut++;
		179
		180	}
		181
		182	}
		183
		184	#else
		185	/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
		186
		187	int i, j, k, l, m, n;
		188	int conv_out;
		189	int in_row, in_col;
		190
		191	if (ch_im_in % 4 != 0 \|\| ch_im_out % 2 != 0 \|\| dim_kernel_x != 1 \|\| dim_kernel_y != 1
		192	\|\| padding_x != 0 \|\| padding_y != 0 \|\| stride_x != 1 \|\| stride_y != 1)
		193	{
		194	/* check if the input dimension meets the constraints */
		195	return ARM_MATH_SIZE_MISMATCH;
		196	}
		197
		198	for (i = 0; i < ch_im_out; i++)
		199	{
		200	for (j = 0; j < dim_im_out_y; j++)
		201	{
		202	for (k = 0; k < dim_im_out_x; k++)
		203	{
		204	conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
		205	for (m = 0; m < dim_kernel_y; m++)
		206	{
		207	for (n = 0; n < dim_kernel_x; n++)
		208	{
		209	// if-for implementation
		210	in_row = stride_y * j + m - padding_y;
		211	in_col = stride_x * k + n - padding_x;
		212	if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
		213	{
		214	for (l = 0; l < ch_im_in; l++)
		215	{
		216	conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
		217	wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_y + n) * ch_im_in + l];
		218	}
		219	}
		220	}
		221	}
		222	Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
		223	}
		224	}
		225	}
		226
		227	#endif /* ARM_MATH_DSP */
		228
		229	/* Return to application */
		230	return ARM_MATH_SUCCESS;
		231	}
		232
		233	/**
		234	* @} end of NNConv group
		235	*/

Subversion Repositories AFRtranscoder

(root)/trunk/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c – Rev 2