WebSVN – testOled – Blame – /trunk/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c

Rev	Author	Line No.	Line
2	mjames	1	/*
		2	* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
		3	*
		4	* SPDX-License-Identifier: Apache-2.0
		5	*
		6	* Licensed under the Apache License, Version 2.0 (the License); you may
		7	* not use this file except in compliance with the License.
		8	* You may obtain a copy of the License at
		9	*
		10	* www.apache.org/licenses/LICENSE-2.0
		11	*
		12	* Unless required by applicable law or agreed to in writing, software
		13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		15	* See the License for the specific language governing permissions and
		16	* limitations under the License.
		17	*/
		18
		19	/* ----------------------------------------------------------------------
		20	* Project: CMSIS NN Library
		21	* Title: arm_convolve_HWC_q7_fast_nonsquare.c
		22	* Description: Fast Q7 version of convolution (non-sqaure shape)
		23	*
		24	* $Date: 17. January 2018
		25	* $Revision: V.1.0.0
		26	*
		27	* Target Processor: Cortex-M cores
		28	*
		29	* -------------------------------------------------------------------- */
		30
		31	#include "arm_math.h"
		32	#include "arm_nnfunctions.h"
		33
		34	/**
		35	* @ingroup groupNN
		36	*/
		37
		38	/**
		39	* @addtogroup NNConv
		40	* @{
		41	*/
		42
		43	/**
		44	* @brief Fast Q7 convolution function (non-sqaure shape)
		45	* @param[in] Im_in pointer to input tensor
		46	* @param[in] dim_im_in_x input tensor dimention x
		47	* @param[in] dim_im_in_y input tensor dimention y
		48	* @param[in] ch_im_in number of input tensor channels
		49	* @param[in] wt pointer to kernel weights
		50	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		51	* @param[in] dim_kernel_x filter kernel size x
		52	* @param[in] dim_kernel_y filter kernel size y
		53	* @param[in] padding_x padding size x
		54	* @param[in] padding_y padding size y
		55	* @param[in] stride_x convolution stride x
		56	* @param[in] stride_y convolution stride y
		57	* @param[in] bias pointer to bias
		58	* @param[in] bias_shift amount of left-shift for bias
		59	* @param[in] out_shift amount of right-shift for output
		60	* @param[in,out] Im_out pointer to output tensor
		61	* @param[in] dim_im_out_x output tensor dimension x
		62	* @param[in] dim_im_out_y output tensor dimension y
		63	* @param[in,out] bufferA pointer to buffer space for input
		64	* @param[in,out] bufferB pointer to buffer space for output
		65	* @return The function returns either
		66	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		67	*
		68	* This function is the version with full list of optimization tricks, but with
		69	* some contraints:
		70	* ch_im_in is multiple of 4
		71	* ch_im_out is multiple of 2
		72	*/
		73
		74	arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
		75	const uint16_t dim_im_in_x,
		76	const uint16_t dim_im_in_y,
		77	const uint16_t ch_im_in,
		78	const q7_t * wt,
		79	const uint16_t ch_im_out,
		80	const uint16_t dim_kernel_x,
		81	const uint16_t dim_kernel_y,
		82	const uint16_t padding_x,
		83	const uint16_t padding_y,
		84	const uint16_t stride_x,
		85	const uint16_t stride_y,
		86	const q7_t * bias,
		87	const uint16_t bias_shift,
		88	const uint16_t out_shift,
		89	q7_t * Im_out,
		90	const uint16_t dim_im_out_x,
		91	const uint16_t dim_im_out_y,
		92	q15_t * bufferA,
		93	q7_t * bufferB)
		94	{
		95
		96	#if defined (ARM_MATH_DSP)
		97	/* Run the following code for Cortex-M4 and Cortex-M7 */
		98
		99	int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
		100
		101	/* -----------------------
		102	* Here we use bufferA as q15_t internally as computation are done with q15_t level
		103	* im2col are done to output in q15_t format from q7_t input
		104	*/
		105
		106	q15_t *pBuffer = bufferA;
		107	q7_t *pOut = Im_out;
		108
		109	if (ch_im_in % 4 != 0 \|\| ch_im_out % 2 != 0)
		110	{
		111	/* check if the input dimension meets the constraints */
		112	return ARM_MATH_SIZE_MISMATCH;
		113	}
		114
		115	/*
		116	* Here we split the entire matrix into three regions depending on the padding situation
		117	* Top: i_out_y from 0 to padding - 1
		118	* Middle: i_out_y from padding to dim_im_out-padding-1
		119	* Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
		120	*/
		121
		122	/* top part */
		123	for (i_out_y = 0; i_out_y < padding_y; i_out_y++)
		124	{
		125	for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
		126	{
		127	/* This part implements the im2col function */
		128	for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
		129	i_ker_y++)
		130	{
		131	for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
		132	i_ker_x++)
		133	{
		134	if (i_ker_y < 0 \|\| i_ker_y >= dim_im_in_y \|\| i_ker_x < 0 \|\| i_ker_x >= dim_im_in_x)
		135	{
		136	/* arm_fill_q15(0, pBuffer, ch_im_in); */
		137	memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
		138	} else
		139	{
		140	arm_q7_to_q15_reordered_no_shift((q7_t ) Im_in + (i_ker_y dim_im_in_x + i_ker_x) * ch_im_in,
		141	pBuffer, ch_im_in);
		142	}
		143	pBuffer += ch_im_in;
		144	}
		145	}
		146
		147	if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
		148	{
		149	pOut =
		150	arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
		151	bias_shift, out_shift, bias, pOut);
		152	/* counter reset */
		153	pBuffer = bufferA;
		154	}
		155	}
		156	}
		157
		158	/* middle part, here we also divide the x into left, mid and right */
		159	for (; i_out_y < dim_im_out_y - padding_y; i_out_y++)
		160	{
		161
		162	/* left part */
		163	for (i_out_x = 0; i_out_x < padding_x; i_out_x++)
		164	{
		165	/* This part implements the im2col function */
		166	for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
		167	i_ker_y++)
		168	{
		169	for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
		170	i_ker_x++)
		171	{
		172	if (i_ker_x < 0 \|\| i_ker_x >= dim_im_in_x)
		173	{
		174	/* arm_fill_q15(0, pBuffer, ch_im_in); */
		175	memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
		176	} else
		177	{
		178	arm_q7_to_q15_reordered_no_shift((q7_t ) Im_in + (i_ker_y dim_im_in_x + i_ker_x) * ch_im_in,
		179	pBuffer, ch_im_in);
		180	}
		181	pBuffer += ch_im_in;
		182	}
		183	}
		184
		185	if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
		186	{
		187	pOut =
		188	arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
		189	bias_shift, out_shift, bias, pOut);
		190	/* counter reset */
		191	pBuffer = bufferA;
		192	}
		193	}
		194
		195	/* mid part */
		196	for (; i_out_x < dim_im_out_x - padding_x; i_out_x++)
		197	{
		198	/* This part implements the im2col function */
		199	for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
		200	i_ker_y++)
		201	{
		202	arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in +
		203	(i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) * ch_im_in,
		204	pBuffer, ch_im_in * dim_kernel_x);
		205	pBuffer += ch_im_in * dim_kernel_x;
		206	}
		207
		208	if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
		209	{
		210	pOut =
		211	arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
		212	bias_shift, out_shift, bias, pOut);
		213	/* counter reset */
		214	pBuffer = bufferA;
		215	}
		216	}
		217
		218	/* right part */
		219	for (; i_out_x < dim_im_out_x; i_out_x++)
		220	{
		221	/* This part implements the im2col function */
		222	for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
		223	i_ker_y++)
		224	{
		225	for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
		226	i_ker_x++)
		227	{
		228	if (i_ker_x < 0 \|\| i_ker_x >= dim_im_in_x)
		229	{
		230	/* arm_fill_q15(0, pBuffer, ch_im_in); */
		231	memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
		232	} else
		233	{
		234	arm_q7_to_q15_reordered_no_shift((q7_t ) Im_in + (i_ker_y dim_im_in_x + i_ker_x) * ch_im_in,
		235	pBuffer, ch_im_in);
		236	}
		237	pBuffer += ch_im_in;
		238	}
		239	}
		240
		241	if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
		242	{
		243	pOut =
		244	arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
		245	bias_shift, out_shift, bias, pOut);
		246	/* counter reset */
		247	pBuffer = bufferA;
		248	}
		249	}
		250	}
		251
		252	for (; i_out_y < dim_im_out_y; i_out_y++)
		253	{
		254	for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
		255	{
		256	/* This part implements the im2col function */
		257	for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
		258	i_ker_y++)
		259	{
		260	for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
		261	i_ker_x++)
		262	{
		263	if (i_ker_y < 0 \|\| i_ker_y >= dim_im_in_y \|\| i_ker_x < 0 \|\| i_ker_x >= dim_im_in_x)
		264	{
		265	/* arm_fill_q15(0, pBuffer, ch_im_in); */
		266	memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
		267	} else
		268	{
		269	arm_q7_to_q15_reordered_no_shift((q7_t ) Im_in + (i_ker_y dim_im_in_x + i_ker_x) * ch_im_in,
		270	pBuffer, ch_im_in);
		271	}
		272	pBuffer += ch_im_in;
		273	}
		274	}
		275
		276	if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
		277	{
		278	pOut =
		279	arm_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
		280	bias_shift, out_shift, bias, pOut);
		281	/* counter reset */
		282	pBuffer = bufferA;
		283	}
		284	}
		285	}
		286
		287	/* check if there is left-over for compute */
		288	if (pBuffer != bufferA)
		289	{
		290	const q7_t *pA = wt;
		291	int i;
		292	for (i = 0; i < ch_im_out; i++)
		293	{
		294	q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
		295	q15_t *pB = bufferA;
		296	/* basically each time it process 4 entries */
		297	uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
		298
		299	while (colCnt)
		300	{
		301
		302	q31_t inA1, inA2;
		303	q31_t inB1, inB2;
		304
		305	pA = (const q7_t )read_and_pad_reordered((void )pA, &inA1, &inA2);
		306
		307	inB1 = *__SIMD32(pB)++;
		308	sum = __SMLAD(inA1, inB1, sum);
		309	inB2 = *__SIMD32(pB)++;
		310	sum = __SMLAD(inA2, inB2, sum);
		311
		312	colCnt--;
		313	}
		314	colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3;
		315	while (colCnt)
		316	{
		317	q7_t inA1 = *pA++;
		318	q15_t inB1 = *pB++;
		319	sum += inA1 * inB1;
		320	colCnt--;
		321	}
		322	*pOut = (q7_t) __SSAT((sum >> out_shift), 8);
		323	pOut++;
		324
		325	}
		326
		327	}
		328
		329	#else
		330	/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
		331	int i, j, k, l, m, n;
		332	int conv_out;
		333	int in_row, in_col;
		334
		335	if (ch_im_in % 4 != 0 \|\| ch_im_out % 2 != 0)
		336	{
		337	/* check if the input dimension meets the constraints */
		338	return ARM_MATH_SIZE_MISMATCH;
		339	}
		340
		341	for (i = 0; i < ch_im_out; i++)
		342	{
		343	for (j = 0; j < dim_im_out_y; j++)
		344	{
		345	for (k = 0; k < dim_im_out_x; k++)
		346	{
		347	conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
		348	for (m = 0; m < dim_kernel_y; m++)
		349	{
		350	for (n = 0; n < dim_kernel_x; n++)
		351	{
		352	/* if-for implementation */
		353	in_row = stride_y * j + m - padding_y;
		354	in_col = stride_x * k + n - padding_x;
		355	if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
		356	{
		357	for (l = 0; l < ch_im_in; l++)
		358	{
		359	conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
		360	wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + l];
		361	}
		362	}
		363	}
		364	}
		365	Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
		366	}
		367	}
		368	}
		369
		370
		371	#endif /* ARM_MATH_DSP */
		372
		373	/* Return to application */
		374	return ARM_MATH_SUCCESS;
		375	}
		376
		377	/**
		378	* @} end of NNConv group
		379	*/

Subversion Repositories testOled

(root)/trunk/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c – Rev 2