WebSVN – testOled – Blame – /trunk/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c

Rev	Author	Line No.	Line
2	mjames	1	/*
		2	* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
		3	*
		4	* SPDX-License-Identifier: Apache-2.0
		5	*
		6	* Licensed under the Apache License, Version 2.0 (the License); you may
		7	* not use this file except in compliance with the License.
		8	* You may obtain a copy of the License at
		9	*
		10	* www.apache.org/licenses/LICENSE-2.0
		11	*
		12	* Unless required by applicable law or agreed to in writing, software
		13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		15	* See the License for the specific language governing permissions and
		16	* limitations under the License.
		17	*/
		18
		19	/* ----------------------------------------------------------------------
		20	* Project: CMSIS NN Library
		21	* Title: arm_convolve_HWC_q7_fast.c
		22	* Description: Fast Q7 version of convolution
		23	*
		24	* $Date: 17. January 2018
		25	* $Revision: V.1.0.0
		26	*
		27	* Target Processor: Cortex-M cores
		28	*
		29	* -------------------------------------------------------------------- */
		30
		31	#include "arm_math.h"
		32	#include "arm_nnfunctions.h"
		33
		34	/**
		35	* @ingroup groupNN
		36	*/
		37
		38	/**
		39	* @addtogroup NNConv
		40	* @{
		41	*/
		42
		43	/**
		44	* @brief Fast Q7 convolution function
		45	* @param[in] Im_in pointer to input tensor
		46	* @param[in] dim_im_in input tensor dimention
		47	* @param[in] ch_im_in number of input tensor channels
		48	* @param[in] wt pointer to kernel weights
		49	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		50	* @param[in] dim_kernel filter kernel size
		51	* @param[in] padding padding sizes
		52	* @param[in] stride convolution stride
		53	* @param[in] bias pointer to bias
		54	* @param[in] bias_shift amount of left-shift for bias
		55	* @param[in] out_shift amount of right-shift for output
		56	* @param[in,out] Im_out pointer to output tensor
		57	* @param[in] dim_im_out output tensor dimension
		58	* @param[in,out] bufferA pointer to buffer space for input
		59	* @param[in,out] bufferB pointer to buffer space for output
		60	* @return The function returns either
		61	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		62	*
		63	* @details
		64	*
		65	* <b>Buffer size:</b>
		66	*
		67	* bufferA size: 2ch_im_indim_kernel*dim_kernel
		68	*
		69	* bufferB size: 0
		70	*
		71	* <b>Input dimension constraints:</b>
		72	*
		73	* ch_im_in is multiple of 4 ( because of the SIMD32 read and swap )
		74	*
		75	* ch_im_out is multipe of 2 ( bacause 2x2 mat_mult kernel )
		76	*
		77	* The im2col converts the Q7 tensor input into Q15 column, which is stored in
		78	* bufferA. There is reordering happenning during this im2col process with
		79	* arm_q7_to_q15_reordered_no_shift. For every four elements, the second and
		80	* third elements are swapped.
		81	*
		82	* The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the
		83	* GEMM computation with the reordered columns.
		84	*
		85	* To speed-up the determination of the padding condition, we split the
		86	* computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}.
		87	* This reduces the total number of boundary condition checks and improves
		88	* the data copying performance.
		89	*/
		90
		91	arm_status
		92	arm_convolve_HWC_q7_fast(const q7_t * Im_in,
		93	const uint16_t dim_im_in,
		94	const uint16_t ch_im_in,
		95	const q7_t * wt,
		96	const uint16_t ch_im_out,
		97	const uint16_t dim_kernel,
		98	const uint16_t padding,
		99	const uint16_t stride,
		100	const q7_t * bias,
		101	const uint16_t bias_shift,
		102	const uint16_t out_shift,
		103	q7_t * Im_out,
		104	const uint16_t dim_im_out,
		105	q15_t * bufferA,
		106	q7_t * bufferB)
		107	{
		108
		109	#if defined (ARM_MATH_DSP)
		110	/* Run the following code for Cortex-M4 and Cortex-M7 */
		111
		112	int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
		113
		114	/*
		115	* Here we use bufferA as q15_t internally as computation are done with q15_t level
		116	* im2col are done to output in q15_t format from q7_t input
		117	*/
		118
		119	q15_t *pBuffer = bufferA;
		120	q7_t *pOut = Im_out;
		121
		122	if (ch_im_in % 4 != 0 \|\| ch_im_out % 2 != 0)
		123	{
		124	/* check if the input dimension meets the constraints */
		125	return ARM_MATH_SIZE_MISMATCH;
		126	}
		127
		128	/*
		129	* Here we split the entire matrix into three regions depending on the padding situation
		130	* Top: i_out_y from 0 to padding - 1
		131	* Middle: i_out_y from padding to dim_im_out-padding-1
		132	* Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
		133	*/
		134
		135	/* top part */
		136	for (i_out_y = 0; i_out_y < padding; i_out_y++)
		137	{
		138	for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
		139	{
		140	/* This part implements the im2col function */
		141	for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
		142	{
		143	for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
		144	{
		145	if (i_ker_y < 0 \|\| i_ker_y >= dim_im_in \|\| i_ker_x < 0 \|\| i_ker_x >= dim_im_in)
		146	{
		147	/* arm_fill_q15(0, pBuffer, ch_im_in); */
		148	memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
		149	} else
		150	{
		151	arm_q7_to_q15_reordered_no_shift
		152	((q7_t ) Im_in + (i_ker_y dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
		153	}
		154	pBuffer += ch_im_in;
		155	}
		156	}
		157
		158	if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
		159	{
		160	pOut =
		161	arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
		162	bufferA,
		163	ch_im_out,
		164	ch_im_in
		165	*
		166	dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
		167	/* counter reset */
		168	pBuffer = bufferA;
		169	}
		170	}
		171	}
		172
		173	/* middle part, here we also divide the x into left, mid and right */
		174	for (; i_out_y < dim_im_out - padding; i_out_y++)
		175	{
		176
		177	/* left part */
		178	for (i_out_x = 0; i_out_x < padding; i_out_x++)
		179	{
		180	/* This part implements the im2col function */
		181	for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
		182	{
		183	for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
		184	{
		185	if (i_ker_x < 0 \|\| i_ker_x >= dim_im_in)
		186	{
		187	/* arm_fill_q15(0, pBuffer, ch_im_in); */
		188	memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
		189	} else
		190	{
		191	arm_q7_to_q15_reordered_no_shift
		192	((q7_t ) Im_in + (i_ker_y dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
		193	}
		194	pBuffer += ch_im_in;
		195	}
		196	}
		197
		198	if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
		199	{
		200	pOut =
		201	arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
		202	bufferA,
		203	ch_im_out,
		204	ch_im_in
		205	*
		206	dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
		207	/* counter reset */
		208	pBuffer = bufferA;
		209	}
		210	}
		211
		212	/* mid part */
		213	for (; i_out_x < dim_im_out - padding; i_out_x++)
		214	{
		215	/* This part implements the im2col function */
		216	for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
		217	{
		218	arm_q7_to_q15_reordered_no_shift((q7_t *) Im_in
		219	+
		220	(i_ker_y *
		221	dim_im_in +
		222	i_out_x *
		223	stride - padding) * ch_im_in, pBuffer, ch_im_in * dim_kernel);
		224	pBuffer += ch_im_in * dim_kernel;
		225	}
		226
		227	if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
		228	{
		229	pOut =
		230	arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
		231	bufferA,
		232	ch_im_out,
		233	ch_im_in
		234	*
		235	dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
		236	/* counter reset */
		237	pBuffer = bufferA;
		238	}
		239	}
		240
		241	/* right part */
		242	for (; i_out_x < dim_im_out; i_out_x++)
		243	{
		244	/* This part implements the im2col function */
		245	for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
		246	{
		247	for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
		248	{
		249	if (i_ker_x < 0 \|\| i_ker_x >= dim_im_in)
		250	{
		251	/* arm_fill_q15(0, pBuffer, ch_im_in); */
		252	memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
		253	} else
		254	{
		255	arm_q7_to_q15_reordered_no_shift
		256	((q7_t ) Im_in + (i_ker_y dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
		257	}
		258	pBuffer += ch_im_in;
		259	}
		260	}
		261
		262	if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
		263	{
		264	pOut =
		265	arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
		266	bufferA,
		267	ch_im_out,
		268	ch_im_in
		269	*
		270	dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
		271	/* counter reset */
		272	pBuffer = bufferA;
		273	}
		274	}
		275	}
		276
		277	for (; i_out_y < dim_im_out; i_out_y++)
		278	{
		279	for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
		280	{
		281	/* This part implements the im2col function */
		282	for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
		283	{
		284	for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
		285	{
		286	if (i_ker_y < 0 \|\| i_ker_y >= dim_im_in \|\| i_ker_x < 0 \|\| i_ker_x >= dim_im_in)
		287	{
		288	/* arm_fill_q15(0, pBuffer, ch_im_in); */
		289	memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
		290	} else
		291	{
		292	arm_q7_to_q15_reordered_no_shift
		293	((q7_t ) Im_in + (i_ker_y dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
		294	}
		295	pBuffer += ch_im_in;
		296	}
		297	}
		298
		299	if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
		300	{
		301	pOut =
		302	arm_nn_mat_mult_kernel_q7_q15_reordered(wt,
		303	bufferA,
		304	ch_im_out,
		305	ch_im_in
		306	*
		307	dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
		308	/* counter reset */
		309	pBuffer = bufferA;
		310	}
		311	}
		312	}
		313
		314	/* check if there is left-over for compute */
		315	if (pBuffer != bufferA)
		316	{
		317	const q7_t *pA = wt;
		318	int i;
		319
		320	for (i = 0; i < ch_im_out; i++)
		321	{
		322	q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
		323	q15_t *pB = bufferA;
		324	/* each time it process 4 entries */
		325	uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
		326
		327	while (colCnt)
		328	{
		329
		330	q31_t inA1, inA2;
		331	q31_t inB1, inB2;
		332
		333	pA = (q7_t ) read_and_pad_reordered((void )pA, &inA1, &inA2);
		334
		335	inB1 = *__SIMD32(pB)++;
		336	sum = __SMLAD(inA1, inB1, sum);
		337	inB2 = *__SIMD32(pB)++;
		338	sum = __SMLAD(inA2, inB2, sum);
		339
		340	colCnt--;
		341	}
		342	colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
		343	while (colCnt)
		344	{
		345	q7_t inA1 = *pA++;
		346	q15_t inB1 = *pB++;
		347	sum += inA1 * inB1;
		348	colCnt--;
		349	}
		350	*pOut = (q7_t) __SSAT((sum >> out_shift), 8);
		351	pOut++;
		352
		353	}
		354
		355	}
		356	#else
		357	/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
		358
		359	uint16_t i, j, k, l, m, n;
		360	int conv_out;
		361	signed char in_row, in_col;
		362
		363	if (ch_im_in % 4 != 0 \|\| ch_im_out % 2 != 0)
		364	{
		365	/* check if the input dimension meets the constraints */
		366	return ARM_MATH_SIZE_MISMATCH;
		367	}
		368
		369	for (i = 0; i < ch_im_out; i++)
		370	{
		371	for (j = 0; j < dim_im_out; j++)
		372	{
		373	for (k = 0; k < dim_im_out; k++)
		374	{
		375	conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
		376	for (m = 0; m < dim_kernel; m++)
		377	{
		378	for (n = 0; n < dim_kernel; n++)
		379	{
		380	// if-for implementation
		381	in_row = stride * j + m - padding;
		382	in_col = stride * k + n - padding;
		383	if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
		384	{
		385	for (l = 0; l < ch_im_in; l++)
		386	{
		387	conv_out +=
		388	Im_in[(in_row * dim_im_in + in_col) * ch_im_in +
		389	l] * wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel +
		390	n) * ch_im_in + l];
		391	}
		392	}
		393	}
		394	}
		395	Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
		396	}
		397	}
		398	}
		399
		400	#endif /* ARM_MATH_DSP */
		401
		402	/* Return to application */
		403	return ARM_MATH_SUCCESS;
		404	}
		405
		406	/**
		407	* @} end of NNConv group
		408	*/

Subversion Repositories testOled

(root)/trunk/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c – Rev 2