WebSVN – DashDisplay – Blame – /branches/Dashboard_L152_v2_NVRAM/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c

Rev	Author	Line No.	Line
56	mjames	1	/*
		2	* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
		3	*
		4	* SPDX-License-Identifier: Apache-2.0
		5	*
		6	* Licensed under the Apache License, Version 2.0 (the License); you may
		7	* not use this file except in compliance with the License.
		8	* You may obtain a copy of the License at
		9	*
		10	* www.apache.org/licenses/LICENSE-2.0
		11	*
		12	* Unless required by applicable law or agreed to in writing, software
		13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		15	* See the License for the specific language governing permissions and
		16	* limitations under the License.
		17	*/
		18
		19	/* ----------------------------------------------------------------------
		20	* Project: CMSIS NN Library
		21	* Title: arm_pool_q7_HWC.c
		22	* Description: Pooling function implementations
		23	*
		24	* $Date: 17. January 2018
		25	* $Revision: V.1.0.0
		26	*
		27	* Target Processor: Cortex-M cores
		28	*
		29	* -------------------------------------------------------------------- */
		30
		31	#include "arm_math.h"
		32	#include "arm_nnfunctions.h"
		33
		34	#if defined (ARM_MATH_DSP)
		35
		36	/**
		37	* @brief A few utility functions used by pooling functions
		38	*
		39	*
		40	*/
		41
		42	static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale)
		43	{
		44	int i;
		45
		46	for (i = 0; i < length; i++)
		47	{
		48	target[i] = (q7_t) (buffer[i] / scale);
		49	}
		50	}
		51
		52	static void compare_and_replace_if_larger_q7(q7_t * base, // base data
		53	q7_t * target, // compare target
		54	const uint16_t length // data length
		55	)
		56	{
		57	q7_t *pIn = base;
		58	q7_t *pCom = target;
		59	union arm_nnword in;
		60	union arm_nnword com;
		61	uint16_t cnt = length >> 2;
		62
		63	while (cnt > 0u)
		64	{
		65	in.word = *__SIMD32(pIn);
		66	com.word = *__SIMD32(pCom)++;
		67
		68	// if version
		69	if (com.bytes[0] > in.bytes[0])
		70	in.bytes[0] = com.bytes[0];
		71	if (com.bytes[1] > in.bytes[1])
		72	in.bytes[1] = com.bytes[1];
		73	if (com.bytes[2] > in.bytes[2])
		74	in.bytes[2] = com.bytes[2];
		75	if (com.bytes[3] > in.bytes[3])
		76	in.bytes[3] = com.bytes[3];
		77
		78	*__SIMD32(pIn)++ = in.word;
		79
		80	cnt--;
		81	}
		82	}
		83
		84	static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length)
		85	{
		86	q15_t *pCnt = base;
		87	q7_t *pV = target;
		88	q31_t v1, v2, vo1, vo2;
		89	uint16_t cnt = length >> 2;
		90	q31_t in;
		91
		92	while (cnt > 0u)
		93	{
		94	q31_t value = *__SIMD32(pV)++;
		95	v1 = __SXTB16(__ROR(value, 8));
		96	v2 = __SXTB16(value);
		97	#ifndef ARM_MATH_BIG_ENDIAN
		98
		99	vo2 = __PKHTB(v1, v2, 16);
		100	vo1 = __PKHBT(v2, v1, 16);
		101
		102	#else
		103
		104	vo1 = __PKHTB(v1, v2, 16);
		105	vo2 = __PKHBT(v2, v1, 16);
		106
		107	#endif
		108
		109	in = *__SIMD32(pCnt);
		110	*__SIMD32(pCnt)++ = __QADD16(vo1, in);
		111
		112	in = *__SIMD32(pCnt);
		113	*__SIMD32(pCnt)++ = __QADD16(vo2, in);
		114
		115	cnt--;
		116	}
		117	cnt = length & 0x3;
		118	while (cnt > 0u)
		119	{
		120	pCnt++ += pV++;
		121	cnt--;
		122	}
		123	}
		124
		125	#endif // ARM_MATH_DSP
		126
		127	/**
		128	* @ingroup groupNN
		129	*/
		130
		131	/**
		132	* @addtogroup Pooling
		133	* @{
		134	*/
		135
		136	/**
		137	* @brief Q7 max pooling function
		138	* @param[in, out] Im_in pointer to input tensor
		139	* @param[in] dim_im_in input tensor dimention
		140	* @param[in] ch_im_in number of input tensor channels
		141	* @param[in] dim_kernel filter kernel size
		142	* @param[in] padding padding sizes
		143	* @param[in] stride convolution stride
		144	* @param[in] dim_im_out output tensor dimension
		145	* @param[in,out] bufferA pointer to buffer space for input
		146	* @param[in,out] Im_out pointer to output tensor
		147	* @return none.
		148	*
		149	* @details
		150	*
		151	* <b>Buffer size:</b>
		152	*
		153	* bufferA size: 0
		154	*
		155	* The pooling function is implemented as split x-pooling then
		156	* y-pooling.
		157	*
		158	* This pooling function is input-destructive. Input data is undefined
		159	* after calling this function.
		160	*
		161	*/
		162
		163	void
		164	arm_maxpool_q7_HWC(q7_t * Im_in,
		165	const uint16_t dim_im_in,
		166	const uint16_t ch_im_in,
		167	const uint16_t dim_kernel,
		168	const uint16_t padding,
		169	const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out)
		170	{
		171
		172	#if defined (ARM_MATH_DSP)
		173	/* Run the following code for Cortex-M4 and Cortex-M7 */
		174
		175	int16_t i_x, i_y;
		176
		177	/* first does the pooling along x axis */
		178	for (i_y = 0; i_y < dim_im_in; i_y++)
		179	{
		180
		181	for (i_x = 0; i_x < dim_im_out; i_x++)
		182	{
		183	/* for each output pixel */
		184	q7_t target = Im_in + (i_y dim_im_in + i_x) * ch_im_in;
		185	q7_t *win_start;
		186	q7_t *win_stop;
		187	if (i_x * stride - padding < 0)
		188	{
		189	win_start = target;
		190	} else
		191	{
		192	win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
		193	}
		194
		195	if (i_x * stride - padding + dim_kernel >= dim_im_in)
		196	{
		197	win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
		198	} else
		199	{
		200	win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
		201	}
		202
		203	/* first step is to copy over initial data */
		204	/* arm_copy_q7(win_start, target, ch_im_in); */
		205	memmove(target, win_start, ch_im_in);
		206
		207	/* start the max operation from the second part */
		208	win_start += ch_im_in;
		209	for (; win_start < win_stop; win_start += ch_im_in)
		210	{
		211	compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
		212	}
		213	}
		214	}
		215
		216	/* then does the pooling along y axis */
		217	for (i_y = 0; i_y < dim_im_out; i_y++)
		218	{
		219
		220	/* for each output row */
		221	q7_t target = Im_out + i_y dim_im_out * ch_im_in;
		222	q7_t *row_start;
		223	q7_t *row_end;
		224	/* setting the starting row */
		225	if (i_y * stride - padding < 0)
		226	{
		227	row_start = Im_in;
		228	} else
		229	{
		230	row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
		231	}
		232	/* setting the stopping row */
		233	if (i_y * stride - padding + dim_kernel >= dim_im_in)
		234	{
		235	row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
		236	} else
		237	{
		238	row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
		239	}
		240
		241	/* copy over the first row */
		242	/* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
		243	memmove(target, row_start, dim_im_out * ch_im_in);
		244
		245	/* move over to next row */
		246	row_start += ch_im_in * dim_im_in;
		247
		248	for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
		249	{
		250	compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in);
		251	}
		252	}
		253
		254	#else
		255	/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
		256
		257	int16_t i_ch_in, i_x, i_y;
		258	int16_t k_x, k_y;
		259
		260	for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
		261	{
		262	for (i_y = 0; i_y < dim_im_out; i_y++)
		263	{
		264	for (i_x = 0; i_x < dim_im_out; i_x++)
		265	{
		266	int max = -129;
		267	for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
		268	{
		269	for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
		270	{
		271	if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
		272	{
		273	if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
		274	{
		275	max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
		276	}
		277	}
		278	}
		279	}
		280	Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
		281	}
		282	}
		283	}
		284
		285	#endif /* ARM_MATH_DSP */
		286
		287	}
		288
		289	/**
		290	* @brief Q7 average pooling function
		291	* @param[in,out] Im_in pointer to input tensor
		292	* @param[in] dim_im_in input tensor dimention
		293	* @param[in] ch_im_in number of input tensor channels
		294	* @param[in] dim_kernel filter kernel size
		295	* @param[in] padding padding sizes
		296	* @param[in] stride convolution stride
		297	* @param[in] dim_im_out output tensor dimension
		298	* @param[in,out] bufferA pointer to buffer space for input
		299	* @param[in,out] Im_out pointer to output tensor
		300	* @return none.
		301	*
		302	* @details
		303	*
		304	* <b>Buffer size:</b>
		305	*
		306	* bufferA size: 2dim_im_outch_im_in
		307	*
		308	* The pooling function is implemented as split x-pooling then
		309	* y-pooling.
		310	*
		311	* This pooling function is input-destructive. Input data is undefined
		312	* after calling this function.
		313	*
		314	*/
		315
		316	void
		317	arm_avepool_q7_HWC(q7_t * Im_in,
		318	const uint16_t dim_im_in,
		319	const uint16_t ch_im_in,
		320	const uint16_t dim_kernel,
		321	const uint16_t padding,
		322	const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out)
		323	{
		324
		325	#if defined (ARM_MATH_DSP)
		326	/* Run the following code for Cortex-M4 and Cortex-M7 */
		327
		328	q15_t buffer = (q15_t ) bufferA;
		329	int16_t i_x, i_y;
		330	int16_t count = 0;
		331
		332	/* first does the pooling along x axis */
		333	for (i_y = 0; i_y < dim_im_in; i_y++)
		334	{
		335
		336	for (i_x = 0; i_x < dim_im_out; i_x++)
		337	{
		338	/* for each output pixel */
		339	q7_t target = Im_in + (i_y dim_im_in + i_x) * ch_im_in;
		340	q7_t *win_start;
		341	q7_t *win_stop;
		342	if (i_x * stride - padding < 0)
		343	{
		344	win_start = target;
		345	} else
		346	{
		347	win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
		348	}
		349
		350	if (i_x * stride - padding + dim_kernel >= dim_im_in)
		351	{
		352	win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
		353	} else
		354	{
		355	win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
		356	}
		357
		358	/* first step is to copy over initial data */
		359	arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
		360	count = 1;
		361
		362	/* start the max operation from the second part */
		363	win_start += ch_im_in;
		364	for (; win_start < win_stop; win_start += ch_im_in)
		365	{
		366	accumulate_q7_to_q15(buffer, win_start, ch_im_in);
		367	count++;
		368	}
		369	buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
		370	}
		371	}
		372
		373	/* then does the pooling along y axis */
		374	for (i_y = 0; i_y < dim_im_out; i_y++)
		375	{
		376	/* for each output row */
		377	q7_t target = Im_out + i_y dim_im_out * ch_im_in;
		378	q7_t *row_start;
		379	q7_t *row_end;
		380	/* setting the starting row */
		381	if (i_y * stride - padding < 0)
		382	{
		383	row_start = Im_in;
		384	} else
		385	{
		386	row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
		387	}
		388	/* setting the stopping row */
		389	if (i_y * stride - padding + dim_kernel >= dim_im_in)
		390	{
		391	row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
		392	} else
		393	{
		394	row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
		395	}
		396
		397	/* copy over the first row */
		398	arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
		399	count = 1;
		400
		401	/* move over to next row */
		402	row_start += ch_im_in * dim_im_in;
		403
		404	for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
		405	{
		406	accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
		407	count++;
		408	}
		409	buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count);
		410	}
		411
		412	#else
		413	/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
		414
		415	int16_t i_ch_in, i_x, i_y;
		416	int16_t k_x, k_y;
		417
		418	for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
		419	{
		420	for (i_y = 0; i_y < dim_im_out; i_y++)
		421	{
		422	for (i_x = 0; i_x < dim_im_out; i_x++)
		423	{
		424	int sum = 0;
		425	int count = 0;
		426	for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
		427	{
		428	for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
		429	{
		430	if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
		431	{
		432	sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
		433	count++;
		434	}
		435	}
		436	}
		437	Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
		438	}
		439	}
		440	}
		441
		442	#endif /* ARM_MATH_DSP */
		443
		444	}
		445
		446	/**
		447	* @} end of Pooling group
		448	*/

Subversion Repositories DashDisplay

(root)/branches/Dashboard_L152_v2_NVRAM/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c – Rev 64