WebSVN – AFRtranscoder – Blame – /trunk/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c

Rev	Author	Line No.	Line
2	mjames	1	/*
		2	* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
		3	*
		4	* SPDX-License-Identifier: Apache-2.0
		5	*
		6	* Licensed under the Apache License, Version 2.0 (the License); you may
		7	* not use this file except in compliance with the License.
		8	* You may obtain a copy of the License at
		9	*
		10	* www.apache.org/licenses/LICENSE-2.0
		11	*
		12	* Unless required by applicable law or agreed to in writing, software
		13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		15	* See the License for the specific language governing permissions and
		16	* limitations under the License.
		17	*/
		18
		19	/* ----------------------------------------------------------------------
		20	* Project: CMSIS NN Library
		21	* Title: arm_depthwise_separable_conv_HWC_q7.c
		22	* Description: Q7 depthwise separable convolution function
		23	*
		24	* $Date: 17. January 2018
		25	* $Revision: V.1.0.0
		26	*
		27	* Target Processor: Cortex-M cores
		28	*
		29	* -------------------------------------------------------------------- */
		30
		31	#include "arm_math.h"
		32	#include "arm_nnfunctions.h"
		33
		34	/**
		35	* @ingroup groupNN
		36	*/
		37
		38	/**
		39	* @addtogroup NNConv
		40	* @{
		41	*/
		42
		43	/**
		44	* @brief Q7 depthwise separable convolution function
		45	* @param[in] Im_in pointer to input tensor
		46	* @param[in] dim_im_in input tensor dimention
		47	* @param[in] ch_im_in number of input tensor channels
		48	* @param[in] wt pointer to kernel weights
		49	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		50	* @param[in] dim_kernel filter kernel size
		51	* @param[in] padding padding sizes
		52	* @param[in] stride convolution stride
		53	* @param[in] bias pointer to bias
		54	* @param[in] bias_shift amount of left-shift for bias
		55	* @param[in] out_shift amount of right-shift for output
		56	* @param[in,out] Im_out pointer to output tensor
		57	* @param[in] dim_im_out output tensor dimension
		58	* @param[in,out] bufferA pointer to buffer space for input
		59	* @param[in,out] bufferB pointer to buffer space for output
		60	* @return The function returns either
		61	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		62	*
		63	* @details
		64	*
		65	* <b>Buffer size:</b>
		66	*
		67	* bufferA size: 2ch_im_indim_kernel*dim_kernel
		68	*
		69	* bufferB size: 0
		70	*
		71	* <b>Input dimension constraints:</b>
		72	*
		73	* ch_im_in equals ch_im_out
		74	*
		75	* Implementation:
		76	* There are 3 nested loop here:
		77	* Inner loop: calculate each output value with MAC instruction over an accumulator
		78	* Mid loop: loop over different output channel
		79	* Outer loop: loop over different output (x, y)
		80	*/
		81
		82	arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
		83	const uint16_t dim_im_in,
		84	const uint16_t ch_im_in,
		85	const q7_t * wt,
		86	const uint16_t ch_im_out,
		87	const uint16_t dim_kernel,
		88	const uint16_t padding,
		89	const uint16_t stride,
		90	const q7_t * bias,
		91	const uint16_t bias_shift,
		92	const uint16_t out_shift,
		93	q7_t * Im_out,
		94	const uint16_t dim_im_out,
		95	q15_t * bufferA,
		96	q7_t * bufferB)
		97	{
		98
		99	#if defined (ARM_MATH_DSP)
		100	/* Run the following code for Cortex-M4 and Cortex-M7 */
		101
		102	int16_t i_out_y, i_out_x;
		103	int16_t i_ker_y, i_ker_x;
		104	q7_t colBuffer = (q7_t ) bufferA;
		105	q7_t *pBuffer = colBuffer;
		106	const q7_t *pBias = bias;
		107	q7_t *pOut = Im_out;
		108	uint16_t rowCnt;
		109	uint16_t row_shift;
		110
		111	/* do some checking here, basically ch_im_in == ch_im_out */
		112	if (ch_im_in != ch_im_out)
		113	{
		114	return ARM_MATH_SIZE_MISMATCH;
		115	}
		116
		117	for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
		118	{
		119	for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
		120	{
		121	/* we first do im2col here */
		122	for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
		123	{
		124	for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
		125	{
		126	if (i_ker_y < 0 \|\| i_ker_y >= dim_im_in \|\| i_ker_x < 0 \|\| i_ker_x >= dim_im_in)
		127	{
		128	/* arm_fill_q7(0, pBuffer, ch_im_in); */
		129	memset(pBuffer, 0, ch_im_in);
		130	} else
		131	{
		132	/* arm_copy_q7((q7_t ) Im_in + (i_ker_y dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); */
		133	memcpy(pBuffer, (q7_t ) Im_in + (i_ker_y dim_im_in + i_ker_x) * ch_im_in, ch_im_in);
		134	}
		135	pBuffer += ch_im_in;
		136	}
		137	}
		138
		139	/* we will do the computation here for each channel */
		140	rowCnt = ch_im_out >> 2;
		141	row_shift = 0;
		142	pBias = bias;
		143
		144	while (rowCnt)
		145	{
		146	q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
		147	q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
		148	q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
		149	q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
		150
		151	uint16_t colCnt = (dim_kernel * dim_kernel) >> 1;
		152	q7_t *pB = colBuffer + row_shift;
		153	const q7_t *pA = wt + row_shift;
		154	row_shift += 4;
		155
		156	#ifdef USE_INTRINSIC
		157
		158	#ifndef ARM_MATH_BIG_ENDIAN
		159
		160	while (colCnt)
		161	{
		162	q31_t inA1, inA2, inB1, inB2, opA, opB;
		163
		164	inB1 = *__SIMD32(pB);
		165	pB += ch_im_in;
		166	opB = *__SIMD32(pB);
		167	pB += ch_im_in;
		168	inB2 = __PKHTB(opB, inB1, 16);
		169	inB1 = __PKHBT(inB1, opB, 16);
		170	inA1 = *__SIMD32(pA);
		171	pA += ch_im_in;
		172	opB = *__SIMD32(pA);
		173	pA += ch_im_in;
		174	inA2 = __PKHTB(opB, inA1, 16);
		175	inA1 = __PKHBT(inA1, opB, 16);
		176	opA = __SXTB16(inA1);
		177	opB = __SXTB16(inB1);
		178	sum = __SMLAD(opA, opB, sum);
		179	opA = __SXTB16(__ROR(inA1, 8));
		180	opB = __SXTB16(__ROR(inB1, 8));
		181	sum2 = __SMLAD(opA, opB, sum2);
		182	opA = __SXTB16(inA2);
		183	opB = __SXTB16(inB2);
		184	sum3 = __SMLAD(opA, opB, sum3);
		185	opA = __SXTB16(__ROR(inA2, 8));
		186	opB = __SXTB16(__ROR(inB2, 8));
		187	sum4 = __SMLAD(opA, opB, sum4);
		188	colCnt--;
		189	}
		190	#else
		191
		192	while (colCnt)
		193	{
		194	q31_t inA1, inA2, inB1, inB2, opA, opB;
		195
		196	inB1 = *__SIMD32(pB);
		197	pB += ch_im_in;
		198	opB = *__SIMD32(pB);
		199	pB += ch_im_in;
		200	inB2 = __PKHBT(opB, inB1, 16);
		201	inB1 = __PKHTB(inB1, opB, 16);
		202	inA1 = *__SIMD32(pA);
		203	pA += ch_im_in;
		204	opB = *__SIMD32(pA);
		205	pA += ch_im_in;
		206	inA2 = __PKHBT(opB, inA1, 16);
		207	inA1 = __PKHTB(inA1, opB, 16);
		208	opA = __SXTB16(inA1);
		209	opB = __SXTB16(inB1);
		210	sum2 = __SMLAD(opA, opB, sum2);
		211	opA = __SXTB16(__ROR(inA1, 8));
		212	opB = __SXTB16(__ROR(inB1, 8));
		213	sum = __SMLAD(opA, opB, sum);
		214	opA = __SXTB16(inA2);
		215	opB = __SXTB16(inB2);
		216	sum4 = __SMLAD(opA, opB, sum4);
		217	opA = __SXTB16(__ROR(inA2, 8));
		218	opB = __SXTB16(__ROR(inB2, 8));
		219	sum3 = __SMLAD(opA, opB, sum3);
		220	colCnt--;
		221	}
		222
		223	#endif /* ARM_MATH_BIG_ENDIAN */
		224
		225	#else
		226
		227	#ifndef ARM_MATH_BIG_ENDIAN
		228	/*
		229	* r0 r1 r2 r3 r4 r5
		230	* inA1, inA2, inB1, inB2, opA, opB
		231	*/
		232
		233	asm volatile ("COL_LOOP_%=:\n"
		234	"ldr.w r2, [%[pB], #0]\n"
		235	"add.w %[pB], %[pB], %[ch_im_in]\n"
		236	"ldr.w r5, [%[pB], #0]\n"
		237	"add.w %[pB], %[pB], %[ch_im_in]\n"
		238	"pkhtb r3, r5, r2, ASR #16\n"
		239	"pkhbt r2, r2, r5, LSL #16\n"
		240	"ldr.w r0, [%[pA], #0]\n"
		241	"add.w %[pA], %[pA], %[ch_im_in]\n"
		242	"ldr.w r5, [%[pA], #0]\n"
		243	"add.w %[pA], %[pA], %[ch_im_in]\n"
		244	"pkhtb r1, r5, r0, ASR #16\n"
		245	"pkhbt r0, r0, r5, LSL #16\n"
		246	"sxtb16 r4, r0\n"
		247	"sxtb16 r5, r2\n"
		248	"smlad %[sum], r4, r5, %[sum]\n"
		249	"mov.w r4, r0, ror #8\n"
		250	"mov.w r5, r2, ror #8\n"
		251	"sxtb16 r4, r4\n"
		252	"sxtb16 r5, r5\n"
		253	"smlad %[sum2], r4, r5, %[sum2]\n"
		254	"sxtb16 r4, r1\n"
		255	"sxtb16 r5, r3\n"
		256	"smlad %[sum3], r4, r5, %[sum3]\n"
		257	"mov.w r4, r1, ror #8\n"
		258	"mov.w r5, r3, ror #8\n"
		259	"sxtb16 r4, r4\n"
		260	"sxtb16 r5, r5\n"
		261	"smlad %[sum4], r4, r5, %[sum4]\n"
		262	"subs %[colCnt], #1\n"
		263	"bne COL_LOOP_%=\n":[sum]
		264	"+r"(sum),[sum2] "+r"(sum2),
		265	[sum3] "+r"(sum3),
		266	[sum4] "+r"(sum4),[pB] "+r"(pB),
		267	[pA] "+r"(pA):[colCnt]
		268	"r"(colCnt),[ch_im_in] "r"(ch_im_in):"r0", "r1", "r2", "r3", "r4", "r5");
		269	#else
		270	/*
		271	* r0 r1 r2 r3 r4 r5
		272	* inA1, inA2, inB1, inB2, opA, opB
		273	*/
		274	asm volatile ("COL_LOOP_%=:\n"
		275	"ldr.w r2, [%[pB], #0]\n"
		276	"add.w %[pB], %[pB], %[ch_im_in]\n"
		277	"ldr.w r5, [%[pB], #0]\n"
		278	"add.w %[pB], %[pB], %[ch_im_in]\n"
		279	"pkhbt r3, r5, r2, LSL #16\n"
		280	"pkhtb r2, r2, r5, ASR #16\n"
		281	"ldr.w r0, [%[pA], #0]\n"
		282	"add.w %[pA], %[pA], %[ch_im_in]\n"
		283	"ldr.w r5, [%[pA], #0]\n"
		284	"add.w %[pA], %[pA], %[ch_im_in]\n"
		285	"pkhbt r1, r5, r0, LSL #16\n"
		286	"pkhtb r0, r0, r5, ASR #16\n"
		287	"sxtb16 r4, r0\n"
		288	"sxtb16 r5, r2\n"
		289	"smlad %[sum2], r4, r5, %[sum2]\n"
		290	"mov.w r4, r0, ror #8\n"
		291	"mov.w r5, r2, ror #8\n"
		292	"sxtb16 r4, r4\n"
		293	"sxtb16 r5, r5\n"
		294	"smlad %[sum], r4, r5, %[sum]\n"
		295	"sxtb16 r4, r1\n"
		296	"sxtb16 r5, r3\n"
		297	"smlad %[sum4], r4, r5, %[sum4]\n"
		298	"mov.w r4, r1, ror #8\n"
		299	"mov.w r5, r3, ror #8\n"
		300	"sxtb16 r4, r4\n"
		301	"sxtb16 r5, r5\n"
		302	"smlad %[sum3], r4, r5, %[sum3]\n"
		303	"subs %[colCnt], #1\n"
		304	"bne COL_LOOP_%=\n":[sum]
		305	"+r"(sum),[sum2] "+r"(sum2),
		306	[sum3] "+r"(sum3),
		307	[sum4] "+r"(sum4),[pB] "+r"(pB),
		308	[pA] "+r"(pA):[colCnt]
		309	"r"(colCnt),[ch_im_in] "r"(ch_im_in):"r0", "r1", "r2", "r3", "r4", "r5");
		310
		311	#endif /* ARM_MATH_BIG_ENDIAN */
		312
		313	#endif /* USE_INTRINSIC */
		314
		315	colCnt = (dim_kernel * dim_kernel) & 0x1;
		316	while (colCnt)
		317	{
		318	union arm_nnword inA, inB;
		319	inA.word = *__SIMD32(pA);
		320	pA += ch_im_in;
		321	inB.word = *__SIMD32(pB);
		322	pB += ch_im_in;
		323	sum += inA.bytes[0] * inB.bytes[0];
		324	sum2 += inA.bytes[1] * inB.bytes[1];
		325	sum3 += inA.bytes[2] * inB.bytes[2];
		326	sum4 += inA.bytes[3] * inB.bytes[3];
		327	colCnt--;
		328	}
		329
		330	*pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
		331	*pOut++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
		332	*pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8);
		333	*pOut++ = (q7_t) __SSAT((sum4 >> out_shift), 8);
		334
		335	rowCnt--;
		336	}
		337
		338	rowCnt = ch_im_out & 0x3;
		339	while (rowCnt)
		340	{
		341	q7_t *pB = colBuffer + row_shift;
		342	const q7_t *pA = wt + row_shift;
		343	q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
		344	uint16_t colCnt = (dim_kernel * dim_kernel);
		345
		346	row_shift += 1;
		347
		348	while (colCnt)
		349	{
		350	q7_t A1 = *pA;
		351	q7_t B1 = *pB;
		352	pA += ch_im_in;
		353	pB += ch_im_in;
		354	sum += A1 * B1;
		355
		356	colCnt--;
		357	}
		358	*pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
		359	rowCnt--;
		360	}
		361
		362	/* clear counter and pointers */
		363	pBuffer = colBuffer;
		364	}
		365	}
		366
		367	#else
		368	/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
		369	int i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y;
		370	int conv_out;
		371
		372	/* do some checking here, basically ch_im_in == ch_im_out */
		373	if (ch_im_in != ch_im_out)
		374	{
		375	return ARM_MATH_SIZE_MISMATCH;
		376	}
		377
		378	for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
		379	{
		380	for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
		381	{
		382	for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
		383	{
		384	// for each output
		385	conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
		386	for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++)
		387	{
		388	for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++)
		389	{
		390	int in_row = stride * i_out_y + i_ker_y - padding;
		391	int in_col = stride * i_out_x + i_ker_x - padding;
		392	if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
		393	{
		394	conv_out +=
		395	Im_in[(in_row *
		396	dim_im_in +
		397	in_col) *
		398	ch_im_in +
		399	i_ch_out] * wt[(i_ker_y * dim_kernel + i_ker_x) * ch_im_out + i_ch_out];
		400	}
		401	}
		402	}
		403	Im_out[(i_out_y * dim_im_out +
		404	i_out_x) * ch_im_out + i_ch_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
		405	}
		406	}
		407	}
		408
		409	#endif /* ARM_MATH_DSP */
		410
		411	/* Return to application */
		412	return ARM_MATH_SUCCESS;
		413
		414	}
		415
		416	/**
		417	* @} end of NNConv group
		418	*/

Subversion Repositories AFRtranscoder

(root)/trunk/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c – Rev 2