WebSVN – dashGPS – Blame – /branches/dashGPS-bmp/Drivers/CMSIS/NN/Include/arm_nnfunctions.h

Rev	Author	Line No.	Line
2	mjames	1	/*
		2	* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
		3	*
		4	* SPDX-License-Identifier: Apache-2.0
		5	*
		6	* Licensed under the Apache License, Version 2.0 (the License); you may
		7	* not use this file except in compliance with the License.
		8	* You may obtain a copy of the License at
		9	*
		10	* www.apache.org/licenses/LICENSE-2.0
		11	*
		12	* Unless required by applicable law or agreed to in writing, software
		13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
		14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		15	* See the License for the specific language governing permissions and
		16	* limitations under the License.
		17	*/
		18
		19	/* ----------------------------------------------------------------------
		20	* Project: CMSIS NN Library
		21	* Title: arm_nnfunctions.h
		22	* Description: Public header file for CMSIS NN Library
		23	*
		24	* $Date: 13. July 2018
		25	* $Revision: V.1.0.0
		26	*
		27	* Target Processor: Cortex-M cores
		28	* -------------------------------------------------------------------- */
		29
		30	/**
		31	\mainpage CMSIS NN Software Library
		32	*
		33	* Introduction
		34	* ------------
		35	*
		36	* This user manual describes the CMSIS NN software library,
		37	* a collection of efficient neural network kernels developed to maximize the
		38	* performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
		39	*
		40	* The library is divided into a number of functions each covering a specific category:
		41	* - Neural Network Convolution Functions
		42	* - Neural Network Activation Functions
		43	* - Fully-connected Layer Functions
		44	* - Neural Network Pooling Functions
		45	* - Softmax Functions
		46	* - Neural Network Support Functions
		47	*
		48	* The library has separate functions for operating on different weight and activation data
		49	* types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
		50	* kernels are included in the function description. The implementation details are also
		51	* described in this paper [1].
		52	*
		53	* Block Diagram
		54	* --------
		55	* \image html CMSIS-NN-OVERVIEW.PNG
		56	*
		57	* Examples
		58	* --------
		59	*
		60	* The library ships with a number of examples which demonstrate how to use the library functions.
		61	*
		62	* Pre-processor Macros
		63	* ------------
		64	*
		65	* Each library project have differant pre-processor macros.
		66	*
		67	* - ARM_MATH_DSP:
		68	*
		69	* Define macro ARM_MATH_DSP, If the silicon supports DSP instructions.
		70	*
		71	* - ARM_MATH_BIG_ENDIAN:
		72	*
		73	* Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets.
		74	*
		75	* - ARM_NN_TRUNCATE:
		76	*
		77	* Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
		78	*
		79	* Copyright Notice
		80	* ------------
		81	*
		82	* Copyright (C) 2010-2018 Arm Limited. All rights reserved.
		83	*
		84	* [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
		85	*/
		86
		87	/**
		88	* @defgroup groupNN Neural Network Functions
		89	* These functions perform basic operations for neural network layers.
		90	*/
		91
		92	#ifndef _ARM_NNFUNCTIONS_H
		93	#define _ARM_NNFUNCTIONS_H
		94
		95	#include "arm_nnsupportfunctions.h"
		96	#include "arm_nn_tables.h"
		97
		98	#define USE_INTRINSIC
		99
		100	//#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
		101
		102	#ifdef __cplusplus
		103	extern "C"
		104	{
		105	#endif
		106
		107	/**
		108	* @defgroup NNConv Neural Network Convolution Functions
		109	*
		110	* Perform convolution layer
		111	*
		112	* The convolution is implemented in 2 steps: im2col and GEMM
		113	*
		114	* im2col is a process of converting each patch of image data into
		115	* a column. After im2col, the convolution is computed as matrix-matrix
		116	* multiplication.
		117	*
		118	* To reduce the memory footprint, the im2col is performed partially.
		119	* Each iteration, only a few column (i.e., patches) are generated and
		120	* computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
		121	*
		122	*/
		123
		124	/**
		125	* @brief Basic Q7 convolution function
		126	* @param[in] Im_in pointer to input tensor
		127	* @param[in] dim_im_in input tensor dimention
		128	* @param[in] ch_im_in number of input tensor channels
		129	* @param[in] wt pointer to kernel weights
		130	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		131	* @param[in] dim_kernel filter kernel size
		132	* @param[in] padding padding sizes
		133	* @param[in] stride convolution stride
		134	* @param[in] bias pointer to bias
		135	* @param[in] bias_shift amount of left-shift for bias
		136	* @param[in] out_shift amount of right-shift for output
		137	* @param[in,out] Im_out pointer to output tensor
		138	* @param[in] dim_im_out output tensor dimension
		139	* @param[in,out] bufferA pointer to buffer space for input
		140	* @param[in,out] bufferB pointer to buffer space for output
		141	* @return The function returns <code>ARM_MATH_SUCCESS</code>
		142	*
		143	*/
		144
		145	arm_status arm_convolve_HWC_q7_basic(const q7_t * Im_in,
		146	const uint16_t dim_im_in,
		147	const uint16_t ch_im_in,
		148	const q7_t * wt,
		149	const uint16_t ch_im_out,
		150	const uint16_t dim_kernel,
		151	const uint16_t padding,
		152	const uint16_t stride,
		153	const q7_t * bias,
		154	const uint16_t bias_shift,
		155	const uint16_t out_shift,
		156	q7_t * Im_out,
		157	const uint16_t dim_im_out,
		158	q15_t * bufferA,
		159	q7_t * bufferB);
		160
		161	/**
		162	* @brief Basic Q7 convolution function (non-sqaure shape)
		163	* @param[in] Im_in pointer to input tensor
		164	* @param[in] dim_im_in_x input tensor dimention x
		165	* @param[in] dim_im_in_y input tensor dimention y
		166	* @param[in] ch_im_in number of input tensor channels
		167	* @param[in] wt pointer to kernel weights
		168	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		169	* @param[in] dim_kernel_x filter kernel size x
		170	* @param[in] dim_kernel_y filter kernel size y
		171	* @param[in] padding_x padding size x
		172	* @param[in] padding_y padding size y
		173	* @param[in] stride_x convolution stride x
		174	* @param[in] stride_y convolution stride y
		175	* @param[in] bias pointer to bias
		176	* @param[in] bias_shift amount of left-shift for bias
		177	* @param[in] out_shift amount of right-shift for output
		178	* @param[in,out] Im_out pointer to output tensor
		179	* @param[in] dim_im_out_x output tensor dimension x
		180	* @param[in] dim_im_out_y output tensor dimension y
		181	* @param[in,out] bufferA pointer to buffer space for input
		182	* @param[in,out] bufferB pointer to buffer space for output
		183	* @return The function returns <code>ARM_MATH_SUCCESS</code>
		184	*/
		185
		186	arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
		187	const uint16_t dim_im_in_x,
		188	const uint16_t dim_im_in_y,
		189	const uint16_t ch_im_in,
		190	const q7_t * wt,
		191	const uint16_t ch_im_out,
		192	const uint16_t dim_kernel_x,
		193	const uint16_t dim_kernel_y,
		194	const uint16_t padding_x,
		195	const uint16_t padding_y,
		196	const uint16_t stride_x,
		197	const uint16_t stride_y,
		198	const q7_t * bias,
		199	const uint16_t bias_shift,
		200	const uint16_t out_shift,
		201	q7_t * Im_out,
		202	const uint16_t dim_im_out_x,
		203	const uint16_t dim_im_out_y,
		204	q15_t * bufferA,
		205	q7_t * bufferB);
		206
		207	/**
		208	* @brief Basic Q15 convolution function
		209	* @param[in] Im_in pointer to input tensor
		210	* @param[in] dim_im_in input tensor dimention
		211	* @param[in] ch_im_in number of input tensor channels
		212	* @param[in] wt pointer to kernel weights
		213	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		214	* @param[in] dim_kernel filter kernel size
		215	* @param[in] padding padding sizes
		216	* @param[in] stride convolution stride
		217	* @param[in] bias pointer to bias
		218	* @param[in] bias_shift amount of left-shift for bias
		219	* @param[in] out_shift amount of right-shift for output
		220	* @param[in,out] Im_out pointer to output tensor
		221	* @param[in] dim_im_out output tensor dimension
		222	* @param[in,out] bufferA pointer to buffer space for input
		223	* @param[in,out] bufferB pointer to buffer space for output
		224	* @return The function returns <code>ARM_MATH_SUCCESS</code>
		225	*
		226	*/
		227
		228	arm_status arm_convolve_HWC_q15_basic(const q15_t * Im_in,
		229	const uint16_t dim_im_in,
		230	const uint16_t ch_im_in,
		231	const q15_t * wt,
		232	const uint16_t ch_im_out,
		233	const uint16_t dim_kernel,
		234	const uint16_t padding,
		235	const uint16_t stride,
		236	const q15_t * bias,
		237	const uint16_t bias_shift,
		238	const uint16_t out_shift,
		239	q15_t * Im_out,
		240	const uint16_t dim_im_out,
		241	q15_t * bufferA,
		242	q7_t * bufferB);
		243
		244	/**
		245	* @brief Fast Q7 convolution function
		246	* @param[in] Im_in pointer to input tensor
		247	* @param[in] dim_im_in input tensor dimention
		248	* @param[in] ch_im_in number of input tensor channels
		249	* @param[in] wt pointer to kernel weights
		250	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		251	* @param[in] dim_kernel filter kernel size
		252	* @param[in] padding padding sizes
		253	* @param[in] stride convolution stride
		254	* @param[in] bias pointer to bias
		255	* @param[in] bias_shift amount of left-shift for bias
		256	* @param[in] out_shift amount of right-shift for output
		257	* @param[in,out] Im_out pointer to output tensor
		258	* @param[in] dim_im_out output tensor dimension
		259	* @param[in,out] bufferA pointer to buffer space for input
		260	* @param[in,out] bufferB pointer to buffer space for output
		261	* @return The function returns either
		262	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		263	*
		264	* This function is the version with full list of optimization tricks, but with
		265	* some contraints:
		266	* ch_im_in is multiple of 4
		267	* ch_im_out is multiple of 2
		268	*/
		269
		270	arm_status arm_convolve_HWC_q7_fast(const q7_t * Im_in,
		271	const uint16_t dim_im_in,
		272	const uint16_t ch_im_in,
		273	const q7_t * wt,
		274	const uint16_t ch_im_out,
		275	const uint16_t dim_kernel,
		276	const uint16_t padding,
		277	const uint16_t stride,
		278	const q7_t * bias,
		279	const uint16_t bias_shift,
		280	const uint16_t out_shift,
		281	q7_t * Im_out,
		282	const uint16_t dim_im_out,
		283	q15_t * bufferA,
		284	q7_t * bufferB);
		285
		286	/**
		287	* @brief Fast Q7 convolution function (non-sqaure shape)
		288	* @param[in] Im_in pointer to input tensor
		289	* @param[in] dim_im_in_x input tensor dimention x
		290	* @param[in] dim_im_in_y input tensor dimention y
		291	* @param[in] ch_im_in number of input tensor channels
		292	* @param[in] wt pointer to kernel weights
		293	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		294	* @param[in] dim_kernel_x filter kernel size x
		295	* @param[in] dim_kernel_y filter kernel size y
		296	* @param[in] padding_x padding size x
		297	* @param[in] padding_y padding size y
		298	* @param[in] stride_x convolution stride x
		299	* @param[in] stride_y convolution stride y
		300	* @param[in] bias pointer to bias
		301	* @param[in] bias_shift amount of left-shift for bias
		302	* @param[in] out_shift amount of right-shift for output
		303	* @param[in,out] Im_out pointer to output tensor
		304	* @param[in] dim_im_out_x output tensor dimension x
		305	* @param[in] dim_im_out_y output tensor dimension y
		306	* @param[in,out] bufferA pointer to buffer space for input
		307	* @param[in,out] bufferB pointer to buffer space for output
		308	* @return The function returns either
		309	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		310	*
		311	* This function is the version with full list of optimization tricks, but with
		312	* some contraints:
		313	* ch_im_in is multiple of 4
		314	* ch_im_out is multiple of 2
		315	*/
		316
		317	arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
		318	const uint16_t dim_im_in_x,
		319	const uint16_t dim_im_in_y,
		320	const uint16_t ch_im_in,
		321	const q7_t * wt,
		322	const uint16_t ch_im_out,
		323	const uint16_t dim_kernel_x,
		324	const uint16_t dim_kernel_y,
		325	const uint16_t padding_x,
		326	const uint16_t padding_y,
		327	const uint16_t stride_x,
		328	const uint16_t stride_y,
		329	const q7_t * bias,
		330	const uint16_t bias_shift,
		331	const uint16_t out_shift,
		332	q7_t * Im_out,
		333	const uint16_t dim_im_out_x,
		334	const uint16_t dim_im_out_y,
		335	q15_t * bufferA,
		336	q7_t * bufferB);
		337
		338	/**
		339	* @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
		340	* @param[in] Im_in pointer to input tensor
		341	* @param[in] dim_im_in_x input tensor dimention x
		342	* @param[in] dim_im_in_y input tensor dimention y
		343	* @param[in] ch_im_in number of input tensor channels
		344	* @param[in] wt pointer to kernel weights
		345	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		346	* @param[in] dim_kernel_x filter kernel size x
		347	* @param[in] dim_kernel_y filter kernel size y
		348	* @param[in] padding_x padding size x
		349	* @param[in] padding_y padding size y
		350	* @param[in] stride_x convolution stride x
		351	* @param[in] stride_y convolution stride y
		352	* @param[in] bias pointer to bias
		353	* @param[in] bias_shift amount of left-shift for bias
		354	* @param[in] out_shift amount of right-shift for output
		355	* @param[in,out] Im_out pointer to output tensor
		356	* @param[in] dim_im_out_x output tensor dimension x
		357	* @param[in] dim_im_out_y output tensor dimension y
		358	* @param[in,out] bufferA pointer to buffer space for input
		359	* @param[in,out] bufferB pointer to buffer space for output
		360	* @return The function returns either
		361	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		362	*
		363	* This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
		364	* and dim_kernel_y=1). It can be used for
		365	* second half of MobileNets after depthwise separable convolution.
		366	*
		367	* This function is the version with full list of optimization tricks, but with
		368	* some contraints:
		369	* ch_im_in is multiple of 4
		370	* ch_im_out is multiple of 2
		371	*/
		372	arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
		373	const uint16_t dim_im_in_x,
		374	const uint16_t dim_im_in_y,
		375	const uint16_t ch_im_in,
		376	const q7_t * wt,
		377	const uint16_t ch_im_out,
		378	const uint16_t dim_kernel_x,
		379	const uint16_t dim_kernel_y,
		380	const uint16_t padding_x,
		381	const uint16_t padding_y,
		382	const uint16_t stride_x,
		383	const uint16_t stride_y,
		384	const q7_t * bias,
		385	const uint16_t bias_shift,
		386	const uint16_t out_shift,
		387	q7_t * Im_out,
		388	const uint16_t dim_im_out_x,
		389	const uint16_t dim_im_out_y,
		390	q15_t * bufferA,
		391	q7_t * bufferB);
		392
		393	/**
		394	* @brief Q7 version of convolution for RGB image
		395	* @param[in] Im_in pointer to input tensor
		396	* @param[in] dim_im_in input tensor dimention
		397	* @param[in] ch_im_in number of input tensor channels
		398	* @param[in] wt pointer to kernel weights
		399	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		400	* @param[in] dim_kernel filter kernel size
		401	* @param[in] padding padding sizes
		402	* @param[in] stride convolution stride
		403	* @param[in] bias pointer to bias
		404	* @param[in] bias_shift amount of left-shift for bias
		405	* @param[in] out_shift amount of right-shift for output
		406	* @param[in,out] Im_out pointer to output tensor
		407	* @param[in] dim_im_out output tensor dimension
		408	* @param[in,out] bufferA pointer to buffer space for input
		409	* @param[in,out] bufferB pointer to buffer space for output
		410	* @return The function returns either
		411	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		412	*
		413	* This kernel is written exclusively for convolution with ch_im_in
		414	* equals 3. This applies on the first layer of CNNs which has input
		415	* image with RGB format.
		416	*/
		417
		418	arm_status arm_convolve_HWC_q7_RGB(const q7_t * Im_in,
		419	const uint16_t dim_im_in,
		420	const uint16_t ch_im_in,
		421	const q7_t * wt,
		422	const uint16_t ch_im_out,
		423	const uint16_t dim_kernel,
		424	const uint16_t padding,
		425	const uint16_t stride,
		426	const q7_t * bias,
		427	const uint16_t bias_shift,
		428	const uint16_t out_shift,
		429	q7_t * Im_out,
		430	const uint16_t dim_im_out,
		431	q15_t * bufferA,
		432	q7_t * bufferB);
		433
		434	/**
		435	* @brief Fast Q15 convolution function
		436	* @param[in] Im_in pointer to input tensor
		437	* @param[in] dim_im_in input tensor dimention
		438	* @param[in] ch_im_in number of input tensor channels
		439	* @param[in] wt pointer to kernel weights
		440	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		441	* @param[in] dim_kernel filter kernel size
		442	* @param[in] padding padding sizes
		443	* @param[in] stride convolution stride
		444	* @param[in] bias pointer to bias
		445	* @param[in] bias_shift amount of left-shift for bias
		446	* @param[in] out_shift amount of right-shift for output
		447	* @param[in,out] Im_out pointer to output tensor
		448	* @param[in] dim_im_out output tensor dimension
		449	* @param[in,out] bufferA pointer to buffer space for input
		450	* @param[in,out] bufferB pointer to buffer space for output
		451	* @return The function returns either
		452	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		453	*
		454	* This function is the version with full list of optimization tricks, but with
		455	* some contraints:
		456	* ch_im_in is multiple of 2
		457	* ch_im_out is multiple of 2
		458	*/
		459
		460	arm_status arm_convolve_HWC_q15_fast(const q15_t * Im_in,
		461	const uint16_t dim_im_in,
		462	const uint16_t ch_im_in,
		463	const q15_t * wt,
		464	const uint16_t ch_im_out,
		465	const uint16_t dim_kernel,
		466	const uint16_t padding,
		467	const uint16_t stride,
		468	const q15_t * bias,
		469	const uint16_t bias_shift,
		470	const uint16_t out_shift,
		471	q15_t * Im_out,
		472	const uint16_t dim_im_out,
		473	q15_t * bufferA,
		474	q7_t * bufferB);
		475
		476	/**
		477	* @brief Fast Q15 convolution function (non-sqaure shape)
		478	* @param[in] Im_in pointer to input tensor
		479	* @param[in] dim_im_in_x input tensor dimention x
		480	* @param[in] dim_im_in_y input tensor dimention y
		481	* @param[in] ch_im_in number of input tensor channels
		482	* @param[in] wt pointer to kernel weights
		483	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		484	* @param[in] dim_kernel_x filter kernel size x
		485	* @param[in] dim_kernel_y filter kernel size y
		486	* @param[in] padding_x padding size x
		487	* @param[in] padding_y padding size y
		488	* @param[in] stride_x convolution stride x
		489	* @param[in] stride_y convolution stride y
		490	* @param[in] bias pointer to bias
		491	* @param[in] bias_shift amount of left-shift for bias
		492	* @param[in] out_shift amount of right-shift for output
		493	* @param[in,out] Im_out pointer to output tensor
		494	* @param[in] dim_im_out_x output tensor dimension x
		495	* @param[in] dim_im_out_y output tensor dimension y
		496	* @param[in,out] bufferA pointer to buffer space for input
		497	* @param[in,out] bufferB pointer to buffer space for output
		498	* @return The function returns either
		499	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		500	*
		501	* @details
		502	*
		503	* <b>Buffer size:</b>
		504	*
		505	* bufferA size: 2ch_im_indim_kernel*dim_kernel
		506	*
		507	* bufferB size: 0
		508	*
		509	* <b>Input dimension constraints:</b>
		510	*
		511	* ch_im_in is multiple of 2
		512	*
		513	* ch_im_out is multipe of 2
		514	*
		515	*/
		516
		517	arm_status
		518	arm_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in,
		519	const uint16_t dim_im_in_x,
		520	const uint16_t dim_im_in_y,
		521	const uint16_t ch_im_in,
		522	const q15_t * wt,
		523	const uint16_t ch_im_out,
		524	const uint16_t dim_kernel_x,
		525	const uint16_t dim_kernel_y,
		526	const uint16_t padding_x,
		527	const uint16_t padding_y,
		528	const uint16_t stride_x,
		529	const uint16_t stride_y,
		530	const q15_t * bias,
		531	const uint16_t bias_shift,
		532	const uint16_t out_shift,
		533	q15_t * Im_out,
		534	const uint16_t dim_im_out_x,
		535	const uint16_t dim_im_out_y,
		536	q15_t * bufferA,
		537	q7_t * bufferB);
		538
		539	/**
		540	* @brief Q7 depthwise separable convolution function
		541	* @param[in] Im_in pointer to input tensor
		542	* @param[in] dim_im_in input tensor dimention
		543	* @param[in] ch_im_in number of input tensor channels
		544	* @param[in] wt pointer to kernel weights
		545	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		546	* @param[in] dim_kernel filter kernel size
		547	* @param[in] padding padding sizes
		548	* @param[in] stride convolution stride
		549	* @param[in] bias pointer to bias
		550	* @param[in] bias_shift amount of left-shift for bias
		551	* @param[in] out_shift amount of right-shift for output
		552	* @param[in,out] Im_out pointer to output tensor
		553	* @param[in] dim_im_out output tensor dimension
		554	* @param[in,out] bufferA pointer to buffer space for input
		555	* @param[in,out] bufferB pointer to buffer space for output
		556	* @return The function returns either
		557	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		558	*
		559	* This function is the version with full list of optimization tricks, but with
		560	* some contraints:
		561	* ch_im_in is multiple of 2
		562	* ch_im_out is multiple of 2
		563	*/
		564
		565	arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
		566	const uint16_t dim_im_in,
		567	const uint16_t ch_im_in,
		568	const q7_t * wt,
		569	const uint16_t ch_im_out,
		570	const uint16_t dim_kernel,
		571	const uint16_t padding,
		572	const uint16_t stride,
		573	const q7_t * bias,
		574	const uint16_t bias_shift,
		575	const uint16_t out_shift,
		576	q7_t * Im_out,
		577	const uint16_t dim_im_out,
		578	q15_t * bufferA,
		579	q7_t * bufferB);
		580
		581	/**
		582	* @brief Q7 depthwise separable convolution function (non-square shape)
		583	* @param[in] Im_in pointer to input tensor
		584	* @param[in] dim_im_in_x input tensor dimention x
		585	* @param[in] dim_im_in_y input tensor dimention y
		586	* @param[in] ch_im_in number of input tensor channels
		587	* @param[in] wt pointer to kernel weights
		588	* @param[in] ch_im_out number of filters, i.e., output tensor channels
		589	* @param[in] dim_kernel_x filter kernel size x
		590	* @param[in] dim_kernel_y filter kernel size y
		591	* @param[in] padding_x padding sizes x
		592	* @param[in] padding_y padding sizes y
		593	* @param[in] stride_x convolution stride x
		594	* @param[in] stride_y convolution stride y
		595	* @param[in] bias pointer to bias
		596	* @param[in] bias_shift amount of left-shift for bias
		597	* @param[in] out_shift amount of right-shift for output
		598	* @param[in,out] Im_out pointer to output tensor
		599	* @param[in] dim_im_out_x output tensor dimension x
		600	* @param[in] dim_im_out_y output tensor dimension y
		601	* @param[in,out] bufferA pointer to buffer space for input
		602	* @param[in,out] bufferB pointer to buffer space for output
		603	* @return The function returns either
		604	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
		605	*
		606	* This function is the version with full list of optimization tricks, but with
		607	* some contraints:
		608	* ch_im_in is multiple of 2
		609	* ch_im_out is multiple of 2
		610	*/
		611	arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
		612	const uint16_t dim_im_in_x,
		613	const uint16_t dim_im_in_y,
		614	const uint16_t ch_im_in,
		615	const q7_t * wt,
		616	const uint16_t ch_im_out,
		617	const uint16_t dim_kernel_x,
		618	const uint16_t dim_kernel_y,
		619	const uint16_t padding_x,
		620	const uint16_t padding_y,
		621	const uint16_t stride_x,
		622	const uint16_t stride_y,
		623	const q7_t * bias,
		624	const uint16_t bias_shift,
		625	const uint16_t out_shift,
		626	q7_t * Im_out,
		627	const uint16_t dim_im_out_x,
		628	const uint16_t dim_im_out_y,
		629	q15_t * bufferA,
		630	q7_t * bufferB);
		631
		632
		633	/**
		634	* @defgroup FC Fully-connected Layer Functions
		635	*
		636	* Perform fully-connected layer
		637	*
		638	* Fully-connected layer is basically a matrix-vector multiplication
		639	* with bias. The matrix is the weights and the input/output vectors
		640	* are the activation values. Supported {weight, activation} precisions
		641	* include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
		642	*
		643	* Here we have two types of kernel functions. The basic function
		644	* implements the function using regular GEMV approach. The opt functions
		645	* operates with weights in interleaved formats.
		646	*
		647	*/
		648
		649	/**
		650	* @brief Q7 basic fully-connected layer function
		651	* @param[in] pV pointer to input vector
		652	* @param[in] pM pointer to matrix weights
		653	* @param[in] dim_vec length of the vector
		654	* @param[in] num_of_rows number of rows in weight matrix
		655	* @param[in] bias_shift amount of left-shift for bias
		656	* @param[in] out_shift amount of right-shift for output
		657	* @param[in] bias pointer to bias
		658	* @param[in,out] pOut pointer to output vector
		659	* @param[in,out] vec_buffer pointer to buffer space for input
		660	* @return The function returns <code>ARM_MATH_SUCCESS</code>
		661	*
		662	*/
		663
		664	arm_status arm_fully_connected_q7(const q7_t * pV,
		665	const q7_t * pM,
		666	const uint16_t dim_vec,
		667	const uint16_t num_of_rows,
		668	const uint16_t bias_shift,
		669	const uint16_t out_shift,
		670	const q7_t * bias,
		671	q7_t * pOut,
		672	q15_t * vec_buffer);
		673
		674	/**
		675	* @brief Q7 opt fully-connected layer function
		676	* @param[in] pV pointer to input vector
		677	* @param[in] pM pointer to matrix weights
		678	* @param[in] dim_vec length of the vector
		679	* @param[in] num_of_rows number of rows in weight matrix
		680	* @param[in] bias_shift amount of left-shift for bias
		681	* @param[in] out_shift amount of right-shift for output
		682	* @param[in] bias pointer to bias
		683	* @param[in,out] pOut pointer to output vector
		684	* @param[in,out] vec_buffer pointer to buffer space for input
		685	* @return The function returns <code>ARM_MATH_SUCCESS</code>
		686	*
		687	*/
		688
		689	arm_status arm_fully_connected_q7_opt(const q7_t * pV,
		690	const q7_t * pM,
		691	const uint16_t dim_vec,
		692	const uint16_t num_of_rows,
		693	const uint16_t bias_shift,
		694	const uint16_t out_shift,
		695	const q7_t * bias,
		696	q7_t * pOut,
		697	q15_t * vec_buffer);
		698
		699	/**
		700	* @brief Q15 basic fully-connected layer function
		701	* @param[in] pV pointer to input vector
		702	* @param[in] pM pointer to matrix weights
		703	* @param[in] dim_vec length of the vector
		704	* @param[in] num_of_rows number of rows in weight matrix
		705	* @param[in] bias_shift amount of left-shift for bias
		706	* @param[in] out_shift amount of right-shift for output
		707	* @param[in] bias pointer to bias
		708	* @param[in,out] pOut pointer to output vector
		709	* @param[in,out] vec_buffer pointer to buffer space for input
		710	* @return The function returns <code>ARM_MATH_SUCCESS</code>
		711	*
		712	*/
		713
		714	arm_status arm_fully_connected_q15(const q15_t * pV,
		715	const q15_t * pM,
		716	const uint16_t dim_vec,
		717	const uint16_t num_of_rows,
		718	const uint16_t bias_shift,
		719	const uint16_t out_shift,
		720	const q15_t * bias,
		721	q15_t * pOut,
		722	q15_t * vec_buffer);
		723
		724	/**
		725	* @brief Q15 opt fully-connected layer function
		726	* @param[in] pV pointer to input vector
		727	* @param[in] pM pointer to matrix weights
		728	* @param[in] dim_vec length of the vector
		729	* @param[in] num_of_rows number of rows in weight matrix
		730	* @param[in] bias_shift amount of left-shift for bias
		731	* @param[in] out_shift amount of right-shift for output
		732	* @param[in] bias pointer to bias
		733	* @param[in,out] pOut pointer to output vector
		734	* @param[in,out] vec_buffer pointer to buffer space for input
		735	* @return The function returns <code>ARM_MATH_SUCCESS</code>
		736	*
		737	*/
		738
		739	arm_status arm_fully_connected_q15_opt(const q15_t * pV,
		740	const q15_t * pM,
		741	const uint16_t dim_vec,
		742	const uint16_t num_of_rows,
		743	const uint16_t bias_shift,
		744	const uint16_t out_shift,
		745	const q15_t * bias,
		746	q15_t * pOut,
		747	q15_t * vec_buffer);
		748
		749	/**
		750	* @brief Mixed Q15-Q7 fully-connected layer function
		751	* @param[in] pV pointer to input vector
		752	* @param[in] pM pointer to matrix weights
		753	* @param[in] dim_vec length of the vector
		754	* @param[in] num_of_rows number of rows in weight matrix
		755	* @param[in] bias_shift amount of left-shift for bias
		756	* @param[in] out_shift amount of right-shift for output
		757	* @param[in] bias pointer to bias
		758	* @param[in,out] pOut pointer to output vector
		759	* @param[in,out] vec_buffer pointer to buffer space for input
		760	* @return The function returns <code>ARM_MATH_SUCCESS</code>
		761	*
		762	*/
		763
		764	arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t * pV,
		765	const q7_t * pM,
		766	const uint16_t dim_vec,
		767	const uint16_t num_of_rows,
		768	const uint16_t bias_shift,
		769	const uint16_t out_shift,
		770	const q7_t * bias,
		771	q15_t * pOut,
		772	q15_t * vec_buffer);
		773
		774	/**
		775	* @brief Mixed Q15-Q7 opt fully-connected layer function
		776	* @param[in] pV pointer to input vector
		777	* @param[in] pM pointer to matrix weights
		778	* @param[in] dim_vec length of the vector
		779	* @param[in] num_of_rows number of rows in weight matrix
		780	* @param[in] bias_shift amount of left-shift for bias
		781	* @param[in] out_shift amount of right-shift for output
		782	* @param[in] bias pointer to bias
		783	* @param[in,out] pOut pointer to output vector
		784	* @param[in,out] vec_buffer pointer to buffer space for input
		785	* @return The function returns <code>ARM_MATH_SUCCESS</code>
		786	*
		787	*/
		788
		789	arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
		790	const q7_t * pM,
		791	const uint16_t dim_vec,
		792	const uint16_t num_of_rows,
		793	const uint16_t bias_shift,
		794	const uint16_t out_shift,
		795	const q7_t * bias,
		796	q15_t * pOut,
		797	q15_t * vec_buffer);
		798
		799	/**
		800	* @brief Matrix-Multiplication Kernels for Convolution
		801	*
		802	* These functions are used within convolution layer functions for
		803	* matrix multiplication.
		804	*
		805	* The implementation is similar to CMSIS-DSP arm_mat_mult functions
		806	* with one Q7 and one Q15 operands. The Q15 operand is the im2col
		807	* output which is always with 2 columns.
		808	*
		809	*/
		810
		811	/**
		812	* @brief Matrix-multiplication function for convolution
		813	* @param[in] pA pointer to operand A
		814	* @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
		815	* @param[in] ch_im_out numRow of A
		816	* @param[in] numCol_A numCol of A
		817	* @param[in] bias_shift amount of left-shift for bias
		818	* @param[in] out_shift amount of right-shift for output
		819	* @param[in] bias the bias
		820	* @param[in,out] pOut pointer to output
		821	* @return The function returns the incremented output pointer
		822	*/
		823
		824	q7_t arm_nn_mat_mult_kernel_q7_q15(const q7_t pA,
		825	const q15_t * pInBuffer,
		826	const uint16_t ch_im_out,
		827	const uint16_t numCol_A,
		828	const uint16_t bias_shift,
		829	const uint16_t out_shift,
		830	const q7_t * bias,
		831	q7_t * pOut);
		832
		833	/**
		834	* @brief Matrix-multiplication function for convolution with reordered columns
		835	* @param[in] pA pointer to operand A
		836	* @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
		837	* @param[in] ch_im_out numRow of A
		838	* @param[in] numCol_A numCol of A
		839	* @param[in] bias_shift amount of left-shift for bias
		840	* @param[in] out_shift amount of right-shift for output
		841	* @param[in] bias the bias
		842	* @param[in,out] pOut pointer to output
		843	* @return The function returns the incremented output pointer
		844	*/
		845
		846	q7_t arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t pA,
		847	const q15_t * pInBuffer,
		848	const uint16_t ch_im_out,
		849	const uint16_t numCol_A,
		850	const uint16_t bias_shift,
		851	const uint16_t out_shift,
		852	const q7_t * bias,
		853	q7_t * pOut);
		854
		855	#ifdef __cplusplus
		856	}
		857	#endif
		858
		859	/*
		860	* Other functions
		861	* These layers are typically not timing critical
		862	* Basic implementation is supported here
		863	*/
		864
		865	#ifdef __cplusplus
		866	extern "C"
		867	{
		868	#endif
		869
		870	/**
		871	* @defgroup Acti Neural Network Activation Functions
		872	*
		873	* Perform activation layers, including ReLU (Rectified Linear Unit),
		874	* sigmoid and tanh
		875	*
		876	*/
		877
		878	/**
		879	* @brief Q7 RELU function
		880	* @param[in,out] data pointer to input
		881	* @param[in] size number of elements
		882	* @return none.
		883	*/
		884
		885	void arm_relu_q7(q7_t * data, uint16_t size);
		886
		887	/**
		888	* @brief Q15 RELU function
		889	* @param[in,out] data pointer to input
		890	* @param[in] size number of elements
		891	* @return none.
		892	*/
		893
		894	void arm_relu_q15(q15_t * data, uint16_t size);
		895
		896	/**
		897	* @brief Q7 neural network activation function using direct table look-up
		898	* @param[in,out] data pointer to input
		899	* @param[in] size number of elements
		900	* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
		901	* @param[in] type type of activation functions
		902	* @return none.
		903	*/
		904
		905	void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
		906	arm_nn_activation_type type);
		907
		908	/**
		909	* @brief Q15 neural network activation function using direct table look-up
		910	* @param[in,out] data pointer to input
		911	* @param[in] size number of elements
		912	* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
		913	* @param[in] type type of activation functions
		914	* @return none.
		915	*/
		916
		917	void arm_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width,
		918	arm_nn_activation_type type);
		919
		920	/**
		921	* @defgroup Pooling Neural Network Pooling Functions
		922	*
		923	* Perform pooling functions, including max pooling and average pooling
		924	*
		925	*/
		926
		927	/**
		928	* @brief Q7 max pooling function
		929	* @param[in] Im_in pointer to input tensor
		930	* @param[in] dim_im_in input tensor dimention
		931	* @param[in] ch_im_in number of input tensor channels
		932	* @param[in] dim_kernel filter kernel size
		933	* @param[in] padding padding sizes
		934	* @param[in] stride convolution stride
		935	* @param[in] dim_im_out output tensor dimension
		936	* @param[in,out] bufferA pointer to buffer space for input
		937	* @param[in,out] Im_out pointer to output tensor
		938	* @return none.
		939	*
		940	*/
		941
		942	void arm_maxpool_q7_HWC(q7_t * Im_in,
		943	const uint16_t dim_im_in,
		944	const uint16_t ch_im_in,
		945	const uint16_t dim_kernel,
		946	const uint16_t padding,
		947	const uint16_t stride,
		948	const uint16_t dim_im_out,
		949	q7_t * bufferA,
		950	q7_t * Im_out);
		951
		952	/**
		953	* @brief Q7 average pooling function
		954	* @param[in] Im_in pointer to input tensor
		955	* @param[in] dim_im_in input tensor dimention
		956	* @param[in] ch_im_in number of input tensor channels
		957	* @param[in] dim_kernel filter kernel size
		958	* @param[in] padding padding sizes
		959	* @param[in] stride convolution stride
		960	* @param[in] dim_im_out output tensor dimension
		961	* @param[in,out] bufferA pointer to buffer space for input
		962	* @param[in,out] Im_out pointer to output tensor
		963	* @return none.
		964	*
		965	*/
		966
		967	void arm_avepool_q7_HWC(q7_t * Im_in,
		968	const uint16_t dim_im_in,
		969	const uint16_t ch_im_in,
		970	const uint16_t dim_kernel,
		971	const uint16_t padding,
		972	const uint16_t stride,
		973	const uint16_t dim_im_out,
		974	q7_t * bufferA,
		975	q7_t * Im_out);
		976
		977	/**
		978	* @defgroup Softmax Softmax Functions
		979	*
		980	* EXP(2) based softmax function
		981	*
		982	*/
		983
		984	/**
		985	* @brief Q7 softmax function
		986	* @param[in] vec_in pointer to input vector
		987	* @param[in] dim_vec input vector dimention
		988	* @param[out] p_out pointer to output vector
		989	* @return none.
		990	*
		991	*/
		992
		993	void arm_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out);
		994
		995	/**
		996	* @brief Q15 softmax function
		997	* @param[in] vec_in pointer to input vector
		998	* @param[in] dim_vec input vector dimention
		999	* @param[out] p_out pointer to output vector
		1000	* @return none.
		1001	*
		1002	*/
		1003
		1004	void arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
		1005
		1006	#ifdef __cplusplus
		1007	}
		1008	#endif
		1009
		1010	#endif

Subversion Repositories dashGPS

(root)/branches/dashGPS-bmp/Drivers/CMSIS/NN/Include/arm_nnfunctions.h – Rev 2