Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
2 | mjames | 1 | /* |
2 | * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. |
||
3 | * |
||
4 | * SPDX-License-Identifier: Apache-2.0 |
||
5 | * |
||
6 | * Licensed under the Apache License, Version 2.0 (the License); you may |
||
7 | * not use this file except in compliance with the License. |
||
8 | * You may obtain a copy of the License at |
||
9 | * |
||
10 | * www.apache.org/licenses/LICENSE-2.0 |
||
11 | * |
||
12 | * Unless required by applicable law or agreed to in writing, software |
||
13 | * distributed under the License is distributed on an AS IS BASIS, WITHOUT |
||
14 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||
15 | * See the License for the specific language governing permissions and |
||
16 | * limitations under the License. |
||
17 | */ |
||
18 | |||
19 | /* ---------------------------------------------------------------------- |
||
20 | * Project: CMSIS NN Library |
||
21 | * Title: arm_pool_q7_HWC.c |
||
22 | * Description: Pooling function implementations |
||
23 | * |
||
24 | * $Date: 17. January 2018 |
||
25 | * $Revision: V.1.0.0 |
||
26 | * |
||
27 | * Target Processor: Cortex-M cores |
||
28 | * |
||
29 | * -------------------------------------------------------------------- */ |
||
30 | |||
31 | #include "arm_math.h" |
||
32 | #include "arm_nnfunctions.h" |
||
33 | |||
34 | #if defined (ARM_MATH_DSP) |
||
35 | |||
36 | /** |
||
37 | * @brief A few utility functions used by pooling functions |
||
38 | * |
||
39 | * |
||
40 | */ |
||
41 | |||
42 | static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale) |
||
43 | { |
||
44 | int i; |
||
45 | |||
46 | for (i = 0; i < length; i++) |
||
47 | { |
||
48 | target[i] = (q7_t) (buffer[i] / scale); |
||
49 | } |
||
50 | } |
||
51 | |||
52 | static void compare_and_replace_if_larger_q7(q7_t * base, // base data |
||
53 | q7_t * target, // compare target |
||
54 | const uint16_t length // data length |
||
55 | ) |
||
56 | { |
||
57 | q7_t *pIn = base; |
||
58 | q7_t *pCom = target; |
||
59 | union arm_nnword in; |
||
60 | union arm_nnword com; |
||
61 | uint16_t cnt = length >> 2; |
||
62 | |||
63 | while (cnt > 0u) |
||
64 | { |
||
65 | in.word = *__SIMD32(pIn); |
||
66 | com.word = *__SIMD32(pCom)++; |
||
67 | |||
68 | // if version |
||
69 | if (com.bytes[0] > in.bytes[0]) |
||
70 | in.bytes[0] = com.bytes[0]; |
||
71 | if (com.bytes[1] > in.bytes[1]) |
||
72 | in.bytes[1] = com.bytes[1]; |
||
73 | if (com.bytes[2] > in.bytes[2]) |
||
74 | in.bytes[2] = com.bytes[2]; |
||
75 | if (com.bytes[3] > in.bytes[3]) |
||
76 | in.bytes[3] = com.bytes[3]; |
||
77 | |||
78 | *__SIMD32(pIn)++ = in.word; |
||
79 | |||
80 | cnt--; |
||
81 | } |
||
82 | } |
||
83 | |||
84 | static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length) |
||
85 | { |
||
86 | q15_t *pCnt = base; |
||
87 | q7_t *pV = target; |
||
88 | q31_t v1, v2, vo1, vo2; |
||
89 | uint16_t cnt = length >> 2; |
||
90 | q31_t in; |
||
91 | |||
92 | while (cnt > 0u) |
||
93 | { |
||
94 | q31_t value = *__SIMD32(pV)++; |
||
95 | v1 = __SXTB16(__ROR(value, 8)); |
||
96 | v2 = __SXTB16(value); |
||
97 | #ifndef ARM_MATH_BIG_ENDIAN |
||
98 | |||
99 | vo2 = __PKHTB(v1, v2, 16); |
||
100 | vo1 = __PKHBT(v2, v1, 16); |
||
101 | |||
102 | #else |
||
103 | |||
104 | vo1 = __PKHTB(v1, v2, 16); |
||
105 | vo2 = __PKHBT(v2, v1, 16); |
||
106 | |||
107 | #endif |
||
108 | |||
109 | in = *__SIMD32(pCnt); |
||
110 | *__SIMD32(pCnt)++ = __QADD16(vo1, in); |
||
111 | |||
112 | in = *__SIMD32(pCnt); |
||
113 | *__SIMD32(pCnt)++ = __QADD16(vo2, in); |
||
114 | |||
115 | cnt--; |
||
116 | } |
||
117 | cnt = length & 0x3; |
||
118 | while (cnt > 0u) |
||
119 | { |
||
120 | *pCnt++ += *pV++; |
||
121 | cnt--; |
||
122 | } |
||
123 | } |
||
124 | |||
125 | #endif // ARM_MATH_DSP |
||
126 | |||
127 | /** |
||
128 | * @ingroup groupNN |
||
129 | */ |
||
130 | |||
131 | /** |
||
132 | * @addtogroup Pooling |
||
133 | * @{ |
||
134 | */ |
||
135 | |||
136 | /** |
||
137 | * @brief Q7 max pooling function |
||
138 | * @param[in, out] Im_in pointer to input tensor |
||
139 | * @param[in] dim_im_in input tensor dimention |
||
140 | * @param[in] ch_im_in number of input tensor channels |
||
141 | * @param[in] dim_kernel filter kernel size |
||
142 | * @param[in] padding padding sizes |
||
143 | * @param[in] stride convolution stride |
||
144 | * @param[in] dim_im_out output tensor dimension |
||
145 | * @param[in,out] bufferA pointer to buffer space for input |
||
146 | * @param[in,out] Im_out pointer to output tensor |
||
147 | * @return none. |
||
148 | * |
||
149 | * @details |
||
150 | * |
||
151 | * <b>Buffer size:</b> |
||
152 | * |
||
153 | * bufferA size: 0 |
||
154 | * |
||
155 | * The pooling function is implemented as split x-pooling then |
||
156 | * y-pooling. |
||
157 | * |
||
158 | * This pooling function is input-destructive. Input data is undefined |
||
159 | * after calling this function. |
||
160 | * |
||
161 | */ |
||
162 | |||
163 | void |
||
164 | arm_maxpool_q7_HWC(q7_t * Im_in, |
||
165 | const uint16_t dim_im_in, |
||
166 | const uint16_t ch_im_in, |
||
167 | const uint16_t dim_kernel, |
||
168 | const uint16_t padding, |
||
169 | const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out) |
||
170 | { |
||
171 | |||
172 | #if defined (ARM_MATH_DSP) |
||
173 | /* Run the following code for Cortex-M4 and Cortex-M7 */ |
||
174 | |||
175 | int16_t i_x, i_y; |
||
176 | |||
177 | /* first does the pooling along x axis */ |
||
178 | for (i_y = 0; i_y < dim_im_in; i_y++) |
||
179 | { |
||
180 | |||
181 | for (i_x = 0; i_x < dim_im_out; i_x++) |
||
182 | { |
||
183 | /* for each output pixel */ |
||
184 | q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; |
||
185 | q7_t *win_start; |
||
186 | q7_t *win_stop; |
||
187 | if (i_x * stride - padding < 0) |
||
188 | { |
||
189 | win_start = target; |
||
190 | } else |
||
191 | { |
||
192 | win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; |
||
193 | } |
||
194 | |||
195 | if (i_x * stride - padding + dim_kernel >= dim_im_in) |
||
196 | { |
||
197 | win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; |
||
198 | } else |
||
199 | { |
||
200 | win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; |
||
201 | } |
||
202 | |||
203 | /* first step is to copy over initial data */ |
||
204 | /* arm_copy_q7(win_start, target, ch_im_in); */ |
||
205 | memmove(target, win_start, ch_im_in); |
||
206 | |||
207 | /* start the max operation from the second part */ |
||
208 | win_start += ch_im_in; |
||
209 | for (; win_start < win_stop; win_start += ch_im_in) |
||
210 | { |
||
211 | compare_and_replace_if_larger_q7(target, win_start, ch_im_in); |
||
212 | } |
||
213 | } |
||
214 | } |
||
215 | |||
216 | /* then does the pooling along y axis */ |
||
217 | for (i_y = 0; i_y < dim_im_out; i_y++) |
||
218 | { |
||
219 | |||
220 | /* for each output row */ |
||
221 | q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; |
||
222 | q7_t *row_start; |
||
223 | q7_t *row_end; |
||
224 | /* setting the starting row */ |
||
225 | if (i_y * stride - padding < 0) |
||
226 | { |
||
227 | row_start = Im_in; |
||
228 | } else |
||
229 | { |
||
230 | row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; |
||
231 | } |
||
232 | /* setting the stopping row */ |
||
233 | if (i_y * stride - padding + dim_kernel >= dim_im_in) |
||
234 | { |
||
235 | row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; |
||
236 | } else |
||
237 | { |
||
238 | row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; |
||
239 | } |
||
240 | |||
241 | /* copy over the first row */ |
||
242 | /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ |
||
243 | memmove(target, row_start, dim_im_out * ch_im_in); |
||
244 | |||
245 | /* move over to next row */ |
||
246 | row_start += ch_im_in * dim_im_in; |
||
247 | |||
248 | for (; row_start < row_end; row_start += dim_im_in * ch_im_in) |
||
249 | { |
||
250 | compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in); |
||
251 | } |
||
252 | } |
||
253 | |||
254 | #else |
||
255 | /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ |
||
256 | |||
257 | int16_t i_ch_in, i_x, i_y; |
||
258 | int16_t k_x, k_y; |
||
259 | |||
260 | for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) |
||
261 | { |
||
262 | for (i_y = 0; i_y < dim_im_out; i_y++) |
||
263 | { |
||
264 | for (i_x = 0; i_x < dim_im_out; i_x++) |
||
265 | { |
||
266 | int max = -129; |
||
267 | for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) |
||
268 | { |
||
269 | for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) |
||
270 | { |
||
271 | if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) |
||
272 | { |
||
273 | if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max) |
||
274 | { |
||
275 | max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; |
||
276 | } |
||
277 | } |
||
278 | } |
||
279 | } |
||
280 | Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max; |
||
281 | } |
||
282 | } |
||
283 | } |
||
284 | |||
285 | #endif /* ARM_MATH_DSP */ |
||
286 | |||
287 | } |
||
288 | |||
289 | /** |
||
290 | * @brief Q7 average pooling function |
||
291 | * @param[in,out] Im_in pointer to input tensor |
||
292 | * @param[in] dim_im_in input tensor dimention |
||
293 | * @param[in] ch_im_in number of input tensor channels |
||
294 | * @param[in] dim_kernel filter kernel size |
||
295 | * @param[in] padding padding sizes |
||
296 | * @param[in] stride convolution stride |
||
297 | * @param[in] dim_im_out output tensor dimension |
||
298 | * @param[in,out] bufferA pointer to buffer space for input |
||
299 | * @param[in,out] Im_out pointer to output tensor |
||
300 | * @return none. |
||
301 | * |
||
302 | * @details |
||
303 | * |
||
304 | * <b>Buffer size:</b> |
||
305 | * |
||
306 | * bufferA size: 2*dim_im_out*ch_im_in |
||
307 | * |
||
308 | * The pooling function is implemented as split x-pooling then |
||
309 | * y-pooling. |
||
310 | * |
||
311 | * This pooling function is input-destructive. Input data is undefined |
||
312 | * after calling this function. |
||
313 | * |
||
314 | */ |
||
315 | |||
316 | void |
||
317 | arm_avepool_q7_HWC(q7_t * Im_in, |
||
318 | const uint16_t dim_im_in, |
||
319 | const uint16_t ch_im_in, |
||
320 | const uint16_t dim_kernel, |
||
321 | const uint16_t padding, |
||
322 | const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out) |
||
323 | { |
||
324 | |||
325 | #if defined (ARM_MATH_DSP) |
||
326 | /* Run the following code for Cortex-M4 and Cortex-M7 */ |
||
327 | |||
328 | q15_t *buffer = (q15_t *) bufferA; |
||
329 | int16_t i_x, i_y; |
||
330 | int16_t count = 0; |
||
331 | |||
332 | /* first does the pooling along x axis */ |
||
333 | for (i_y = 0; i_y < dim_im_in; i_y++) |
||
334 | { |
||
335 | |||
336 | for (i_x = 0; i_x < dim_im_out; i_x++) |
||
337 | { |
||
338 | /* for each output pixel */ |
||
339 | q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; |
||
340 | q7_t *win_start; |
||
341 | q7_t *win_stop; |
||
342 | if (i_x * stride - padding < 0) |
||
343 | { |
||
344 | win_start = target; |
||
345 | } else |
||
346 | { |
||
347 | win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; |
||
348 | } |
||
349 | |||
350 | if (i_x * stride - padding + dim_kernel >= dim_im_in) |
||
351 | { |
||
352 | win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; |
||
353 | } else |
||
354 | { |
||
355 | win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; |
||
356 | } |
||
357 | |||
358 | /* first step is to copy over initial data */ |
||
359 | arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in); |
||
360 | count = 1; |
||
361 | |||
362 | /* start the max operation from the second part */ |
||
363 | win_start += ch_im_in; |
||
364 | for (; win_start < win_stop; win_start += ch_im_in) |
||
365 | { |
||
366 | accumulate_q7_to_q15(buffer, win_start, ch_im_in); |
||
367 | count++; |
||
368 | } |
||
369 | buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); |
||
370 | } |
||
371 | } |
||
372 | |||
373 | /* then does the pooling along y axis */ |
||
374 | for (i_y = 0; i_y < dim_im_out; i_y++) |
||
375 | { |
||
376 | /* for each output row */ |
||
377 | q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; |
||
378 | q7_t *row_start; |
||
379 | q7_t *row_end; |
||
380 | /* setting the starting row */ |
||
381 | if (i_y * stride - padding < 0) |
||
382 | { |
||
383 | row_start = Im_in; |
||
384 | } else |
||
385 | { |
||
386 | row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; |
||
387 | } |
||
388 | /* setting the stopping row */ |
||
389 | if (i_y * stride - padding + dim_kernel >= dim_im_in) |
||
390 | { |
||
391 | row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; |
||
392 | } else |
||
393 | { |
||
394 | row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; |
||
395 | } |
||
396 | |||
397 | /* copy over the first row */ |
||
398 | arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in); |
||
399 | count = 1; |
||
400 | |||
401 | /* move over to next row */ |
||
402 | row_start += ch_im_in * dim_im_in; |
||
403 | |||
404 | for (; row_start < row_end; row_start += dim_im_in * ch_im_in) |
||
405 | { |
||
406 | accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in); |
||
407 | count++; |
||
408 | } |
||
409 | buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count); |
||
410 | } |
||
411 | |||
412 | #else |
||
413 | /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ |
||
414 | |||
415 | int16_t i_ch_in, i_x, i_y; |
||
416 | int16_t k_x, k_y; |
||
417 | |||
418 | for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) |
||
419 | { |
||
420 | for (i_y = 0; i_y < dim_im_out; i_y++) |
||
421 | { |
||
422 | for (i_x = 0; i_x < dim_im_out; i_x++) |
||
423 | { |
||
424 | int sum = 0; |
||
425 | int count = 0; |
||
426 | for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) |
||
427 | { |
||
428 | for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) |
||
429 | { |
||
430 | if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) |
||
431 | { |
||
432 | sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; |
||
433 | count++; |
||
434 | } |
||
435 | } |
||
436 | } |
||
437 | Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count; |
||
438 | } |
||
439 | } |
||
440 | } |
||
441 | |||
442 | #endif /* ARM_MATH_DSP */ |
||
443 | |||
444 | } |
||
445 | |||
446 | /** |
||
447 | * @} end of Pooling group |
||
448 | */ |