Rev 56 | Details | Compare with Previous | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line |
|---|---|---|---|
| 56 | mjames | 1 | /* |
| 2 | * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. |
||
| 3 | * |
||
| 4 | * SPDX-License-Identifier: Apache-2.0 |
||
| 5 | * |
||
| 6 | * Licensed under the Apache License, Version 2.0 (the License); you may |
||
| 7 | * not use this file except in compliance with the License. |
||
| 8 | * You may obtain a copy of the License at |
||
| 9 | * |
||
| 10 | * www.apache.org/licenses/LICENSE-2.0 |
||
| 11 | * |
||
| 12 | * Unless required by applicable law or agreed to in writing, software |
||
| 13 | * distributed under the License is distributed on an AS IS BASIS, WITHOUT |
||
| 14 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||
| 15 | * See the License for the specific language governing permissions and |
||
| 16 | * limitations under the License. |
||
| 17 | */ |
||
| 18 | |||
| 19 | /* ---------------------------------------------------------------------- |
||
| 20 | * Project: CMSIS NN Library |
||
| 21 | * Title: arm_pool_q7_HWC.c |
||
| 22 | * Description: Pooling function implementations |
||
| 23 | * |
||
| 24 | * $Date: 17. January 2018 |
||
| 25 | * $Revision: V.1.0.0 |
||
| 26 | * |
||
| 27 | * Target Processor: Cortex-M cores |
||
| 28 | * |
||
| 29 | * -------------------------------------------------------------------- */ |
||
| 30 | |||
| 31 | #include "arm_math.h" |
||
| 32 | #include "arm_nnfunctions.h" |
||
| 33 | |||
| 34 | #if defined (ARM_MATH_DSP) |
||
| 35 | |||
| 36 | /** |
||
| 37 | * @brief A few utility functions used by pooling functions |
||
| 38 | * |
||
| 39 | * |
||
| 40 | */ |
||
| 41 | |||
| 42 | static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale) |
||
| 43 | { |
||
| 44 | int i; |
||
| 45 | |||
| 46 | for (i = 0; i < length; i++) |
||
| 47 | { |
||
| 48 | target[i] = (q7_t) (buffer[i] / scale); |
||
| 49 | } |
||
| 50 | } |
||
| 51 | |||
| 52 | static void compare_and_replace_if_larger_q7(q7_t * base, // base data |
||
| 53 | q7_t * target, // compare target |
||
| 54 | const uint16_t length // data length |
||
| 55 | ) |
||
| 56 | { |
||
| 57 | q7_t *pIn = base; |
||
| 58 | q7_t *pCom = target; |
||
| 59 | union arm_nnword in; |
||
| 60 | union arm_nnword com; |
||
| 61 | uint16_t cnt = length >> 2; |
||
| 62 | |||
| 63 | while (cnt > 0u) |
||
| 64 | { |
||
| 65 | in.word = *__SIMD32(pIn); |
||
| 66 | com.word = *__SIMD32(pCom)++; |
||
| 67 | |||
| 68 | // if version |
||
| 69 | if (com.bytes[0] > in.bytes[0]) |
||
| 70 | in.bytes[0] = com.bytes[0]; |
||
| 71 | if (com.bytes[1] > in.bytes[1]) |
||
| 72 | in.bytes[1] = com.bytes[1]; |
||
| 73 | if (com.bytes[2] > in.bytes[2]) |
||
| 74 | in.bytes[2] = com.bytes[2]; |
||
| 75 | if (com.bytes[3] > in.bytes[3]) |
||
| 76 | in.bytes[3] = com.bytes[3]; |
||
| 77 | |||
| 78 | *__SIMD32(pIn)++ = in.word; |
||
| 79 | |||
| 80 | cnt--; |
||
| 81 | } |
||
| 82 | } |
||
| 83 | |||
| 84 | static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length) |
||
| 85 | { |
||
| 86 | q15_t *pCnt = base; |
||
| 87 | q7_t *pV = target; |
||
| 88 | q31_t v1, v2, vo1, vo2; |
||
| 89 | uint16_t cnt = length >> 2; |
||
| 90 | q31_t in; |
||
| 91 | |||
| 92 | while (cnt > 0u) |
||
| 93 | { |
||
| 94 | q31_t value = *__SIMD32(pV)++; |
||
| 95 | v1 = __SXTB16(__ROR(value, 8)); |
||
| 96 | v2 = __SXTB16(value); |
||
| 97 | #ifndef ARM_MATH_BIG_ENDIAN |
||
| 98 | |||
| 99 | vo2 = __PKHTB(v1, v2, 16); |
||
| 100 | vo1 = __PKHBT(v2, v1, 16); |
||
| 101 | |||
| 102 | #else |
||
| 103 | |||
| 104 | vo1 = __PKHTB(v1, v2, 16); |
||
| 105 | vo2 = __PKHBT(v2, v1, 16); |
||
| 106 | |||
| 107 | #endif |
||
| 108 | |||
| 109 | in = *__SIMD32(pCnt); |
||
| 110 | *__SIMD32(pCnt)++ = __QADD16(vo1, in); |
||
| 111 | |||
| 112 | in = *__SIMD32(pCnt); |
||
| 113 | *__SIMD32(pCnt)++ = __QADD16(vo2, in); |
||
| 114 | |||
| 115 | cnt--; |
||
| 116 | } |
||
| 117 | cnt = length & 0x3; |
||
| 118 | while (cnt > 0u) |
||
| 119 | { |
||
| 120 | *pCnt++ += *pV++; |
||
| 121 | cnt--; |
||
| 122 | } |
||
| 123 | } |
||
| 124 | |||
| 125 | #endif // ARM_MATH_DSP |
||
| 126 | |||
| 127 | /** |
||
| 128 | * @ingroup groupNN |
||
| 129 | */ |
||
| 130 | |||
| 131 | /** |
||
| 132 | * @addtogroup Pooling |
||
| 133 | * @{ |
||
| 134 | */ |
||
| 135 | |||
| 136 | /** |
||
| 137 | * @brief Q7 max pooling function |
||
| 138 | * @param[in, out] Im_in pointer to input tensor |
||
| 139 | * @param[in] dim_im_in input tensor dimention |
||
| 140 | * @param[in] ch_im_in number of input tensor channels |
||
| 141 | * @param[in] dim_kernel filter kernel size |
||
| 142 | * @param[in] padding padding sizes |
||
| 143 | * @param[in] stride convolution stride |
||
| 144 | * @param[in] dim_im_out output tensor dimension |
||
| 145 | * @param[in,out] bufferA pointer to buffer space for input |
||
| 146 | * @param[in,out] Im_out pointer to output tensor |
||
| 147 | * @return none. |
||
| 148 | * |
||
| 149 | * @details |
||
| 150 | * |
||
| 151 | * <b>Buffer size:</b> |
||
| 152 | * |
||
| 153 | * bufferA size: 0 |
||
| 154 | * |
||
| 155 | * The pooling function is implemented as split x-pooling then |
||
| 156 | * y-pooling. |
||
| 157 | * |
||
| 158 | * This pooling function is input-destructive. Input data is undefined |
||
| 159 | * after calling this function. |
||
| 160 | * |
||
| 161 | */ |
||
| 162 | |||
| 163 | void |
||
| 164 | arm_maxpool_q7_HWC(q7_t * Im_in, |
||
| 165 | const uint16_t dim_im_in, |
||
| 166 | const uint16_t ch_im_in, |
||
| 167 | const uint16_t dim_kernel, |
||
| 168 | const uint16_t padding, |
||
| 169 | const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out) |
||
| 170 | { |
||
| 171 | |||
| 172 | #if defined (ARM_MATH_DSP) |
||
| 173 | /* Run the following code for Cortex-M4 and Cortex-M7 */ |
||
| 174 | |||
| 175 | int16_t i_x, i_y; |
||
| 176 | |||
| 177 | /* first does the pooling along x axis */ |
||
| 178 | for (i_y = 0; i_y < dim_im_in; i_y++) |
||
| 179 | { |
||
| 180 | |||
| 181 | for (i_x = 0; i_x < dim_im_out; i_x++) |
||
| 182 | { |
||
| 183 | /* for each output pixel */ |
||
| 184 | q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; |
||
| 185 | q7_t *win_start; |
||
| 186 | q7_t *win_stop; |
||
| 187 | if (i_x * stride - padding < 0) |
||
| 188 | { |
||
| 189 | win_start = target; |
||
| 190 | } else |
||
| 191 | { |
||
| 192 | win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; |
||
| 193 | } |
||
| 194 | |||
| 195 | if (i_x * stride - padding + dim_kernel >= dim_im_in) |
||
| 196 | { |
||
| 197 | win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; |
||
| 198 | } else |
||
| 199 | { |
||
| 200 | win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; |
||
| 201 | } |
||
| 202 | |||
| 203 | /* first step is to copy over initial data */ |
||
| 204 | /* arm_copy_q7(win_start, target, ch_im_in); */ |
||
| 205 | memmove(target, win_start, ch_im_in); |
||
| 206 | |||
| 207 | /* start the max operation from the second part */ |
||
| 208 | win_start += ch_im_in; |
||
| 209 | for (; win_start < win_stop; win_start += ch_im_in) |
||
| 210 | { |
||
| 211 | compare_and_replace_if_larger_q7(target, win_start, ch_im_in); |
||
| 212 | } |
||
| 213 | } |
||
| 214 | } |
||
| 215 | |||
| 216 | /* then does the pooling along y axis */ |
||
| 217 | for (i_y = 0; i_y < dim_im_out; i_y++) |
||
| 218 | { |
||
| 219 | |||
| 220 | /* for each output row */ |
||
| 221 | q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; |
||
| 222 | q7_t *row_start; |
||
| 223 | q7_t *row_end; |
||
| 224 | /* setting the starting row */ |
||
| 225 | if (i_y * stride - padding < 0) |
||
| 226 | { |
||
| 227 | row_start = Im_in; |
||
| 228 | } else |
||
| 229 | { |
||
| 230 | row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; |
||
| 231 | } |
||
| 232 | /* setting the stopping row */ |
||
| 233 | if (i_y * stride - padding + dim_kernel >= dim_im_in) |
||
| 234 | { |
||
| 235 | row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; |
||
| 236 | } else |
||
| 237 | { |
||
| 238 | row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; |
||
| 239 | } |
||
| 240 | |||
| 241 | /* copy over the first row */ |
||
| 242 | /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ |
||
| 243 | memmove(target, row_start, dim_im_out * ch_im_in); |
||
| 244 | |||
| 245 | /* move over to next row */ |
||
| 246 | row_start += ch_im_in * dim_im_in; |
||
| 247 | |||
| 248 | for (; row_start < row_end; row_start += dim_im_in * ch_im_in) |
||
| 249 | { |
||
| 250 | compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in); |
||
| 251 | } |
||
| 252 | } |
||
| 253 | |||
| 254 | #else |
||
| 255 | /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ |
||
| 256 | |||
| 257 | int16_t i_ch_in, i_x, i_y; |
||
| 258 | int16_t k_x, k_y; |
||
| 259 | |||
| 260 | for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) |
||
| 261 | { |
||
| 262 | for (i_y = 0; i_y < dim_im_out; i_y++) |
||
| 263 | { |
||
| 264 | for (i_x = 0; i_x < dim_im_out; i_x++) |
||
| 265 | { |
||
| 266 | int max = -129; |
||
| 267 | for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) |
||
| 268 | { |
||
| 269 | for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) |
||
| 270 | { |
||
| 271 | if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) |
||
| 272 | { |
||
| 273 | if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max) |
||
| 274 | { |
||
| 275 | max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; |
||
| 276 | } |
||
| 277 | } |
||
| 278 | } |
||
| 279 | } |
||
| 280 | Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max; |
||
| 281 | } |
||
| 282 | } |
||
| 283 | } |
||
| 284 | |||
| 285 | #endif /* ARM_MATH_DSP */ |
||
| 286 | |||
| 287 | } |
||
| 288 | |||
| 289 | /** |
||
| 290 | * @brief Q7 average pooling function |
||
| 291 | * @param[in,out] Im_in pointer to input tensor |
||
| 292 | * @param[in] dim_im_in input tensor dimention |
||
| 293 | * @param[in] ch_im_in number of input tensor channels |
||
| 294 | * @param[in] dim_kernel filter kernel size |
||
| 295 | * @param[in] padding padding sizes |
||
| 296 | * @param[in] stride convolution stride |
||
| 297 | * @param[in] dim_im_out output tensor dimension |
||
| 298 | * @param[in,out] bufferA pointer to buffer space for input |
||
| 299 | * @param[in,out] Im_out pointer to output tensor |
||
| 300 | * @return none. |
||
| 301 | * |
||
| 302 | * @details |
||
| 303 | * |
||
| 304 | * <b>Buffer size:</b> |
||
| 305 | * |
||
| 306 | * bufferA size: 2*dim_im_out*ch_im_in |
||
| 307 | * |
||
| 308 | * The pooling function is implemented as split x-pooling then |
||
| 309 | * y-pooling. |
||
| 310 | * |
||
| 311 | * This pooling function is input-destructive. Input data is undefined |
||
| 312 | * after calling this function. |
||
| 313 | * |
||
| 314 | */ |
||
| 315 | |||
| 316 | void |
||
| 317 | arm_avepool_q7_HWC(q7_t * Im_in, |
||
| 318 | const uint16_t dim_im_in, |
||
| 319 | const uint16_t ch_im_in, |
||
| 320 | const uint16_t dim_kernel, |
||
| 321 | const uint16_t padding, |
||
| 322 | const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out) |
||
| 323 | { |
||
| 324 | |||
| 325 | #if defined (ARM_MATH_DSP) |
||
| 326 | /* Run the following code for Cortex-M4 and Cortex-M7 */ |
||
| 327 | |||
| 328 | q15_t *buffer = (q15_t *) bufferA; |
||
| 329 | int16_t i_x, i_y; |
||
| 330 | int16_t count = 0; |
||
| 331 | |||
| 332 | /* first does the pooling along x axis */ |
||
| 333 | for (i_y = 0; i_y < dim_im_in; i_y++) |
||
| 334 | { |
||
| 335 | |||
| 336 | for (i_x = 0; i_x < dim_im_out; i_x++) |
||
| 337 | { |
||
| 338 | /* for each output pixel */ |
||
| 339 | q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; |
||
| 340 | q7_t *win_start; |
||
| 341 | q7_t *win_stop; |
||
| 342 | if (i_x * stride - padding < 0) |
||
| 343 | { |
||
| 344 | win_start = target; |
||
| 345 | } else |
||
| 346 | { |
||
| 347 | win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; |
||
| 348 | } |
||
| 349 | |||
| 350 | if (i_x * stride - padding + dim_kernel >= dim_im_in) |
||
| 351 | { |
||
| 352 | win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; |
||
| 353 | } else |
||
| 354 | { |
||
| 355 | win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; |
||
| 356 | } |
||
| 357 | |||
| 358 | /* first step is to copy over initial data */ |
||
| 359 | arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in); |
||
| 360 | count = 1; |
||
| 361 | |||
| 362 | /* start the max operation from the second part */ |
||
| 363 | win_start += ch_im_in; |
||
| 364 | for (; win_start < win_stop; win_start += ch_im_in) |
||
| 365 | { |
||
| 366 | accumulate_q7_to_q15(buffer, win_start, ch_im_in); |
||
| 367 | count++; |
||
| 368 | } |
||
| 369 | buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); |
||
| 370 | } |
||
| 371 | } |
||
| 372 | |||
| 373 | /* then does the pooling along y axis */ |
||
| 374 | for (i_y = 0; i_y < dim_im_out; i_y++) |
||
| 375 | { |
||
| 376 | /* for each output row */ |
||
| 377 | q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; |
||
| 378 | q7_t *row_start; |
||
| 379 | q7_t *row_end; |
||
| 380 | /* setting the starting row */ |
||
| 381 | if (i_y * stride - padding < 0) |
||
| 382 | { |
||
| 383 | row_start = Im_in; |
||
| 384 | } else |
||
| 385 | { |
||
| 386 | row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; |
||
| 387 | } |
||
| 388 | /* setting the stopping row */ |
||
| 389 | if (i_y * stride - padding + dim_kernel >= dim_im_in) |
||
| 390 | { |
||
| 391 | row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; |
||
| 392 | } else |
||
| 393 | { |
||
| 394 | row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; |
||
| 395 | } |
||
| 396 | |||
| 397 | /* copy over the first row */ |
||
| 398 | arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in); |
||
| 399 | count = 1; |
||
| 400 | |||
| 401 | /* move over to next row */ |
||
| 402 | row_start += ch_im_in * dim_im_in; |
||
| 403 | |||
| 404 | for (; row_start < row_end; row_start += dim_im_in * ch_im_in) |
||
| 405 | { |
||
| 406 | accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in); |
||
| 407 | count++; |
||
| 408 | } |
||
| 409 | buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count); |
||
| 410 | } |
||
| 411 | |||
| 412 | #else |
||
| 413 | /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ |
||
| 414 | |||
| 415 | int16_t i_ch_in, i_x, i_y; |
||
| 416 | int16_t k_x, k_y; |
||
| 417 | |||
| 418 | for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) |
||
| 419 | { |
||
| 420 | for (i_y = 0; i_y < dim_im_out; i_y++) |
||
| 421 | { |
||
| 422 | for (i_x = 0; i_x < dim_im_out; i_x++) |
||
| 423 | { |
||
| 424 | int sum = 0; |
||
| 425 | int count = 0; |
||
| 426 | for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) |
||
| 427 | { |
||
| 428 | for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) |
||
| 429 | { |
||
| 430 | if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) |
||
| 431 | { |
||
| 432 | sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; |
||
| 433 | count++; |
||
| 434 | } |
||
| 435 | } |
||
| 436 | } |
||
| 437 | Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count; |
||
| 438 | } |
||
| 439 | } |
||
| 440 | } |
||
| 441 | |||
| 442 | #endif /* ARM_MATH_DSP */ |
||
| 443 | |||
| 444 | } |
||
| 445 | |||
| 446 | /** |
||
| 447 | * @} end of Pooling group |
||
| 448 | */ |