Subversion Repositories testOled

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 mjames 1
/*
2
 * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3
 *
4
 * SPDX-License-Identifier: Apache-2.0
5
 *
6
 * Licensed under the Apache License, Version 2.0 (the License); you may
7
 * not use this file except in compliance with the License.
8
 * You may obtain a copy of the License at
9
 *
10
 * www.apache.org/licenses/LICENSE-2.0
11
 *
12
 * Unless required by applicable law or agreed to in writing, software
13
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
 * See the License for the specific language governing permissions and
16
 * limitations under the License.
17
 */
18
 
19
/* ----------------------------------------------------------------------
20
 * Project:      CMSIS NN Library
21
 * Title:        arm_nn_mult_q15.c
22
 * Description:  Q15 vector multiplication with variable output shifts
23
 *
24
 * $Date:        13. July 2018
25
 * $Revision:    V.1.0.0
26
 *
27
 * Target Processor:  Cortex-M cores
28
 *
29
 * -------------------------------------------------------------------- */
30
 
31
#include "arm_nnfunctions.h"
32
 
33
/**    
34
 * @ingroup groupSupport    
35
 */
36
 
37
/**
38
 * @addtogroup NNBasicMath
39
 * @{
40
 */
41
 
42
 
43
/**
44
 * @brief           Q7 vector multiplication with variable output shifts
45
 * @param[in]       *pSrcA        pointer to the first input vector
46
 * @param[in]       *pSrcB        pointer to the second input vector
47
 * @param[out]      *pDst         pointer to the output vector
48
 * @param[in]       out_shift     amount of right-shift for output
49
 * @param[in]       blockSize     number of samples in each vector
50
 * @return none.
51
 *
52
 * <b>Scaling and Overflow Behavior:</b>
53
 * \par
54
 * The function uses saturating arithmetic.
55
 * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
56
 */
57
 
58
void arm_nn_mult_q15(
59
  q15_t * pSrcA,
60
  q15_t * pSrcB,
61
  q15_t * pDst,
62
  const uint16_t out_shift,
63
  uint32_t blockSize)
64
{
65
  uint32_t blkCnt;                               /* loop counters */
66
 
67
#if defined (ARM_MATH_DSP)
68
 
69
/* Run the below code for Cortex-M4 and Cortex-M3 */
70
  q31_t inA1, inA2, inB1, inB2;                  /* temporary input variables */
71
  q15_t out1, out2, out3, out4;                  /* temporary output variables */
72
  q31_t mul1, mul2, mul3, mul4;                  /* temporary variables */
73
 
74
  /* loop Unrolling */
75
  blkCnt = blockSize >> 2U;
76
 
77
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
78
   ** a second loop below computes the remaining 1 to 3 samples. */
79
  while (blkCnt > 0U)
80
  {
81
    /* read two samples at a time from sourceA */
82
    inA1 = *__SIMD32(pSrcA)++;
83
    /* read two samples at a time from sourceB */
84
    inB1 = *__SIMD32(pSrcB)++;
85
    /* read two samples at a time from sourceA */
86
    inA2 = *__SIMD32(pSrcA)++;
87
    /* read two samples at a time from sourceB */
88
    inB2 = *__SIMD32(pSrcB)++;
89
 
90
    /* multiply mul = sourceA * sourceB */
91
    mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
92
    mul2 = (q31_t) ((q15_t) inA1 * (q15_t) inB1);
93
    mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
94
    mul4 = (q31_t) ((q15_t) inA2 * (q15_t) inB2);
95
 
96
    /* saturate result to 16 bit */
97
    out1 = (q15_t) __SSAT((mul1 + NN_ROUND(out_shift)) >> out_shift, 16);
98
    out2 = (q15_t) __SSAT((mul2 + NN_ROUND(out_shift)) >> out_shift, 16);
99
    out3 = (q15_t) __SSAT((mul3 + NN_ROUND(out_shift)) >> out_shift, 16);
100
    out4 = (q15_t) __SSAT((mul4 + NN_ROUND(out_shift)) >> out_shift, 16);
101
 
102
    /* store the result */
103
#ifndef ARM_MATH_BIG_ENDIAN
104
 
105
    *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
106
    *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
107
 
108
#else
109
 
110
    *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
111
    *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
112
 
113
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
114
 
115
    /* Decrement the blockSize loop counter */
116
    blkCnt--;
117
  }
118
 
119
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
120
   ** No loop unrolling is used. */
121
  blkCnt = blockSize % 0x4U;
122
 
123
#else
124
 
125
  /* Run the below code for Cortex-M0 */
126
 
127
  /* Initialize blkCnt with number of samples */
128
  blkCnt = blockSize;
129
 
130
#endif /* #if defined (ARM_MATH_DSP) */
131
 
132
 
133
  while (blkCnt > 0U)
134
  {
135
    /* C = A * B */
136
    /* Multiply the inputs and store the result in the destination buffer */
137
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 16);
138
 
139
    /* Decrement the blockSize loop counter */
140
    blkCnt--;
141
  }
142
}
143
 
144
/**
145
 * @} end of NNBasicMath group
146
 */
147