Subversion Repositories dashGPS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 mjames 1
/* ----------------------------------------------------------------------
2
 * Project:      CMSIS DSP Library
3
 * Title:        arm_mat_mult_q15.c
4
 * Description:  Q15 matrix multiplication
5
 *
6
 * $Date:        27. January 2017
7
 * $Revision:    V.1.5.1
8
 *
9
 * Target Processor: Cortex-M cores
10
 * -------------------------------------------------------------------- */
11
/*
12
 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
13
 *
14
 * SPDX-License-Identifier: Apache-2.0
15
 *
16
 * Licensed under the Apache License, Version 2.0 (the License); you may
17
 * not use this file except in compliance with the License.
18
 * You may obtain a copy of the License at
19
 *
20
 * www.apache.org/licenses/LICENSE-2.0
21
 *
22
 * Unless required by applicable law or agreed to in writing, software
23
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25
 * See the License for the specific language governing permissions and
26
 * limitations under the License.
27
 */
28
 
29
#include "arm_math.h"
30
 
31
/**
32
 * @ingroup groupMatrix
33
 */
34
 
35
/**
36
 * @addtogroup MatrixMult
37
 * @{
38
 */
39
 
40
 
41
/**
42
 * @brief Q15 matrix multiplication
43
 * @param[in]       *pSrcA points to the first input matrix structure
44
 * @param[in]       *pSrcB points to the second input matrix structure
45
 * @param[out]      *pDst points to output matrix structure
46
 * @param[in]       *pState points to the array for storing intermediate results (Unused)
47
 * @return          The function returns either
48
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
49
 *
50
 * @details
51
 * <b>Scaling and Overflow Behavior:</b>
52
 *
53
 * \par
54
 * The function is implemented using a 64-bit internal accumulator. The inputs to the
55
 * multiplications are in 1.15 format and multiplications yield a 2.30 result.
56
 * The 2.30 intermediate
57
 * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
58
 * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
59
 * truncated to 34.15 format by discarding the low 15 bits and then saturated to
60
 * 1.15 format.
61
 *
62
 * \par
63
 * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
64
 *
65
 */
66
 
67
arm_status arm_mat_mult_q15(
68
  const arm_matrix_instance_q15 * pSrcA,
69
  const arm_matrix_instance_q15 * pSrcB,
70
  arm_matrix_instance_q15 * pDst,
71
  q15_t * pState)
72
{
73
  q63_t sum;                                     /* accumulator */
74
 
75
#if defined (ARM_MATH_DSP)
76
 
77
  /* Run the below code for Cortex-M4 and Cortex-M3 */
78
 
79
  q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
80
  q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
81
  q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
82
  q15_t *px;                                     /* Temporary output data matrix pointer */
83
  uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
84
  uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
85
  uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
86
  uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
87
  uint16_t col, i = 0U, row = numRowsB, colCnt;  /* loop counters */
88
  arm_status status;                             /* status of matrix multiplication */
89
 
90
#ifndef UNALIGNED_SUPPORT_DISABLE
91
 
92
  q31_t in;                                      /* Temporary variable to hold the input value */
93
  q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2;
94
 
95
#else
96
 
97
  q15_t in;                                      /* Temporary variable to hold the input value */
98
  q15_t inA1, inB1, inA2, inB2;
99
 
100
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
101
 
102
#ifdef ARM_MATH_MATRIX_CHECK
103
  /* Check for matrix mismatch condition */
104
  if ((pSrcA->numCols != pSrcB->numRows) ||
105
     (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
106
  {
107
    /* Set status as ARM_MATH_SIZE_MISMATCH */
108
    status = ARM_MATH_SIZE_MISMATCH;
109
  }
110
  else
111
#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
112
  {
113
    /* Matrix transpose */
114
    do
115
    {
116
      /* Apply loop unrolling and exchange the columns with row elements */
117
      col = numColsB >> 2;
118
 
119
      /* The pointer px is set to starting address of the column being processed */
120
      px = pSrcBT + i;
121
 
122
      /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
123
       ** a second loop below computes the remaining 1 to 3 samples. */
124
      while (col > 0U)
125
      {
126
#ifndef UNALIGNED_SUPPORT_DISABLE
127
 
128
        /* Read two elements from the row */
129
        in = *__SIMD32(pInB)++;
130
 
131
        /* Unpack and store one element in the destination */
132
#ifndef ARM_MATH_BIG_ENDIAN
133
 
134
        *px = (q15_t) in;
135
 
136
#else
137
 
138
        *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
139
 
140
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
141
 
142
        /* Update the pointer px to point to the next row of the transposed matrix */
143
        px += numRowsB;
144
 
145
        /* Unpack and store the second element in the destination */
146
#ifndef ARM_MATH_BIG_ENDIAN
147
 
148
        *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
149
 
150
#else
151
 
152
        *px = (q15_t) in;
153
 
154
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
155
 
156
        /* Update the pointer px to point to the next row of the transposed matrix */
157
        px += numRowsB;
158
 
159
        /* Read two elements from the row */
160
        in = *__SIMD32(pInB)++;
161
 
162
        /* Unpack and store one element in the destination */
163
#ifndef ARM_MATH_BIG_ENDIAN
164
 
165
        *px = (q15_t) in;
166
 
167
#else
168
 
169
        *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
170
 
171
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
172
 
173
        /* Update the pointer px to point to the next row of the transposed matrix */
174
        px += numRowsB;
175
 
176
        /* Unpack and store the second element in the destination */
177
 
178
#ifndef ARM_MATH_BIG_ENDIAN
179
 
180
        *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
181
 
182
#else
183
 
184
        *px = (q15_t) in;
185
 
186
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
187
 
188
        /* Update the pointer px to point to the next row of the transposed matrix */
189
        px += numRowsB;
190
 
191
#else
192
 
193
        /* Read one element from the row */
194
        in = *pInB++;
195
 
196
        /* Store one element in the destination */
197
        *px = in;
198
 
199
        /* Update the pointer px to point to the next row of the transposed matrix */
200
        px += numRowsB;
201
 
202
        /* Read one element from the row */
203
        in = *pInB++;
204
 
205
        /* Store one element in the destination */
206
        *px = in;
207
 
208
        /* Update the pointer px to point to the next row of the transposed matrix */
209
        px += numRowsB;
210
 
211
        /* Read one element from the row */
212
        in = *pInB++;
213
 
214
        /* Store one element in the destination */
215
        *px = in;
216
 
217
        /* Update the pointer px to point to the next row of the transposed matrix */
218
        px += numRowsB;
219
 
220
        /* Read one element from the row */
221
        in = *pInB++;
222
 
223
        /* Store one element in the destination */
224
        *px = in;
225
 
226
        /* Update the pointer px to point to the next row of the transposed matrix */
227
        px += numRowsB;
228
 
229
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
230
 
231
       /* Decrement the column loop counter */
232
        col--;
233
      }
234
 
235
      /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
236
       ** No loop unrolling is used. */
237
      col = numColsB % 0x4U;
238
 
239
      while (col > 0U)
240
      {
241
        /* Read and store the input element in the destination */
242
        *px = *pInB++;
243
 
244
        /* Update the pointer px to point to the next row of the transposed matrix */
245
        px += numRowsB;
246
 
247
        /* Decrement the column loop counter */
248
        col--;
249
      }
250
 
251
      i++;
252
 
253
      /* Decrement the row loop counter */
254
      row--;
255
 
256
    } while (row > 0U);
257
 
258
    /* Reset the variables for the usage in the following multiplication process */
259
    row = numRowsA;
260
    i = 0U;
261
    px = pDst->pData;
262
 
263
    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
264
    /* row loop */
265
    do
266
    {
267
      /* For every row wise process, the column loop counter is to be initiated */
268
      col = numColsB;
269
 
270
      /* For every row wise process, the pIn2 pointer is set
271
       ** to the starting address of the transposed pSrcB data */
272
      pInB = pSrcBT;
273
 
274
      /* column loop */
275
      do
276
      {
277
        /* Set the variable sum, that acts as accumulator, to zero */
278
        sum = 0;
279
 
280
        /* Apply loop unrolling and compute 2 MACs simultaneously. */
281
        colCnt = numColsA >> 2;
282
 
283
        /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
284
        pInA = pSrcA->pData + i;
285
 
286
 
287
        /* matrix multiplication */
288
        while (colCnt > 0U)
289
        {
290
          /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
291
#ifndef UNALIGNED_SUPPORT_DISABLE
292
 
293
          /* read real and imag values from pSrcA and pSrcB buffer */
294
          pSourceA1 = *__SIMD32(pInA)++;
295
          pSourceB1 = *__SIMD32(pInB)++;
296
 
297
          pSourceA2 = *__SIMD32(pInA)++;
298
          pSourceB2 = *__SIMD32(pInB)++;
299
 
300
          /* Multiply and Accumlates */
301
          sum = __SMLALD(pSourceA1, pSourceB1, sum);
302
          sum = __SMLALD(pSourceA2, pSourceB2, sum);
303
 
304
#else
305
          /* read real and imag values from pSrcA and pSrcB buffer */
306
          inA1 = *pInA++;
307
          inB1 = *pInB++;
308
          inA2 = *pInA++;
309
          /* Multiply and Accumlates */
310
          sum += inA1 * inB1;
311
          inB2 = *pInB++;
312
 
313
          inA1 = *pInA++;
314
          inB1 = *pInB++;
315
          /* Multiply and Accumlates */
316
          sum += inA2 * inB2;
317
          inA2 = *pInA++;
318
          inB2 = *pInB++;
319
 
320
          /* Multiply and Accumlates */
321
          sum += inA1 * inB1;
322
          sum += inA2 * inB2;
323
 
324
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
325
 
326
          /* Decrement the loop counter */
327
          colCnt--;
328
        }
329
 
330
        /* process remaining column samples */
331
        colCnt = numColsA & 3U;
332
 
333
        while (colCnt > 0U)
334
        {
335
          /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
336
          sum += *pInA++ * *pInB++;
337
 
338
          /* Decrement the loop counter */
339
          colCnt--;
340
        }
341
 
342
        /* Saturate and store the result in the destination buffer */
343
        *px = (q15_t) (__SSAT((sum >> 15), 16));
344
        px++;
345
 
346
        /* Decrement the column loop counter */
347
        col--;
348
 
349
      } while (col > 0U);
350
 
351
      i = i + numColsA;
352
 
353
      /* Decrement the row loop counter */
354
      row--;
355
 
356
    } while (row > 0U);
357
 
358
#else
359
 
360
  /* Run the below code for Cortex-M0 */
361
 
362
  q15_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
363
  q15_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
364
  q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
365
  q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
366
  q15_t *pOut = pDst->pData;                     /* output data matrix pointer */
367
  q15_t *px;                                     /* Temporary output data matrix pointer */
368
  uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
369
  uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
370
  uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
371
  uint16_t col, i = 0U, row = numRowsA, colCnt;  /* loop counters */
372
  arm_status status;                             /* status of matrix multiplication */
373
 
374
#ifdef ARM_MATH_MATRIX_CHECK
375
 
376
  /* Check for matrix mismatch condition */
377
  if ((pSrcA->numCols != pSrcB->numRows) ||
378
     (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
379
  {
380
    /* Set status as ARM_MATH_SIZE_MISMATCH */
381
    status = ARM_MATH_SIZE_MISMATCH;
382
  }
383
  else
384
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
385
 
386
  {
387
    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
388
    /* row loop */
389
    do
390
    {
391
      /* Output pointer is set to starting address of the row being processed */
392
      px = pOut + i;
393
 
394
      /* For every row wise process, the column loop counter is to be initiated */
395
      col = numColsB;
396
 
397
      /* For every row wise process, the pIn2 pointer is set
398
       ** to the starting address of the pSrcB data */
399
      pIn2 = pSrcB->pData;
400
 
401
      /* column loop */
402
      do
403
      {
404
        /* Set the variable sum, that acts as accumulator, to zero */
405
        sum = 0;
406
 
407
        /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
408
        pIn1 = pInA;
409
 
410
        /* Matrix A columns number of MAC operations are to be performed */
411
        colCnt = numColsA;
412
 
413
        /* matrix multiplication */
414
        while (colCnt > 0U)
415
        {
416
          /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
417
          /* Perform the multiply-accumulates */
418
          sum += (q31_t) * pIn1++ * *pIn2;
419
          pIn2 += numColsB;
420
 
421
          /* Decrement the loop counter */
422
          colCnt--;
423
        }
424
 
425
        /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
426
        /* Saturate and store the result in the destination buffer */
427
        *px++ = (q15_t) __SSAT((sum >> 15), 16);
428
 
429
        /* Decrement the column loop counter */
430
        col--;
431
 
432
        /* Update the pointer pIn2 to point to the  starting address of the next column */
433
        pIn2 = pInB + (numColsB - col);
434
 
435
      } while (col > 0U);
436
 
437
      /* Update the pointer pSrcA to point to the  starting address of the next row */
438
      i = i + numColsB;
439
      pInA = pInA + numColsA;
440
 
441
      /* Decrement the row loop counter */
442
      row--;
443
 
444
    } while (row > 0U);
445
 
446
#endif /* #if defined (ARM_MATH_DSP) */
447
    /* set status as ARM_MATH_SUCCESS */
448
    status = ARM_MATH_SUCCESS;
449
  }
450
 
451
  /* Return to application */
452
  return (status);
453
}
454
 
455
/**
456
 * @} end of MatrixMult group
457
 */