Subversion Repositories DashDisplay

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
56 mjames 1
/* ----------------------------------------------------------------------
2
 * Project:      CMSIS DSP Library
3
 * Title:        arm_mat_mult_fast_q15.c
4
 * Description:  Q15 matrix multiplication (fast variant)
5
 *
6
 * $Date:        27. January 2017
7
 * $Revision:    V.1.5.1
8
 *
9
 * Target Processor: Cortex-M cores
10
 * -------------------------------------------------------------------- */
11
/*
12
 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
13
 *
14
 * SPDX-License-Identifier: Apache-2.0
15
 *
16
 * Licensed under the Apache License, Version 2.0 (the License); you may
17
 * not use this file except in compliance with the License.
18
 * You may obtain a copy of the License at
19
 *
20
 * www.apache.org/licenses/LICENSE-2.0
21
 *
22
 * Unless required by applicable law or agreed to in writing, software
23
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25
 * See the License for the specific language governing permissions and
26
 * limitations under the License.
27
 */
28
 
29
#include "arm_math.h"
30
 
31
/**
32
 * @ingroup groupMatrix
33
 */
34
 
35
/**
36
 * @addtogroup MatrixMult
37
 * @{
38
 */
39
 
40
 
41
/**
42
 * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
43
 * @param[in]       *pSrcA points to the first input matrix structure
44
 * @param[in]       *pSrcB points to the second input matrix structure
45
 * @param[out]      *pDst points to output matrix structure
46
 * @param[in]       *pState points to the array for storing intermediate results
47
 * @return          The function returns either
48
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
49
 *
50
 * @details
51
 * <b>Scaling and Overflow Behavior:</b>
52
 *
53
 * \par
54
 * The difference between the function arm_mat_mult_q15() and this fast variant is that
55
 * the fast variant use a 32-bit rather than a 64-bit accumulator.
56
 * The result of each 1.15 x 1.15 multiplication is truncated to
57
 * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
58
 * format. Finally, the accumulator is saturated and converted to a 1.15 result.
59
 *
60
 * \par
61
 * The fast version has the same overflow behavior as the standard version but provides
62
 * less precision since it discards the low 16 bits of each multiplication result.
63
 * In order to avoid overflows completely the input signals must be scaled down.
64
 * Scale down one of the input matrices by log2(numColsA) bits to
65
 * avoid overflows, as a total of numColsA additions are computed internally for each
66
 * output element.
67
 *
68
 * \par
69
 * See <code>arm_mat_mult_q15()</code> for a slower implementation of this function
70
 * which uses 64-bit accumulation to provide higher precision.
71
 */
72
 
73
arm_status arm_mat_mult_fast_q15(
74
  const arm_matrix_instance_q15 * pSrcA,
75
  const arm_matrix_instance_q15 * pSrcB,
76
  arm_matrix_instance_q15 * pDst,
77
  q15_t * pState)
78
{
79
  q31_t sum;                                     /* accumulator */
80
  q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
81
  q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
82
  q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
83
  q15_t *px;                                     /* Temporary output data matrix pointer */
84
  uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
85
  uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
86
  uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
87
  uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
88
  uint32_t col, i = 0U, row = numRowsB, colCnt;  /* loop counters */
89
  arm_status status;                             /* status of matrix multiplication */
90
 
91
#ifndef UNALIGNED_SUPPORT_DISABLE
92
 
93
  q31_t in;                                      /* Temporary variable to hold the input value */
94
  q31_t inA1, inA2, inB1, inB2;
95
  q31_t sum2, sum3, sum4;
96
  q15_t *pInA2, *pInB2, *px2;
97
  uint32_t j = 0;
98
 
99
#else
100
 
101
  q15_t in;                                      /* Temporary variable to hold the input value */
102
  q15_t inA1, inA2, inB1, inB2;
103
 
104
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
105
 
106
#ifdef ARM_MATH_MATRIX_CHECK
107
  /* Check for matrix mismatch condition */
108
  if ((pSrcA->numCols != pSrcB->numRows) ||
109
     (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
110
  {
111
    /* Set status as ARM_MATH_SIZE_MISMATCH */
112
    status = ARM_MATH_SIZE_MISMATCH;
113
  }
114
  else
115
#endif
116
  {
117
    /* Matrix transpose */
118
    do
119
    {
120
      /* Apply loop unrolling and exchange the columns with row elements */
121
      col = numColsB >> 2;
122
 
123
      /* The pointer px is set to starting address of the column being processed */
124
      px = pSrcBT + i;
125
 
126
      /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
127
       ** a second loop below computes the remaining 1 to 3 samples. */
128
      while (col > 0U)
129
      {
130
#ifndef UNALIGNED_SUPPORT_DISABLE
131
        /* Read two elements from the row */
132
        in = *__SIMD32(pInB)++;
133
 
134
        /* Unpack and store one element in the destination */
135
#ifndef ARM_MATH_BIG_ENDIAN
136
 
137
        *px = (q15_t) in;
138
 
139
#else
140
 
141
        *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
142
 
143
#endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
144
 
145
        /* Update the pointer px to point to the next row of the transposed matrix */
146
        px += numRowsB;
147
 
148
        /* Unpack and store the second element in the destination */
149
#ifndef ARM_MATH_BIG_ENDIAN
150
 
151
        *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
152
 
153
#else
154
 
155
        *px = (q15_t) in;
156
 
157
#endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
158
 
159
        /* Update the pointer px to point to the next row of the transposed matrix */
160
        px += numRowsB;
161
 
162
        /* Read two elements from the row */
163
        in = *__SIMD32(pInB)++;
164
 
165
        /* Unpack and store one element in the destination */
166
#ifndef ARM_MATH_BIG_ENDIAN
167
 
168
        *px = (q15_t) in;
169
 
170
#else
171
 
172
        *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
173
 
174
#endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
175
 
176
        /* Update the pointer px to point to the next row of the transposed matrix */
177
        px += numRowsB;
178
 
179
        /* Unpack and store the second element in the destination */
180
 
181
#ifndef ARM_MATH_BIG_ENDIAN
182
 
183
        *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
184
 
185
#else
186
 
187
        *px = (q15_t) in;
188
 
189
#endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
190
 
191
#else
192
 
193
        /* Read one element from the row */
194
        in = *pInB++;
195
 
196
        /* Store one element in the destination */
197
        *px = in;
198
 
199
        /* Update the pointer px to point to the next row of the transposed matrix */
200
        px += numRowsB;
201
 
202
        /* Read one element from the row */
203
        in = *pInB++;
204
 
205
        /* Store one element in the destination */
206
        *px = in;
207
 
208
        /* Update the pointer px to point to the next row of the transposed matrix */
209
        px += numRowsB;
210
 
211
        /* Read one element from the row */
212
        in = *pInB++;
213
 
214
        /* Store one element in the destination */
215
        *px = in;
216
 
217
        /* Update the pointer px to point to the next row of the transposed matrix */
218
        px += numRowsB;
219
 
220
        /* Read one element from the row */
221
        in = *pInB++;
222
 
223
        /* Store one element in the destination */
224
        *px = in;
225
 
226
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
227
 
228
        /* Update the pointer px to point to the next row of the transposed matrix */
229
        px += numRowsB;
230
 
231
        /* Decrement the column loop counter */
232
        col--;
233
      }
234
 
235
      /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
236
       ** No loop unrolling is used. */
237
      col = numColsB % 0x4U;
238
 
239
      while (col > 0U)
240
      {
241
        /* Read and store the input element in the destination */
242
        *px = *pInB++;
243
 
244
        /* Update the pointer px to point to the next row of the transposed matrix */
245
        px += numRowsB;
246
 
247
        /* Decrement the column loop counter */
248
        col--;
249
      }
250
 
251
      i++;
252
 
253
      /* Decrement the row loop counter */
254
      row--;
255
 
256
    } while (row > 0U);
257
 
258
    /* Reset the variables for the usage in the following multiplication process */
259
    row = numRowsA;
260
    i = 0U;
261
    px = pDst->pData;
262
 
263
#ifndef UNALIGNED_SUPPORT_DISABLE
264
    /* Process two rows from matrix A at a time and output two rows at a time */
265
    row = row >> 1;
266
    px2 = px + numColsB;
267
#endif
268
 
269
    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
270
    /* row loop */
271
    while (row > 0U)
272
    {
273
      /* For every row wise process, the column loop counter is to be initiated */
274
      col = numColsB;
275
 
276
      /* For every row wise process, the pIn2 pointer is set
277
       ** to the starting address of the transposed pSrcB data */
278
      pInB = pSrcBT;
279
 
280
#ifndef UNALIGNED_SUPPORT_DISABLE
281
      /* Process two (transposed) columns from matrix B at a time */
282
      col = col >> 1;
283
      j = 0;
284
#endif
285
 
286
      /* column loop */
287
      while (col > 0U)
288
      {
289
        /* Set the variable sum, that acts as accumulator, to zero */
290
        sum = 0;
291
 
292
        /* Initiate the pointer pInA to point to the starting address of the column being processed */
293
        pInA = pSrcA->pData + i;
294
 
295
#ifndef UNALIGNED_SUPPORT_DISABLE
296
        sum2 = 0;
297
        sum3 = 0;
298
        sum4 = 0;
299
        pInB  = pSrcBT + j;
300
        pInA2 = pInA + numColsA;
301
        pInB2 = pInB + numRowsB;
302
 
303
        /* Read in two elements at once - alows dual MAC instruction */
304
        colCnt = numColsA >> 1;
305
#else
306
        colCnt = numColsA >> 2;
307
#endif
308
 
309
        /* matrix multiplication */
310
        while (colCnt > 0U)
311
        {
312
          /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
313
#ifndef UNALIGNED_SUPPORT_DISABLE
314
 
315
          inA1 = *__SIMD32(pInA)++;
316
          inB1 = *__SIMD32(pInB)++;
317
          inA2 = *__SIMD32(pInA2)++;
318
          inB2 = *__SIMD32(pInB2)++;
319
 
320
          sum  = __SMLAD(inA1, inB1, sum);
321
          sum2 = __SMLAD(inA1, inB2, sum2);
322
          sum3 = __SMLAD(inA2, inB1, sum3);
323
          sum4 = __SMLAD(inA2, inB2, sum4);
324
 
325
#else
326
 
327
          inA1 = *pInA;
328
          inB1 = *pInB;
329
          sum += inA1 * inB1;
330
 
331
          inA2 = pInA[1];
332
          inB2 = pInB[1];
333
          sum += inA2 * inB2;
334
 
335
          inA1 = pInA[2];
336
          inB1 = pInB[2];
337
          sum += inA1 * inB1;
338
 
339
          inA2 = pInA[3];
340
          inB2 = pInB[3];
341
          sum += inA2 * inB2;
342
 
343
          pInA += 4;
344
          pInB += 4;
345
 
346
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
347
 
348
          /* Decrement the loop counter */
349
          colCnt--;
350
        }
351
 
352
        /* process odd column samples */
353
#ifndef UNALIGNED_SUPPORT_DISABLE
354
        if (numColsA & 1U) {
355
          inA1 = *pInA++;
356
          inB1 = *pInB++;
357
          inA2 = *pInA2++;
358
          inB2 = *pInB2++;
359
          sum  += inA1 * inB1;
360
          sum2 += inA1 * inB2;
361
          sum3 += inA2 * inB1;
362
          sum4 += inA2 * inB2;
363
        }
364
#else
365
        colCnt = numColsA % 0x4U;
366
 
367
        while (colCnt > 0U)
368
        {
369
          /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
370
          sum += (q31_t) (*pInA++) * (*pInB++);
371
 
372
          colCnt--;
373
        }
374
#endif
375
 
376
        /* Saturate and store the result in the destination buffer */
377
        *px++  = (q15_t) (sum >> 15);
378
 
379
#ifndef UNALIGNED_SUPPORT_DISABLE
380
        *px++  = (q15_t) (sum2 >> 15);
381
        *px2++ = (q15_t) (sum3 >> 15);
382
        *px2++ = (q15_t) (sum4 >> 15);
383
        j += numRowsB * 2;
384
#endif
385
 
386
        /* Decrement the column loop counter */
387
        col--;
388
 
389
      }
390
 
391
      i = i + numColsA;
392
 
393
#ifndef UNALIGNED_SUPPORT_DISABLE
394
      i = i + numColsA;
395
      px = px2 + (numColsB & 1U);
396
      px2 = px + numColsB;
397
#endif
398
 
399
      /* Decrement the row loop counter */
400
      row--;
401
 
402
    }
403
 
404
    /* Compute any remaining odd row/column below */
405
 
406
#ifndef UNALIGNED_SUPPORT_DISABLE
407
 
408
    /* Compute remaining output column */
409
    if (numColsB & 1U) {
410
 
411
      /* Avoid redundant computation of last element */
412
      row = numRowsA & (~0x1);
413
 
414
      /* Point to remaining unfilled column in output matrix */
415
      px = pDst->pData+numColsB-1;
416
      pInA = pSrcA->pData;
417
 
418
      /* row loop */
419
      while (row > 0)
420
      {
421
 
422
        /* point to last column in matrix B */
423
        pInB  = pSrcBT + numRowsB*(numColsB-1);
424
 
425
        /* Set the variable sum, that acts as accumulator, to zero */
426
        sum  = 0;
427
 
428
        /* Compute 4 columns at once */
429
        colCnt = numColsA >> 2;
430
 
431
        /* matrix multiplication */
432
        while (colCnt > 0U)
433
        {
434
          inA1 = *__SIMD32(pInA)++;
435
          inA2 = *__SIMD32(pInA)++;
436
          inB1 = *__SIMD32(pInB)++;
437
          inB2 = *__SIMD32(pInB)++;
438
 
439
          sum  = __SMLAD(inA1, inB1, sum);
440
          sum  = __SMLAD(inA2, inB2, sum);
441
 
442
          /* Decrement the loop counter */
443
          colCnt--;
444
        }
445
 
446
        colCnt = numColsA & 3U;
447
        while (colCnt > 0U) {
448
          sum += (q31_t) (*pInA++) * (*pInB++);
449
          colCnt--;
450
        }
451
 
452
        /* Store the result in the destination buffer */
453
        *px  = (q15_t) (sum  >> 15);
454
        px += numColsB;
455
 
456
        /* Decrement the row loop counter */
457
        row--;
458
      }
459
    }
460
 
461
    /* Compute remaining output row */
462
    if (numRowsA & 1U) {
463
 
464
      /* point to last row in output matrix */
465
      px = pDst->pData+(numColsB)*(numRowsA-1);
466
 
467
      pInB  = pSrcBT;
468
      col = numColsB;
469
      i = 0U;
470
 
471
      /* col loop */
472
      while (col > 0)
473
      {
474
 
475
        /* point to last row in matrix A */
476
        pInA = pSrcA->pData + (numRowsA-1)*numColsA;
477
 
478
        /* Set the variable sum, that acts as accumulator, to zero */
479
        sum  = 0;
480
 
481
        /* Compute 4 columns at once */
482
        colCnt = numColsA >> 2;
483
 
484
        /* matrix multiplication */
485
        while (colCnt > 0U)
486
        {
487
          inA1 = *__SIMD32(pInA)++;
488
          inA2 = *__SIMD32(pInA)++;
489
          inB1 = *__SIMD32(pInB)++;
490
          inB2 = *__SIMD32(pInB)++;
491
 
492
          sum  = __SMLAD(inA1, inB1, sum);
493
          sum  = __SMLAD(inA2, inB2, sum);
494
 
495
          /* Decrement the loop counter */
496
          colCnt--;
497
        }
498
 
499
        colCnt = numColsA & 3U;
500
        while (colCnt > 0U) {
501
          sum += (q31_t) (*pInA++) * (*pInB++);
502
          colCnt--;
503
        }
504
 
505
        /* Store the result in the destination buffer */
506
        *px++  = (q15_t) (sum  >> 15);
507
 
508
        /* Decrement the col loop counter */
509
        col--;
510
      }
511
    }
512
 
513
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
514
 
515
    /* set status as ARM_MATH_SUCCESS */
516
    status = ARM_MATH_SUCCESS;
517
  }
518
 
519
  /* Return to application */
520
  return (status);
521
}
522
 
523
/**
524
 * @} end of MatrixMult group
525
 */