Subversion Repositories dashGPS

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 mjames 1
/* ----------------------------------------------------------------------
2
 * Project:      CMSIS DSP Library
3
 * Title:        arm_cmplx_mat_mult_q15.c
4
 * Description:  Q15 complex matrix multiplication
5
 *
6
 * $Date:        27. January 2017
7
 * $Revision:    V.1.5.1
8
 *
9
 * Target Processor: Cortex-M cores
10
 * -------------------------------------------------------------------- */
11
/*
12
 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
13
 *
14
 * SPDX-License-Identifier: Apache-2.0
15
 *
16
 * Licensed under the Apache License, Version 2.0 (the License); you may
17
 * not use this file except in compliance with the License.
18
 * You may obtain a copy of the License at
19
 *
20
 * www.apache.org/licenses/LICENSE-2.0
21
 *
22
 * Unless required by applicable law or agreed to in writing, software
23
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25
 * See the License for the specific language governing permissions and
26
 * limitations under the License.
27
 */
28
 
29
#include "arm_math.h"
30
 
31
/**
32
 * @ingroup groupMatrix
33
 */
34
 
35
/**
36
 * @addtogroup CmplxMatrixMult
37
 * @{
38
 */
39
 
40
 
41
/**
42
 * @brief Q15 Complex matrix multiplication
43
 * @param[in]       *pSrcA points to the first input complex matrix structure
44
 * @param[in]       *pSrcB points to the second input complex matrix structure
45
 * @param[out]      *pDst points to output complex matrix structure
46
 * @param[in]           *pScratch points to the array for storing intermediate results
47
 * @return              The function returns either
48
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
49
 *
50
 * \par Conditions for optimum performance
51
 *  Input, output and state buffers should be aligned by 32-bit
52
 *
53
 * \par Restrictions
54
 *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
55
 *      In this case input, output, scratch buffers should be aligned by 32-bit
56
 *
57
 * @details
58
 * <b>Scaling and Overflow Behavior:</b>
59
 *
60
 * \par
61
 * The function is implemented using a 64-bit internal accumulator. The inputs to the
62
 * multiplications are in 1.15 format and multiplications yield a 2.30 result.
63
 * The 2.30 intermediate
64
 * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
65
 * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
66
 * truncated to 34.15 format by discarding the low 15 bits and then saturated to
67
 * 1.15 format.
68
 *
69
 * \par
70
 * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function.
71
 *
72
 */
73
 
74
 
75
 
76
 
77
arm_status arm_mat_cmplx_mult_q15(
78
  const arm_matrix_instance_q15 * pSrcA,
79
  const arm_matrix_instance_q15 * pSrcB,
80
  arm_matrix_instance_q15 * pDst,
81
  q15_t * pScratch)
82
{
83
  /* accumulator */
84
  q15_t *pSrcBT = pScratch;                      /* input data matrix pointer for transpose */
85
  q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
86
  q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
87
  q15_t *px;                                     /* Temporary output data matrix pointer */
88
  uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
89
  uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
90
  uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
91
  uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
92
  uint16_t col, i = 0U, row = numRowsB, colCnt;  /* loop counters */
93
  arm_status status;                             /* status of matrix multiplication */
94
  q63_t sumReal, sumImag;
95
 
96
#ifdef UNALIGNED_SUPPORT_DISABLE
97
  q15_t in;                                      /* Temporary variable to hold the input value */
98
  q15_t a, b, c, d;
99
#else
100
  q31_t in;                                      /* Temporary variable to hold the input value */
101
  q31_t prod1, prod2;
102
  q31_t pSourceA, pSourceB;
103
#endif
104
 
105
#ifdef ARM_MATH_MATRIX_CHECK
106
  /* Check for matrix mismatch condition */
107
  if ((pSrcA->numCols != pSrcB->numRows) ||
108
     (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
109
  {
110
    /* Set status as ARM_MATH_SIZE_MISMATCH */
111
    status = ARM_MATH_SIZE_MISMATCH;
112
  }
113
  else
114
#endif
115
  {
116
    /* Matrix transpose */
117
    do
118
    {
119
      /* Apply loop unrolling and exchange the columns with row elements */
120
      col = numColsB >> 2;
121
 
122
      /* The pointer px is set to starting address of the column being processed */
123
      px = pSrcBT + i;
124
 
125
      /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
126
       ** a second loop below computes the remaining 1 to 3 samples. */
127
      while (col > 0U)
128
      {
129
#ifdef UNALIGNED_SUPPORT_DISABLE
130
        /* Read two elements from the row */
131
        in = *pInB++;
132
        *px = in;
133
        in = *pInB++;
134
        px[1] = in;
135
 
136
        /* Update the pointer px to point to the next row of the transposed matrix */
137
        px += numRowsB * 2;
138
 
139
        /* Read two elements from the row */
140
        in = *pInB++;
141
        *px = in;
142
        in = *pInB++;
143
        px[1] = in;
144
 
145
        /* Update the pointer px to point to the next row of the transposed matrix */
146
        px += numRowsB * 2;
147
 
148
        /* Read two elements from the row */
149
        in = *pInB++;
150
        *px = in;
151
        in = *pInB++;
152
        px[1] = in;
153
 
154
        /* Update the pointer px to point to the next row of the transposed matrix */
155
        px += numRowsB * 2;
156
 
157
        /* Read two elements from the row */
158
        in = *pInB++;
159
        *px = in;
160
        in = *pInB++;
161
        px[1] = in;
162
 
163
        /* Update the pointer px to point to the next row of the transposed matrix */
164
        px += numRowsB * 2;
165
 
166
        /* Decrement the column loop counter */
167
        col--;
168
      }
169
 
170
      /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
171
       ** No loop unrolling is used. */
172
      col = numColsB % 0x4U;
173
 
174
      while (col > 0U)
175
      {
176
        /* Read two elements from the row */
177
        in = *pInB++;
178
        *px = in;
179
        in = *pInB++;
180
        px[1] = in;
181
#else
182
 
183
        /* Read two elements from the row */
184
        in = *__SIMD32(pInB)++;
185
 
186
        *__SIMD32(px) = in;
187
 
188
        /* Update the pointer px to point to the next row of the transposed matrix */
189
        px += numRowsB * 2;
190
 
191
 
192
        /* Read two elements from the row */
193
        in = *__SIMD32(pInB)++;
194
 
195
        *__SIMD32(px) = in;
196
 
197
        /* Update the pointer px to point to the next row of the transposed matrix */
198
        px += numRowsB * 2;
199
 
200
        /* Read two elements from the row */
201
        in = *__SIMD32(pInB)++;
202
 
203
        *__SIMD32(px) = in;
204
 
205
        /* Update the pointer px to point to the next row of the transposed matrix */
206
        px += numRowsB * 2;
207
 
208
        /* Read two elements from the row */
209
        in = *__SIMD32(pInB)++;
210
 
211
        *__SIMD32(px) = in;
212
 
213
        /* Update the pointer px to point to the next row of the transposed matrix */
214
        px += numRowsB * 2;
215
 
216
        /* Decrement the column loop counter */
217
        col--;
218
      }
219
 
220
      /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
221
       ** No loop unrolling is used. */
222
      col = numColsB % 0x4U;
223
 
224
      while (col > 0U)
225
      {
226
        /* Read two elements from the row */
227
        in = *__SIMD32(pInB)++;
228
 
229
        *__SIMD32(px) = in;
230
#endif
231
 
232
        /* Update the pointer px to point to the next row of the transposed matrix */
233
        px += numRowsB * 2;
234
 
235
        /* Decrement the column loop counter */
236
        col--;
237
      }
238
 
239
      i = i + 2U;
240
 
241
      /* Decrement the row loop counter */
242
      row--;
243
 
244
    } while (row > 0U);
245
 
246
    /* Reset the variables for the usage in the following multiplication process */
247
    row = numRowsA;
248
    i = 0U;
249
    px = pDst->pData;
250
 
251
    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
252
    /* row loop */
253
    do
254
    {
255
      /* For every row wise process, the column loop counter is to be initiated */
256
      col = numColsB;
257
 
258
      /* For every row wise process, the pIn2 pointer is set
259
       ** to the starting address of the transposed pSrcB data */
260
      pInB = pSrcBT;
261
 
262
      /* column loop */
263
      do
264
      {
265
        /* Set the variable sum, that acts as accumulator, to zero */
266
        sumReal = 0;
267
        sumImag = 0;
268
 
269
        /* Apply loop unrolling and compute 2 MACs simultaneously. */
270
        colCnt = numColsA >> 1;
271
 
272
        /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
273
        pInA = pSrcA->pData + i * 2;
274
 
275
 
276
        /* matrix multiplication */
277
        while (colCnt > 0U)
278
        {
279
          /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
280
 
281
#ifdef UNALIGNED_SUPPORT_DISABLE
282
 
283
          /* read real and imag values from pSrcA buffer */
284
          a = *pInA;
285
          b = *(pInA + 1U);
286
          /* read real and imag values from pSrcB buffer */
287
          c = *pInB;
288
          d = *(pInB + 1U);
289
 
290
          /* Multiply and Accumlates */
291
          sumReal += (q31_t) a *c;
292
          sumImag += (q31_t) a *d;
293
          sumReal -= (q31_t) b *d;
294
          sumImag += (q31_t) b *c;
295
 
296
          /* read next real and imag values from pSrcA buffer */
297
          a = *(pInA + 2U);
298
          b = *(pInA + 3U);
299
          /* read next real and imag values from pSrcB buffer */
300
          c = *(pInB + 2U);
301
          d = *(pInB + 3U);
302
 
303
          /* update pointer */
304
          pInA += 4U;
305
 
306
          /* Multiply and Accumlates */
307
          sumReal += (q31_t) a *c;
308
          sumImag += (q31_t) a *d;
309
          sumReal -= (q31_t) b *d;
310
          sumImag += (q31_t) b *c;
311
          /* update pointer */
312
          pInB += 4U;
313
#else
314
          /* read real and imag values from pSrcA and pSrcB buffer */
315
          pSourceA = *__SIMD32(pInA)++;
316
          pSourceB = *__SIMD32(pInB)++;
317
 
318
          /* Multiply and Accumlates */
319
#ifdef ARM_MATH_BIG_ENDIAN
320
          prod1 = -__SMUSD(pSourceA, pSourceB);
321
#else
322
          prod1 = __SMUSD(pSourceA, pSourceB);
323
#endif
324
          prod2 = __SMUADX(pSourceA, pSourceB);
325
          sumReal += (q63_t) prod1;
326
          sumImag += (q63_t) prod2;
327
 
328
          /* read real and imag values from pSrcA and pSrcB buffer */
329
          pSourceA = *__SIMD32(pInA)++;
330
          pSourceB = *__SIMD32(pInB)++;
331
 
332
          /* Multiply and Accumlates */
333
#ifdef ARM_MATH_BIG_ENDIAN
334
          prod1 = -__SMUSD(pSourceA, pSourceB);
335
#else
336
          prod1 = __SMUSD(pSourceA, pSourceB);
337
#endif
338
          prod2 = __SMUADX(pSourceA, pSourceB);
339
          sumReal += (q63_t) prod1;
340
          sumImag += (q63_t) prod2;
341
 
342
#endif /*      #ifdef UNALIGNED_SUPPORT_DISABLE */
343
 
344
          /* Decrement the loop counter */
345
          colCnt--;
346
        }
347
 
348
        /* process odd column samples */
349
        if ((numColsA & 0x1U) > 0U)
350
        {
351
          /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
352
 
353
#ifdef UNALIGNED_SUPPORT_DISABLE
354
 
355
          /* read real and imag values from pSrcA and pSrcB buffer */
356
          a = *pInA++;
357
          b = *pInA++;
358
          c = *pInB++;
359
          d = *pInB++;
360
 
361
          /* Multiply and Accumlates */
362
          sumReal += (q31_t) a *c;
363
          sumImag += (q31_t) a *d;
364
          sumReal -= (q31_t) b *d;
365
          sumImag += (q31_t) b *c;
366
 
367
#else
368
          /* read real and imag values from pSrcA and pSrcB buffer */
369
          pSourceA = *__SIMD32(pInA)++;
370
          pSourceB = *__SIMD32(pInB)++;
371
 
372
          /* Multiply and Accumlates */
373
#ifdef ARM_MATH_BIG_ENDIAN
374
          prod1 = -__SMUSD(pSourceA, pSourceB);
375
#else
376
          prod1 = __SMUSD(pSourceA, pSourceB);
377
#endif
378
          prod2 = __SMUADX(pSourceA, pSourceB);
379
          sumReal += (q63_t) prod1;
380
          sumImag += (q63_t) prod2;
381
 
382
#endif /*      #ifdef UNALIGNED_SUPPORT_DISABLE */
383
 
384
        }
385
 
386
        /* Saturate and store the result in the destination buffer */
387
 
388
        *px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
389
        *px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
390
 
391
        /* Decrement the column loop counter */
392
        col--;
393
 
394
      } while (col > 0U);
395
 
396
      i = i + numColsA;
397
 
398
      /* Decrement the row loop counter */
399
      row--;
400
 
401
    } while (row > 0U);
402
 
403
    /* set status as ARM_MATH_SUCCESS */
404
    status = ARM_MATH_SUCCESS;
405
  }
406
 
407
  /* Return to application */
408
  return (status);
409
}
410
 
411
/**
412
 * @} end of MatrixMult group
413
 */