Subversion Repositories dashGPS

Rev

Rev 2 | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------
  2.  * Project:      CMSIS DSP Library
  3.  * Title:        arm_cmplx_mat_mult_q15.c
  4.  * Description:  Q15 complex matrix multiplication
  5.  *
  6.  * $Date:        27. January 2017
  7.  * $Revision:    V.1.5.1
  8.  *
  9.  * Target Processor: Cortex-M cores
  10.  * -------------------------------------------------------------------- */
  11. /*
  12.  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13.  *
  14.  * SPDX-License-Identifier: Apache-2.0
  15.  *
  16.  * Licensed under the Apache License, Version 2.0 (the License); you may
  17.  * not use this file except in compliance with the License.
  18.  * You may obtain a copy of the License at
  19.  *
  20.  * www.apache.org/licenses/LICENSE-2.0
  21.  *
  22.  * Unless required by applicable law or agreed to in writing, software
  23.  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25.  * See the License for the specific language governing permissions and
  26.  * limitations under the License.
  27.  */
  28.  
  29. #include "arm_math.h"
  30.  
  31. /**
  32.  * @ingroup groupMatrix
  33.  */
  34.  
  35. /**
  36.  * @addtogroup CmplxMatrixMult
  37.  * @{
  38.  */
  39.  
  40.  
  41. /**
  42.  * @brief Q15 Complex matrix multiplication
  43.  * @param[in]       *pSrcA points to the first input complex matrix structure
  44.  * @param[in]       *pSrcB points to the second input complex matrix structure
  45.  * @param[out]      *pDst points to output complex matrix structure
  46.  * @param[in]           *pScratch points to the array for storing intermediate results
  47.  * @return              The function returns either
  48.  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  49.  *
  50.  * \par Conditions for optimum performance
  51.  *  Input, output and state buffers should be aligned by 32-bit
  52.  *
  53.  * \par Restrictions
  54.  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  55.  *      In this case input, output, scratch buffers should be aligned by 32-bit
  56.  *
  57.  * @details
  58.  * <b>Scaling and Overflow Behavior:</b>
  59.  *
  60.  * \par
  61.  * The function is implemented using a 64-bit internal accumulator. The inputs to the
  62.  * multiplications are in 1.15 format and multiplications yield a 2.30 result.
  63.  * The 2.30 intermediate
  64.  * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
  65.  * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
  66.  * truncated to 34.15 format by discarding the low 15 bits and then saturated to
  67.  * 1.15 format.
  68.  *
  69.  * \par
  70.  * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function.
  71.  *
  72.  */
  73.  
  74.  
  75.  
  76.  
  77. arm_status arm_mat_cmplx_mult_q15(
  78.   const arm_matrix_instance_q15 * pSrcA,
  79.   const arm_matrix_instance_q15 * pSrcB,
  80.   arm_matrix_instance_q15 * pDst,
  81.   q15_t * pScratch)
  82. {
  83.   /* accumulator */
  84.   q15_t *pSrcBT = pScratch;                      /* input data matrix pointer for transpose */
  85.   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  86.   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  87.   q15_t *px;                                     /* Temporary output data matrix pointer */
  88.   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  89.   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  90.   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  91.   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
  92.   uint16_t col, i = 0U, row = numRowsB, colCnt;  /* loop counters */
  93.   arm_status status;                             /* status of matrix multiplication */
  94.   q63_t sumReal, sumImag;
  95.  
  96. #ifdef UNALIGNED_SUPPORT_DISABLE
  97.   q15_t in;                                      /* Temporary variable to hold the input value */
  98.   q15_t a, b, c, d;
  99. #else
  100.   q31_t in;                                      /* Temporary variable to hold the input value */
  101.   q31_t prod1, prod2;
  102.   q31_t pSourceA, pSourceB;
  103. #endif
  104.  
  105. #ifdef ARM_MATH_MATRIX_CHECK
  106.   /* Check for matrix mismatch condition */
  107.   if ((pSrcA->numCols != pSrcB->numRows) ||
  108.      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  109.   {
  110.     /* Set status as ARM_MATH_SIZE_MISMATCH */
  111.     status = ARM_MATH_SIZE_MISMATCH;
  112.   }
  113.   else
  114. #endif
  115.   {
  116.     /* Matrix transpose */
  117.     do
  118.     {
  119.       /* Apply loop unrolling and exchange the columns with row elements */
  120.       col = numColsB >> 2;
  121.  
  122.       /* The pointer px is set to starting address of the column being processed */
  123.       px = pSrcBT + i;
  124.  
  125.       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
  126.        ** a second loop below computes the remaining 1 to 3 samples. */
  127.       while (col > 0U)
  128.       {
  129. #ifdef UNALIGNED_SUPPORT_DISABLE
  130.         /* Read two elements from the row */
  131.         in = *pInB++;
  132.         *px = in;
  133.         in = *pInB++;
  134.         px[1] = in;
  135.  
  136.         /* Update the pointer px to point to the next row of the transposed matrix */
  137.         px += numRowsB * 2;
  138.  
  139.         /* Read two elements from the row */
  140.         in = *pInB++;
  141.         *px = in;
  142.         in = *pInB++;
  143.         px[1] = in;
  144.  
  145.         /* Update the pointer px to point to the next row of the transposed matrix */
  146.         px += numRowsB * 2;
  147.  
  148.         /* Read two elements from the row */
  149.         in = *pInB++;
  150.         *px = in;
  151.         in = *pInB++;
  152.         px[1] = in;
  153.  
  154.         /* Update the pointer px to point to the next row of the transposed matrix */
  155.         px += numRowsB * 2;
  156.  
  157.         /* Read two elements from the row */
  158.         in = *pInB++;
  159.         *px = in;
  160.         in = *pInB++;
  161.         px[1] = in;
  162.  
  163.         /* Update the pointer px to point to the next row of the transposed matrix */
  164.         px += numRowsB * 2;
  165.  
  166.         /* Decrement the column loop counter */
  167.         col--;
  168.       }
  169.  
  170.       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
  171.        ** No loop unrolling is used. */
  172.       col = numColsB % 0x4U;
  173.  
  174.       while (col > 0U)
  175.       {
  176.         /* Read two elements from the row */
  177.         in = *pInB++;
  178.         *px = in;
  179.         in = *pInB++;
  180.         px[1] = in;
  181. #else
  182.  
  183.         /* Read two elements from the row */
  184.         in = *__SIMD32(pInB)++;
  185.  
  186.         *__SIMD32(px) = in;
  187.  
  188.         /* Update the pointer px to point to the next row of the transposed matrix */
  189.         px += numRowsB * 2;
  190.  
  191.  
  192.         /* Read two elements from the row */
  193.         in = *__SIMD32(pInB)++;
  194.  
  195.         *__SIMD32(px) = in;
  196.  
  197.         /* Update the pointer px to point to the next row of the transposed matrix */
  198.         px += numRowsB * 2;
  199.  
  200.         /* Read two elements from the row */
  201.         in = *__SIMD32(pInB)++;
  202.  
  203.         *__SIMD32(px) = in;
  204.  
  205.         /* Update the pointer px to point to the next row of the transposed matrix */
  206.         px += numRowsB * 2;
  207.  
  208.         /* Read two elements from the row */
  209.         in = *__SIMD32(pInB)++;
  210.  
  211.         *__SIMD32(px) = in;
  212.  
  213.         /* Update the pointer px to point to the next row of the transposed matrix */
  214.         px += numRowsB * 2;
  215.  
  216.         /* Decrement the column loop counter */
  217.         col--;
  218.       }
  219.  
  220.       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
  221.        ** No loop unrolling is used. */
  222.       col = numColsB % 0x4U;
  223.  
  224.       while (col > 0U)
  225.       {
  226.         /* Read two elements from the row */
  227.         in = *__SIMD32(pInB)++;
  228.  
  229.         *__SIMD32(px) = in;
  230. #endif
  231.  
  232.         /* Update the pointer px to point to the next row of the transposed matrix */
  233.         px += numRowsB * 2;
  234.  
  235.         /* Decrement the column loop counter */
  236.         col--;
  237.       }
  238.  
  239.       i = i + 2U;
  240.  
  241.       /* Decrement the row loop counter */
  242.       row--;
  243.  
  244.     } while (row > 0U);
  245.  
  246.     /* Reset the variables for the usage in the following multiplication process */
  247.     row = numRowsA;
  248.     i = 0U;
  249.     px = pDst->pData;
  250.  
  251.     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
  252.     /* row loop */
  253.     do
  254.     {
  255.       /* For every row wise process, the column loop counter is to be initiated */
  256.       col = numColsB;
  257.  
  258.       /* For every row wise process, the pIn2 pointer is set
  259.        ** to the starting address of the transposed pSrcB data */
  260.       pInB = pSrcBT;
  261.  
  262.       /* column loop */
  263.       do
  264.       {
  265.         /* Set the variable sum, that acts as accumulator, to zero */
  266.         sumReal = 0;
  267.         sumImag = 0;
  268.  
  269.         /* Apply loop unrolling and compute 2 MACs simultaneously. */
  270.         colCnt = numColsA >> 1;
  271.  
  272.         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
  273.         pInA = pSrcA->pData + i * 2;
  274.  
  275.  
  276.         /* matrix multiplication */
  277.         while (colCnt > 0U)
  278.         {
  279.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  280.  
  281. #ifdef UNALIGNED_SUPPORT_DISABLE
  282.  
  283.           /* read real and imag values from pSrcA buffer */
  284.           a = *pInA;
  285.           b = *(pInA + 1U);
  286.           /* read real and imag values from pSrcB buffer */
  287.           c = *pInB;
  288.           d = *(pInB + 1U);
  289.  
  290.           /* Multiply and Accumlates */
  291.           sumReal += (q31_t) a *c;
  292.           sumImag += (q31_t) a *d;
  293.           sumReal -= (q31_t) b *d;
  294.           sumImag += (q31_t) b *c;
  295.  
  296.           /* read next real and imag values from pSrcA buffer */
  297.           a = *(pInA + 2U);
  298.           b = *(pInA + 3U);
  299.           /* read next real and imag values from pSrcB buffer */
  300.           c = *(pInB + 2U);
  301.           d = *(pInB + 3U);
  302.  
  303.           /* update pointer */
  304.           pInA += 4U;
  305.  
  306.           /* Multiply and Accumlates */
  307.           sumReal += (q31_t) a *c;
  308.           sumImag += (q31_t) a *d;
  309.           sumReal -= (q31_t) b *d;
  310.           sumImag += (q31_t) b *c;
  311.           /* update pointer */
  312.           pInB += 4U;
  313. #else
  314.           /* read real and imag values from pSrcA and pSrcB buffer */
  315.           pSourceA = *__SIMD32(pInA)++;
  316.           pSourceB = *__SIMD32(pInB)++;
  317.  
  318.           /* Multiply and Accumlates */
  319. #ifdef ARM_MATH_BIG_ENDIAN
  320.           prod1 = -__SMUSD(pSourceA, pSourceB);
  321. #else
  322.           prod1 = __SMUSD(pSourceA, pSourceB);
  323. #endif
  324.           prod2 = __SMUADX(pSourceA, pSourceB);
  325.           sumReal += (q63_t) prod1;
  326.           sumImag += (q63_t) prod2;
  327.  
  328.           /* read real and imag values from pSrcA and pSrcB buffer */
  329.           pSourceA = *__SIMD32(pInA)++;
  330.           pSourceB = *__SIMD32(pInB)++;
  331.  
  332.           /* Multiply and Accumlates */
  333. #ifdef ARM_MATH_BIG_ENDIAN
  334.           prod1 = -__SMUSD(pSourceA, pSourceB);
  335. #else
  336.           prod1 = __SMUSD(pSourceA, pSourceB);
  337. #endif
  338.           prod2 = __SMUADX(pSourceA, pSourceB);
  339.           sumReal += (q63_t) prod1;
  340.           sumImag += (q63_t) prod2;
  341.  
  342. #endif /*      #ifdef UNALIGNED_SUPPORT_DISABLE */
  343.  
  344.           /* Decrement the loop counter */
  345.           colCnt--;
  346.         }
  347.  
  348.         /* process odd column samples */
  349.         if ((numColsA & 0x1U) > 0U)
  350.         {
  351.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  352.  
  353. #ifdef UNALIGNED_SUPPORT_DISABLE
  354.  
  355.           /* read real and imag values from pSrcA and pSrcB buffer */
  356.           a = *pInA++;
  357.           b = *pInA++;
  358.           c = *pInB++;
  359.           d = *pInB++;
  360.  
  361.           /* Multiply and Accumlates */
  362.           sumReal += (q31_t) a *c;
  363.           sumImag += (q31_t) a *d;
  364.           sumReal -= (q31_t) b *d;
  365.           sumImag += (q31_t) b *c;
  366.  
  367. #else
  368.           /* read real and imag values from pSrcA and pSrcB buffer */
  369.           pSourceA = *__SIMD32(pInA)++;
  370.           pSourceB = *__SIMD32(pInB)++;
  371.  
  372.           /* Multiply and Accumlates */
  373. #ifdef ARM_MATH_BIG_ENDIAN
  374.           prod1 = -__SMUSD(pSourceA, pSourceB);
  375. #else
  376.           prod1 = __SMUSD(pSourceA, pSourceB);
  377. #endif
  378.           prod2 = __SMUADX(pSourceA, pSourceB);
  379.           sumReal += (q63_t) prod1;
  380.           sumImag += (q63_t) prod2;
  381.  
  382. #endif /*      #ifdef UNALIGNED_SUPPORT_DISABLE */
  383.  
  384.         }
  385.  
  386.         /* Saturate and store the result in the destination buffer */
  387.  
  388.         *px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
  389.         *px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
  390.  
  391.         /* Decrement the column loop counter */
  392.         col--;
  393.  
  394.       } while (col > 0U);
  395.  
  396.       i = i + numColsA;
  397.  
  398.       /* Decrement the row loop counter */
  399.       row--;
  400.  
  401.     } while (row > 0U);
  402.  
  403.     /* set status as ARM_MATH_SUCCESS */
  404.     status = ARM_MATH_SUCCESS;
  405.   }
  406.  
  407.   /* Return to application */
  408.   return (status);
  409. }
  410.  
  411. /**
  412.  * @} end of MatrixMult group
  413.  */
  414.