Subversion Repositories dashGPS

Rev

Rev 2 | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------
  2.  * Project:      CMSIS DSP Library
  3.  * Title:        arm_mat_mult_q15.c
  4.  * Description:  Q15 matrix multiplication
  5.  *
  6.  * $Date:        27. January 2017
  7.  * $Revision:    V.1.5.1
  8.  *
  9.  * Target Processor: Cortex-M cores
  10.  * -------------------------------------------------------------------- */
  11. /*
  12.  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13.  *
  14.  * SPDX-License-Identifier: Apache-2.0
  15.  *
  16.  * Licensed under the Apache License, Version 2.0 (the License); you may
  17.  * not use this file except in compliance with the License.
  18.  * You may obtain a copy of the License at
  19.  *
  20.  * www.apache.org/licenses/LICENSE-2.0
  21.  *
  22.  * Unless required by applicable law or agreed to in writing, software
  23.  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25.  * See the License for the specific language governing permissions and
  26.  * limitations under the License.
  27.  */
  28.  
  29. #include "arm_math.h"
  30.  
  31. /**
  32.  * @ingroup groupMatrix
  33.  */
  34.  
  35. /**
  36.  * @addtogroup MatrixMult
  37.  * @{
  38.  */
  39.  
  40.  
  41. /**
  42.  * @brief Q15 matrix multiplication
  43.  * @param[in]       *pSrcA points to the first input matrix structure
  44.  * @param[in]       *pSrcB points to the second input matrix structure
  45.  * @param[out]      *pDst points to output matrix structure
  46.  * @param[in]       *pState points to the array for storing intermediate results (Unused)
  47.  * @return          The function returns either
  48.  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  49.  *
  50.  * @details
  51.  * <b>Scaling and Overflow Behavior:</b>
  52.  *
  53.  * \par
  54.  * The function is implemented using a 64-bit internal accumulator. The inputs to the
  55.  * multiplications are in 1.15 format and multiplications yield a 2.30 result.
  56.  * The 2.30 intermediate
  57.  * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
  58.  * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
  59.  * truncated to 34.15 format by discarding the low 15 bits and then saturated to
  60.  * 1.15 format.
  61.  *
  62.  * \par
  63.  * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
  64.  *
  65.  */
  66.  
  67. arm_status arm_mat_mult_q15(
  68.   const arm_matrix_instance_q15 * pSrcA,
  69.   const arm_matrix_instance_q15 * pSrcB,
  70.   arm_matrix_instance_q15 * pDst,
  71.   q15_t * pState)
  72. {
  73.   q63_t sum;                                     /* accumulator */
  74.  
  75. #if defined (ARM_MATH_DSP)
  76.  
  77.   /* Run the below code for Cortex-M4 and Cortex-M3 */
  78.  
  79.   q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
  80.   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  81.   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  82.   q15_t *px;                                     /* Temporary output data matrix pointer */
  83.   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  84.   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  85.   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  86.   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
  87.   uint16_t col, i = 0U, row = numRowsB, colCnt;  /* loop counters */
  88.   arm_status status;                             /* status of matrix multiplication */
  89.  
  90. #ifndef UNALIGNED_SUPPORT_DISABLE
  91.  
  92.   q31_t in;                                      /* Temporary variable to hold the input value */
  93.   q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2;
  94.  
  95. #else
  96.  
  97.   q15_t in;                                      /* Temporary variable to hold the input value */
  98.   q15_t inA1, inB1, inA2, inB2;
  99.  
  100. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  101.  
  102. #ifdef ARM_MATH_MATRIX_CHECK
  103.   /* Check for matrix mismatch condition */
  104.   if ((pSrcA->numCols != pSrcB->numRows) ||
  105.      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  106.   {
  107.     /* Set status as ARM_MATH_SIZE_MISMATCH */
  108.     status = ARM_MATH_SIZE_MISMATCH;
  109.   }
  110.   else
  111. #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
  112.   {
  113.     /* Matrix transpose */
  114.     do
  115.     {
  116.       /* Apply loop unrolling and exchange the columns with row elements */
  117.       col = numColsB >> 2;
  118.  
  119.       /* The pointer px is set to starting address of the column being processed */
  120.       px = pSrcBT + i;
  121.  
  122.       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
  123.        ** a second loop below computes the remaining 1 to 3 samples. */
  124.       while (col > 0U)
  125.       {
  126. #ifndef UNALIGNED_SUPPORT_DISABLE
  127.  
  128.         /* Read two elements from the row */
  129.         in = *__SIMD32(pInB)++;
  130.  
  131.         /* Unpack and store one element in the destination */
  132. #ifndef ARM_MATH_BIG_ENDIAN
  133.  
  134.         *px = (q15_t) in;
  135.  
  136. #else
  137.  
  138.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  139.  
  140. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  141.  
  142.         /* Update the pointer px to point to the next row of the transposed matrix */
  143.         px += numRowsB;
  144.  
  145.         /* Unpack and store the second element in the destination */
  146. #ifndef ARM_MATH_BIG_ENDIAN
  147.  
  148.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  149.  
  150. #else
  151.  
  152.         *px = (q15_t) in;
  153.  
  154. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  155.  
  156.         /* Update the pointer px to point to the next row of the transposed matrix */
  157.         px += numRowsB;
  158.  
  159.         /* Read two elements from the row */
  160.         in = *__SIMD32(pInB)++;
  161.  
  162.         /* Unpack and store one element in the destination */
  163. #ifndef ARM_MATH_BIG_ENDIAN
  164.  
  165.         *px = (q15_t) in;
  166.  
  167. #else
  168.  
  169.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  170.  
  171. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  172.  
  173.         /* Update the pointer px to point to the next row of the transposed matrix */
  174.         px += numRowsB;
  175.  
  176.         /* Unpack and store the second element in the destination */
  177.  
  178. #ifndef ARM_MATH_BIG_ENDIAN
  179.  
  180.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  181.  
  182. #else
  183.  
  184.         *px = (q15_t) in;
  185.  
  186. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  187.  
  188.         /* Update the pointer px to point to the next row of the transposed matrix */
  189.         px += numRowsB;
  190.  
  191. #else
  192.  
  193.         /* Read one element from the row */
  194.         in = *pInB++;
  195.  
  196.         /* Store one element in the destination */
  197.         *px = in;
  198.  
  199.         /* Update the pointer px to point to the next row of the transposed matrix */
  200.         px += numRowsB;
  201.  
  202.         /* Read one element from the row */
  203.         in = *pInB++;
  204.  
  205.         /* Store one element in the destination */
  206.         *px = in;
  207.  
  208.         /* Update the pointer px to point to the next row of the transposed matrix */
  209.         px += numRowsB;
  210.  
  211.         /* Read one element from the row */
  212.         in = *pInB++;
  213.  
  214.         /* Store one element in the destination */
  215.         *px = in;
  216.  
  217.         /* Update the pointer px to point to the next row of the transposed matrix */
  218.         px += numRowsB;
  219.  
  220.         /* Read one element from the row */
  221.         in = *pInB++;
  222.  
  223.         /* Store one element in the destination */
  224.         *px = in;
  225.  
  226.         /* Update the pointer px to point to the next row of the transposed matrix */
  227.         px += numRowsB;
  228.  
  229. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  230.  
  231.        /* Decrement the column loop counter */
  232.         col--;
  233.       }
  234.  
  235.       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
  236.        ** No loop unrolling is used. */
  237.       col = numColsB % 0x4U;
  238.  
  239.       while (col > 0U)
  240.       {
  241.         /* Read and store the input element in the destination */
  242.         *px = *pInB++;
  243.  
  244.         /* Update the pointer px to point to the next row of the transposed matrix */
  245.         px += numRowsB;
  246.  
  247.         /* Decrement the column loop counter */
  248.         col--;
  249.       }
  250.  
  251.       i++;
  252.  
  253.       /* Decrement the row loop counter */
  254.       row--;
  255.  
  256.     } while (row > 0U);
  257.  
  258.     /* Reset the variables for the usage in the following multiplication process */
  259.     row = numRowsA;
  260.     i = 0U;
  261.     px = pDst->pData;
  262.  
  263.     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
  264.     /* row loop */
  265.     do
  266.     {
  267.       /* For every row wise process, the column loop counter is to be initiated */
  268.       col = numColsB;
  269.  
  270.       /* For every row wise process, the pIn2 pointer is set
  271.        ** to the starting address of the transposed pSrcB data */
  272.       pInB = pSrcBT;
  273.  
  274.       /* column loop */
  275.       do
  276.       {
  277.         /* Set the variable sum, that acts as accumulator, to zero */
  278.         sum = 0;
  279.  
  280.         /* Apply loop unrolling and compute 2 MACs simultaneously. */
  281.         colCnt = numColsA >> 2;
  282.  
  283.         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
  284.         pInA = pSrcA->pData + i;
  285.  
  286.  
  287.         /* matrix multiplication */
  288.         while (colCnt > 0U)
  289.         {
  290.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  291. #ifndef UNALIGNED_SUPPORT_DISABLE
  292.  
  293.           /* read real and imag values from pSrcA and pSrcB buffer */
  294.           pSourceA1 = *__SIMD32(pInA)++;
  295.           pSourceB1 = *__SIMD32(pInB)++;
  296.  
  297.           pSourceA2 = *__SIMD32(pInA)++;
  298.           pSourceB2 = *__SIMD32(pInB)++;
  299.  
  300.           /* Multiply and Accumlates */
  301.           sum = __SMLALD(pSourceA1, pSourceB1, sum);
  302.           sum = __SMLALD(pSourceA2, pSourceB2, sum);
  303.  
  304. #else
  305.           /* read real and imag values from pSrcA and pSrcB buffer */
  306.           inA1 = *pInA++;
  307.           inB1 = *pInB++;
  308.           inA2 = *pInA++;
  309.           /* Multiply and Accumlates */
  310.           sum += inA1 * inB1;
  311.           inB2 = *pInB++;
  312.  
  313.           inA1 = *pInA++;
  314.           inB1 = *pInB++;
  315.           /* Multiply and Accumlates */
  316.           sum += inA2 * inB2;
  317.           inA2 = *pInA++;
  318.           inB2 = *pInB++;
  319.  
  320.           /* Multiply and Accumlates */
  321.           sum += inA1 * inB1;
  322.           sum += inA2 * inB2;
  323.  
  324. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  325.  
  326.           /* Decrement the loop counter */
  327.           colCnt--;
  328.         }
  329.  
  330.         /* process remaining column samples */
  331.         colCnt = numColsA & 3U;
  332.  
  333.         while (colCnt > 0U)
  334.         {
  335.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  336.           sum += *pInA++ * *pInB++;
  337.  
  338.           /* Decrement the loop counter */
  339.           colCnt--;
  340.         }
  341.  
  342.         /* Saturate and store the result in the destination buffer */
  343.         *px = (q15_t) (__SSAT((sum >> 15), 16));
  344.         px++;
  345.  
  346.         /* Decrement the column loop counter */
  347.         col--;
  348.  
  349.       } while (col > 0U);
  350.  
  351.       i = i + numColsA;
  352.  
  353.       /* Decrement the row loop counter */
  354.       row--;
  355.  
  356.     } while (row > 0U);
  357.  
  358. #else
  359.  
  360.   /* Run the below code for Cortex-M0 */
  361.  
  362.   q15_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
  363.   q15_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
  364.   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  365.   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  366.   q15_t *pOut = pDst->pData;                     /* output data matrix pointer */
  367.   q15_t *px;                                     /* Temporary output data matrix pointer */
  368.   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  369.   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  370.   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  371.   uint16_t col, i = 0U, row = numRowsA, colCnt;  /* loop counters */
  372.   arm_status status;                             /* status of matrix multiplication */
  373.  
  374. #ifdef ARM_MATH_MATRIX_CHECK
  375.  
  376.   /* Check for matrix mismatch condition */
  377.   if ((pSrcA->numCols != pSrcB->numRows) ||
  378.      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  379.   {
  380.     /* Set status as ARM_MATH_SIZE_MISMATCH */
  381.     status = ARM_MATH_SIZE_MISMATCH;
  382.   }
  383.   else
  384. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  385.  
  386.   {
  387.     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
  388.     /* row loop */
  389.     do
  390.     {
  391.       /* Output pointer is set to starting address of the row being processed */
  392.       px = pOut + i;
  393.  
  394.       /* For every row wise process, the column loop counter is to be initiated */
  395.       col = numColsB;
  396.  
  397.       /* For every row wise process, the pIn2 pointer is set
  398.        ** to the starting address of the pSrcB data */
  399.       pIn2 = pSrcB->pData;
  400.  
  401.       /* column loop */
  402.       do
  403.       {
  404.         /* Set the variable sum, that acts as accumulator, to zero */
  405.         sum = 0;
  406.  
  407.         /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
  408.         pIn1 = pInA;
  409.  
  410.         /* Matrix A columns number of MAC operations are to be performed */
  411.         colCnt = numColsA;
  412.  
  413.         /* matrix multiplication */
  414.         while (colCnt > 0U)
  415.         {
  416.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  417.           /* Perform the multiply-accumulates */
  418.           sum += (q31_t) * pIn1++ * *pIn2;
  419.           pIn2 += numColsB;
  420.  
  421.           /* Decrement the loop counter */
  422.           colCnt--;
  423.         }
  424.  
  425.         /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
  426.         /* Saturate and store the result in the destination buffer */
  427.         *px++ = (q15_t) __SSAT((sum >> 15), 16);
  428.  
  429.         /* Decrement the column loop counter */
  430.         col--;
  431.  
  432.         /* Update the pointer pIn2 to point to the  starting address of the next column */
  433.         pIn2 = pInB + (numColsB - col);
  434.  
  435.       } while (col > 0U);
  436.  
  437.       /* Update the pointer pSrcA to point to the  starting address of the next row */
  438.       i = i + numColsB;
  439.       pInA = pInA + numColsA;
  440.  
  441.       /* Decrement the row loop counter */
  442.       row--;
  443.  
  444.     } while (row > 0U);
  445.  
  446. #endif /* #if defined (ARM_MATH_DSP) */
  447.     /* set status as ARM_MATH_SUCCESS */
  448.     status = ARM_MATH_SUCCESS;
  449.   }
  450.  
  451.   /* Return to application */
  452.   return (status);
  453. }
  454.  
  455. /**
  456.  * @} end of MatrixMult group
  457.  */
  458.