Subversion Repositories DashDisplay

Rev

Rev 2 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------    
  2. * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
  3. *    
  4. * $Date:        19. March 2015
  5. * $Revision:    V.1.4.5
  6. *    
  7. * Project:          CMSIS DSP Library    
  8. * Title:            arm_mat_mult_q15.c    
  9. *    
  10. * Description:   Q15 matrix multiplication.    
  11. *    
  12. * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13. *  
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. *   - Redistributions of source code must retain the above copyright
  18. *     notice, this list of conditions and the following disclaimer.
  19. *   - Redistributions in binary form must reproduce the above copyright
  20. *     notice, this list of conditions and the following disclaimer in
  21. *     the documentation and/or other materials provided with the
  22. *     distribution.
  23. *   - Neither the name of ARM LIMITED nor the names of its contributors
  24. *     may be used to endorse or promote products derived from this
  25. *     software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.    
  39. * -------------------------------------------------------------------- */
  40.  
  41. #include "arm_math.h"
  42.  
  43. /**    
  44.  * @ingroup groupMatrix    
  45.  */
  46.  
  47. /**    
  48.  * @addtogroup MatrixMult    
  49.  * @{    
  50.  */
  51.  
  52.  
  53. /**    
  54.  * @brief Q15 matrix multiplication    
  55.  * @param[in]       *pSrcA points to the first input matrix structure    
  56.  * @param[in]       *pSrcB points to the second input matrix structure    
  57.  * @param[out]      *pDst points to output matrix structure    
  58.  * @param[in]           *pState points to the array for storing intermediate results (Unused)  
  59.  * @return              The function returns either    
  60.  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.    
  61.  *    
  62.  * @details    
  63.  * <b>Scaling and Overflow Behavior:</b>    
  64.  *    
  65.  * \par    
  66.  * The function is implemented using a 64-bit internal accumulator. The inputs to the    
  67.  * multiplications are in 1.15 format and multiplications yield a 2.30 result.    
  68.  * The 2.30 intermediate    
  69.  * results are accumulated in a 64-bit accumulator in 34.30 format. This approach    
  70.  * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then    
  71.  * truncated to 34.15 format by discarding the low 15 bits and then saturated to    
  72.  * 1.15 format.    
  73.  *    
  74.  * \par    
  75.  * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.    
  76.  *    
  77.  */
  78.  
  79. arm_status arm_mat_mult_q15(
  80.   const arm_matrix_instance_q15 * pSrcA,
  81.   const arm_matrix_instance_q15 * pSrcB,
  82.   arm_matrix_instance_q15 * pDst,
  83.   q15_t * pState CMSIS_UNUSED)
  84. {
  85.   q63_t sum;                                     /* accumulator */
  86.  
  87. #ifndef ARM_MATH_CM0_FAMILY
  88.  
  89.   /* Run the below code for Cortex-M4 and Cortex-M3 */
  90.  
  91.   q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
  92.   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  93.   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  94.   q15_t *px;                                     /* Temporary output data matrix pointer */
  95.   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  96.   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  97.   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  98.   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
  99.   uint16_t col, i = 0u, row = numRowsB, colCnt;  /* loop counters */
  100.   arm_status status;                             /* status of matrix multiplication */
  101.  
  102. #ifndef UNALIGNED_SUPPORT_DISABLE
  103.  
  104.   q31_t in;                                      /* Temporary variable to hold the input value */
  105.   q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2;
  106.  
  107. #else
  108.  
  109.   q15_t in;                                      /* Temporary variable to hold the input value */
  110.   q15_t inA1, inB1, inA2, inB2;
  111.  
  112. #endif  /*      #ifndef UNALIGNED_SUPPORT_DISABLE       */
  113.  
  114. #ifdef ARM_MATH_MATRIX_CHECK
  115.   /* Check for matrix mismatch condition */
  116.   if((pSrcA->numCols != pSrcB->numRows) ||
  117.      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  118.   {
  119.     /* Set status as ARM_MATH_SIZE_MISMATCH */
  120.     status = ARM_MATH_SIZE_MISMATCH;
  121.   }
  122.   else
  123. #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
  124.   {
  125.     /* Matrix transpose */
  126.     do
  127.     {
  128.       /* Apply loop unrolling and exchange the columns with row elements */
  129.       col = numColsB >> 2;
  130.  
  131.       /* The pointer px is set to starting address of the column being processed */
  132.       px = pSrcBT + i;
  133.  
  134.       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
  135.        ** a second loop below computes the remaining 1 to 3 samples. */
  136.       while(col > 0u)
  137.       {
  138. #ifndef UNALIGNED_SUPPORT_DISABLE
  139.  
  140.         /* Read two elements from the row */
  141.         in = *__SIMD32(pInB)++;
  142.  
  143.         /* Unpack and store one element in the destination */
  144. #ifndef ARM_MATH_BIG_ENDIAN
  145.  
  146.         *px = (q15_t) in;
  147.  
  148. #else
  149.  
  150.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  151.  
  152. #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
  153.  
  154.         /* Update the pointer px to point to the next row of the transposed matrix */
  155.         px += numRowsB;
  156.  
  157.         /* Unpack and store the second element in the destination */
  158. #ifndef ARM_MATH_BIG_ENDIAN
  159.  
  160.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  161.  
  162. #else
  163.  
  164.         *px = (q15_t) in;
  165.  
  166. #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
  167.  
  168.         /* Update the pointer px to point to the next row of the transposed matrix */
  169.         px += numRowsB;
  170.  
  171.         /* Read two elements from the row */
  172.         in = *__SIMD32(pInB)++;
  173.  
  174.         /* Unpack and store one element in the destination */
  175. #ifndef ARM_MATH_BIG_ENDIAN
  176.  
  177.         *px = (q15_t) in;
  178.  
  179. #else
  180.  
  181.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  182.  
  183. #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
  184.  
  185.         /* Update the pointer px to point to the next row of the transposed matrix */
  186.         px += numRowsB;
  187.  
  188.         /* Unpack and store the second element in the destination */
  189.  
  190. #ifndef ARM_MATH_BIG_ENDIAN
  191.  
  192.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  193.  
  194. #else
  195.  
  196.         *px = (q15_t) in;
  197.  
  198. #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
  199.  
  200.         /* Update the pointer px to point to the next row of the transposed matrix */
  201.         px += numRowsB;
  202.  
  203. #else
  204.  
  205.         /* Read one element from the row */
  206.         in = *pInB++;
  207.  
  208.         /* Store one element in the destination */
  209.         *px = in;
  210.  
  211.         /* Update the pointer px to point to the next row of the transposed matrix */
  212.         px += numRowsB;
  213.  
  214.         /* Read one element from the row */
  215.         in = *pInB++;
  216.  
  217.         /* Store one element in the destination */
  218.         *px = in;
  219.  
  220.         /* Update the pointer px to point to the next row of the transposed matrix */
  221.         px += numRowsB;
  222.  
  223.         /* Read one element from the row */
  224.         in = *pInB++;
  225.  
  226.         /* Store one element in the destination */
  227.         *px = in;
  228.  
  229.         /* Update the pointer px to point to the next row of the transposed matrix */
  230.         px += numRowsB;
  231.  
  232.         /* Read one element from the row */
  233.         in = *pInB++;
  234.  
  235.         /* Store one element in the destination */
  236.         *px = in;
  237.  
  238.         /* Update the pointer px to point to the next row of the transposed matrix */
  239.         px += numRowsB;
  240.  
  241. #endif  /*      #ifndef UNALIGNED_SUPPORT_DISABLE       */
  242.  
  243.        /* Decrement the column loop counter */
  244.         col--;
  245.       }
  246.  
  247.       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.        
  248.        ** No loop unrolling is used. */
  249.       col = numColsB % 0x4u;
  250.  
  251.       while(col > 0u)
  252.       {
  253.         /* Read and store the input element in the destination */
  254.         *px = *pInB++;
  255.  
  256.         /* Update the pointer px to point to the next row of the transposed matrix */
  257.         px += numRowsB;
  258.  
  259.         /* Decrement the column loop counter */
  260.         col--;
  261.       }
  262.  
  263.       i++;
  264.  
  265.       /* Decrement the row loop counter */
  266.       row--;
  267.  
  268.     } while(row > 0u);
  269.  
  270.     /* Reset the variables for the usage in the following multiplication process */
  271.     row = numRowsA;
  272.     i = 0u;
  273.     px = pDst->pData;
  274.  
  275.     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
  276.     /* row loop */
  277.     do
  278.     {
  279.       /* For every row wise process, the column loop counter is to be initiated */
  280.       col = numColsB;
  281.  
  282.       /* For every row wise process, the pIn2 pointer is set        
  283.        ** to the starting address of the transposed pSrcB data */
  284.       pInB = pSrcBT;
  285.  
  286.       /* column loop */
  287.       do
  288.       {
  289.         /* Set the variable sum, that acts as accumulator, to zero */
  290.         sum = 0;
  291.  
  292.         /* Apply loop unrolling and compute 2 MACs simultaneously. */
  293.         colCnt = numColsA >> 2;
  294.  
  295.         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
  296.         pInA = pSrcA->pData + i;
  297.  
  298.  
  299.         /* matrix multiplication */
  300.         while(colCnt > 0u)
  301.         {
  302.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  303. #ifndef UNALIGNED_SUPPORT_DISABLE
  304.  
  305.           /* read real and imag values from pSrcA and pSrcB buffer */
  306.           pSourceA1 = *__SIMD32(pInA)++;
  307.           pSourceB1 = *__SIMD32(pInB)++;
  308.  
  309.           pSourceA2 = *__SIMD32(pInA)++;
  310.           pSourceB2 = *__SIMD32(pInB)++;
  311.  
  312.           /* Multiply and Accumlates */
  313.           sum = __SMLALD(pSourceA1, pSourceB1, sum);
  314.           sum = __SMLALD(pSourceA2, pSourceB2, sum);
  315.  
  316. #else
  317.           /* read real and imag values from pSrcA and pSrcB buffer */
  318.           inA1 = *pInA++;
  319.           inB1 = *pInB++;
  320.           inA2 = *pInA++;
  321.           /* Multiply and Accumlates */
  322.           sum += inA1 * inB1;
  323.           inB2 = *pInB++;
  324.  
  325.           inA1 = *pInA++;
  326.           inB1 = *pInB++;
  327.           /* Multiply and Accumlates */
  328.           sum += inA2 * inB2;
  329.           inA2 = *pInA++;
  330.           inB2 = *pInB++;
  331.  
  332.           /* Multiply and Accumlates */
  333.           sum += inA1 * inB1;
  334.           sum += inA2 * inB2;
  335.  
  336. #endif  /*      #ifndef UNALIGNED_SUPPORT_DISABLE       */
  337.  
  338.           /* Decrement the loop counter */
  339.           colCnt--;
  340.         }
  341.  
  342.         /* process remaining column samples */
  343.         colCnt = numColsA & 3u;
  344.  
  345.         while(colCnt > 0u)
  346.         {
  347.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  348.           sum += *pInA++ * *pInB++;
  349.  
  350.           /* Decrement the loop counter */
  351.           colCnt--;
  352.         }
  353.  
  354.         /* Saturate and store the result in the destination buffer */
  355.         *px = (q15_t) (__SSAT((sum >> 15), 16));
  356.         px++;
  357.  
  358.         /* Decrement the column loop counter */
  359.         col--;
  360.  
  361.       } while(col > 0u);
  362.  
  363.       i = i + numColsA;
  364.  
  365.       /* Decrement the row loop counter */
  366.       row--;
  367.  
  368.     } while(row > 0u);
  369.  
  370. #else
  371.  
  372.   /* Run the below code for Cortex-M0 */
  373.  
  374.   q15_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
  375.   q15_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
  376.   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  377.   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  378.   q15_t *pOut = pDst->pData;                     /* output data matrix pointer */
  379.   q15_t *px;                                     /* Temporary output data matrix pointer */
  380.   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  381.   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  382.   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  383.   uint16_t col, i = 0u, row = numRowsA, colCnt;  /* loop counters */
  384.   arm_status status;                             /* status of matrix multiplication */
  385.  
  386. #ifdef ARM_MATH_MATRIX_CHECK
  387.  
  388.   /* Check for matrix mismatch condition */
  389.   if((pSrcA->numCols != pSrcB->numRows) ||
  390.      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  391.   {
  392.     /* Set status as ARM_MATH_SIZE_MISMATCH */
  393.     status = ARM_MATH_SIZE_MISMATCH;
  394.   }
  395.   else
  396. #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
  397.  
  398.   {
  399.     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
  400.     /* row loop */
  401.     do
  402.     {
  403.       /* Output pointer is set to starting address of the row being processed */
  404.       px = pOut + i;
  405.  
  406.       /* For every row wise process, the column loop counter is to be initiated */
  407.       col = numColsB;
  408.  
  409.       /* For every row wise process, the pIn2 pointer is set          
  410.        ** to the starting address of the pSrcB data */
  411.       pIn2 = pSrcB->pData;
  412.  
  413.       /* column loop */
  414.       do
  415.       {
  416.         /* Set the variable sum, that acts as accumulator, to zero */
  417.         sum = 0;
  418.  
  419.         /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
  420.         pIn1 = pInA;
  421.  
  422.         /* Matrix A columns number of MAC operations are to be performed */
  423.         colCnt = numColsA;
  424.  
  425.         /* matrix multiplication */
  426.         while(colCnt > 0u)
  427.         {
  428.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  429.           /* Perform the multiply-accumulates */
  430.           sum += (q31_t) * pIn1++ * *pIn2;
  431.           pIn2 += numColsB;
  432.  
  433.           /* Decrement the loop counter */
  434.           colCnt--;
  435.         }
  436.  
  437.         /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
  438.         /* Saturate and store the result in the destination buffer */
  439.         *px++ = (q15_t) __SSAT((sum >> 15), 16);
  440.  
  441.         /* Decrement the column loop counter */
  442.         col--;
  443.  
  444.         /* Update the pointer pIn2 to point to the  starting address of the next column */
  445.         pIn2 = pInB + (numColsB - col);
  446.  
  447.       } while(col > 0u);
  448.  
  449.       /* Update the pointer pSrcA to point to the  starting address of the next row */
  450.       i = i + numColsB;
  451.       pInA = pInA + numColsA;
  452.  
  453.       /* Decrement the row loop counter */
  454.       row--;
  455.  
  456.     } while(row > 0u);
  457.  
  458. #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  459.     /* set status as ARM_MATH_SUCCESS */
  460.     status = ARM_MATH_SUCCESS;
  461.   }
  462.  
  463.   /* Return to application */
  464.   return (status);
  465. }
  466.  
  467. /**        
  468.  * @} end of MatrixMult group        
  469.  */
  470.