Subversion Repositories DashDisplay

Rev

Rev 2 | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------    
  2. * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
  3. *    
  4. * $Date:        19. March 2015
  5. * $Revision:    V.1.4.5
  6. *    
  7. * Project:          CMSIS DSP Library    
  8. * Title:            arm_mat_mult_fast_q15.c    
  9. *    
  10. * Description:   Q15 matrix multiplication (fast variant)    
  11. *    
  12. * Target Processor: Cortex-M4/Cortex-M3
  13. *  
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. *   - Redistributions of source code must retain the above copyright
  18. *     notice, this list of conditions and the following disclaimer.
  19. *   - Redistributions in binary form must reproduce the above copyright
  20. *     notice, this list of conditions and the following disclaimer in
  21. *     the documentation and/or other materials provided with the
  22. *     distribution.
  23. *   - Neither the name of ARM LIMITED nor the names of its contributors
  24. *     may be used to endorse or promote products derived from this
  25. *     software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.    
  39. * -------------------------------------------------------------------- */
  40.  
  41. #include "arm_math.h"
  42.  
  43. /**    
  44.  * @ingroup groupMatrix    
  45.  */
  46.  
  47. /**    
  48.  * @addtogroup MatrixMult    
  49.  * @{    
  50.  */
  51.  
  52.  
  53. /**    
  54.  * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4    
  55.  * @param[in]       *pSrcA points to the first input matrix structure    
  56.  * @param[in]       *pSrcB points to the second input matrix structure    
  57.  * @param[out]      *pDst points to output matrix structure    
  58.  * @param[in]           *pState points to the array for storing intermediate results    
  59.  * @return              The function returns either    
  60.  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.    
  61.  *    
  62.  * @details    
  63.  * <b>Scaling and Overflow Behavior:</b>    
  64.  *    
  65.  * \par    
  66.  * The difference between the function arm_mat_mult_q15() and this fast variant is that    
  67.  * the fast variant use a 32-bit rather than a 64-bit accumulator.    
  68.  * The result of each 1.15 x 1.15 multiplication is truncated to        
  69.  * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30        
  70.  * format. Finally, the accumulator is saturated and converted to a 1.15 result.        
  71.  *        
  72.  * \par        
  73.  * The fast version has the same overflow behavior as the standard version but provides        
  74.  * less precision since it discards the low 16 bits of each multiplication result.        
  75.  * In order to avoid overflows completely the input signals must be scaled down.        
  76.  * Scale down one of the input matrices by log2(numColsA) bits to        
  77.  * avoid overflows, as a total of numColsA additions are computed internally for each        
  78.  * output element.        
  79.  *        
  80.  * \par    
  81.  * See <code>arm_mat_mult_q15()</code> for a slower implementation of this function    
  82.  * which uses 64-bit accumulation to provide higher precision.    
  83.  */
  84.  
  85. arm_status arm_mat_mult_fast_q15(
  86.   const arm_matrix_instance_q15 * pSrcA,
  87.   const arm_matrix_instance_q15 * pSrcB,
  88.   arm_matrix_instance_q15 * pDst,
  89.   q15_t * pState)
  90. {
  91.   q31_t sum;                                     /* accumulator */
  92.   q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
  93.   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  94.   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  95.   q15_t *px;                                     /* Temporary output data matrix pointer */
  96.   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  97.   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  98.   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  99.   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
  100.   uint16_t col, i = 0u, row = numRowsB, colCnt;  /* loop counters */
  101.   arm_status status;                             /* status of matrix multiplication */
  102.  
  103. #ifndef UNALIGNED_SUPPORT_DISABLE
  104.  
  105.   q31_t in;                                      /* Temporary variable to hold the input value */
  106.   q31_t inA1, inA2, inB1, inB2;
  107.  
  108. #else
  109.  
  110.   q15_t in;                                      /* Temporary variable to hold the input value */
  111.   q15_t inA1, inA2, inB1, inB2;
  112.  
  113. #endif  /*      #ifndef UNALIGNED_SUPPORT_DISABLE       */
  114.  
  115. #ifdef ARM_MATH_MATRIX_CHECK
  116.   /* Check for matrix mismatch condition */
  117.   if((pSrcA->numCols != pSrcB->numRows) ||
  118.      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  119.   {
  120.     /* Set status as ARM_MATH_SIZE_MISMATCH */
  121.     status = ARM_MATH_SIZE_MISMATCH;
  122.   }
  123.   else
  124. #endif
  125.   {
  126.     /* Matrix transpose */
  127.     do
  128.     {
  129.       /* Apply loop unrolling and exchange the columns with row elements */
  130.       col = numColsB >> 2;
  131.  
  132.       /* The pointer px is set to starting address of the column being processed */
  133.       px = pSrcBT + i;
  134.  
  135.       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
  136.        ** a second loop below computes the remaining 1 to 3 samples. */
  137.       while(col > 0u)
  138.       {
  139. #ifndef UNALIGNED_SUPPORT_DISABLE
  140.         /* Read two elements from the row */
  141.         in = *__SIMD32(pInB)++;
  142.  
  143.         /* Unpack and store one element in the destination */
  144. #ifndef ARM_MATH_BIG_ENDIAN
  145.  
  146.         *px = (q15_t) in;
  147.  
  148. #else
  149.  
  150.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  151.  
  152. #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
  153.  
  154.         /* Update the pointer px to point to the next row of the transposed matrix */
  155.         px += numRowsB;
  156.  
  157.         /* Unpack and store the second element in the destination */
  158. #ifndef ARM_MATH_BIG_ENDIAN
  159.  
  160.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  161.  
  162. #else
  163.  
  164.         *px = (q15_t) in;
  165.  
  166. #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
  167.  
  168.         /* Update the pointer px to point to the next row of the transposed matrix */
  169.         px += numRowsB;
  170.  
  171.         /* Read two elements from the row */
  172.         in = *__SIMD32(pInB)++;
  173.  
  174.         /* Unpack and store one element in the destination */
  175. #ifndef ARM_MATH_BIG_ENDIAN
  176.  
  177.         *px = (q15_t) in;
  178.  
  179. #else
  180.  
  181.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  182.  
  183. #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
  184.  
  185.         /* Update the pointer px to point to the next row of the transposed matrix */
  186.         px += numRowsB;
  187.  
  188.         /* Unpack and store the second element in the destination */
  189.  
  190. #ifndef ARM_MATH_BIG_ENDIAN
  191.  
  192.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  193.  
  194. #else
  195.  
  196.         *px = (q15_t) in;
  197.  
  198. #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
  199.  
  200. #else
  201.  
  202.         /* Read one element from the row */
  203.         in = *pInB++;
  204.  
  205.         /* Store one element in the destination */
  206.         *px = in;
  207.  
  208.         /* Update the pointer px to point to the next row of the transposed matrix */
  209.         px += numRowsB;
  210.  
  211.         /* Read one element from the row */
  212.         in = *pInB++;
  213.  
  214.         /* Store one element in the destination */
  215.         *px = in;
  216.  
  217.         /* Update the pointer px to point to the next row of the transposed matrix */
  218.         px += numRowsB;
  219.  
  220.         /* Read one element from the row */
  221.         in = *pInB++;
  222.  
  223.         /* Store one element in the destination */
  224.         *px = in;
  225.  
  226.         /* Update the pointer px to point to the next row of the transposed matrix */
  227.         px += numRowsB;
  228.  
  229.         /* Read one element from the row */
  230.         in = *pInB++;
  231.  
  232.         /* Store one element in the destination */
  233.         *px = in;
  234.  
  235. #endif  /*      #ifndef UNALIGNED_SUPPORT_DISABLE       */
  236.        
  237.                 /* Update the pointer px to point to the next row of the transposed matrix */
  238.         px += numRowsB;
  239.  
  240.         /* Decrement the column loop counter */
  241.         col--;
  242.       }
  243.  
  244.       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.        
  245.        ** No loop unrolling is used. */
  246.       col = numColsB % 0x4u;
  247.  
  248.       while(col > 0u)
  249.       {
  250.         /* Read and store the input element in the destination */
  251.         *px = *pInB++;
  252.  
  253.         /* Update the pointer px to point to the next row of the transposed matrix */
  254.         px += numRowsB;
  255.  
  256.         /* Decrement the column loop counter */
  257.         col--;
  258.       }
  259.  
  260.       i++;
  261.  
  262.       /* Decrement the row loop counter */
  263.       row--;
  264.  
  265.     } while(row > 0u);
  266.  
  267.     /* Reset the variables for the usage in the following multiplication process */
  268.     row = numRowsA;
  269.     i = 0u;
  270.     px = pDst->pData;
  271.  
  272.     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
  273.     /* row loop */
  274.     do
  275.     {
  276.       /* For every row wise process, the column loop counter is to be initiated */
  277.       col = numColsB;
  278.  
  279.       /* For every row wise process, the pIn2 pointer is set        
  280.        ** to the starting address of the transposed pSrcB data */
  281.       pInB = pSrcBT;
  282.  
  283.       /* column loop */
  284.       do
  285.       {
  286.         /* Set the variable sum, that acts as accumulator, to zero */
  287.         sum = 0;
  288.  
  289.         /* Apply loop unrolling and compute 2 MACs simultaneously. */
  290.         colCnt = numColsA >> 2;
  291.  
  292.         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
  293.         pInA = pSrcA->pData + i;
  294.  
  295.         /* matrix multiplication */
  296.         while(colCnt > 0u)
  297.         {
  298.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  299. #ifndef UNALIGNED_SUPPORT_DISABLE
  300.  
  301.           inA1 = *__SIMD32(pInA)++;
  302.           inB1 = *__SIMD32(pInB)++;
  303.           inA2 = *__SIMD32(pInA)++;
  304.           inB2 = *__SIMD32(pInB)++;
  305.  
  306.           sum = __SMLAD(inA1, inB1, sum);
  307.           sum = __SMLAD(inA2, inB2, sum);
  308.  
  309. #else
  310.  
  311.           inA1 = *pInA++;
  312.           inB1 = *pInB++;
  313.           inA2 = *pInA++;
  314.           sum += inA1 * inB1;
  315.           inB2 = *pInB++;
  316.  
  317.           inA1 = *pInA++;
  318.           inB1 = *pInB++;
  319.           sum += inA2 * inB2;
  320.           inA2 = *pInA++;
  321.           inB2 = *pInB++;
  322.  
  323.           sum += inA1 * inB1;
  324.           sum += inA2 * inB2;
  325.  
  326. #endif  /*      #ifndef UNALIGNED_SUPPORT_DISABLE       */
  327.  
  328.           /* Decrement the loop counter */
  329.           colCnt--;
  330.         }
  331.  
  332.         /* process odd column samples */
  333.         colCnt = numColsA % 0x4u;
  334.  
  335.         while(colCnt > 0u)
  336.         {
  337.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  338.           sum += (q31_t) (*pInA++) * (*pInB++);
  339.  
  340.           colCnt--;
  341.         }
  342.  
  343.         /* Saturate and store the result in the destination buffer */
  344.         *px = (q15_t) (sum >> 15);
  345.         px++;
  346.  
  347.         /* Decrement the column loop counter */
  348.         col--;
  349.  
  350.       } while(col > 0u);
  351.  
  352.       i = i + numColsA;
  353.  
  354.       /* Decrement the row loop counter */
  355.       row--;
  356.  
  357.     } while(row > 0u);
  358.  
  359.     /* set status as ARM_MATH_SUCCESS */
  360.     status = ARM_MATH_SUCCESS;
  361.   }
  362.  
  363.   /* Return to application */
  364.   return (status);
  365. }
  366.  
  367. /**        
  368.  * @} end of MatrixMult group        
  369.  */
  370.