Subversion Repositories AFRtranscoder

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------
  2.  * Project:      CMSIS DSP Library
  3.  * Title:        arm_mat_mult_fast_q15.c
  4.  * Description:  Q15 matrix multiplication (fast variant)
  5.  *
  6.  * $Date:        27. January 2017
  7.  * $Revision:    V.1.5.1
  8.  *
  9.  * Target Processor: Cortex-M cores
  10.  * -------------------------------------------------------------------- */
  11. /*
  12.  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13.  *
  14.  * SPDX-License-Identifier: Apache-2.0
  15.  *
  16.  * Licensed under the Apache License, Version 2.0 (the License); you may
  17.  * not use this file except in compliance with the License.
  18.  * You may obtain a copy of the License at
  19.  *
  20.  * www.apache.org/licenses/LICENSE-2.0
  21.  *
  22.  * Unless required by applicable law or agreed to in writing, software
  23.  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25.  * See the License for the specific language governing permissions and
  26.  * limitations under the License.
  27.  */
  28.  
  29. #include "arm_math.h"
  30.  
  31. /**
  32.  * @ingroup groupMatrix
  33.  */
  34.  
  35. /**
  36.  * @addtogroup MatrixMult
  37.  * @{
  38.  */
  39.  
  40.  
  41. /**
  42.  * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
  43.  * @param[in]       *pSrcA points to the first input matrix structure
  44.  * @param[in]       *pSrcB points to the second input matrix structure
  45.  * @param[out]      *pDst points to output matrix structure
  46.  * @param[in]       *pState points to the array for storing intermediate results
  47.  * @return          The function returns either
  48.  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  49.  *
  50.  * @details
  51.  * <b>Scaling and Overflow Behavior:</b>
  52.  *
  53.  * \par
  54.  * The difference between the function arm_mat_mult_q15() and this fast variant is that
  55.  * the fast variant use a 32-bit rather than a 64-bit accumulator.
  56.  * The result of each 1.15 x 1.15 multiplication is truncated to
  57.  * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
  58.  * format. Finally, the accumulator is saturated and converted to a 1.15 result.
  59.  *
  60.  * \par
  61.  * The fast version has the same overflow behavior as the standard version but provides
  62.  * less precision since it discards the low 16 bits of each multiplication result.
  63.  * In order to avoid overflows completely the input signals must be scaled down.
  64.  * Scale down one of the input matrices by log2(numColsA) bits to
  65.  * avoid overflows, as a total of numColsA additions are computed internally for each
  66.  * output element.
  67.  *
  68.  * \par
  69.  * See <code>arm_mat_mult_q15()</code> for a slower implementation of this function
  70.  * which uses 64-bit accumulation to provide higher precision.
  71.  */
  72.  
  73. arm_status arm_mat_mult_fast_q15(
  74.   const arm_matrix_instance_q15 * pSrcA,
  75.   const arm_matrix_instance_q15 * pSrcB,
  76.   arm_matrix_instance_q15 * pDst,
  77.   q15_t * pState)
  78. {
  79.   q31_t sum;                                     /* accumulator */
  80.   q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
  81.   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  82.   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  83.   q15_t *px;                                     /* Temporary output data matrix pointer */
  84.   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  85.   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  86.   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  87.   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
  88.   uint32_t col, i = 0U, row = numRowsB, colCnt;  /* loop counters */
  89.   arm_status status;                             /* status of matrix multiplication */
  90.  
  91. #ifndef UNALIGNED_SUPPORT_DISABLE
  92.  
  93.   q31_t in;                                      /* Temporary variable to hold the input value */
  94.   q31_t inA1, inA2, inB1, inB2;
  95.   q31_t sum2, sum3, sum4;
  96.   q15_t *pInA2, *pInB2, *px2;
  97.   uint32_t j = 0;
  98.  
  99. #else
  100.  
  101.   q15_t in;                                      /* Temporary variable to hold the input value */
  102.   q15_t inA1, inA2, inB1, inB2;
  103.  
  104. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  105.  
  106. #ifdef ARM_MATH_MATRIX_CHECK
  107.   /* Check for matrix mismatch condition */
  108.   if ((pSrcA->numCols != pSrcB->numRows) ||
  109.      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  110.   {
  111.     /* Set status as ARM_MATH_SIZE_MISMATCH */
  112.     status = ARM_MATH_SIZE_MISMATCH;
  113.   }
  114.   else
  115. #endif
  116.   {
  117.     /* Matrix transpose */
  118.     do
  119.     {
  120.       /* Apply loop unrolling and exchange the columns with row elements */
  121.       col = numColsB >> 2;
  122.  
  123.       /* The pointer px is set to starting address of the column being processed */
  124.       px = pSrcBT + i;
  125.  
  126.       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
  127.        ** a second loop below computes the remaining 1 to 3 samples. */
  128.       while (col > 0U)
  129.       {
  130. #ifndef UNALIGNED_SUPPORT_DISABLE
  131.         /* Read two elements from the row */
  132.         in = *__SIMD32(pInB)++;
  133.  
  134.         /* Unpack and store one element in the destination */
  135. #ifndef ARM_MATH_BIG_ENDIAN
  136.  
  137.         *px = (q15_t) in;
  138.  
  139. #else
  140.  
  141.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  142.  
  143. #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
  144.  
  145.         /* Update the pointer px to point to the next row of the transposed matrix */
  146.         px += numRowsB;
  147.  
  148.         /* Unpack and store the second element in the destination */
  149. #ifndef ARM_MATH_BIG_ENDIAN
  150.  
  151.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  152.  
  153. #else
  154.  
  155.         *px = (q15_t) in;
  156.  
  157. #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
  158.  
  159.         /* Update the pointer px to point to the next row of the transposed matrix */
  160.         px += numRowsB;
  161.  
  162.         /* Read two elements from the row */
  163.         in = *__SIMD32(pInB)++;
  164.  
  165.         /* Unpack and store one element in the destination */
  166. #ifndef ARM_MATH_BIG_ENDIAN
  167.  
  168.         *px = (q15_t) in;
  169.  
  170. #else
  171.  
  172.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  173.  
  174. #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
  175.  
  176.         /* Update the pointer px to point to the next row of the transposed matrix */
  177.         px += numRowsB;
  178.  
  179.         /* Unpack and store the second element in the destination */
  180.  
  181. #ifndef ARM_MATH_BIG_ENDIAN
  182.  
  183.         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  184.  
  185. #else
  186.  
  187.         *px = (q15_t) in;
  188.  
  189. #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
  190.  
  191. #else
  192.  
  193.         /* Read one element from the row */
  194.         in = *pInB++;
  195.  
  196.         /* Store one element in the destination */
  197.         *px = in;
  198.  
  199.         /* Update the pointer px to point to the next row of the transposed matrix */
  200.         px += numRowsB;
  201.  
  202.         /* Read one element from the row */
  203.         in = *pInB++;
  204.  
  205.         /* Store one element in the destination */
  206.         *px = in;
  207.  
  208.         /* Update the pointer px to point to the next row of the transposed matrix */
  209.         px += numRowsB;
  210.  
  211.         /* Read one element from the row */
  212.         in = *pInB++;
  213.  
  214.         /* Store one element in the destination */
  215.         *px = in;
  216.  
  217.         /* Update the pointer px to point to the next row of the transposed matrix */
  218.         px += numRowsB;
  219.  
  220.         /* Read one element from the row */
  221.         in = *pInB++;
  222.  
  223.         /* Store one element in the destination */
  224.         *px = in;
  225.  
  226. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  227.  
  228.         /* Update the pointer px to point to the next row of the transposed matrix */
  229.         px += numRowsB;
  230.  
  231.         /* Decrement the column loop counter */
  232.         col--;
  233.       }
  234.  
  235.       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
  236.        ** No loop unrolling is used. */
  237.       col = numColsB % 0x4U;
  238.  
  239.       while (col > 0U)
  240.       {
  241.         /* Read and store the input element in the destination */
  242.         *px = *pInB++;
  243.  
  244.         /* Update the pointer px to point to the next row of the transposed matrix */
  245.         px += numRowsB;
  246.  
  247.         /* Decrement the column loop counter */
  248.         col--;
  249.       }
  250.  
  251.       i++;
  252.  
  253.       /* Decrement the row loop counter */
  254.       row--;
  255.  
  256.     } while (row > 0U);
  257.  
  258.     /* Reset the variables for the usage in the following multiplication process */
  259.     row = numRowsA;
  260.     i = 0U;
  261.     px = pDst->pData;
  262.  
  263. #ifndef UNALIGNED_SUPPORT_DISABLE
  264.     /* Process two rows from matrix A at a time and output two rows at a time */
  265.     row = row >> 1;
  266.     px2 = px + numColsB;
  267. #endif
  268.  
  269.     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
  270.     /* row loop */
  271.     while (row > 0U)
  272.     {
  273.       /* For every row wise process, the column loop counter is to be initiated */
  274.       col = numColsB;
  275.  
  276.       /* For every row wise process, the pIn2 pointer is set
  277.        ** to the starting address of the transposed pSrcB data */
  278.       pInB = pSrcBT;
  279.  
  280. #ifndef UNALIGNED_SUPPORT_DISABLE
  281.       /* Process two (transposed) columns from matrix B at a time */
  282.       col = col >> 1;
  283.       j = 0;
  284. #endif
  285.  
  286.       /* column loop */
  287.       while (col > 0U)
  288.       {
  289.         /* Set the variable sum, that acts as accumulator, to zero */
  290.         sum = 0;
  291.  
  292.         /* Initiate the pointer pInA to point to the starting address of the column being processed */
  293.         pInA = pSrcA->pData + i;
  294.  
  295. #ifndef UNALIGNED_SUPPORT_DISABLE
  296.         sum2 = 0;
  297.         sum3 = 0;
  298.         sum4 = 0;
  299.         pInB  = pSrcBT + j;
  300.         pInA2 = pInA + numColsA;
  301.         pInB2 = pInB + numRowsB;
  302.  
  303.         /* Read in two elements at once - alows dual MAC instruction */
  304.         colCnt = numColsA >> 1;
  305. #else
  306.         colCnt = numColsA >> 2;
  307. #endif
  308.  
  309.         /* matrix multiplication */
  310.         while (colCnt > 0U)
  311.         {
  312.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  313. #ifndef UNALIGNED_SUPPORT_DISABLE
  314.  
  315.           inA1 = *__SIMD32(pInA)++;
  316.           inB1 = *__SIMD32(pInB)++;
  317.           inA2 = *__SIMD32(pInA2)++;
  318.           inB2 = *__SIMD32(pInB2)++;
  319.  
  320.           sum  = __SMLAD(inA1, inB1, sum);
  321.           sum2 = __SMLAD(inA1, inB2, sum2);
  322.           sum3 = __SMLAD(inA2, inB1, sum3);
  323.           sum4 = __SMLAD(inA2, inB2, sum4);
  324.  
  325. #else
  326.  
  327.           inA1 = *pInA;
  328.           inB1 = *pInB;
  329.           sum += inA1 * inB1;
  330.  
  331.           inA2 = pInA[1];
  332.           inB2 = pInB[1];
  333.           sum += inA2 * inB2;
  334.  
  335.           inA1 = pInA[2];
  336.           inB1 = pInB[2];
  337.           sum += inA1 * inB1;
  338.  
  339.           inA2 = pInA[3];
  340.           inB2 = pInB[3];
  341.           sum += inA2 * inB2;
  342.  
  343.           pInA += 4;
  344.           pInB += 4;
  345.  
  346. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  347.  
  348.           /* Decrement the loop counter */
  349.           colCnt--;
  350.         }
  351.  
  352.         /* process odd column samples */
  353. #ifndef UNALIGNED_SUPPORT_DISABLE
  354.         if (numColsA & 1U) {
  355.           inA1 = *pInA++;
  356.           inB1 = *pInB++;
  357.           inA2 = *pInA2++;
  358.           inB2 = *pInB2++;
  359.           sum  += inA1 * inB1;
  360.           sum2 += inA1 * inB2;
  361.           sum3 += inA2 * inB1;
  362.           sum4 += inA2 * inB2;
  363.         }
  364. #else
  365.         colCnt = numColsA % 0x4U;
  366.  
  367.         while (colCnt > 0U)
  368.         {
  369.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  370.           sum += (q31_t) (*pInA++) * (*pInB++);
  371.  
  372.           colCnt--;
  373.         }
  374. #endif
  375.  
  376.         /* Saturate and store the result in the destination buffer */
  377.         *px++  = (q15_t) (sum >> 15);
  378.  
  379. #ifndef UNALIGNED_SUPPORT_DISABLE
  380.         *px++  = (q15_t) (sum2 >> 15);
  381.         *px2++ = (q15_t) (sum3 >> 15);
  382.         *px2++ = (q15_t) (sum4 >> 15);
  383.         j += numRowsB * 2;
  384. #endif
  385.  
  386.         /* Decrement the column loop counter */
  387.         col--;
  388.  
  389.       }
  390.  
  391.       i = i + numColsA;
  392.  
  393. #ifndef UNALIGNED_SUPPORT_DISABLE
  394.       i = i + numColsA;
  395.       px = px2 + (numColsB & 1U);
  396.       px2 = px + numColsB;
  397. #endif
  398.  
  399.       /* Decrement the row loop counter */
  400.       row--;
  401.  
  402.     }
  403.  
  404.     /* Compute any remaining odd row/column below */
  405.  
  406. #ifndef UNALIGNED_SUPPORT_DISABLE
  407.  
  408.     /* Compute remaining output column */
  409.     if (numColsB & 1U) {
  410.  
  411.       /* Avoid redundant computation of last element */
  412.       row = numRowsA & (~0x1);
  413.  
  414.       /* Point to remaining unfilled column in output matrix */
  415.       px = pDst->pData+numColsB-1;
  416.       pInA = pSrcA->pData;
  417.  
  418.       /* row loop */
  419.       while (row > 0)
  420.       {
  421.  
  422.         /* point to last column in matrix B */
  423.         pInB  = pSrcBT + numRowsB*(numColsB-1);
  424.  
  425.         /* Set the variable sum, that acts as accumulator, to zero */
  426.         sum  = 0;
  427.  
  428.         /* Compute 4 columns at once */
  429.         colCnt = numColsA >> 2;
  430.  
  431.         /* matrix multiplication */
  432.         while (colCnt > 0U)
  433.         {
  434.           inA1 = *__SIMD32(pInA)++;
  435.           inA2 = *__SIMD32(pInA)++;
  436.           inB1 = *__SIMD32(pInB)++;
  437.           inB2 = *__SIMD32(pInB)++;
  438.  
  439.           sum  = __SMLAD(inA1, inB1, sum);
  440.           sum  = __SMLAD(inA2, inB2, sum);
  441.  
  442.           /* Decrement the loop counter */
  443.           colCnt--;
  444.         }
  445.  
  446.         colCnt = numColsA & 3U;
  447.         while (colCnt > 0U) {
  448.           sum += (q31_t) (*pInA++) * (*pInB++);
  449.           colCnt--;
  450.         }
  451.  
  452.         /* Store the result in the destination buffer */
  453.         *px  = (q15_t) (sum  >> 15);
  454.         px += numColsB;
  455.  
  456.         /* Decrement the row loop counter */
  457.         row--;
  458.       }
  459.     }
  460.  
  461.     /* Compute remaining output row */
  462.     if (numRowsA & 1U) {
  463.  
  464.       /* point to last row in output matrix */
  465.       px = pDst->pData+(numColsB)*(numRowsA-1);
  466.  
  467.       pInB  = pSrcBT;
  468.       col = numColsB;
  469.       i = 0U;
  470.  
  471.       /* col loop */
  472.       while (col > 0)
  473.       {
  474.  
  475.         /* point to last row in matrix A */
  476.         pInA = pSrcA->pData + (numRowsA-1)*numColsA;
  477.  
  478.         /* Set the variable sum, that acts as accumulator, to zero */
  479.         sum  = 0;
  480.  
  481.         /* Compute 4 columns at once */
  482.         colCnt = numColsA >> 2;
  483.  
  484.         /* matrix multiplication */
  485.         while (colCnt > 0U)
  486.         {
  487.           inA1 = *__SIMD32(pInA)++;
  488.           inA2 = *__SIMD32(pInA)++;
  489.           inB1 = *__SIMD32(pInB)++;
  490.           inB2 = *__SIMD32(pInB)++;
  491.  
  492.           sum  = __SMLAD(inA1, inB1, sum);
  493.           sum  = __SMLAD(inA2, inB2, sum);
  494.  
  495.           /* Decrement the loop counter */
  496.           colCnt--;
  497.         }
  498.  
  499.         colCnt = numColsA & 3U;
  500.         while (colCnt > 0U) {
  501.           sum += (q31_t) (*pInA++) * (*pInB++);
  502.           colCnt--;
  503.         }
  504.  
  505.         /* Store the result in the destination buffer */
  506.         *px++  = (q15_t) (sum  >> 15);
  507.  
  508.         /* Decrement the col loop counter */
  509.         col--;
  510.       }
  511.     }
  512.  
  513. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  514.  
  515.     /* set status as ARM_MATH_SUCCESS */
  516.     status = ARM_MATH_SUCCESS;
  517.   }
  518.  
  519.   /* Return to application */
  520.   return (status);
  521. }
  522.  
  523. /**
  524.  * @} end of MatrixMult group
  525.  */
  526.