Subversion Repositories LedShow

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------      
  2. * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
  3. *      
  4. * $Date:        19. March 2015
  5. * $Revision:    V.1.4.5
  6. *      
  7. * Project:      CMSIS DSP Library
  8. * Title:            arm_cmplx_mat_mult_q15.c      
  9. *      
  10. * Description:   Q15 complex matrix multiplication.      
  11. *      
  12. * Target Processor:          Cortex-M4/Cortex-M3/Cortex-M0
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. *   - Redistributions of source code must retain the above copyright
  18. *     notice, this list of conditions and the following disclaimer.
  19. *   - Redistributions in binary form must reproduce the above copyright
  20. *     notice, this list of conditions and the following disclaimer in
  21. *     the documentation and/or other materials provided with the
  22. *     distribution.
  23. *   - Neither the name of ARM LIMITED nor the names of its contributors
  24. *     may be used to endorse or promote products derived from this
  25. *     software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.  
  39. * -------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41.  
  42. /**      
  43.  * @ingroup groupMatrix      
  44.  */
  45.  
  46. /**      
  47.  * @addtogroup CmplxMatrixMult      
  48.  * @{      
  49.  */
  50.  
  51.  
  52. /**      
  53.  * @brief Q15 Complex matrix multiplication      
  54.  * @param[in]       *pSrcA points to the first input complex matrix structure      
  55.  * @param[in]       *pSrcB points to the second input complex matrix structure      
  56.  * @param[out]      *pDst points to output complex matrix structure      
  57.  * @param[in]           *pScratch points to the array for storing intermediate results    
  58.  * @return              The function returns either      
  59.  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.        
  60.  *  
  61.  * \par Conditions for optimum performance  
  62.  *  Input, output and state buffers should be aligned by 32-bit  
  63.  *  
  64.  * \par Restrictions  
  65.  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE  
  66.  *      In this case input, output, scratch buffers should be aligned by 32-bit  
  67.  *  
  68.  * @details      
  69.  * <b>Scaling and Overflow Behavior:</b>      
  70.  *      
  71.  * \par      
  72.  * The function is implemented using a 64-bit internal accumulator. The inputs to the      
  73.  * multiplications are in 1.15 format and multiplications yield a 2.30 result.      
  74.  * The 2.30 intermediate      
  75.  * results are accumulated in a 64-bit accumulator in 34.30 format. This approach      
  76.  * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then      
  77.  * truncated to 34.15 format by discarding the low 15 bits and then saturated to      
  78.  * 1.15 format.      
  79.  *      
  80.  * \par      
  81.  * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function.      
  82.  *      
  83.  */
  84.  
  85.  
  86.  
  87.  
  88. arm_status arm_mat_cmplx_mult_q15(
  89.   const arm_matrix_instance_q15 * pSrcA,
  90.   const arm_matrix_instance_q15 * pSrcB,
  91.   arm_matrix_instance_q15 * pDst,
  92.   q15_t * pScratch)
  93. {
  94.   /* accumulator */
  95.   q15_t *pSrcBT = pScratch;                      /* input data matrix pointer for transpose */
  96.   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  97.   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  98.   q15_t *px;                                     /* Temporary output data matrix pointer */
  99.   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  100.   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  101.   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  102.   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
  103.   uint16_t col, i = 0u, row = numRowsB, colCnt;  /* loop counters */
  104.   arm_status status;                             /* status of matrix multiplication */
  105.   q63_t sumReal, sumImag;
  106.  
  107. #ifdef UNALIGNED_SUPPORT_DISABLE
  108.   q15_t in;                                      /* Temporary variable to hold the input value */
  109.   q15_t a, b, c, d;
  110. #else
  111.   q31_t in;                                      /* Temporary variable to hold the input value */
  112.   q31_t prod1, prod2;
  113.   q31_t pSourceA, pSourceB;
  114. #endif
  115.  
  116. #ifdef ARM_MATH_MATRIX_CHECK
  117.   /* Check for matrix mismatch condition */
  118.   if((pSrcA->numCols != pSrcB->numRows) ||
  119.      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  120.   {
  121.     /* Set status as ARM_MATH_SIZE_MISMATCH */
  122.     status = ARM_MATH_SIZE_MISMATCH;
  123.   }
  124.   else
  125. #endif
  126.   {
  127.     /* Matrix transpose */
  128.     do
  129.     {
  130.       /* Apply loop unrolling and exchange the columns with row elements */
  131.       col = numColsB >> 2;
  132.  
  133.       /* The pointer px is set to starting address of the column being processed */
  134.       px = pSrcBT + i;
  135.  
  136.       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.      
  137.        ** a second loop below computes the remaining 1 to 3 samples. */
  138.       while(col > 0u)
  139.       {
  140. #ifdef UNALIGNED_SUPPORT_DISABLE
  141.         /* Read two elements from the row */
  142.         in = *pInB++;
  143.         *px = in;
  144.         in = *pInB++;
  145.         px[1] = in;
  146.  
  147.         /* Update the pointer px to point to the next row of the transposed matrix */
  148.         px += numRowsB * 2;
  149.  
  150.         /* Read two elements from the row */
  151.         in = *pInB++;
  152.         *px = in;
  153.         in = *pInB++;
  154.         px[1] = in;
  155.  
  156.         /* Update the pointer px to point to the next row of the transposed matrix */
  157.         px += numRowsB * 2;
  158.  
  159.         /* Read two elements from the row */
  160.         in = *pInB++;
  161.         *px = in;
  162.         in = *pInB++;
  163.         px[1] = in;
  164.  
  165.         /* Update the pointer px to point to the next row of the transposed matrix */
  166.         px += numRowsB * 2;
  167.  
  168.         /* Read two elements from the row */
  169.         in = *pInB++;
  170.         *px = in;
  171.         in = *pInB++;
  172.         px[1] = in;
  173.  
  174.         /* Update the pointer px to point to the next row of the transposed matrix */
  175.         px += numRowsB * 2;
  176.  
  177.         /* Decrement the column loop counter */
  178.         col--;
  179.       }
  180.  
  181.       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.      
  182.        ** No loop unrolling is used. */
  183.       col = numColsB % 0x4u;
  184.  
  185.       while(col > 0u)
  186.       {
  187.         /* Read two elements from the row */
  188.         in = *pInB++;
  189.         *px = in;
  190.         in = *pInB++;
  191.         px[1] = in;
  192. #else
  193.  
  194.         /* Read two elements from the row */
  195.         in = *__SIMD32(pInB)++;
  196.  
  197.         *__SIMD32(px) = in;
  198.  
  199.         /* Update the pointer px to point to the next row of the transposed matrix */
  200.         px += numRowsB * 2;
  201.  
  202.  
  203.         /* Read two elements from the row */
  204.         in = *__SIMD32(pInB)++;
  205.  
  206.         *__SIMD32(px) = in;
  207.  
  208.         /* Update the pointer px to point to the next row of the transposed matrix */
  209.         px += numRowsB * 2;
  210.  
  211.         /* Read two elements from the row */
  212.         in = *__SIMD32(pInB)++;
  213.  
  214.         *__SIMD32(px) = in;
  215.  
  216.         /* Update the pointer px to point to the next row of the transposed matrix */
  217.         px += numRowsB * 2;
  218.  
  219.         /* Read two elements from the row */
  220.         in = *__SIMD32(pInB)++;
  221.  
  222.         *__SIMD32(px) = in;
  223.  
  224.         /* Update the pointer px to point to the next row of the transposed matrix */
  225.         px += numRowsB * 2;
  226.  
  227.         /* Decrement the column loop counter */
  228.         col--;
  229.       }
  230.  
  231.       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.      
  232.        ** No loop unrolling is used. */
  233.       col = numColsB % 0x4u;
  234.  
  235.       while(col > 0u)
  236.       {
  237.         /* Read two elements from the row */
  238.         in = *__SIMD32(pInB)++;
  239.  
  240.         *__SIMD32(px) = in;
  241. #endif
  242.  
  243.         /* Update the pointer px to point to the next row of the transposed matrix */
  244.         px += numRowsB * 2;
  245.  
  246.         /* Decrement the column loop counter */
  247.         col--;
  248.       }
  249.  
  250.       i = i + 2u;
  251.  
  252.       /* Decrement the row loop counter */
  253.       row--;
  254.  
  255.     } while(row > 0u);
  256.  
  257.     /* Reset the variables for the usage in the following multiplication process */
  258.     row = numRowsA;
  259.     i = 0u;
  260.     px = pDst->pData;
  261.  
  262.     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
  263.     /* row loop */
  264.     do
  265.     {
  266.       /* For every row wise process, the column loop counter is to be initiated */
  267.       col = numColsB;
  268.  
  269.       /* For every row wise process, the pIn2 pointer is set      
  270.        ** to the starting address of the transposed pSrcB data */
  271.       pInB = pSrcBT;
  272.  
  273.       /* column loop */
  274.       do
  275.       {
  276.         /* Set the variable sum, that acts as accumulator, to zero */
  277.         sumReal = 0;
  278.         sumImag = 0;
  279.  
  280.         /* Apply loop unrolling and compute 2 MACs simultaneously. */
  281.         colCnt = numColsA >> 1;
  282.  
  283.         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
  284.         pInA = pSrcA->pData + i * 2;
  285.  
  286.  
  287.         /* matrix multiplication */
  288.         while(colCnt > 0u)
  289.         {
  290.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  291.  
  292. #ifdef UNALIGNED_SUPPORT_DISABLE
  293.  
  294.           /* read real and imag values from pSrcA buffer */
  295.           a = *pInA;
  296.           b = *(pInA + 1u);
  297.           /* read real and imag values from pSrcB buffer */
  298.           c = *pInB;
  299.           d = *(pInB + 1u);
  300.  
  301.           /* Multiply and Accumlates */
  302.           sumReal += (q31_t) a *c;
  303.           sumImag += (q31_t) a *d;
  304.           sumReal -= (q31_t) b *d;
  305.           sumImag += (q31_t) b *c;
  306.  
  307.           /* read next real and imag values from pSrcA buffer */
  308.           a = *(pInA + 2u);
  309.           b = *(pInA + 3u);
  310.           /* read next real and imag values from pSrcB buffer */
  311.           c = *(pInB + 2u);
  312.           d = *(pInB + 3u);
  313.  
  314.           /* update pointer */
  315.           pInA += 4u;
  316.  
  317.           /* Multiply and Accumlates */
  318.           sumReal += (q31_t) a *c;
  319.           sumImag += (q31_t) a *d;
  320.           sumReal -= (q31_t) b *d;
  321.           sumImag += (q31_t) b *c;
  322.           /* update pointer */
  323.           pInB += 4u;
  324. #else
  325.           /* read real and imag values from pSrcA and pSrcB buffer */
  326.           pSourceA = *__SIMD32(pInA)++;
  327.           pSourceB = *__SIMD32(pInB)++;
  328.  
  329.           /* Multiply and Accumlates */
  330. #ifdef ARM_MATH_BIG_ENDIAN
  331.           prod1 = -__SMUSD(pSourceA, pSourceB);
  332. #else
  333.           prod1 = __SMUSD(pSourceA, pSourceB);
  334. #endif
  335.           prod2 = __SMUADX(pSourceA, pSourceB);
  336.           sumReal += (q63_t) prod1;
  337.           sumImag += (q63_t) prod2;
  338.  
  339.           /* read real and imag values from pSrcA and pSrcB buffer */
  340.           pSourceA = *__SIMD32(pInA)++;
  341.           pSourceB = *__SIMD32(pInB)++;
  342.  
  343.           /* Multiply and Accumlates */
  344. #ifdef ARM_MATH_BIG_ENDIAN
  345.           prod1 = -__SMUSD(pSourceA, pSourceB);
  346. #else
  347.           prod1 = __SMUSD(pSourceA, pSourceB);
  348. #endif
  349.           prod2 = __SMUADX(pSourceA, pSourceB);
  350.           sumReal += (q63_t) prod1;
  351.           sumImag += (q63_t) prod2;
  352.  
  353. #endif /*      #ifdef UNALIGNED_SUPPORT_DISABLE */
  354.  
  355.           /* Decrement the loop counter */
  356.           colCnt--;
  357.         }
  358.  
  359.         /* process odd column samples */
  360.         if((numColsA & 0x1u) > 0u)
  361.         {
  362.           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  363.  
  364. #ifdef UNALIGNED_SUPPORT_DISABLE
  365.  
  366.           /* read real and imag values from pSrcA and pSrcB buffer */
  367.           a = *pInA++;
  368.           b = *pInA++;
  369.           c = *pInB++;
  370.           d = *pInB++;
  371.  
  372.           /* Multiply and Accumlates */
  373.           sumReal += (q31_t) a *c;
  374.           sumImag += (q31_t) a *d;
  375.           sumReal -= (q31_t) b *d;
  376.           sumImag += (q31_t) b *c;
  377.  
  378. #else
  379.           /* read real and imag values from pSrcA and pSrcB buffer */
  380.           pSourceA = *__SIMD32(pInA)++;
  381.           pSourceB = *__SIMD32(pInB)++;
  382.  
  383.           /* Multiply and Accumlates */
  384. #ifdef ARM_MATH_BIG_ENDIAN
  385.           prod1 = -__SMUSD(pSourceA, pSourceB);
  386. #else
  387.           prod1 = __SMUSD(pSourceA, pSourceB);
  388. #endif
  389.           prod2 = __SMUADX(pSourceA, pSourceB);
  390.           sumReal += (q63_t) prod1;
  391.           sumImag += (q63_t) prod2;
  392.  
  393. #endif /*      #ifdef UNALIGNED_SUPPORT_DISABLE */
  394.  
  395.         }
  396.  
  397.         /* Saturate and store the result in the destination buffer */
  398.  
  399.         *px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
  400.         *px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
  401.  
  402.         /* Decrement the column loop counter */
  403.         col--;
  404.  
  405.       } while(col > 0u);
  406.  
  407.       i = i + numColsA;
  408.  
  409.       /* Decrement the row loop counter */
  410.       row--;
  411.  
  412.     } while(row > 0u);
  413.  
  414.     /* set status as ARM_MATH_SUCCESS */
  415.     status = ARM_MATH_SUCCESS;
  416.   }
  417.  
  418.   /* Return to application */
  419.   return (status);
  420. }
  421.  
  422. /**      
  423.  * @} end of MatrixMult group      
  424.  */
  425.