Subversion Repositories ScreenTimer

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------
  2.  * Project:      CMSIS DSP Library
  3.  * Title:        arm_conv_q15.c
  4.  * Description:  Convolution of Q15 sequences
  5.  *
  6.  * $Date:        27. January 2017
  7.  * $Revision:    V.1.5.1
  8.  *
  9.  * Target Processor: Cortex-M cores
  10.  * -------------------------------------------------------------------- */
  11. /*
  12.  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13.  *
  14.  * SPDX-License-Identifier: Apache-2.0
  15.  *
  16.  * Licensed under the Apache License, Version 2.0 (the License); you may
  17.  * not use this file except in compliance with the License.
  18.  * You may obtain a copy of the License at
  19.  *
  20.  * www.apache.org/licenses/LICENSE-2.0
  21.  *
  22.  * Unless required by applicable law or agreed to in writing, software
  23.  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25.  * See the License for the specific language governing permissions and
  26.  * limitations under the License.
  27.  */
  28.  
  29. #include "arm_math.h"
  30.  
  31. /**
  32.  * @ingroup groupFilters
  33.  */
  34.  
  35. /**
  36.  * @addtogroup Conv
  37.  * @{
  38.  */
  39.  
  40. /**
  41.  * @brief Convolution of Q15 sequences.
  42.  * @param[in] *pSrcA points to the first input sequence.
  43.  * @param[in] srcALen length of the first input sequence.
  44.  * @param[in] *pSrcB points to the second input sequence.
  45.  * @param[in] srcBLen length of the second input sequence.
  46.  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
  47.  * @return none.
  48.  *
  49.  * @details
  50.  * <b>Scaling and Overflow Behavior:</b>
  51.  *
  52.  * \par
  53.  * The function is implemented using a 64-bit internal accumulator.
  54.  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
  55.  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
  56.  * This approach provides 33 guard bits and there is no risk of overflow.
  57.  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
  58.  *
  59.  * \par
  60.  * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
  61.  *
  62.  * \par
  63.  * Refer the function <code>arm_conv_opt_q15()</code> for a faster implementation of this function using scratch buffers.
  64.  *
  65.  */
  66.  
  67. void arm_conv_q15(
  68.   q15_t * pSrcA,
  69.   uint32_t srcALen,
  70.   q15_t * pSrcB,
  71.   uint32_t srcBLen,
  72.   q15_t * pDst)
  73. {
  74.  
  75. #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
  76.  
  77.   /* Run the below code for Cortex-M4 and Cortex-M3 */
  78.  
  79.   q15_t *pIn1;                                   /* inputA pointer */
  80.   q15_t *pIn2;                                   /* inputB pointer */
  81.   q15_t *pOut = pDst;                            /* output pointer */
  82.   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
  83.   q15_t *px;                                     /* Intermediate inputA pointer  */
  84.   q15_t *py;                                     /* Intermediate inputB pointer  */
  85.   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
  86.   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
  87.   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */
  88.  
  89.   /* The algorithm implementation is based on the lengths of the inputs. */
  90.   /* srcB is always made to slide across srcA. */
  91.   /* So srcBLen is always considered as shorter or equal to srcALen */
  92.   if (srcALen >= srcBLen)
  93.   {
  94.     /* Initialization of inputA pointer */
  95.     pIn1 = pSrcA;
  96.  
  97.     /* Initialization of inputB pointer */
  98.     pIn2 = pSrcB;
  99.   }
  100.   else
  101.   {
  102.     /* Initialization of inputA pointer */
  103.     pIn1 = pSrcB;
  104.  
  105.     /* Initialization of inputB pointer */
  106.     pIn2 = pSrcA;
  107.  
  108.     /* srcBLen is always considered as shorter or equal to srcALen */
  109.     j = srcBLen;
  110.     srcBLen = srcALen;
  111.     srcALen = j;
  112.   }
  113.  
  114.   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
  115.   /* The function is internally
  116.    * divided into three stages according to the number of multiplications that has to be
  117.    * taken place between inputA samples and inputB samples. In the first stage of the
  118.    * algorithm, the multiplications increase by one for every iteration.
  119.    * In the second stage of the algorithm, srcBLen number of multiplications are done.
  120.    * In the third stage of the algorithm, the multiplications decrease by one
  121.    * for every iteration. */
  122.  
  123.   /* The algorithm is implemented in three stages.
  124.      The loop counters of each stage is initiated here. */
  125.   blockSize1 = srcBLen - 1U;
  126.   blockSize2 = srcALen - (srcBLen - 1U);
  127.  
  128.   /* --------------------------
  129.    * Initializations of stage1
  130.    * -------------------------*/
  131.  
  132.   /* sum = x[0] * y[0]
  133.    * sum = x[0] * y[1] + x[1] * y[0]
  134.    * ....
  135.    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
  136.    */
  137.  
  138.   /* In this stage the MAC operations are increased by 1 for every iteration.
  139.      The count variable holds the number of MAC operations performed */
  140.   count = 1U;
  141.  
  142.   /* Working pointer of inputA */
  143.   px = pIn1;
  144.  
  145.   /* Working pointer of inputB */
  146.   py = pIn2;
  147.  
  148.  
  149.   /* ------------------------
  150.    * Stage1 process
  151.    * ----------------------*/
  152.  
  153.   /* For loop unrolling by 4, this stage is divided into two. */
  154.   /* First part of this stage computes the MAC operations less than 4 */
  155.   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
  156.  
  157.   /* The first part of the stage starts here */
  158.   while ((count < 4U) && (blockSize1 > 0U))
  159.   {
  160.     /* Accumulator is made zero for every iteration */
  161.     sum = 0;
  162.  
  163.     /* Loop over number of MAC operations between
  164.      * inputA samples and inputB samples */
  165.     k = count;
  166.  
  167.     while (k > 0U)
  168.     {
  169.       /* Perform the multiply-accumulates */
  170.       sum = __SMLALD(*px++, *py--, sum);
  171.  
  172.       /* Decrement the loop counter */
  173.       k--;
  174.     }
  175.  
  176.     /* Store the result in the accumulator in the destination buffer. */
  177.     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
  178.  
  179.     /* Update the inputA and inputB pointers for next MAC calculation */
  180.     py = pIn2 + count;
  181.     px = pIn1;
  182.  
  183.     /* Increment the MAC count */
  184.     count++;
  185.  
  186.     /* Decrement the loop counter */
  187.     blockSize1--;
  188.   }
  189.  
  190.   /* The second part of the stage starts here */
  191.   /* The internal loop, over count, is unrolled by 4 */
  192.   /* To, read the last two inputB samples using SIMD:
  193.    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
  194.   py = py - 1;
  195.  
  196.   while (blockSize1 > 0U)
  197.   {
  198.     /* Accumulator is made zero for every iteration */
  199.     sum = 0;
  200.  
  201.     /* Apply loop unrolling and compute 4 MACs simultaneously. */
  202.     k = count >> 2U;
  203.  
  204.     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  205.      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  206.     while (k > 0U)
  207.     {
  208.       /* Perform the multiply-accumulates */
  209.       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
  210.       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
  211.       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
  212.       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
  213.  
  214.       /* Decrement the loop counter */
  215.       k--;
  216.     }
  217.  
  218.     /* For the next MAC operations, the pointer py is used without SIMD
  219.      * So, py is incremented by 1 */
  220.     py = py + 1U;
  221.  
  222.     /* If the count is not a multiple of 4, compute any remaining MACs here.
  223.      ** No loop unrolling is used. */
  224.     k = count % 0x4U;
  225.  
  226.     while (k > 0U)
  227.     {
  228.       /* Perform the multiply-accumulates */
  229.       sum = __SMLALD(*px++, *py--, sum);
  230.  
  231.       /* Decrement the loop counter */
  232.       k--;
  233.     }
  234.  
  235.     /* Store the result in the accumulator in the destination buffer. */
  236.     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
  237.  
  238.     /* Update the inputA and inputB pointers for next MAC calculation */
  239.     py = pIn2 + (count - 1U);
  240.     px = pIn1;
  241.  
  242.     /* Increment the MAC count */
  243.     count++;
  244.  
  245.     /* Decrement the loop counter */
  246.     blockSize1--;
  247.   }
  248.  
  249.   /* --------------------------
  250.    * Initializations of stage2
  251.    * ------------------------*/
  252.  
  253.   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
  254.    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
  255.    * ....
  256.    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
  257.    */
  258.  
  259.   /* Working pointer of inputA */
  260.   px = pIn1;
  261.  
  262.   /* Working pointer of inputB */
  263.   pSrc2 = pIn2 + (srcBLen - 1U);
  264.   py = pSrc2;
  265.  
  266.   /* count is the index by which the pointer pIn1 to be incremented */
  267.   count = 0U;
  268.  
  269.  
  270.   /* --------------------
  271.    * Stage2 process
  272.    * -------------------*/
  273.  
  274.   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
  275.    * So, to loop unroll over blockSize2,
  276.    * srcBLen should be greater than or equal to 4 */
  277.   if (srcBLen >= 4U)
  278.   {
  279.     /* Loop unroll over blockSize2, by 4 */
  280.     blkCnt = blockSize2 >> 2U;
  281.  
  282.     while (blkCnt > 0U)
  283.     {
  284.       py = py - 1U;
  285.  
  286.       /* Set all accumulators to zero */
  287.       acc0 = 0;
  288.       acc1 = 0;
  289.       acc2 = 0;
  290.       acc3 = 0;
  291.  
  292.  
  293.       /* read x[0], x[1] samples */
  294.       x0 = *__SIMD32(px);
  295.       /* read x[1], x[2] samples */
  296.       x1 = _SIMD32_OFFSET(px+1);
  297.       px+= 2U;
  298.  
  299.  
  300.       /* Apply loop unrolling and compute 4 MACs simultaneously. */
  301.       k = srcBLen >> 2U;
  302.  
  303.       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  304.        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  305.       do
  306.       {
  307.         /* Read the last two inputB samples using SIMD:
  308.          * y[srcBLen - 1] and y[srcBLen - 2] */
  309.         c0 = *__SIMD32(py)--;
  310.  
  311.         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
  312.         acc0 = __SMLALDX(x0, c0, acc0);
  313.  
  314.         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
  315.         acc1 = __SMLALDX(x1, c0, acc1);
  316.  
  317.         /* Read x[2], x[3] */
  318.         x2 = *__SIMD32(px);
  319.  
  320.         /* Read x[3], x[4] */
  321.         x3 = _SIMD32_OFFSET(px+1);
  322.  
  323.         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
  324.         acc2 = __SMLALDX(x2, c0, acc2);
  325.  
  326.         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
  327.         acc3 = __SMLALDX(x3, c0, acc3);
  328.  
  329.         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
  330.         c0 = *__SIMD32(py)--;
  331.  
  332.         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
  333.         acc0 = __SMLALDX(x2, c0, acc0);
  334.  
  335.         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
  336.         acc1 = __SMLALDX(x3, c0, acc1);
  337.  
  338.         /* Read x[4], x[5] */
  339.         x0 = _SIMD32_OFFSET(px+2);
  340.  
  341.         /* Read x[5], x[6] */
  342.         x1 = _SIMD32_OFFSET(px+3);
  343.         px += 4U;
  344.  
  345.         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
  346.         acc2 = __SMLALDX(x0, c0, acc2);
  347.  
  348.         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
  349.         acc3 = __SMLALDX(x1, c0, acc3);
  350.  
  351.       } while (--k);
  352.  
  353.       /* For the next MAC operations, SIMD is not used
  354.        * So, the 16 bit pointer if inputB, py is updated */
  355.  
  356.       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  357.        ** No loop unrolling is used. */
  358.       k = srcBLen % 0x4U;
  359.  
  360.       if (k == 1U)
  361.       {
  362.         /* Read y[srcBLen - 5] */
  363.         c0 = *(py+1);
  364.  
  365. #ifdef  ARM_MATH_BIG_ENDIAN
  366.  
  367.         c0 = c0 << 16U;
  368.  
  369. #else
  370.  
  371.         c0 = c0 & 0x0000FFFF;
  372.  
  373. #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
  374.         /* Read x[7] */
  375.         x3 = *__SIMD32(px);
  376.         px++;
  377.  
  378.         /* Perform the multiply-accumulates */
  379.         acc0 = __SMLALD(x0, c0, acc0);
  380.         acc1 = __SMLALD(x1, c0, acc1);
  381.         acc2 = __SMLALDX(x1, c0, acc2);
  382.         acc3 = __SMLALDX(x3, c0, acc3);
  383.       }
  384.  
  385.       if (k == 2U)
  386.       {
  387.         /* Read y[srcBLen - 5], y[srcBLen - 6] */
  388.         c0 = _SIMD32_OFFSET(py);
  389.  
  390.         /* Read x[7], x[8] */
  391.         x3 = *__SIMD32(px);
  392.  
  393.         /* Read x[9] */
  394.         x2 = _SIMD32_OFFSET(px+1);
  395.         px += 2U;
  396.  
  397.         /* Perform the multiply-accumulates */
  398.         acc0 = __SMLALDX(x0, c0, acc0);
  399.         acc1 = __SMLALDX(x1, c0, acc1);
  400.         acc2 = __SMLALDX(x3, c0, acc2);
  401.         acc3 = __SMLALDX(x2, c0, acc3);
  402.       }
  403.  
  404.       if (k == 3U)
  405.       {
  406.         /* Read y[srcBLen - 5], y[srcBLen - 6] */
  407.         c0 = _SIMD32_OFFSET(py);
  408.  
  409.         /* Read x[7], x[8] */
  410.         x3 = *__SIMD32(px);
  411.  
  412.         /* Read x[9] */
  413.         x2 = _SIMD32_OFFSET(px+1);
  414.  
  415.         /* Perform the multiply-accumulates */
  416.         acc0 = __SMLALDX(x0, c0, acc0);
  417.         acc1 = __SMLALDX(x1, c0, acc1);
  418.         acc2 = __SMLALDX(x3, c0, acc2);
  419.         acc3 = __SMLALDX(x2, c0, acc3);
  420.  
  421.         c0 = *(py-1);
  422.  
  423. #ifdef  ARM_MATH_BIG_ENDIAN
  424.  
  425.         c0 = c0 << 16U;
  426. #else
  427.  
  428.         c0 = c0 & 0x0000FFFF;
  429. #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
  430.         /* Read x[10] */
  431.         x3 =  _SIMD32_OFFSET(px+2);
  432.         px += 3U;
  433.  
  434.         /* Perform the multiply-accumulates */
  435.         acc0 = __SMLALDX(x1, c0, acc0);
  436.         acc1 = __SMLALD(x2, c0, acc1);
  437.         acc2 = __SMLALDX(x2, c0, acc2);
  438.         acc3 = __SMLALDX(x3, c0, acc3);
  439.       }
  440.  
  441.  
  442.       /* Store the results in the accumulators in the destination buffer. */
  443.  
  444. #ifndef  ARM_MATH_BIG_ENDIAN
  445.  
  446.       *__SIMD32(pOut)++ =
  447.         __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
  448.       *__SIMD32(pOut)++ =
  449.         __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
  450.  
  451. #else
  452.  
  453.       *__SIMD32(pOut)++ =
  454.         __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
  455.       *__SIMD32(pOut)++ =
  456.         __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
  457.  
  458. #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
  459.  
  460.       /* Increment the pointer pIn1 index, count by 4 */
  461.       count += 4U;
  462.  
  463.       /* Update the inputA and inputB pointers for next MAC calculation */
  464.       px = pIn1 + count;
  465.       py = pSrc2;
  466.  
  467.        /* Decrement the loop counter */
  468.       blkCnt--;
  469.     }
  470.  
  471.     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
  472.      ** No loop unrolling is used. */
  473.     blkCnt = blockSize2 % 0x4U;
  474.  
  475.     while (blkCnt > 0U)
  476.     {
  477.       /* Accumulator is made zero for every iteration */
  478.       sum = 0;
  479.  
  480.       /* Apply loop unrolling and compute 4 MACs simultaneously. */
  481.       k = srcBLen >> 2U;
  482.  
  483.       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  484.        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  485.       while (k > 0U)
  486.       {
  487.         /* Perform the multiply-accumulates */
  488.         sum += (q63_t) ((q31_t) * px++ * *py--);
  489.         sum += (q63_t) ((q31_t) * px++ * *py--);
  490.         sum += (q63_t) ((q31_t) * px++ * *py--);
  491.         sum += (q63_t) ((q31_t) * px++ * *py--);
  492.  
  493.         /* Decrement the loop counter */
  494.         k--;
  495.       }
  496.  
  497.       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  498.        ** No loop unrolling is used. */
  499.       k = srcBLen % 0x4U;
  500.  
  501.       while (k > 0U)
  502.       {
  503.         /* Perform the multiply-accumulates */
  504.         sum += (q63_t) ((q31_t) * px++ * *py--);
  505.  
  506.         /* Decrement the loop counter */
  507.         k--;
  508.       }
  509.  
  510.       /* Store the result in the accumulator in the destination buffer. */
  511.       *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
  512.  
  513.       /* Increment the pointer pIn1 index, count by 1 */
  514.       count++;
  515.  
  516.       /* Update the inputA and inputB pointers for next MAC calculation */
  517.       px = pIn1 + count;
  518.       py = pSrc2;
  519.  
  520.       /* Decrement the loop counter */
  521.       blkCnt--;
  522.     }
  523.   }
  524.   else
  525.   {
  526.     /* If the srcBLen is not a multiple of 4,
  527.      * the blockSize2 loop cannot be unrolled by 4 */
  528.     blkCnt = blockSize2;
  529.  
  530.     while (blkCnt > 0U)
  531.     {
  532.       /* Accumulator is made zero for every iteration */
  533.       sum = 0;
  534.  
  535.       /* srcBLen number of MACS should be performed */
  536.       k = srcBLen;
  537.  
  538.       while (k > 0U)
  539.       {
  540.         /* Perform the multiply-accumulate */
  541.         sum += (q63_t) ((q31_t) * px++ * *py--);
  542.  
  543.         /* Decrement the loop counter */
  544.         k--;
  545.       }
  546.  
  547.       /* Store the result in the accumulator in the destination buffer. */
  548.       *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
  549.  
  550.       /* Increment the MAC count */
  551.       count++;
  552.  
  553.       /* Update the inputA and inputB pointers for next MAC calculation */
  554.       px = pIn1 + count;
  555.       py = pSrc2;
  556.  
  557.       /* Decrement the loop counter */
  558.       blkCnt--;
  559.     }
  560.   }
  561.  
  562.  
  563.   /* --------------------------
  564.    * Initializations of stage3
  565.    * -------------------------*/
  566.  
  567.   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
  568.    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
  569.    * ....
  570.    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
  571.    * sum +=  x[srcALen-1] * y[srcBLen-1]
  572.    */
  573.  
  574.   /* In this stage the MAC operations are decreased by 1 for every iteration.
  575.      The blockSize3 variable holds the number of MAC operations performed */
  576.  
  577.   blockSize3 = srcBLen - 1U;
  578.  
  579.   /* Working pointer of inputA */
  580.   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
  581.   px = pSrc1;
  582.  
  583.   /* Working pointer of inputB */
  584.   pSrc2 = pIn2 + (srcBLen - 1U);
  585.   pIn2 = pSrc2 - 1U;
  586.   py = pIn2;
  587.  
  588.   /* -------------------
  589.    * Stage3 process
  590.    * ------------------*/
  591.  
  592.   /* For loop unrolling by 4, this stage is divided into two. */
  593.   /* First part of this stage computes the MAC operations greater than 4 */
  594.   /* Second part of this stage computes the MAC operations less than or equal to 4 */
  595.  
  596.   /* The first part of the stage starts here */
  597.   j = blockSize3 >> 2U;
  598.  
  599.   while ((j > 0U) && (blockSize3 > 0U))
  600.   {
  601.     /* Accumulator is made zero for every iteration */
  602.     sum = 0;
  603.  
  604.     /* Apply loop unrolling and compute 4 MACs simultaneously. */
  605.     k = blockSize3 >> 2U;
  606.  
  607.     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  608.      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  609.     while (k > 0U)
  610.     {
  611.       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
  612.        * with y[srcBLen - 1], y[srcBLen - 2] respectively */
  613.       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
  614.       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
  615.        * with y[srcBLen - 3], y[srcBLen - 4] respectively */
  616.       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
  617.  
  618.       /* Decrement the loop counter */
  619.       k--;
  620.     }
  621.  
  622.     /* For the next MAC operations, the pointer py is used without SIMD
  623.      * So, py is incremented by 1 */
  624.     py = py + 1U;
  625.  
  626.     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
  627.      ** No loop unrolling is used. */
  628.     k = blockSize3 % 0x4U;
  629.  
  630.     while (k > 0U)
  631.     {
  632.       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
  633.       sum = __SMLALD(*px++, *py--, sum);
  634.  
  635.       /* Decrement the loop counter */
  636.       k--;
  637.     }
  638.  
  639.     /* Store the result in the accumulator in the destination buffer. */
  640.     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
  641.  
  642.     /* Update the inputA and inputB pointers for next MAC calculation */
  643.     px = ++pSrc1;
  644.     py = pIn2;
  645.  
  646.     /* Decrement the loop counter */
  647.     blockSize3--;
  648.  
  649.     j--;
  650.   }
  651.  
  652.   /* The second part of the stage starts here */
  653.   /* SIMD is not used for the next MAC operations,
  654.    * so pointer py is updated to read only one sample at a time */
  655.   py = py + 1U;
  656.  
  657.   while (blockSize3 > 0U)
  658.   {
  659.     /* Accumulator is made zero for every iteration */
  660.     sum = 0;
  661.  
  662.     /* Apply loop unrolling and compute 4 MACs simultaneously. */
  663.     k = blockSize3;
  664.  
  665.     while (k > 0U)
  666.     {
  667.       /* Perform the multiply-accumulates */
  668.       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
  669.       sum = __SMLALD(*px++, *py--, sum);
  670.  
  671.       /* Decrement the loop counter */
  672.       k--;
  673.     }
  674.  
  675.     /* Store the result in the accumulator in the destination buffer. */
  676.     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
  677.  
  678.     /* Update the inputA and inputB pointers for next MAC calculation */
  679.     px = ++pSrc1;
  680.     py = pSrc2;
  681.  
  682.     /* Decrement the loop counter */
  683.     blockSize3--;
  684.   }
  685.  
  686. #else
  687.  
  688. /* Run the below code for Cortex-M0 */
  689.  
  690.   q15_t *pIn1 = pSrcA;                           /* input pointer */
  691.   q15_t *pIn2 = pSrcB;                           /* coefficient pointer */
  692.   q63_t sum;                                     /* Accumulator */
  693.   uint32_t i, j;                                 /* loop counter */
  694.  
  695.   /* Loop to calculate output of convolution for output length number of times */
  696.   for (i = 0; i < (srcALen + srcBLen - 1); i++)
  697.   {
  698.     /* Initialize sum with zero to carry on MAC operations */
  699.     sum = 0;
  700.  
  701.     /* Loop to perform MAC operations according to convolution equation */
  702.     for (j = 0; j <= i; j++)
  703.     {
  704.       /* Check the array limitations */
  705.       if (((i - j) < srcBLen) && (j < srcALen))
  706.       {
  707.         /* z[i] += x[i-j] * y[j] */
  708.         sum += (q31_t) pIn1[j] * (pIn2[i - j]);
  709.       }
  710.     }
  711.  
  712.     /* Store the output in the destination buffer */
  713.     pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
  714.   }
  715.  
  716. #endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
  717.  
  718. }
  719.  
  720. /**
  721.  * @} end of Conv group
  722.  */
  723.