Subversion Repositories testOled

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------
  2.  * Project:      CMSIS DSP Library
  3.  * Title:        arm_conv_q7.c
  4.  * Description:  Convolution of Q7 sequences
  5.  *
  6.  * $Date:        27. January 2017
  7.  * $Revision:    V.1.5.1
  8.  *
  9.  * Target Processor: Cortex-M cores
  10.  * -------------------------------------------------------------------- */
  11. /*
  12.  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13.  *
  14.  * SPDX-License-Identifier: Apache-2.0
  15.  *
  16.  * Licensed under the Apache License, Version 2.0 (the License); you may
  17.  * not use this file except in compliance with the License.
  18.  * You may obtain a copy of the License at
  19.  *
  20.  * www.apache.org/licenses/LICENSE-2.0
  21.  *
  22.  * Unless required by applicable law or agreed to in writing, software
  23.  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25.  * See the License for the specific language governing permissions and
  26.  * limitations under the License.
  27.  */
  28.  
  29. #include "arm_math.h"
  30.  
  31. /**
  32.  * @ingroup groupFilters
  33.  */
  34.  
  35. /**
  36.  * @addtogroup Conv
  37.  * @{
  38.  */
  39.  
  40. /**
  41.  * @brief Convolution of Q7 sequences.
  42.  * @param[in] *pSrcA points to the first input sequence.
  43.  * @param[in] srcALen length of the first input sequence.
  44.  * @param[in] *pSrcB points to the second input sequence.
  45.  * @param[in] srcBLen length of the second input sequence.
  46.  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
  47.  * @return none.
  48.  *
  49.  * @details
  50.  * <b>Scaling and Overflow Behavior:</b>
  51.  *
  52.  * \par
  53.  * The function is implemented using a 32-bit internal accumulator.
  54.  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
  55.  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
  56.  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
  57.  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
  58.  *
  59.  * \par
  60.  * Refer the function <code>arm_conv_opt_q7()</code> for a faster implementation of this function.
  61.  *
  62.  */
  63.  
  64. void arm_conv_q7(
  65.   q7_t * pSrcA,
  66.   uint32_t srcALen,
  67.   q7_t * pSrcB,
  68.   uint32_t srcBLen,
  69.   q7_t * pDst)
  70. {
  71.  
  72.  
  73. #if defined (ARM_MATH_DSP)
  74.  
  75.   /* Run the below code for Cortex-M4 and Cortex-M3 */
  76.  
  77.   q7_t *pIn1;                                    /* inputA pointer */
  78.   q7_t *pIn2;                                    /* inputB pointer */
  79.   q7_t *pOut = pDst;                             /* output pointer */
  80.   q7_t *px;                                      /* Intermediate inputA pointer */
  81.   q7_t *py;                                      /* Intermediate inputB pointer */
  82.   q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */
  83.   q7_t x0, x1, x2, x3, c0, c1;                   /* Temporary variables to hold state and coefficient values */
  84.   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
  85.   q31_t input1, input2;                          /* Temporary input variables */
  86.   q15_t in1, in2;                                /* Temporary input variables */
  87.   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */
  88.  
  89.   /* The algorithm implementation is based on the lengths of the inputs. */
  90.   /* srcB is always made to slide across srcA. */
  91.   /* So srcBLen is always considered as shorter or equal to srcALen */
  92.   if (srcALen >= srcBLen)
  93.   {
  94.     /* Initialization of inputA pointer */
  95.     pIn1 = pSrcA;
  96.  
  97.     /* Initialization of inputB pointer */
  98.     pIn2 = pSrcB;
  99.   }
  100.   else
  101.   {
  102.     /* Initialization of inputA pointer */
  103.     pIn1 = pSrcB;
  104.  
  105.     /* Initialization of inputB pointer */
  106.     pIn2 = pSrcA;
  107.  
  108.     /* srcBLen is always considered as shorter or equal to srcALen */
  109.     j = srcBLen;
  110.     srcBLen = srcALen;
  111.     srcALen = j;
  112.   }
  113.  
  114.   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
  115.   /* The function is internally
  116.    * divided into three stages according to the number of multiplications that has to be
  117.    * taken place between inputA samples and inputB samples. In the first stage of the
  118.    * algorithm, the multiplications increase by one for every iteration.
  119.    * In the second stage of the algorithm, srcBLen number of multiplications are done.
  120.    * In the third stage of the algorithm, the multiplications decrease by one
  121.    * for every iteration. */
  122.  
  123.   /* The algorithm is implemented in three stages.
  124.      The loop counters of each stage is initiated here. */
  125.   blockSize1 = srcBLen - 1U;
  126.   blockSize2 = (srcALen - srcBLen) + 1U;
  127.   blockSize3 = blockSize1;
  128.  
  129.   /* --------------------------
  130.    * Initializations of stage1
  131.    * -------------------------*/
  132.  
  133.   /* sum = x[0] * y[0]
  134.    * sum = x[0] * y[1] + x[1] * y[0]
  135.    * ....
  136.    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
  137.    */
  138.  
  139.   /* In this stage the MAC operations are increased by 1 for every iteration.
  140.      The count variable holds the number of MAC operations performed */
  141.   count = 1U;
  142.  
  143.   /* Working pointer of inputA */
  144.   px = pIn1;
  145.  
  146.   /* Working pointer of inputB */
  147.   py = pIn2;
  148.  
  149.  
  150.   /* ------------------------
  151.    * Stage1 process
  152.    * ----------------------*/
  153.  
  154.   /* The first stage starts here */
  155.   while (blockSize1 > 0U)
  156.   {
  157.     /* Accumulator is made zero for every iteration */
  158.     sum = 0;
  159.  
  160.     /* Apply loop unrolling and compute 4 MACs simultaneously. */
  161.     k = count >> 2U;
  162.  
  163.     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  164.      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  165.     while (k > 0U)
  166.     {
  167.       /* x[0] , x[1] */
  168.       in1 = (q15_t) * px++;
  169.       in2 = (q15_t) * px++;
  170.       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  171.  
  172.       /* y[srcBLen - 1] , y[srcBLen - 2] */
  173.       in1 = (q15_t) * py--;
  174.       in2 = (q15_t) * py--;
  175.       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  176.  
  177.       /* x[0] * y[srcBLen - 1] */
  178.       /* x[1] * y[srcBLen - 2] */
  179.       sum = __SMLAD(input1, input2, sum);
  180.  
  181.       /* x[2] , x[3] */
  182.       in1 = (q15_t) * px++;
  183.       in2 = (q15_t) * px++;
  184.       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  185.  
  186.       /* y[srcBLen - 3] , y[srcBLen - 4] */
  187.       in1 = (q15_t) * py--;
  188.       in2 = (q15_t) * py--;
  189.       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  190.  
  191.       /* x[2] * y[srcBLen - 3] */
  192.       /* x[3] * y[srcBLen - 4] */
  193.       sum = __SMLAD(input1, input2, sum);
  194.  
  195.       /* Decrement the loop counter */
  196.       k--;
  197.     }
  198.  
  199.     /* If the count is not a multiple of 4, compute any remaining MACs here.
  200.      ** No loop unrolling is used. */
  201.     k = count % 0x4U;
  202.  
  203.     while (k > 0U)
  204.     {
  205.       /* Perform the multiply-accumulates */
  206.       sum += ((q15_t) * px++ * *py--);
  207.  
  208.       /* Decrement the loop counter */
  209.       k--;
  210.     }
  211.  
  212.     /* Store the result in the accumulator in the destination buffer. */
  213.     *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
  214.  
  215.     /* Update the inputA and inputB pointers for next MAC calculation */
  216.     py = pIn2 + count;
  217.     px = pIn1;
  218.  
  219.     /* Increment the MAC count */
  220.     count++;
  221.  
  222.     /* Decrement the loop counter */
  223.     blockSize1--;
  224.   }
  225.  
  226.   /* --------------------------
  227.    * Initializations of stage2
  228.    * ------------------------*/
  229.  
  230.   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
  231.    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
  232.    * ....
  233.    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
  234.    */
  235.  
  236.   /* Working pointer of inputA */
  237.   px = pIn1;
  238.  
  239.   /* Working pointer of inputB */
  240.   pSrc2 = pIn2 + (srcBLen - 1U);
  241.   py = pSrc2;
  242.  
  243.   /* count is index by which the pointer pIn1 to be incremented */
  244.   count = 0U;
  245.  
  246.   /* -------------------
  247.    * Stage2 process
  248.    * ------------------*/
  249.  
  250.   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
  251.    * So, to loop unroll over blockSize2,
  252.    * srcBLen should be greater than or equal to 4 */
  253.   if (srcBLen >= 4U)
  254.   {
  255.     /* Loop unroll over blockSize2, by 4 */
  256.     blkCnt = blockSize2 >> 2U;
  257.  
  258.     while (blkCnt > 0U)
  259.     {
  260.       /* Set all accumulators to zero */
  261.       acc0 = 0;
  262.       acc1 = 0;
  263.       acc2 = 0;
  264.       acc3 = 0;
  265.  
  266.       /* read x[0], x[1], x[2] samples */
  267.       x0 = *(px++);
  268.       x1 = *(px++);
  269.       x2 = *(px++);
  270.  
  271.       /* Apply loop unrolling and compute 4 MACs simultaneously. */
  272.       k = srcBLen >> 2U;
  273.  
  274.       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  275.        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  276.       do
  277.       {
  278.         /* Read y[srcBLen - 1] sample */
  279.         c0 = *(py--);
  280.         /* Read y[srcBLen - 2] sample */
  281.         c1 = *(py--);
  282.  
  283.         /* Read x[3] sample */
  284.         x3 = *(px++);
  285.  
  286.         /* x[0] and x[1] are packed */
  287.         in1 = (q15_t) x0;
  288.         in2 = (q15_t) x1;
  289.  
  290.         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  291.  
  292.         /* y[srcBLen - 1]   and y[srcBLen - 2] are packed */
  293.         in1 = (q15_t) c0;
  294.         in2 = (q15_t) c1;
  295.  
  296.         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  297.  
  298.         /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2]  */
  299.         acc0 = __SMLAD(input1, input2, acc0);
  300.  
  301.         /* x[1] and x[2] are packed */
  302.         in1 = (q15_t) x1;
  303.         in2 = (q15_t) x2;
  304.  
  305.         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  306.  
  307.         /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2]  */
  308.         acc1 = __SMLAD(input1, input2, acc1);
  309.  
  310.         /* x[2] and x[3] are packed */
  311.         in1 = (q15_t) x2;
  312.         in2 = (q15_t) x3;
  313.  
  314.         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  315.  
  316.         /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2]  */
  317.         acc2 = __SMLAD(input1, input2, acc2);
  318.  
  319.         /* Read x[4] sample */
  320.         x0 = *(px++);
  321.  
  322.         /* x[3] and x[4] are packed */
  323.         in1 = (q15_t) x3;
  324.         in2 = (q15_t) x0;
  325.  
  326.         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  327.  
  328.         /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2]  */
  329.         acc3 = __SMLAD(input1, input2, acc3);
  330.  
  331.         /* Read y[srcBLen - 3] sample */
  332.         c0 = *(py--);
  333.         /* Read y[srcBLen - 4] sample */
  334.         c1 = *(py--);
  335.  
  336.         /* Read x[5] sample */
  337.         x1 = *(px++);
  338.  
  339.         /* x[2] and x[3] are packed */
  340.         in1 = (q15_t) x2;
  341.         in2 = (q15_t) x3;
  342.  
  343.         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  344.  
  345.         /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
  346.         in1 = (q15_t) c0;
  347.         in2 = (q15_t) c1;
  348.  
  349.         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  350.  
  351.         /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */
  352.         acc0 = __SMLAD(input1, input2, acc0);
  353.  
  354.         /* x[3] and x[4] are packed */
  355.         in1 = (q15_t) x3;
  356.         in2 = (q15_t) x0;
  357.  
  358.         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  359.  
  360.         /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4]  */
  361.         acc1 = __SMLAD(input1, input2, acc1);
  362.  
  363.         /* x[4] and x[5] are packed */
  364.         in1 = (q15_t) x0;
  365.         in2 = (q15_t) x1;
  366.  
  367.         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  368.  
  369.         /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */
  370.         acc2 = __SMLAD(input1, input2, acc2);
  371.  
  372.         /* Read x[6] sample */
  373.         x2 = *(px++);
  374.  
  375.         /* x[5] and x[6] are packed */
  376.         in1 = (q15_t) x1;
  377.         in2 = (q15_t) x2;
  378.  
  379.         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  380.  
  381.         /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4]  */
  382.         acc3 = __SMLAD(input1, input2, acc3);
  383.  
  384.       } while (--k);
  385.  
  386.       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  387.        ** No loop unrolling is used. */
  388.       k = srcBLen % 0x4U;
  389.  
  390.       while (k > 0U)
  391.       {
  392.         /* Read y[srcBLen - 5] sample */
  393.         c0 = *(py--);
  394.  
  395.         /* Read x[7] sample */
  396.         x3 = *(px++);
  397.  
  398.         /* Perform the multiply-accumulates */
  399.         /* acc0 +=  x[4] * y[srcBLen - 5] */
  400.         acc0 += ((q15_t) x0 * c0);
  401.         /* acc1 +=  x[5] * y[srcBLen - 5] */
  402.         acc1 += ((q15_t) x1 * c0);
  403.         /* acc2 +=  x[6] * y[srcBLen - 5] */
  404.         acc2 += ((q15_t) x2 * c0);
  405.         /* acc3 +=  x[7] * y[srcBLen - 5] */
  406.         acc3 += ((q15_t) x3 * c0);
  407.  
  408.         /* Reuse the present samples for the next MAC */
  409.         x0 = x1;
  410.         x1 = x2;
  411.         x2 = x3;
  412.  
  413.         /* Decrement the loop counter */
  414.         k--;
  415.       }
  416.  
  417.  
  418.       /* Store the result in the accumulator in the destination buffer. */
  419.       *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
  420.       *pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8));
  421.       *pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8));
  422.       *pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8));
  423.  
  424.       /* Increment the pointer pIn1 index, count by 4 */
  425.       count += 4U;
  426.  
  427.       /* Update the inputA and inputB pointers for next MAC calculation */
  428.       px = pIn1 + count;
  429.       py = pSrc2;
  430.  
  431.       /* Decrement the loop counter */
  432.       blkCnt--;
  433.     }
  434.  
  435.     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
  436.      ** No loop unrolling is used. */
  437.     blkCnt = blockSize2 % 0x4U;
  438.  
  439.     while (blkCnt > 0U)
  440.     {
  441.       /* Accumulator is made zero for every iteration */
  442.       sum = 0;
  443.  
  444.       /* Apply loop unrolling and compute 4 MACs simultaneously. */
  445.       k = srcBLen >> 2U;
  446.  
  447.       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  448.        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  449.       while (k > 0U)
  450.       {
  451.  
  452.         /* Reading two inputs of SrcA buffer and packing */
  453.         in1 = (q15_t) * px++;
  454.         in2 = (q15_t) * px++;
  455.         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  456.  
  457.         /* Reading two inputs of SrcB buffer and packing */
  458.         in1 = (q15_t) * py--;
  459.         in2 = (q15_t) * py--;
  460.         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  461.  
  462.         /* Perform the multiply-accumulates */
  463.         sum = __SMLAD(input1, input2, sum);
  464.  
  465.         /* Reading two inputs of SrcA buffer and packing */
  466.         in1 = (q15_t) * px++;
  467.         in2 = (q15_t) * px++;
  468.         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  469.  
  470.         /* Reading two inputs of SrcB buffer and packing */
  471.         in1 = (q15_t) * py--;
  472.         in2 = (q15_t) * py--;
  473.         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  474.  
  475.         /* Perform the multiply-accumulates */
  476.         sum = __SMLAD(input1, input2, sum);
  477.  
  478.         /* Decrement the loop counter */
  479.         k--;
  480.       }
  481.  
  482.       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  483.        ** No loop unrolling is used. */
  484.       k = srcBLen % 0x4U;
  485.  
  486.       while (k > 0U)
  487.       {
  488.         /* Perform the multiply-accumulates */
  489.         sum += ((q15_t) * px++ * *py--);
  490.  
  491.         /* Decrement the loop counter */
  492.         k--;
  493.       }
  494.  
  495.       /* Store the result in the accumulator in the destination buffer. */
  496.       *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
  497.  
  498.       /* Increment the pointer pIn1 index, count by 1 */
  499.       count++;
  500.  
  501.       /* Update the inputA and inputB pointers for next MAC calculation */
  502.       px = pIn1 + count;
  503.       py = pSrc2;
  504.  
  505.       /* Decrement the loop counter */
  506.       blkCnt--;
  507.     }
  508.   }
  509.   else
  510.   {
  511.     /* If the srcBLen is not a multiple of 4,
  512.      * the blockSize2 loop cannot be unrolled by 4 */
  513.     blkCnt = blockSize2;
  514.  
  515.     while (blkCnt > 0U)
  516.     {
  517.       /* Accumulator is made zero for every iteration */
  518.       sum = 0;
  519.  
  520.       /* srcBLen number of MACS should be performed */
  521.       k = srcBLen;
  522.  
  523.       while (k > 0U)
  524.       {
  525.         /* Perform the multiply-accumulate */
  526.         sum += ((q15_t) * px++ * *py--);
  527.  
  528.         /* Decrement the loop counter */
  529.         k--;
  530.       }
  531.  
  532.       /* Store the result in the accumulator in the destination buffer. */
  533.       *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
  534.  
  535.       /* Increment the MAC count */
  536.       count++;
  537.  
  538.       /* Update the inputA and inputB pointers for next MAC calculation */
  539.       px = pIn1 + count;
  540.       py = pSrc2;
  541.  
  542.       /* Decrement the loop counter */
  543.       blkCnt--;
  544.     }
  545.   }
  546.  
  547.  
  548.   /* --------------------------
  549.    * Initializations of stage3
  550.    * -------------------------*/
  551.  
  552.   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
  553.    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
  554.    * ....
  555.    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
  556.    * sum +=  x[srcALen-1] * y[srcBLen-1]
  557.    */
  558.  
  559.   /* In this stage the MAC operations are decreased by 1 for every iteration.
  560.      The blockSize3 variable holds the number of MAC operations performed */
  561.  
  562.   /* Working pointer of inputA */
  563.   pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
  564.   px = pSrc1;
  565.  
  566.   /* Working pointer of inputB */
  567.   pSrc2 = pIn2 + (srcBLen - 1U);
  568.   py = pSrc2;
  569.  
  570.   /* -------------------
  571.    * Stage3 process
  572.    * ------------------*/
  573.  
  574.   while (blockSize3 > 0U)
  575.   {
  576.     /* Accumulator is made zero for every iteration */
  577.     sum = 0;
  578.  
  579.     /* Apply loop unrolling and compute 4 MACs simultaneously. */
  580.     k = blockSize3 >> 2U;
  581.  
  582.     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  583.      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  584.     while (k > 0U)
  585.     {
  586.       /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
  587.       in1 = (q15_t) * px++;
  588.       in2 = (q15_t) * px++;
  589.       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  590.  
  591.       /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
  592.       in1 = (q15_t) * py--;
  593.       in2 = (q15_t) * py--;
  594.       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  595.  
  596.       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
  597.       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
  598.       sum = __SMLAD(input1, input2, sum);
  599.  
  600.       /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
  601.       in1 = (q15_t) * px++;
  602.       in2 = (q15_t) * px++;
  603.       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  604.  
  605.       /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
  606.       in1 = (q15_t) * py--;
  607.       in2 = (q15_t) * py--;
  608.       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
  609.  
  610.       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
  611.       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
  612.       sum = __SMLAD(input1, input2, sum);
  613.  
  614.       /* Decrement the loop counter */
  615.       k--;
  616.     }
  617.  
  618.     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
  619.      ** No loop unrolling is used. */
  620.     k = blockSize3 % 0x4U;
  621.  
  622.     while (k > 0U)
  623.     {
  624.       /* Perform the multiply-accumulates */
  625.       sum += ((q15_t) * px++ * *py--);
  626.  
  627.       /* Decrement the loop counter */
  628.       k--;
  629.     }
  630.  
  631.     /* Store the result in the accumulator in the destination buffer. */
  632.     *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
  633.  
  634.     /* Update the inputA and inputB pointers for next MAC calculation */
  635.     px = ++pSrc1;
  636.     py = pSrc2;
  637.  
  638.     /* Decrement the loop counter */
  639.     blockSize3--;
  640.   }
  641.  
  642. #else
  643.  
  644.   /* Run the below code for Cortex-M0 */
  645.  
  646.   q7_t *pIn1 = pSrcA;                            /* input pointer */
  647.   q7_t *pIn2 = pSrcB;                            /* coefficient pointer */
  648.   q31_t sum;                                     /* Accumulator */
  649.   uint32_t i, j;                                 /* loop counter */
  650.  
  651.   /* Loop to calculate output of convolution for output length number of times */
  652.   for (i = 0; i < (srcALen + srcBLen - 1); i++)
  653.   {
  654.     /* Initialize sum with zero to carry on MAC operations */
  655.     sum = 0;
  656.  
  657.     /* Loop to perform MAC operations according to convolution equation */
  658.     for (j = 0; j <= i; j++)
  659.     {
  660.       /* Check the array limitations */
  661.       if (((i - j) < srcBLen) && (j < srcALen))
  662.       {
  663.         /* z[i] += x[i-j] * y[j] */
  664.         sum += (q15_t) pIn1[j] * (pIn2[i - j]);
  665.       }
  666.     }
  667.  
  668.     /* Store the output in the destination buffer */
  669.     pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U);
  670.   }
  671.  
  672. #endif /*   #if defined (ARM_MATH_DSP)        */
  673.  
  674. }
  675.  
  676. /**
  677.  * @} end of Conv group
  678.  */
  679.