Subversion Repositories ScreenTimer

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------
  2.  * Project:      CMSIS DSP Library
  3.  * Title:        arm_conv_f32.c
  4.  * Description:  Convolution of floating-point sequences
  5.  *
  6.  * $Date:        27. January 2017
  7.  * $Revision:    V.1.5.1
  8.  *
  9.  * Target Processor: Cortex-M cores
  10.  * -------------------------------------------------------------------- */
  11. /*
  12.  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13.  *
  14.  * SPDX-License-Identifier: Apache-2.0
  15.  *
  16.  * Licensed under the Apache License, Version 2.0 (the License); you may
  17.  * not use this file except in compliance with the License.
  18.  * You may obtain a copy of the License at
  19.  *
  20.  * www.apache.org/licenses/LICENSE-2.0
  21.  *
  22.  * Unless required by applicable law or agreed to in writing, software
  23.  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25.  * See the License for the specific language governing permissions and
  26.  * limitations under the License.
  27.  */
  28.  
  29. #include "arm_math.h"
  30.  
  31. /**
  32.  * @ingroup groupFilters
  33.  */
  34.  
  35. /**
  36.  * @defgroup Conv Convolution
  37.  *
  38.  * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
  39.  * Convolution is similar to correlation and is frequently used in filtering and data analysis.
  40.  * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
  41.  * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.
  42.  *
  43.  * \par Algorithm
  44.  * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
  45.  * Then the convolution
  46.  *
  47.  * <pre>
  48.  *                   c[n] = a[n] * b[n]
  49.  * </pre>
  50.  *
  51.  * \par
  52.  * is defined as
  53.  * \image html ConvolutionEquation.gif
  54.  * \par
  55.  * Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.
  56.  * <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and
  57.  * <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
  58.  * The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.
  59.  *
  60.  * \par
  61.  * Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,
  62.  * the signal <code>b[n]</code> slides over <code>a[n]</code>.
  63.  * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
  64.  *
  65.  * \par
  66.  * Note that convolution is a commutative operation:
  67.  *
  68.  * <pre>
  69.  *                   a[n] * b[n] = b[n] * a[n].
  70.  * </pre>
  71.  *
  72.  * \par
  73.  * This means that switching the A and B arguments to the convolution functions has no effect.
  74.  *
  75.  * <b>Fixed-Point Behavior</b>
  76.  *
  77.  * \par
  78.  * Convolution requires summing up a large number of intermediate products.
  79.  * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
  80.  * Refer to the function specific documentation below for further details of the particular algorithm used.
  81.  *
  82.  *
  83.  * <b>Fast Versions</b>
  84.  *
  85.  * \par
  86.  * Fast versions are supported for Q31 and Q15.  Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
  87.  * the input signals should be scaled down to avoid intermediate overflows.
  88.  *
  89.  *
  90.  * <b>Opt Versions</b>
  91.  *
  92.  * \par
  93.  * Opt versions are supported for Q15 and Q7.  Design uses internal scratch buffer for getting good optimisation.
  94.  * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions
  95.  */
  96.  
  97. /**
  98.  * @addtogroup Conv
  99.  * @{
  100.  */
  101.  
  102. /**
  103.  * @brief Convolution of floating-point sequences.
  104.  * @param[in] *pSrcA points to the first input sequence.
  105.  * @param[in] srcALen length of the first input sequence.
  106.  * @param[in] *pSrcB points to the second input sequence.
  107.  * @param[in] srcBLen length of the second input sequence.
  108.  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
  109.  * @return none.
  110.  */
  111.  
  112. void arm_conv_f32(
  113.   float32_t * pSrcA,
  114.   uint32_t srcALen,
  115.   float32_t * pSrcB,
  116.   uint32_t srcBLen,
  117.   float32_t * pDst)
  118. {
  119.  
  120.  
  121. #if defined (ARM_MATH_DSP)
  122.  
  123.   /* Run the below code for Cortex-M4 and Cortex-M3 */
  124.  
  125.   float32_t *pIn1;                               /* inputA pointer */
  126.   float32_t *pIn2;                               /* inputB pointer */
  127.   float32_t *pOut = pDst;                        /* output pointer */
  128.   float32_t *px;                                 /* Intermediate inputA pointer */
  129.   float32_t *py;                                 /* Intermediate inputB pointer */
  130.   float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
  131.   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */
  132.   float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */
  133.   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counters */
  134.  
  135.   /* The algorithm implementation is based on the lengths of the inputs. */
  136.   /* srcB is always made to slide across srcA. */
  137.   /* So srcBLen is always considered as shorter or equal to srcALen */
  138.   if (srcALen >= srcBLen)
  139.   {
  140.     /* Initialization of inputA pointer */
  141.     pIn1 = pSrcA;
  142.  
  143.     /* Initialization of inputB pointer */
  144.     pIn2 = pSrcB;
  145.   }
  146.   else
  147.   {
  148.     /* Initialization of inputA pointer */
  149.     pIn1 = pSrcB;
  150.  
  151.     /* Initialization of inputB pointer */
  152.     pIn2 = pSrcA;
  153.  
  154.     /* srcBLen is always considered as shorter or equal to srcALen */
  155.     j = srcBLen;
  156.     srcBLen = srcALen;
  157.     srcALen = j;
  158.   }
  159.  
  160.   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
  161.   /* The function is internally
  162.    * divided into three stages according to the number of multiplications that has to be
  163.    * taken place between inputA samples and inputB samples. In the first stage of the
  164.    * algorithm, the multiplications increase by one for every iteration.
  165.    * In the second stage of the algorithm, srcBLen number of multiplications are done.
  166.    * In the third stage of the algorithm, the multiplications decrease by one
  167.    * for every iteration. */
  168.  
  169.   /* The algorithm is implemented in three stages.
  170.      The loop counters of each stage is initiated here. */
  171.   blockSize1 = srcBLen - 1U;
  172.   blockSize2 = srcALen - (srcBLen - 1U);
  173.   blockSize3 = blockSize1;
  174.  
  175.   /* --------------------------
  176.    * initializations of stage1
  177.    * -------------------------*/
  178.  
  179.   /* sum = x[0] * y[0]
  180.    * sum = x[0] * y[1] + x[1] * y[0]
  181.    * ....
  182.    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
  183.    */
  184.  
  185.   /* In this stage the MAC operations are increased by 1 for every iteration.
  186.      The count variable holds the number of MAC operations performed */
  187.   count = 1U;
  188.  
  189.   /* Working pointer of inputA */
  190.   px = pIn1;
  191.  
  192.   /* Working pointer of inputB */
  193.   py = pIn2;
  194.  
  195.  
  196.   /* ------------------------
  197.    * Stage1 process
  198.    * ----------------------*/
  199.  
  200.   /* The first stage starts here */
  201.   while (blockSize1 > 0U)
  202.   {
  203.     /* Accumulator is made zero for every iteration */
  204.     sum = 0.0f;
  205.  
  206.     /* Apply loop unrolling and compute 4 MACs simultaneously. */
  207.     k = count >> 2U;
  208.  
  209.     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  210.      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  211.     while (k > 0U)
  212.     {
  213.       /* x[0] * y[srcBLen - 1] */
  214.       sum += *px++ * *py--;
  215.  
  216.       /* x[1] * y[srcBLen - 2] */
  217.       sum += *px++ * *py--;
  218.  
  219.       /* x[2] * y[srcBLen - 3] */
  220.       sum += *px++ * *py--;
  221.  
  222.       /* x[3] * y[srcBLen - 4] */
  223.       sum += *px++ * *py--;
  224.  
  225.       /* Decrement the loop counter */
  226.       k--;
  227.     }
  228.  
  229.     /* If the count is not a multiple of 4, compute any remaining MACs here.
  230.      ** No loop unrolling is used. */
  231.     k = count % 0x4U;
  232.  
  233.     while (k > 0U)
  234.     {
  235.       /* Perform the multiply-accumulate */
  236.       sum += *px++ * *py--;
  237.  
  238.       /* Decrement the loop counter */
  239.       k--;
  240.     }
  241.  
  242.     /* Store the result in the accumulator in the destination buffer. */
  243.     *pOut++ = sum;
  244.  
  245.     /* Update the inputA and inputB pointers for next MAC calculation */
  246.     py = pIn2 + count;
  247.     px = pIn1;
  248.  
  249.     /* Increment the MAC count */
  250.     count++;
  251.  
  252.     /* Decrement the loop counter */
  253.     blockSize1--;
  254.   }
  255.  
  256.   /* --------------------------
  257.    * Initializations of stage2
  258.    * ------------------------*/
  259.  
  260.   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
  261.    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
  262.    * ....
  263.    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
  264.    */
  265.  
  266.   /* Working pointer of inputA */
  267.   px = pIn1;
  268.  
  269.   /* Working pointer of inputB */
  270.   pSrc2 = pIn2 + (srcBLen - 1U);
  271.   py = pSrc2;
  272.  
  273.   /* count is index by which the pointer pIn1 to be incremented */
  274.   count = 0U;
  275.  
  276.   /* -------------------
  277.    * Stage2 process
  278.    * ------------------*/
  279.  
  280.   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
  281.    * So, to loop unroll over blockSize2,
  282.    * srcBLen should be greater than or equal to 4 */
  283.   if (srcBLen >= 4U)
  284.   {
  285.     /* Loop unroll over blockSize2, by 4 */
  286.     blkCnt = blockSize2 >> 2U;
  287.  
  288.     while (blkCnt > 0U)
  289.     {
  290.       /* Set all accumulators to zero */
  291.       acc0 = 0.0f;
  292.       acc1 = 0.0f;
  293.       acc2 = 0.0f;
  294.       acc3 = 0.0f;
  295.  
  296.       /* read x[0], x[1], x[2] samples */
  297.       x0 = *(px++);
  298.       x1 = *(px++);
  299.       x2 = *(px++);
  300.  
  301.       /* Apply loop unrolling and compute 4 MACs simultaneously. */
  302.       k = srcBLen >> 2U;
  303.  
  304.       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  305.        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  306.       do
  307.       {
  308.         /* Read y[srcBLen - 1] sample */
  309.         c0 = *(py--);
  310.  
  311.         /* Read x[3] sample */
  312.         x3 = *(px);
  313.  
  314.         /* Perform the multiply-accumulate */
  315.         /* acc0 +=  x[0] * y[srcBLen - 1] */
  316.         acc0 += x0 * c0;
  317.  
  318.         /* acc1 +=  x[1] * y[srcBLen - 1] */
  319.         acc1 += x1 * c0;
  320.  
  321.         /* acc2 +=  x[2] * y[srcBLen - 1] */
  322.         acc2 += x2 * c0;
  323.  
  324.         /* acc3 +=  x[3] * y[srcBLen - 1] */
  325.         acc3 += x3 * c0;
  326.  
  327.         /* Read y[srcBLen - 2] sample */
  328.         c0 = *(py--);
  329.  
  330.         /* Read x[4] sample */
  331.         x0 = *(px + 1U);
  332.  
  333.         /* Perform the multiply-accumulate */
  334.         /* acc0 +=  x[1] * y[srcBLen - 2] */
  335.         acc0 += x1 * c0;
  336.         /* acc1 +=  x[2] * y[srcBLen - 2] */
  337.         acc1 += x2 * c0;
  338.         /* acc2 +=  x[3] * y[srcBLen - 2] */
  339.         acc2 += x3 * c0;
  340.         /* acc3 +=  x[4] * y[srcBLen - 2] */
  341.         acc3 += x0 * c0;
  342.  
  343.         /* Read y[srcBLen - 3] sample */
  344.         c0 = *(py--);
  345.  
  346.         /* Read x[5] sample */
  347.         x1 = *(px + 2U);
  348.  
  349.         /* Perform the multiply-accumulates */
  350.         /* acc0 +=  x[2] * y[srcBLen - 3] */
  351.         acc0 += x2 * c0;
  352.         /* acc1 +=  x[3] * y[srcBLen - 2] */
  353.         acc1 += x3 * c0;
  354.         /* acc2 +=  x[4] * y[srcBLen - 2] */
  355.         acc2 += x0 * c0;
  356.         /* acc3 +=  x[5] * y[srcBLen - 2] */
  357.         acc3 += x1 * c0;
  358.  
  359.         /* Read y[srcBLen - 4] sample */
  360.         c0 = *(py--);
  361.  
  362.         /* Read x[6] sample */
  363.         x2 = *(px + 3U);
  364.         px += 4U;
  365.  
  366.         /* Perform the multiply-accumulates */
  367.         /* acc0 +=  x[3] * y[srcBLen - 4] */
  368.         acc0 += x3 * c0;
  369.         /* acc1 +=  x[4] * y[srcBLen - 4] */
  370.         acc1 += x0 * c0;
  371.         /* acc2 +=  x[5] * y[srcBLen - 4] */
  372.         acc2 += x1 * c0;
  373.         /* acc3 +=  x[6] * y[srcBLen - 4] */
  374.         acc3 += x2 * c0;
  375.  
  376.  
  377.       } while (--k);
  378.  
  379.       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  380.        ** No loop unrolling is used. */
  381.       k = srcBLen % 0x4U;
  382.  
  383.       while (k > 0U)
  384.       {
  385.         /* Read y[srcBLen - 5] sample */
  386.         c0 = *(py--);
  387.  
  388.         /* Read x[7] sample */
  389.         x3 = *(px++);
  390.  
  391.         /* Perform the multiply-accumulates */
  392.         /* acc0 +=  x[4] * y[srcBLen - 5] */
  393.         acc0 += x0 * c0;
  394.         /* acc1 +=  x[5] * y[srcBLen - 5] */
  395.         acc1 += x1 * c0;
  396.         /* acc2 +=  x[6] * y[srcBLen - 5] */
  397.         acc2 += x2 * c0;
  398.         /* acc3 +=  x[7] * y[srcBLen - 5] */
  399.         acc3 += x3 * c0;
  400.  
  401.         /* Reuse the present samples for the next MAC */
  402.         x0 = x1;
  403.         x1 = x2;
  404.         x2 = x3;
  405.  
  406.         /* Decrement the loop counter */
  407.         k--;
  408.       }
  409.  
  410.       /* Store the result in the accumulator in the destination buffer. */
  411.       *pOut++ = acc0;
  412.       *pOut++ = acc1;
  413.       *pOut++ = acc2;
  414.       *pOut++ = acc3;
  415.  
  416.       /* Increment the pointer pIn1 index, count by 4 */
  417.       count += 4U;
  418.  
  419.       /* Update the inputA and inputB pointers for next MAC calculation */
  420.       px = pIn1 + count;
  421.       py = pSrc2;
  422.  
  423.  
  424.       /* Decrement the loop counter */
  425.       blkCnt--;
  426.     }
  427.  
  428.  
  429.     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
  430.      ** No loop unrolling is used. */
  431.     blkCnt = blockSize2 % 0x4U;
  432.  
  433.     while (blkCnt > 0U)
  434.     {
  435.       /* Accumulator is made zero for every iteration */
  436.       sum = 0.0f;
  437.  
  438.       /* Apply loop unrolling and compute 4 MACs simultaneously. */
  439.       k = srcBLen >> 2U;
  440.  
  441.       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  442.        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  443.       while (k > 0U)
  444.       {
  445.         /* Perform the multiply-accumulates */
  446.         sum += *px++ * *py--;
  447.         sum += *px++ * *py--;
  448.         sum += *px++ * *py--;
  449.         sum += *px++ * *py--;
  450.  
  451.         /* Decrement the loop counter */
  452.         k--;
  453.       }
  454.  
  455.       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  456.        ** No loop unrolling is used. */
  457.       k = srcBLen % 0x4U;
  458.  
  459.       while (k > 0U)
  460.       {
  461.         /* Perform the multiply-accumulate */
  462.         sum += *px++ * *py--;
  463.  
  464.         /* Decrement the loop counter */
  465.         k--;
  466.       }
  467.  
  468.       /* Store the result in the accumulator in the destination buffer. */
  469.       *pOut++ = sum;
  470.  
  471.       /* Increment the MAC count */
  472.       count++;
  473.  
  474.       /* Update the inputA and inputB pointers for next MAC calculation */
  475.       px = pIn1 + count;
  476.       py = pSrc2;
  477.  
  478.       /* Decrement the loop counter */
  479.       blkCnt--;
  480.     }
  481.   }
  482.   else
  483.   {
  484.     /* If the srcBLen is not a multiple of 4,
  485.      * the blockSize2 loop cannot be unrolled by 4 */
  486.     blkCnt = blockSize2;
  487.  
  488.     while (blkCnt > 0U)
  489.     {
  490.       /* Accumulator is made zero for every iteration */
  491.       sum = 0.0f;
  492.  
  493.       /* srcBLen number of MACS should be performed */
  494.       k = srcBLen;
  495.  
  496.       while (k > 0U)
  497.       {
  498.         /* Perform the multiply-accumulate */
  499.         sum += *px++ * *py--;
  500.  
  501.         /* Decrement the loop counter */
  502.         k--;
  503.       }
  504.  
  505.       /* Store the result in the accumulator in the destination buffer. */
  506.       *pOut++ = sum;
  507.  
  508.       /* Increment the MAC count */
  509.       count++;
  510.  
  511.       /* Update the inputA and inputB pointers for next MAC calculation */
  512.       px = pIn1 + count;
  513.       py = pSrc2;
  514.  
  515.       /* Decrement the loop counter */
  516.       blkCnt--;
  517.     }
  518.   }
  519.  
  520.  
  521.   /* --------------------------
  522.    * Initializations of stage3
  523.    * -------------------------*/
  524.  
  525.   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
  526.    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
  527.    * ....
  528.    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
  529.    * sum +=  x[srcALen-1] * y[srcBLen-1]
  530.    */
  531.  
  532.   /* In this stage the MAC operations are decreased by 1 for every iteration.
  533.      The blockSize3 variable holds the number of MAC operations performed */
  534.  
  535.   /* Working pointer of inputA */
  536.   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
  537.   px = pSrc1;
  538.  
  539.   /* Working pointer of inputB */
  540.   pSrc2 = pIn2 + (srcBLen - 1U);
  541.   py = pSrc2;
  542.  
  543.   /* -------------------
  544.    * Stage3 process
  545.    * ------------------*/
  546.  
  547.   while (blockSize3 > 0U)
  548.   {
  549.     /* Accumulator is made zero for every iteration */
  550.     sum = 0.0f;
  551.  
  552.     /* Apply loop unrolling and compute 4 MACs simultaneously. */
  553.     k = blockSize3 >> 2U;
  554.  
  555.     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  556.      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  557.     while (k > 0U)
  558.     {
  559.       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
  560.       sum += *px++ * *py--;
  561.  
  562.       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
  563.       sum += *px++ * *py--;
  564.  
  565.       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
  566.       sum += *px++ * *py--;
  567.  
  568.       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
  569.       sum += *px++ * *py--;
  570.  
  571.       /* Decrement the loop counter */
  572.       k--;
  573.     }
  574.  
  575.     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
  576.      ** No loop unrolling is used. */
  577.     k = blockSize3 % 0x4U;
  578.  
  579.     while (k > 0U)
  580.     {
  581.       /* Perform the multiply-accumulates */
  582.       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
  583.       sum += *px++ * *py--;
  584.  
  585.       /* Decrement the loop counter */
  586.       k--;
  587.     }
  588.  
  589.     /* Store the result in the accumulator in the destination buffer. */
  590.     *pOut++ = sum;
  591.  
  592.     /* Update the inputA and inputB pointers for next MAC calculation */
  593.     px = ++pSrc1;
  594.     py = pSrc2;
  595.  
  596.     /* Decrement the loop counter */
  597.     blockSize3--;
  598.   }
  599.  
  600. #else
  601.  
  602.   /* Run the below code for Cortex-M0 */
  603.  
  604.   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
  605.   float32_t *pIn2 = pSrcB;                       /* inputB pointer */
  606.   float32_t sum;                                 /* Accumulator */
  607.   uint32_t i, j;                                 /* loop counters */
  608.  
  609.   /* Loop to calculate convolution for output length number of times */
  610.   for (i = 0U; i < ((srcALen + srcBLen) - 1U); i++)
  611.   {
  612.     /* Initialize sum with zero to carry out MAC operations */
  613.     sum = 0.0f;
  614.  
  615.     /* Loop to perform MAC operations according to convolution equation */
  616.     for (j = 0U; j <= i; j++)
  617.     {
  618.       /* Check the array limitations */
  619.       if ((((i - j) < srcBLen) && (j < srcALen)))
  620.       {
  621.         /* z[i] += x[i-j] * y[j] */
  622.         sum += pIn1[j] * pIn2[i - j];
  623.       }
  624.     }
  625.     /* Store the output in the destination buffer */
  626.     pDst[i] = sum;
  627.   }
  628.  
  629. #endif /*   #if defined (ARM_MATH_DSP)        */
  630.  
  631. }
  632.  
  633. /**
  634.  * @} end of Conv group
  635.  */
  636.