Subversion Repositories dashGPS

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------
  2.  * Project:      CMSIS DSP Library
  3.  * Title:        arm_conv_q31.c
  4.  * Description:  Convolution of Q31 sequences
  5.  *
  6.  * $Date:        27. January 2017
  7.  * $Revision:    V.1.5.1
  8.  *
  9.  * Target Processor: Cortex-M cores
  10.  * -------------------------------------------------------------------- */
  11. /*
  12.  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13.  *
  14.  * SPDX-License-Identifier: Apache-2.0
  15.  *
  16.  * Licensed under the Apache License, Version 2.0 (the License); you may
  17.  * not use this file except in compliance with the License.
  18.  * You may obtain a copy of the License at
  19.  *
  20.  * www.apache.org/licenses/LICENSE-2.0
  21.  *
  22.  * Unless required by applicable law or agreed to in writing, software
  23.  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25.  * See the License for the specific language governing permissions and
  26.  * limitations under the License.
  27.  */
  28.  
  29. #include "arm_math.h"
  30.  
  31. /**
  32.  * @ingroup groupFilters
  33.  */
  34.  
  35. /**
  36.  * @addtogroup Conv
  37.  * @{
  38.  */
  39.  
  40. /**
  41.  * @brief Convolution of Q31 sequences.
  42.  * @param[in] *pSrcA points to the first input sequence.
  43.  * @param[in] srcALen length of the first input sequence.
  44.  * @param[in] *pSrcB points to the second input sequence.
  45.  * @param[in] srcBLen length of the second input sequence.
  46.  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
  47.  * @return none.
  48.  *
  49.  * @details
  50.  * <b>Scaling and Overflow Behavior:</b>
  51.  *
  52.  * \par
  53.  * The function is implemented using an internal 64-bit accumulator.
  54.  * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
  55.  * There is no saturation on intermediate additions.
  56.  * Thus, if the accumulator overflows it wraps around and distorts the result.
  57.  * The input signals should be scaled down to avoid intermediate overflows.
  58.  * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
  59.  * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
  60.  * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
  61.  *
  62.  * \par
  63.  * See <code>arm_conv_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
  64.  */
  65.  
  66. void arm_conv_q31(
  67.   q31_t * pSrcA,
  68.   uint32_t srcALen,
  69.   q31_t * pSrcB,
  70.   uint32_t srcBLen,
  71.   q31_t * pDst)
  72. {
  73.  
  74.  
  75. #if defined (ARM_MATH_DSP)
  76.  
  77.   /* Run the below code for Cortex-M4 and Cortex-M3 */
  78.  
  79.   q31_t *pIn1;                                   /* inputA pointer */
  80.   q31_t *pIn2;                                   /* inputB pointer */
  81.   q31_t *pOut = pDst;                            /* output pointer */
  82.   q31_t *px;                                     /* Intermediate inputA pointer  */
  83.   q31_t *py;                                     /* Intermediate inputB pointer  */
  84.   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
  85.   q63_t sum;                                     /* Accumulator */
  86.   q63_t acc0, acc1, acc2;                        /* Accumulator */
  87.   q31_t x0, x1, x2, c0;                          /* Temporary variables to hold state and coefficient values */
  88.   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */
  89.  
  90.   /* The algorithm implementation is based on the lengths of the inputs. */
  91.   /* srcB is always made to slide across srcA. */
  92.   /* So srcBLen is always considered as shorter or equal to srcALen */
  93.   if (srcALen >= srcBLen)
  94.   {
  95.     /* Initialization of inputA pointer */
  96.     pIn1 = pSrcA;
  97.  
  98.     /* Initialization of inputB pointer */
  99.     pIn2 = pSrcB;
  100.   }
  101.   else
  102.   {
  103.     /* Initialization of inputA pointer */
  104.     pIn1 = (q31_t *) pSrcB;
  105.  
  106.     /* Initialization of inputB pointer */
  107.     pIn2 = (q31_t *) pSrcA;
  108.  
  109.     /* srcBLen is always considered as shorter or equal to srcALen */
  110.     j = srcBLen;
  111.     srcBLen = srcALen;
  112.     srcALen = j;
  113.   }
  114.  
  115.   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
  116.   /* The function is internally
  117.    * divided into three stages according to the number of multiplications that has to be
  118.    * taken place between inputA samples and inputB samples. In the first stage of the
  119.    * algorithm, the multiplications increase by one for every iteration.
  120.    * In the second stage of the algorithm, srcBLen number of multiplications are done.
  121.    * In the third stage of the algorithm, the multiplications decrease by one
  122.    * for every iteration. */
  123.  
  124.   /* The algorithm is implemented in three stages.
  125.      The loop counters of each stage is initiated here. */
  126.   blockSize1 = srcBLen - 1U;
  127.   blockSize2 = srcALen - (srcBLen - 1U);
  128.   blockSize3 = blockSize1;
  129.  
  130.   /* --------------------------
  131.    * Initializations of stage1
  132.    * -------------------------*/
  133.  
  134.   /* sum = x[0] * y[0]
  135.    * sum = x[0] * y[1] + x[1] * y[0]
  136.    * ....
  137.    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
  138.    */
  139.  
  140.   /* In this stage the MAC operations are increased by 1 for every iteration.
  141.      The count variable holds the number of MAC operations performed */
  142.   count = 1U;
  143.  
  144.   /* Working pointer of inputA */
  145.   px = pIn1;
  146.  
  147.   /* Working pointer of inputB */
  148.   py = pIn2;
  149.  
  150.  
  151.   /* ------------------------
  152.    * Stage1 process
  153.    * ----------------------*/
  154.  
  155.   /* The first stage starts here */
  156.   while (blockSize1 > 0U)
  157.   {
  158.     /* Accumulator is made zero for every iteration */
  159.     sum = 0;
  160.  
  161.     /* Apply loop unrolling and compute 4 MACs simultaneously. */
  162.     k = count >> 2U;
  163.  
  164.     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  165.      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  166.     while (k > 0U)
  167.     {
  168.       /* x[0] * y[srcBLen - 1] */
  169.       sum += (q63_t) * px++ * (*py--);
  170.       /* x[1] * y[srcBLen - 2] */
  171.       sum += (q63_t) * px++ * (*py--);
  172.       /* x[2] * y[srcBLen - 3] */
  173.       sum += (q63_t) * px++ * (*py--);
  174.       /* x[3] * y[srcBLen - 4] */
  175.       sum += (q63_t) * px++ * (*py--);
  176.  
  177.       /* Decrement the loop counter */
  178.       k--;
  179.     }
  180.  
  181.     /* If the count is not a multiple of 4, compute any remaining MACs here.
  182.      ** No loop unrolling is used. */
  183.     k = count % 0x4U;
  184.  
  185.     while (k > 0U)
  186.     {
  187.       /* Perform the multiply-accumulate */
  188.       sum += (q63_t) * px++ * (*py--);
  189.  
  190.       /* Decrement the loop counter */
  191.       k--;
  192.     }
  193.  
  194.     /* Store the result in the accumulator in the destination buffer. */
  195.     *pOut++ = (q31_t) (sum >> 31);
  196.  
  197.     /* Update the inputA and inputB pointers for next MAC calculation */
  198.     py = pIn2 + count;
  199.     px = pIn1;
  200.  
  201.     /* Increment the MAC count */
  202.     count++;
  203.  
  204.     /* Decrement the loop counter */
  205.     blockSize1--;
  206.   }
  207.  
  208.   /* --------------------------
  209.    * Initializations of stage2
  210.    * ------------------------*/
  211.  
  212.   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
  213.    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
  214.    * ....
  215.    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
  216.    */
  217.  
  218.   /* Working pointer of inputA */
  219.   px = pIn1;
  220.  
  221.   /* Working pointer of inputB */
  222.   pSrc2 = pIn2 + (srcBLen - 1U);
  223.   py = pSrc2;
  224.  
  225.   /* count is index by which the pointer pIn1 to be incremented */
  226.   count = 0U;
  227.  
  228.   /* -------------------
  229.    * Stage2 process
  230.    * ------------------*/
  231.  
  232.   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
  233.    * So, to loop unroll over blockSize2,
  234.    * srcBLen should be greater than or equal to 4 */
  235.   if (srcBLen >= 4U)
  236.   {
  237.     /* Loop unroll by 3 */
  238.     blkCnt = blockSize2 / 3;
  239.  
  240.     while (blkCnt > 0U)
  241.     {
  242.       /* Set all accumulators to zero */
  243.       acc0 = 0;
  244.       acc1 = 0;
  245.       acc2 = 0;
  246.  
  247.       /* read x[0], x[1], x[2] samples */
  248.       x0 = *(px++);
  249.       x1 = *(px++);
  250.  
  251.       /* Apply loop unrolling and compute 3 MACs simultaneously. */
  252.       k = srcBLen / 3;
  253.  
  254.       /* First part of the processing with loop unrolling.  Compute 3 MACs at a time.
  255.        ** a second loop below computes MACs for the remaining 1 to 2 samples. */
  256.       do
  257.       {
  258.         /* Read y[srcBLen - 1] sample */
  259.         c0 = *(py);
  260.  
  261.         /* Read x[3] sample */
  262.         x2 = *(px);
  263.  
  264.         /* Perform the multiply-accumulates */
  265.         /* acc0 +=  x[0] * y[srcBLen - 1] */
  266.         acc0 += ((q63_t) x0 * c0);
  267.         /* acc1 +=  x[1] * y[srcBLen - 1] */
  268.         acc1 += ((q63_t) x1 * c0);
  269.         /* acc2 +=  x[2] * y[srcBLen - 1] */
  270.         acc2 += ((q63_t) x2 * c0);
  271.  
  272.         /* Read y[srcBLen - 2] sample */
  273.         c0 = *(py - 1U);
  274.  
  275.         /* Read x[4] sample */
  276.         x0 = *(px + 1U);
  277.  
  278.         /* Perform the multiply-accumulate */
  279.         /* acc0 +=  x[1] * y[srcBLen - 2] */
  280.         acc0 += ((q63_t) x1 * c0);
  281.         /* acc1 +=  x[2] * y[srcBLen - 2] */
  282.         acc1 += ((q63_t) x2 * c0);
  283.         /* acc2 +=  x[3] * y[srcBLen - 2] */
  284.         acc2 += ((q63_t) x0 * c0);
  285.  
  286.         /* Read y[srcBLen - 3] sample */
  287.         c0 = *(py - 2U);
  288.  
  289.         /* Read x[5] sample */
  290.         x1 = *(px + 2U);
  291.  
  292.         /* Perform the multiply-accumulates */
  293.         /* acc0 +=  x[2] * y[srcBLen - 3] */
  294.         acc0 += ((q63_t) x2 * c0);
  295.         /* acc1 +=  x[3] * y[srcBLen - 2] */
  296.         acc1 += ((q63_t) x0 * c0);
  297.         /* acc2 +=  x[4] * y[srcBLen - 2] */
  298.         acc2 += ((q63_t) x1 * c0);
  299.  
  300.         /* update scratch pointers */
  301.         px += 3U;
  302.         py -= 3U;
  303.  
  304.       } while (--k);
  305.  
  306.       /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
  307.        ** No loop unrolling is used. */
  308.       k = srcBLen - (3 * (srcBLen / 3));
  309.  
  310.       while (k > 0U)
  311.       {
  312.         /* Read y[srcBLen - 5] sample */
  313.         c0 = *(py--);
  314.  
  315.         /* Read x[7] sample */
  316.         x2 = *(px++);
  317.  
  318.         /* Perform the multiply-accumulates */
  319.         /* acc0 +=  x[4] * y[srcBLen - 5] */
  320.         acc0 += ((q63_t) x0 * c0);
  321.         /* acc1 +=  x[5] * y[srcBLen - 5] */
  322.         acc1 += ((q63_t) x1 * c0);
  323.         /* acc2 +=  x[6] * y[srcBLen - 5] */
  324.         acc2 += ((q63_t) x2 * c0);
  325.  
  326.         /* Reuse the present samples for the next MAC */
  327.         x0 = x1;
  328.         x1 = x2;
  329.  
  330.         /* Decrement the loop counter */
  331.         k--;
  332.       }
  333.  
  334.       /* Store the results in the accumulators in the destination buffer. */
  335.       *pOut++ = (q31_t) (acc0 >> 31);
  336.       *pOut++ = (q31_t) (acc1 >> 31);
  337.       *pOut++ = (q31_t) (acc2 >> 31);
  338.  
  339.       /* Increment the pointer pIn1 index, count by 3 */
  340.       count += 3U;
  341.  
  342.       /* Update the inputA and inputB pointers for next MAC calculation */
  343.       px = pIn1 + count;
  344.       py = pSrc2;
  345.  
  346.       /* Decrement the loop counter */
  347.       blkCnt--;
  348.     }
  349.  
  350.     /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.
  351.      ** No loop unrolling is used. */
  352.     blkCnt = blockSize2 - 3 * (blockSize2 / 3);
  353.  
  354.     while (blkCnt > 0U)
  355.     {
  356.       /* Accumulator is made zero for every iteration */
  357.       sum = 0;
  358.  
  359.       /* Apply loop unrolling and compute 4 MACs simultaneously. */
  360.       k = srcBLen >> 2U;
  361.  
  362.       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  363.        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  364.       while (k > 0U)
  365.       {
  366.         /* Perform the multiply-accumulates */
  367.         sum += (q63_t) * px++ * (*py--);
  368.         sum += (q63_t) * px++ * (*py--);
  369.         sum += (q63_t) * px++ * (*py--);
  370.         sum += (q63_t) * px++ * (*py--);
  371.  
  372.         /* Decrement the loop counter */
  373.         k--;
  374.       }
  375.  
  376.       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
  377.        ** No loop unrolling is used. */
  378.       k = srcBLen % 0x4U;
  379.  
  380.       while (k > 0U)
  381.       {
  382.         /* Perform the multiply-accumulate */
  383.         sum += (q63_t) * px++ * (*py--);
  384.  
  385.         /* Decrement the loop counter */
  386.         k--;
  387.       }
  388.  
  389.       /* Store the result in the accumulator in the destination buffer. */
  390.       *pOut++ = (q31_t) (sum >> 31);
  391.  
  392.       /* Increment the MAC count */
  393.       count++;
  394.  
  395.       /* Update the inputA and inputB pointers for next MAC calculation */
  396.       px = pIn1 + count;
  397.       py = pSrc2;
  398.  
  399.       /* Decrement the loop counter */
  400.       blkCnt--;
  401.     }
  402.   }
  403.   else
  404.   {
  405.     /* If the srcBLen is not a multiple of 4,
  406.      * the blockSize2 loop cannot be unrolled by 4 */
  407.     blkCnt = blockSize2;
  408.  
  409.     while (blkCnt > 0U)
  410.     {
  411.       /* Accumulator is made zero for every iteration */
  412.       sum = 0;
  413.  
  414.       /* srcBLen number of MACS should be performed */
  415.       k = srcBLen;
  416.  
  417.       while (k > 0U)
  418.       {
  419.         /* Perform the multiply-accumulate */
  420.         sum += (q63_t) * px++ * (*py--);
  421.  
  422.         /* Decrement the loop counter */
  423.         k--;
  424.       }
  425.  
  426.       /* Store the result in the accumulator in the destination buffer. */
  427.       *pOut++ = (q31_t) (sum >> 31);
  428.  
  429.       /* Increment the MAC count */
  430.       count++;
  431.  
  432.       /* Update the inputA and inputB pointers for next MAC calculation */
  433.       px = pIn1 + count;
  434.       py = pSrc2;
  435.  
  436.       /* Decrement the loop counter */
  437.       blkCnt--;
  438.     }
  439.   }
  440.  
  441.  
  442.   /* --------------------------
  443.    * Initializations of stage3
  444.    * -------------------------*/
  445.  
  446.   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
  447.    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
  448.    * ....
  449.    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
  450.    * sum +=  x[srcALen-1] * y[srcBLen-1]
  451.    */
  452.  
  453.   /* In this stage the MAC operations are decreased by 1 for every iteration.
  454.      The blockSize3 variable holds the number of MAC operations performed */
  455.  
  456.   /* Working pointer of inputA */
  457.   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
  458.   px = pSrc1;
  459.  
  460.   /* Working pointer of inputB */
  461.   pSrc2 = pIn2 + (srcBLen - 1U);
  462.   py = pSrc2;
  463.  
  464.   /* -------------------
  465.    * Stage3 process
  466.    * ------------------*/
  467.  
  468.   while (blockSize3 > 0U)
  469.   {
  470.     /* Accumulator is made zero for every iteration */
  471.     sum = 0;
  472.  
  473.     /* Apply loop unrolling and compute 4 MACs simultaneously. */
  474.     k = blockSize3 >> 2U;
  475.  
  476.     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
  477.      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
  478.     while (k > 0U)
  479.     {
  480.       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
  481.       sum += (q63_t) * px++ * (*py--);
  482.       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
  483.       sum += (q63_t) * px++ * (*py--);
  484.       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
  485.       sum += (q63_t) * px++ * (*py--);
  486.       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
  487.       sum += (q63_t) * px++ * (*py--);
  488.  
  489.       /* Decrement the loop counter */
  490.       k--;
  491.     }
  492.  
  493.     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
  494.      ** No loop unrolling is used. */
  495.     k = blockSize3 % 0x4U;
  496.  
  497.     while (k > 0U)
  498.     {
  499.       /* Perform the multiply-accumulate */
  500.       sum += (q63_t) * px++ * (*py--);
  501.  
  502.       /* Decrement the loop counter */
  503.       k--;
  504.     }
  505.  
  506.     /* Store the result in the accumulator in the destination buffer. */
  507.     *pOut++ = (q31_t) (sum >> 31);
  508.  
  509.     /* Update the inputA and inputB pointers for next MAC calculation */
  510.     px = ++pSrc1;
  511.     py = pSrc2;
  512.  
  513.     /* Decrement the loop counter */
  514.     blockSize3--;
  515.   }
  516.  
  517. #else
  518.  
  519.   /* Run the below code for Cortex-M0 */
  520.  
  521.   q31_t *pIn1 = pSrcA;                           /* input pointer */
  522.   q31_t *pIn2 = pSrcB;                           /* coefficient pointer */
  523.   q63_t sum;                                     /* Accumulator */
  524.   uint32_t i, j;                                 /* loop counter */
  525.  
  526.   /* Loop to calculate output of convolution for output length number of times */
  527.   for (i = 0; i < (srcALen + srcBLen - 1); i++)
  528.   {
  529.     /* Initialize sum with zero to carry on MAC operations */
  530.     sum = 0;
  531.  
  532.     /* Loop to perform MAC operations according to convolution equation */
  533.     for (j = 0; j <= i; j++)
  534.     {
  535.       /* Check the array limitations */
  536.       if (((i - j) < srcBLen) && (j < srcALen))
  537.       {
  538.         /* z[i] += x[i-j] * y[j] */
  539.         sum += ((q63_t) pIn1[j] * (pIn2[i - j]));
  540.       }
  541.     }
  542.  
  543.     /* Store the output in the destination buffer */
  544.     pDst[i] = (q31_t) (sum >> 31U);
  545.   }
  546.  
  547. #endif /*     #if defined (ARM_MATH_DSP) */
  548.  
  549. }
  550.  
  551. /**
  552.  * @} end of Conv group
  553.  */
  554.