Subversion Repositories AFRtranscoder

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------
  2.  * Project:      CMSIS DSP Library
  3.  * Title:        arm_fir_q31.c
  4.  * Description:  Q31 FIR filter processing function
  5.  *
  6.  * $Date:        27. January 2017
  7.  * $Revision:    V.1.5.1
  8.  *
  9.  * Target Processor: Cortex-M cores
  10.  * -------------------------------------------------------------------- */
  11. /*
  12.  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13.  *
  14.  * SPDX-License-Identifier: Apache-2.0
  15.  *
  16.  * Licensed under the Apache License, Version 2.0 (the License); you may
  17.  * not use this file except in compliance with the License.
  18.  * You may obtain a copy of the License at
  19.  *
  20.  * www.apache.org/licenses/LICENSE-2.0
  21.  *
  22.  * Unless required by applicable law or agreed to in writing, software
  23.  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25.  * See the License for the specific language governing permissions and
  26.  * limitations under the License.
  27.  */
  28.  
  29. #include "arm_math.h"
  30.  
  31. /**
  32.  * @ingroup groupFilters
  33.  */
  34.  
  35. /**
  36.  * @addtogroup FIR
  37.  * @{
  38.  */
  39.  
  40. /**
  41.  * @param[in] *S points to an instance of the Q31 FIR filter structure.
  42.  * @param[in] *pSrc points to the block of input data.
  43.  * @param[out] *pDst points to the block of output data.
  44.  * @param[in] blockSize number of samples to process per call.
  45.  * @return none.
  46.  *
  47.  * @details
  48.  * <b>Scaling and Overflow Behavior:</b>
  49.  * \par
  50.  * The function is implemented using an internal 64-bit accumulator.
  51.  * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
  52.  * Thus, if the accumulator result overflows it wraps around rather than clip.
  53.  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
  54.  * After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
  55.  *
  56.  * \par
  57.  * Refer to the function <code>arm_fir_fast_q31()</code> for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
  58.  */
  59.  
  60. void arm_fir_q31(
  61.   const arm_fir_instance_q31 * S,
  62.   q31_t * pSrc,
  63.   q31_t * pDst,
  64.   uint32_t blockSize)
  65. {
  66.   q31_t *pState = S->pState;                     /* State pointer */
  67.   q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
  68.   q31_t *pStateCurnt;                            /* Points to the current sample of the state */
  69.  
  70.  
  71. #if defined (ARM_MATH_DSP)
  72.  
  73.   /* Run the below code for Cortex-M4 and Cortex-M3 */
  74.  
  75.   q31_t x0, x1, x2;                              /* Temporary variables to hold state */
  76.   q31_t c0;                                      /* Temporary variable to hold coefficient value */
  77.   q31_t *px;                                     /* Temporary pointer for state */
  78.   q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
  79.   q63_t acc0, acc1, acc2;                        /* Accumulators */
  80.   uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
  81.   uint32_t i, tapCnt, blkCnt, tapCntN3;          /* Loop counters */
  82.  
  83.   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
  84.   /* pStateCurnt points to the location where the new input data should be written */
  85.   pStateCurnt = &(S->pState[(numTaps - 1U)]);
  86.  
  87.   /* Apply loop unrolling and compute 4 output values simultaneously.
  88.    * The variables acc0 ... acc3 hold output values that are being computed:
  89.    *
  90.    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
  91.    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
  92.    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
  93.    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
  94.    */
  95.   blkCnt = blockSize / 3;
  96.   blockSize = blockSize - (3 * blkCnt);
  97.  
  98.   tapCnt = numTaps / 3;
  99.   tapCntN3 = numTaps - (3 * tapCnt);
  100.  
  101.   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
  102.    ** a second loop below computes the remaining 1 to 3 samples. */
  103.   while (blkCnt > 0U)
  104.   {
  105.     /* Copy three new input samples into the state buffer */
  106.     *pStateCurnt++ = *pSrc++;
  107.     *pStateCurnt++ = *pSrc++;
  108.     *pStateCurnt++ = *pSrc++;
  109.  
  110.     /* Set all accumulators to zero */
  111.     acc0 = 0;
  112.     acc1 = 0;
  113.     acc2 = 0;
  114.  
  115.     /* Initialize state pointer */
  116.     px = pState;
  117.  
  118.     /* Initialize coefficient pointer */
  119.     pb = pCoeffs;
  120.  
  121.     /* Read the first two samples from the state buffer:
  122.      *  x[n-numTaps], x[n-numTaps-1] */
  123.     x0 = *(px++);
  124.     x1 = *(px++);
  125.  
  126.     /* Loop unrolling.  Process 3 taps at a time. */
  127.     i = tapCnt;
  128.  
  129.     while (i > 0U)
  130.     {
  131.       /* Read the b[numTaps] coefficient */
  132.       c0 = *pb;
  133.  
  134.       /* Read x[n-numTaps-2] sample */
  135.       x2 = *(px++);
  136.  
  137.       /* Perform the multiply-accumulates */
  138.       acc0 += ((q63_t) x0 * c0);
  139.       acc1 += ((q63_t) x1 * c0);
  140.       acc2 += ((q63_t) x2 * c0);
  141.  
  142.       /* Read the coefficient and state */
  143.       c0 = *(pb + 1U);
  144.       x0 = *(px++);
  145.  
  146.       /* Perform the multiply-accumulates */
  147.       acc0 += ((q63_t) x1 * c0);
  148.       acc1 += ((q63_t) x2 * c0);
  149.       acc2 += ((q63_t) x0 * c0);
  150.  
  151.       /* Read the coefficient and state */
  152.       c0 = *(pb + 2U);
  153.       x1 = *(px++);
  154.  
  155.       /* update coefficient pointer */
  156.       pb += 3U;
  157.  
  158.       /* Perform the multiply-accumulates */
  159.       acc0 += ((q63_t) x2 * c0);
  160.       acc1 += ((q63_t) x0 * c0);
  161.       acc2 += ((q63_t) x1 * c0);
  162.  
  163.       /* Decrement the loop counter */
  164.       i--;
  165.     }
  166.  
  167.     /* If the filter length is not a multiple of 3, compute the remaining filter taps */
  168.  
  169.     i = tapCntN3;
  170.  
  171.     while (i > 0U)
  172.     {
  173.       /* Read coefficients */
  174.       c0 = *(pb++);
  175.  
  176.       /* Fetch 1 state variable */
  177.       x2 = *(px++);
  178.  
  179.       /* Perform the multiply-accumulates */
  180.       acc0 += ((q63_t) x0 * c0);
  181.       acc1 += ((q63_t) x1 * c0);
  182.       acc2 += ((q63_t) x2 * c0);
  183.  
  184.       /* Reuse the present sample states for next sample */
  185.       x0 = x1;
  186.       x1 = x2;
  187.  
  188.       /* Decrement the loop counter */
  189.       i--;
  190.     }
  191.  
  192.     /* Advance the state pointer by 3 to process the next group of 3 samples */
  193.     pState = pState + 3;
  194.  
  195.     /* The results in the 3 accumulators are in 2.30 format.  Convert to 1.31
  196.      ** Then store the 3 outputs in the destination buffer. */
  197.     *pDst++ = (q31_t) (acc0 >> 31U);
  198.     *pDst++ = (q31_t) (acc1 >> 31U);
  199.     *pDst++ = (q31_t) (acc2 >> 31U);
  200.  
  201.     /* Decrement the samples loop counter */
  202.     blkCnt--;
  203.   }
  204.  
  205.   /* If the blockSize is not a multiple of 3, compute any remaining output samples here.
  206.    ** No loop unrolling is used. */
  207.  
  208.   while (blockSize > 0U)
  209.   {
  210.     /* Copy one sample at a time into state buffer */
  211.     *pStateCurnt++ = *pSrc++;
  212.  
  213.     /* Set the accumulator to zero */
  214.     acc0 = 0;
  215.  
  216.     /* Initialize state pointer */
  217.     px = pState;
  218.  
  219.     /* Initialize Coefficient pointer */
  220.     pb = (pCoeffs);
  221.  
  222.     i = numTaps;
  223.  
  224.     /* Perform the multiply-accumulates */
  225.     do
  226.     {
  227.       acc0 += (q63_t) * (px++) * (*(pb++));
  228.       i--;
  229.     } while (i > 0U);
  230.  
  231.     /* The result is in 2.62 format.  Convert to 1.31
  232.      ** Then store the output in the destination buffer. */
  233.     *pDst++ = (q31_t) (acc0 >> 31U);
  234.  
  235.     /* Advance state pointer by 1 for the next sample */
  236.     pState = pState + 1;
  237.  
  238.     /* Decrement the samples loop counter */
  239.     blockSize--;
  240.   }
  241.  
  242.   /* Processing is complete.
  243.    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
  244.    ** This prepares the state buffer for the next function call. */
  245.  
  246.   /* Points to the start of the state buffer */
  247.   pStateCurnt = S->pState;
  248.  
  249.   tapCnt = (numTaps - 1U) >> 2U;
  250.  
  251.   /* copy data */
  252.   while (tapCnt > 0U)
  253.   {
  254.     *pStateCurnt++ = *pState++;
  255.     *pStateCurnt++ = *pState++;
  256.     *pStateCurnt++ = *pState++;
  257.     *pStateCurnt++ = *pState++;
  258.  
  259.     /* Decrement the loop counter */
  260.     tapCnt--;
  261.   }
  262.  
  263.   /* Calculate remaining number of copies */
  264.   tapCnt = (numTaps - 1U) % 0x4U;
  265.  
  266.   /* Copy the remaining q31_t data */
  267.   while (tapCnt > 0U)
  268.   {
  269.     *pStateCurnt++ = *pState++;
  270.  
  271.     /* Decrement the loop counter */
  272.     tapCnt--;
  273.   }
  274.  
  275. #else
  276.  
  277. /* Run the below code for Cortex-M0 */
  278.  
  279.   q31_t *px;                                     /* Temporary pointer for state */
  280.   q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
  281.   q63_t acc;                                     /* Accumulator */
  282.   uint32_t numTaps = S->numTaps;                 /* Length of the filter */
  283.   uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
  284.  
  285.   /* S->pState buffer contains previous frame (numTaps - 1) samples */
  286.   /* pStateCurnt points to the location where the new input data should be written */
  287.   pStateCurnt = &(S->pState[(numTaps - 1U)]);
  288.  
  289.   /* Initialize blkCnt with blockSize */
  290.   blkCnt = blockSize;
  291.  
  292.   while (blkCnt > 0U)
  293.   {
  294.     /* Copy one sample at a time into state buffer */
  295.     *pStateCurnt++ = *pSrc++;
  296.  
  297.     /* Set the accumulator to zero */
  298.     acc = 0;
  299.  
  300.     /* Initialize state pointer */
  301.     px = pState;
  302.  
  303.     /* Initialize Coefficient pointer */
  304.     pb = pCoeffs;
  305.  
  306.     i = numTaps;
  307.  
  308.     /* Perform the multiply-accumulates */
  309.     do
  310.     {
  311.       /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
  312.       acc += (q63_t) * px++ * *pb++;
  313.       i--;
  314.     } while (i > 0U);
  315.  
  316.     /* The result is in 2.62 format.  Convert to 1.31
  317.      ** Then store the output in the destination buffer. */
  318.     *pDst++ = (q31_t) (acc >> 31U);
  319.  
  320.     /* Advance state pointer by 1 for the next sample */
  321.     pState = pState + 1;
  322.  
  323.     /* Decrement the samples loop counter */
  324.     blkCnt--;
  325.   }
  326.  
  327.   /* Processing is complete.
  328.    ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
  329.    ** This prepares the state buffer for the next function call. */
  330.  
  331.   /* Points to the start of the state buffer */
  332.   pStateCurnt = S->pState;
  333.  
  334.   /* Copy numTaps number of values */
  335.   tapCnt = numTaps - 1U;
  336.  
  337.   /* Copy the data */
  338.   while (tapCnt > 0U)
  339.   {
  340.     *pStateCurnt++ = *pState++;
  341.  
  342.     /* Decrement the loop counter */
  343.     tapCnt--;
  344.   }
  345.  
  346.  
  347. #endif /*  #if defined (ARM_MATH_DSP) */
  348.  
  349. }
  350.  
  351. /**
  352.  * @} end of FIR group
  353.  */
  354.