Subversion Repositories ScreenTimer

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------
  2.  * Project:      CMSIS DSP Library
  3.  * Title:        arm_fir_f32.c
  4.  * Description:  Floating-point FIR filter processing function
  5.  *
  6.  * $Date:        27. January 2017
  7.  * $Revision:    V.1.5.1
  8.  *
  9.  * Target Processor: Cortex-M cores
  10.  * -------------------------------------------------------------------- */
  11. /*
  12.  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13.  *
  14.  * SPDX-License-Identifier: Apache-2.0
  15.  *
  16.  * Licensed under the Apache License, Version 2.0 (the License); you may
  17.  * not use this file except in compliance with the License.
  18.  * You may obtain a copy of the License at
  19.  *
  20.  * www.apache.org/licenses/LICENSE-2.0
  21.  *
  22.  * Unless required by applicable law or agreed to in writing, software
  23.  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25.  * See the License for the specific language governing permissions and
  26.  * limitations under the License.
  27.  */
  28.  
  29. #include "arm_math.h"
  30.  
  31. /**
  32. * @ingroup groupFilters
  33. */
  34.  
  35. /**
  36. * @defgroup FIR Finite Impulse Response (FIR) Filters
  37. *
  38. * This set of functions implements Finite Impulse Response (FIR) filters
  39. * for Q7, Q15, Q31, and floating-point data types.  Fast versions of Q15 and Q31 are also provided.
  40. * The functions operate on blocks of input and output data and each call to the function processes
  41. * <code>blockSize</code> samples through the filter.  <code>pSrc</code> and
  42. * <code>pDst</code> points to input and output arrays containing <code>blockSize</code> values.
  43. *
  44. * \par Algorithm:
  45. * The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.
  46. * Each filter coefficient <code>b[n]</code> is multiplied by a state variable which equals a previous input sample <code>x[n]</code>.
  47. * <pre>
  48. *    y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
  49. * </pre>
  50. * \par
  51. * \image html FIR.gif "Finite Impulse Response filter"
  52. * \par
  53. * <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
  54. * Coefficients are stored in time reversed order.
  55. * \par
  56. * <pre>
  57. *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
  58. * </pre>
  59. * \par
  60. * <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
  61. * Samples in the state buffer are stored in the following order.
  62. * \par
  63. * <pre>
  64. *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
  65. * </pre>
  66. * \par
  67. * Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>.
  68. * The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
  69. * to be avoided and yields a significant speed improvement.
  70. * The state variables are updated after each block of data is processed; the coefficients are untouched.
  71. * \par Instance Structure
  72. * The coefficients and state variables for a filter are stored together in an instance data structure.
  73. * A separate instance structure must be defined for each filter.
  74. * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
  75. * There are separate instance structure declarations for each of the 4 supported data types.
  76. *
  77. * \par Initialization Functions
  78. * There is also an associated initialization function for each data type.
  79. * The initialization function performs the following operations:
  80. * - Sets the values of the internal structure fields.
  81. * - Zeros out the values in the state buffer.
  82. * To do this manually without calling the init function, assign the follow subfields of the instance structure:
  83. * numTaps, pCoeffs, pState. Also set all of the values in pState to zero.
  84. *
  85. * \par
  86. * Use of the initialization function is optional.
  87. * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
  88. * To place an instance structure into a const data section, the instance structure must be manually initialized.
  89. * Set the values in the state buffer to zeros before static initialization.
  90. * The code below statically initializes each of the 4 different data type filter instance structures
  91. * <pre>
  92. *arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
  93. *arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
  94. *arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
  95. *arm_fir_instance_q7 S =  {numTaps, pState, pCoeffs};
  96. * </pre>
  97. *
  98. * where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
  99. * <code>pCoeffs</code> is the address of the coefficient buffer.
  100. *
  101. * \par Fixed-Point Behavior
  102. * Care must be taken when using the fixed-point versions of the FIR filter functions.
  103. * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
  104. * Refer to the function specific documentation below for usage guidelines.
  105. */
  106.  
  107. /**
  108. * @addtogroup FIR
  109. * @{
  110. */
  111.  
  112. /**
  113. *
  114. * @param[in]  *S points to an instance of the floating-point FIR filter structure.
  115. * @param[in]  *pSrc points to the block of input data.
  116. * @param[out] *pDst points to the block of output data.
  117. * @param[in]  blockSize number of samples to process per call.
  118. * @return     none.
  119. *
  120. */
  121.  
  122. #if defined(ARM_MATH_CM7)
  123.  
  124. void arm_fir_f32(
  125. const arm_fir_instance_f32 * S,
  126. float32_t * pSrc,
  127. float32_t * pDst,
  128. uint32_t blockSize)
  129. {
  130.    float32_t *pState = S->pState;                 /* State pointer */
  131.    float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
  132.    float32_t *pStateCurnt;                        /* Points to the current sample of the state */
  133.    float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
  134.    float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;     /* Accumulators */
  135.    float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0;  /* Temporary variables to hold state and coefficient values */
  136.    uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
  137.    uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
  138.  
  139.    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
  140.    /* pStateCurnt points to the location where the new input data should be written */
  141.    pStateCurnt = &(S->pState[(numTaps - 1U)]);
  142.  
  143.    /* Apply loop unrolling and compute 8 output values simultaneously.
  144.     * The variables acc0 ... acc7 hold output values that are being computed:
  145.     *
  146.     *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
  147.     *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
  148.     *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
  149.     *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
  150.     */
  151.    blkCnt = blockSize >> 3;
  152.  
  153.    /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.
  154.    ** a second loop below computes the remaining 1 to 7 samples. */
  155.    while (blkCnt > 0U)
  156.    {
  157.       /* Copy four new input samples into the state buffer */
  158.       *pStateCurnt++ = *pSrc++;
  159.       *pStateCurnt++ = *pSrc++;
  160.       *pStateCurnt++ = *pSrc++;
  161.       *pStateCurnt++ = *pSrc++;
  162.  
  163.       /* Set all accumulators to zero */
  164.       acc0 = 0.0f;
  165.       acc1 = 0.0f;
  166.       acc2 = 0.0f;
  167.       acc3 = 0.0f;
  168.       acc4 = 0.0f;
  169.       acc5 = 0.0f;
  170.       acc6 = 0.0f;
  171.       acc7 = 0.0f;
  172.  
  173.       /* Initialize state pointer */
  174.       px = pState;
  175.  
  176.       /* Initialize coeff pointer */
  177.       pb = (pCoeffs);
  178.  
  179.       /* This is separated from the others to avoid
  180.        * a call to __aeabi_memmove which would be slower
  181.        */
  182.       *pStateCurnt++ = *pSrc++;
  183.       *pStateCurnt++ = *pSrc++;
  184.       *pStateCurnt++ = *pSrc++;
  185.       *pStateCurnt++ = *pSrc++;
  186.  
  187.       /* Read the first seven samples from the state buffer:  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
  188.       x0 = *px++;
  189.       x1 = *px++;
  190.       x2 = *px++;
  191.       x3 = *px++;
  192.       x4 = *px++;
  193.       x5 = *px++;
  194.       x6 = *px++;
  195.  
  196.       /* Loop unrolling.  Process 8 taps at a time. */
  197.       tapCnt = numTaps >> 3U;
  198.  
  199.       /* Loop over the number of taps.  Unroll by a factor of 8.
  200.        ** Repeat until we've computed numTaps-8 coefficients. */
  201.       while (tapCnt > 0U)
  202.       {
  203.          /* Read the b[numTaps-1] coefficient */
  204.          c0 = *(pb++);
  205.  
  206.          /* Read x[n-numTaps-3] sample */
  207.          x7 = *(px++);
  208.  
  209.          /* acc0 +=  b[numTaps-1] * x[n-numTaps] */
  210.          acc0 += x0 * c0;
  211.  
  212.          /* acc1 +=  b[numTaps-1] * x[n-numTaps-1] */
  213.          acc1 += x1 * c0;
  214.  
  215.          /* acc2 +=  b[numTaps-1] * x[n-numTaps-2] */
  216.          acc2 += x2 * c0;
  217.  
  218.          /* acc3 +=  b[numTaps-1] * x[n-numTaps-3] */
  219.          acc3 += x3 * c0;
  220.  
  221.          /* acc4 +=  b[numTaps-1] * x[n-numTaps-4] */
  222.          acc4 += x4 * c0;
  223.  
  224.          /* acc1 +=  b[numTaps-1] * x[n-numTaps-5] */
  225.          acc5 += x5 * c0;
  226.  
  227.          /* acc2 +=  b[numTaps-1] * x[n-numTaps-6] */
  228.          acc6 += x6 * c0;
  229.  
  230.          /* acc3 +=  b[numTaps-1] * x[n-numTaps-7] */
  231.          acc7 += x7 * c0;
  232.  
  233.          /* Read the b[numTaps-2] coefficient */
  234.          c0 = *(pb++);
  235.  
  236.          /* Read x[n-numTaps-4] sample */
  237.          x0 = *(px++);
  238.  
  239.          /* Perform the multiply-accumulate */
  240.          acc0 += x1 * c0;
  241.          acc1 += x2 * c0;
  242.          acc2 += x3 * c0;
  243.          acc3 += x4 * c0;
  244.          acc4 += x5 * c0;
  245.          acc5 += x6 * c0;
  246.          acc6 += x7 * c0;
  247.          acc7 += x0 * c0;
  248.  
  249.          /* Read the b[numTaps-3] coefficient */
  250.          c0 = *(pb++);
  251.  
  252.          /* Read x[n-numTaps-5] sample */
  253.          x1 = *(px++);
  254.  
  255.          /* Perform the multiply-accumulates */
  256.          acc0 += x2 * c0;
  257.          acc1 += x3 * c0;
  258.          acc2 += x4 * c0;
  259.          acc3 += x5 * c0;
  260.          acc4 += x6 * c0;
  261.          acc5 += x7 * c0;
  262.          acc6 += x0 * c0;
  263.          acc7 += x1 * c0;
  264.  
  265.          /* Read the b[numTaps-4] coefficient */
  266.          c0 = *(pb++);
  267.  
  268.          /* Read x[n-numTaps-6] sample */
  269.          x2 = *(px++);
  270.  
  271.          /* Perform the multiply-accumulates */
  272.          acc0 += x3 * c0;
  273.          acc1 += x4 * c0;
  274.          acc2 += x5 * c0;
  275.          acc3 += x6 * c0;
  276.          acc4 += x7 * c0;
  277.          acc5 += x0 * c0;
  278.          acc6 += x1 * c0;
  279.          acc7 += x2 * c0;
  280.  
  281.          /* Read the b[numTaps-4] coefficient */
  282.          c0 = *(pb++);
  283.  
  284.          /* Read x[n-numTaps-6] sample */
  285.          x3 = *(px++);
  286.          /* Perform the multiply-accumulates */
  287.          acc0 += x4 * c0;
  288.          acc1 += x5 * c0;
  289.          acc2 += x6 * c0;
  290.          acc3 += x7 * c0;
  291.          acc4 += x0 * c0;
  292.          acc5 += x1 * c0;
  293.          acc6 += x2 * c0;
  294.          acc7 += x3 * c0;
  295.  
  296.          /* Read the b[numTaps-4] coefficient */
  297.          c0 = *(pb++);
  298.  
  299.          /* Read x[n-numTaps-6] sample */
  300.          x4 = *(px++);
  301.  
  302.          /* Perform the multiply-accumulates */
  303.          acc0 += x5 * c0;
  304.          acc1 += x6 * c0;
  305.          acc2 += x7 * c0;
  306.          acc3 += x0 * c0;
  307.          acc4 += x1 * c0;
  308.          acc5 += x2 * c0;
  309.          acc6 += x3 * c0;
  310.          acc7 += x4 * c0;
  311.  
  312.          /* Read the b[numTaps-4] coefficient */
  313.          c0 = *(pb++);
  314.  
  315.          /* Read x[n-numTaps-6] sample */
  316.          x5 = *(px++);
  317.  
  318.          /* Perform the multiply-accumulates */
  319.          acc0 += x6 * c0;
  320.          acc1 += x7 * c0;
  321.          acc2 += x0 * c0;
  322.          acc3 += x1 * c0;
  323.          acc4 += x2 * c0;
  324.          acc5 += x3 * c0;
  325.          acc6 += x4 * c0;
  326.          acc7 += x5 * c0;
  327.  
  328.          /* Read the b[numTaps-4] coefficient */
  329.          c0 = *(pb++);
  330.  
  331.          /* Read x[n-numTaps-6] sample */
  332.          x6 = *(px++);
  333.  
  334.          /* Perform the multiply-accumulates */
  335.          acc0 += x7 * c0;
  336.          acc1 += x0 * c0;
  337.          acc2 += x1 * c0;
  338.          acc3 += x2 * c0;
  339.          acc4 += x3 * c0;
  340.          acc5 += x4 * c0;
  341.          acc6 += x5 * c0;
  342.          acc7 += x6 * c0;
  343.  
  344.          tapCnt--;
  345.       }
  346.  
  347.       /* If the filter length is not a multiple of 8, compute the remaining filter taps */
  348.       tapCnt = numTaps % 0x8U;
  349.  
  350.       while (tapCnt > 0U)
  351.       {
  352.          /* Read coefficients */
  353.          c0 = *(pb++);
  354.  
  355.          /* Fetch 1 state variable */
  356.          x7 = *(px++);
  357.  
  358.          /* Perform the multiply-accumulates */
  359.          acc0 += x0 * c0;
  360.          acc1 += x1 * c0;
  361.          acc2 += x2 * c0;
  362.          acc3 += x3 * c0;
  363.          acc4 += x4 * c0;
  364.          acc5 += x5 * c0;
  365.          acc6 += x6 * c0;
  366.          acc7 += x7 * c0;
  367.  
  368.          /* Reuse the present sample states for next sample */
  369.          x0 = x1;
  370.          x1 = x2;
  371.          x2 = x3;
  372.          x3 = x4;
  373.          x4 = x5;
  374.          x5 = x6;
  375.          x6 = x7;
  376.  
  377.          /* Decrement the loop counter */
  378.          tapCnt--;
  379.       }
  380.  
  381.       /* Advance the state pointer by 8 to process the next group of 8 samples */
  382.       pState = pState + 8;
  383.  
  384.       /* The results in the 8 accumulators, store in the destination buffer. */
  385.       *pDst++ = acc0;
  386.       *pDst++ = acc1;
  387.       *pDst++ = acc2;
  388.       *pDst++ = acc3;
  389.       *pDst++ = acc4;
  390.       *pDst++ = acc5;
  391.       *pDst++ = acc6;
  392.       *pDst++ = acc7;
  393.  
  394.       blkCnt--;
  395.    }
  396.  
  397.    /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
  398.    ** No loop unrolling is used. */
  399.    blkCnt = blockSize % 0x8U;
  400.  
  401.    while (blkCnt > 0U)
  402.    {
  403.       /* Copy one sample at a time into state buffer */
  404.       *pStateCurnt++ = *pSrc++;
  405.  
  406.       /* Set the accumulator to zero */
  407.       acc0 = 0.0f;
  408.  
  409.       /* Initialize state pointer */
  410.       px = pState;
  411.  
  412.       /* Initialize Coefficient pointer */
  413.       pb = (pCoeffs);
  414.  
  415.       i = numTaps;
  416.  
  417.       /* Perform the multiply-accumulates */
  418.       do
  419.       {
  420.          acc0 += *px++ * *pb++;
  421.          i--;
  422.  
  423.       } while (i > 0U);
  424.  
  425.       /* The result is store in the destination buffer. */
  426.       *pDst++ = acc0;
  427.  
  428.       /* Advance state pointer by 1 for the next sample */
  429.       pState = pState + 1;
  430.  
  431.       blkCnt--;
  432.    }
  433.  
  434.    /* Processing is complete.
  435.    ** Now copy the last numTaps - 1 samples to the start of the state buffer.
  436.    ** This prepares the state buffer for the next function call. */
  437.  
  438.    /* Points to the start of the state buffer */
  439.    pStateCurnt = S->pState;
  440.  
  441.    tapCnt = (numTaps - 1U) >> 2U;
  442.  
  443.    /* copy data */
  444.    while (tapCnt > 0U)
  445.    {
  446.       *pStateCurnt++ = *pState++;
  447.       *pStateCurnt++ = *pState++;
  448.       *pStateCurnt++ = *pState++;
  449.       *pStateCurnt++ = *pState++;
  450.  
  451.       /* Decrement the loop counter */
  452.       tapCnt--;
  453.    }
  454.  
  455.    /* Calculate remaining number of copies */
  456.    tapCnt = (numTaps - 1U) % 0x4U;
  457.  
  458.    /* Copy the remaining q31_t data */
  459.    while (tapCnt > 0U)
  460.    {
  461.       *pStateCurnt++ = *pState++;
  462.  
  463.       /* Decrement the loop counter */
  464.       tapCnt--;
  465.    }
  466. }
  467.  
  468. #elif defined(ARM_MATH_CM0_FAMILY)
  469.  
  470. void arm_fir_f32(
  471. const arm_fir_instance_f32 * S,
  472. float32_t * pSrc,
  473. float32_t * pDst,
  474. uint32_t blockSize)
  475. {
  476.    float32_t *pState = S->pState;                 /* State pointer */
  477.    float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
  478.    float32_t *pStateCurnt;                        /* Points to the current sample of the state */
  479.    float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
  480.    uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
  481.    uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
  482.  
  483.    /* Run the below code for Cortex-M0 */
  484.  
  485.    float32_t acc;
  486.  
  487.    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
  488.    /* pStateCurnt points to the location where the new input data should be written */
  489.    pStateCurnt = &(S->pState[(numTaps - 1U)]);
  490.  
  491.    /* Initialize blkCnt with blockSize */
  492.    blkCnt = blockSize;
  493.  
  494.    while (blkCnt > 0U)
  495.    {
  496.       /* Copy one sample at a time into state buffer */
  497.       *pStateCurnt++ = *pSrc++;
  498.  
  499.       /* Set the accumulator to zero */
  500.       acc = 0.0f;
  501.  
  502.       /* Initialize state pointer */
  503.       px = pState;
  504.  
  505.       /* Initialize Coefficient pointer */
  506.       pb = pCoeffs;
  507.  
  508.       i = numTaps;
  509.  
  510.       /* Perform the multiply-accumulates */
  511.       do
  512.       {
  513.          /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
  514.          acc += *px++ * *pb++;
  515.          i--;
  516.  
  517.       } while (i > 0U);
  518.  
  519.       /* The result is store in the destination buffer. */
  520.       *pDst++ = acc;
  521.  
  522.       /* Advance state pointer by 1 for the next sample */
  523.       pState = pState + 1;
  524.  
  525.       blkCnt--;
  526.    }
  527.  
  528.    /* Processing is complete.
  529.    ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
  530.    ** This prepares the state buffer for the next function call. */
  531.  
  532.    /* Points to the start of the state buffer */
  533.    pStateCurnt = S->pState;
  534.  
  535.    /* Copy numTaps number of values */
  536.    tapCnt = numTaps - 1U;
  537.  
  538.    /* Copy data */
  539.    while (tapCnt > 0U)
  540.    {
  541.       *pStateCurnt++ = *pState++;
  542.  
  543.       /* Decrement the loop counter */
  544.       tapCnt--;
  545.    }
  546.  
  547. }
  548.  
  549. #else
  550.  
  551. /* Run the below code for Cortex-M4 and Cortex-M3 */
  552.  
  553. void arm_fir_f32(
  554. const arm_fir_instance_f32 * S,
  555. float32_t * pSrc,
  556. float32_t * pDst,
  557. uint32_t blockSize)
  558. {
  559.    float32_t *pState = S->pState;                 /* State pointer */
  560.    float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
  561.    float32_t *pStateCurnt;                        /* Points to the current sample of the state */
  562.    float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
  563.    float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;     /* Accumulators */
  564.    float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0;  /* Temporary variables to hold state and coefficient values */
  565.    uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
  566.    uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
  567.    float32_t p0,p1,p2,p3,p4,p5,p6,p7;             /* Temporary product values */
  568.  
  569.    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
  570.    /* pStateCurnt points to the location where the new input data should be written */
  571.    pStateCurnt = &(S->pState[(numTaps - 1U)]);
  572.  
  573.    /* Apply loop unrolling and compute 8 output values simultaneously.
  574.     * The variables acc0 ... acc7 hold output values that are being computed:
  575.     *
  576.     *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
  577.     *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
  578.     *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
  579.     *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
  580.     */
  581.    blkCnt = blockSize >> 3;
  582.  
  583.    /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.
  584.    ** a second loop below computes the remaining 1 to 7 samples. */
  585.    while (blkCnt > 0U)
  586.    {
  587.       /* Copy four new input samples into the state buffer */
  588.       *pStateCurnt++ = *pSrc++;
  589.       *pStateCurnt++ = *pSrc++;
  590.       *pStateCurnt++ = *pSrc++;
  591.       *pStateCurnt++ = *pSrc++;
  592.  
  593.       /* Set all accumulators to zero */
  594.       acc0 = 0.0f;
  595.       acc1 = 0.0f;
  596.       acc2 = 0.0f;
  597.       acc3 = 0.0f;
  598.       acc4 = 0.0f;
  599.       acc5 = 0.0f;
  600.       acc6 = 0.0f;
  601.       acc7 = 0.0f;
  602.  
  603.       /* Initialize state pointer */
  604.       px = pState;
  605.  
  606.       /* Initialize coeff pointer */
  607.       pb = (pCoeffs);
  608.  
  609.       /* This is separated from the others to avoid
  610.        * a call to __aeabi_memmove which would be slower
  611.        */
  612.       *pStateCurnt++ = *pSrc++;
  613.       *pStateCurnt++ = *pSrc++;
  614.       *pStateCurnt++ = *pSrc++;
  615.       *pStateCurnt++ = *pSrc++;
  616.  
  617.       /* Read the first seven samples from the state buffer:  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
  618.       x0 = *px++;
  619.       x1 = *px++;
  620.       x2 = *px++;
  621.       x3 = *px++;
  622.       x4 = *px++;
  623.       x5 = *px++;
  624.       x6 = *px++;
  625.  
  626.       /* Loop unrolling.  Process 8 taps at a time. */
  627.       tapCnt = numTaps >> 3U;
  628.  
  629.       /* Loop over the number of taps.  Unroll by a factor of 8.
  630.        ** Repeat until we've computed numTaps-8 coefficients. */
  631.       while (tapCnt > 0U)
  632.       {
  633.          /* Read the b[numTaps-1] coefficient */
  634.          c0 = *(pb++);
  635.  
  636.          /* Read x[n-numTaps-3] sample */
  637.          x7 = *(px++);
  638.  
  639.          /* acc0 +=  b[numTaps-1] * x[n-numTaps] */
  640.          p0 = x0 * c0;
  641.  
  642.          /* acc1 +=  b[numTaps-1] * x[n-numTaps-1] */
  643.          p1 = x1 * c0;
  644.  
  645.          /* acc2 +=  b[numTaps-1] * x[n-numTaps-2] */
  646.          p2 = x2 * c0;
  647.  
  648.          /* acc3 +=  b[numTaps-1] * x[n-numTaps-3] */
  649.          p3 = x3 * c0;
  650.  
  651.          /* acc4 +=  b[numTaps-1] * x[n-numTaps-4] */
  652.          p4 = x4 * c0;
  653.  
  654.          /* acc1 +=  b[numTaps-1] * x[n-numTaps-5] */
  655.          p5 = x5 * c0;
  656.  
  657.          /* acc2 +=  b[numTaps-1] * x[n-numTaps-6] */
  658.          p6 = x6 * c0;
  659.  
  660.          /* acc3 +=  b[numTaps-1] * x[n-numTaps-7] */
  661.          p7 = x7 * c0;
  662.  
  663.          /* Read the b[numTaps-2] coefficient */
  664.          c0 = *(pb++);
  665.  
  666.          /* Read x[n-numTaps-4] sample */
  667.          x0 = *(px++);
  668.  
  669.          acc0 += p0;
  670.          acc1 += p1;
  671.          acc2 += p2;
  672.          acc3 += p3;
  673.          acc4 += p4;
  674.          acc5 += p5;
  675.          acc6 += p6;
  676.          acc7 += p7;
  677.  
  678.  
  679.          /* Perform the multiply-accumulate */
  680.          p0 = x1 * c0;
  681.          p1 = x2 * c0;
  682.          p2 = x3 * c0;
  683.          p3 = x4 * c0;
  684.          p4 = x5 * c0;
  685.          p5 = x6 * c0;
  686.          p6 = x7 * c0;
  687.          p7 = x0 * c0;
  688.  
  689.          /* Read the b[numTaps-3] coefficient */
  690.          c0 = *(pb++);
  691.  
  692.          /* Read x[n-numTaps-5] sample */
  693.          x1 = *(px++);
  694.  
  695.          acc0 += p0;
  696.          acc1 += p1;
  697.          acc2 += p2;
  698.          acc3 += p3;
  699.          acc4 += p4;
  700.          acc5 += p5;
  701.          acc6 += p6;
  702.          acc7 += p7;
  703.  
  704.          /* Perform the multiply-accumulates */
  705.          p0 = x2 * c0;
  706.          p1 = x3 * c0;
  707.          p2 = x4 * c0;
  708.          p3 = x5 * c0;
  709.          p4 = x6 * c0;
  710.          p5 = x7 * c0;
  711.          p6 = x0 * c0;
  712.          p7 = x1 * c0;
  713.  
  714.          /* Read the b[numTaps-4] coefficient */
  715.          c0 = *(pb++);
  716.  
  717.          /* Read x[n-numTaps-6] sample */
  718.          x2 = *(px++);
  719.  
  720.          acc0 += p0;
  721.          acc1 += p1;
  722.          acc2 += p2;
  723.          acc3 += p3;
  724.          acc4 += p4;
  725.          acc5 += p5;
  726.          acc6 += p6;
  727.          acc7 += p7;
  728.  
  729.          /* Perform the multiply-accumulates */
  730.          p0 = x3 * c0;
  731.          p1 = x4 * c0;
  732.          p2 = x5 * c0;
  733.          p3 = x6 * c0;
  734.          p4 = x7 * c0;
  735.          p5 = x0 * c0;
  736.          p6 = x1 * c0;
  737.          p7 = x2 * c0;
  738.  
  739.          /* Read the b[numTaps-4] coefficient */
  740.          c0 = *(pb++);
  741.  
  742.          /* Read x[n-numTaps-6] sample */
  743.          x3 = *(px++);
  744.  
  745.          acc0 += p0;
  746.          acc1 += p1;
  747.          acc2 += p2;
  748.          acc3 += p3;
  749.          acc4 += p4;
  750.          acc5 += p5;
  751.          acc6 += p6;
  752.          acc7 += p7;
  753.  
  754.          /* Perform the multiply-accumulates */
  755.          p0 = x4 * c0;
  756.          p1 = x5 * c0;
  757.          p2 = x6 * c0;
  758.          p3 = x7 * c0;
  759.          p4 = x0 * c0;
  760.          p5 = x1 * c0;
  761.          p6 = x2 * c0;
  762.          p7 = x3 * c0;
  763.  
  764.          /* Read the b[numTaps-4] coefficient */
  765.          c0 = *(pb++);
  766.  
  767.          /* Read x[n-numTaps-6] sample */
  768.          x4 = *(px++);
  769.  
  770.          acc0 += p0;
  771.          acc1 += p1;
  772.          acc2 += p2;
  773.          acc3 += p3;
  774.          acc4 += p4;
  775.          acc5 += p5;
  776.          acc6 += p6;
  777.          acc7 += p7;
  778.  
  779.          /* Perform the multiply-accumulates */
  780.          p0 = x5 * c0;
  781.          p1 = x6 * c0;
  782.          p2 = x7 * c0;
  783.          p3 = x0 * c0;
  784.          p4 = x1 * c0;
  785.          p5 = x2 * c0;
  786.          p6 = x3 * c0;
  787.          p7 = x4 * c0;
  788.  
  789.          /* Read the b[numTaps-4] coefficient */
  790.          c0 = *(pb++);
  791.  
  792.          /* Read x[n-numTaps-6] sample */
  793.          x5 = *(px++);
  794.  
  795.          acc0 += p0;
  796.          acc1 += p1;
  797.          acc2 += p2;
  798.          acc3 += p3;
  799.          acc4 += p4;
  800.          acc5 += p5;
  801.          acc6 += p6;
  802.          acc7 += p7;
  803.  
  804.          /* Perform the multiply-accumulates */
  805.          p0 = x6 * c0;
  806.          p1 = x7 * c0;
  807.          p2 = x0 * c0;
  808.          p3 = x1 * c0;
  809.          p4 = x2 * c0;
  810.          p5 = x3 * c0;
  811.          p6 = x4 * c0;
  812.          p7 = x5 * c0;
  813.  
  814.          /* Read the b[numTaps-4] coefficient */
  815.          c0 = *(pb++);
  816.  
  817.          /* Read x[n-numTaps-6] sample */
  818.          x6 = *(px++);
  819.  
  820.          acc0 += p0;
  821.          acc1 += p1;
  822.          acc2 += p2;
  823.          acc3 += p3;
  824.          acc4 += p4;
  825.          acc5 += p5;
  826.          acc6 += p6;
  827.          acc7 += p7;
  828.  
  829.          /* Perform the multiply-accumulates */
  830.          p0 = x7 * c0;
  831.          p1 = x0 * c0;
  832.          p2 = x1 * c0;
  833.          p3 = x2 * c0;
  834.          p4 = x3 * c0;
  835.          p5 = x4 * c0;
  836.          p6 = x5 * c0;
  837.          p7 = x6 * c0;
  838.  
  839.          tapCnt--;
  840.  
  841.          acc0 += p0;
  842.          acc1 += p1;
  843.          acc2 += p2;
  844.          acc3 += p3;
  845.          acc4 += p4;
  846.          acc5 += p5;
  847.          acc6 += p6;
  848.          acc7 += p7;
  849.       }
  850.  
  851.       /* If the filter length is not a multiple of 8, compute the remaining filter taps */
  852.       tapCnt = numTaps % 0x8U;
  853.  
  854.       while (tapCnt > 0U)
  855.       {
  856.          /* Read coefficients */
  857.          c0 = *(pb++);
  858.  
  859.          /* Fetch 1 state variable */
  860.          x7 = *(px++);
  861.  
  862.          /* Perform the multiply-accumulates */
  863.          p0 = x0 * c0;
  864.          p1 = x1 * c0;
  865.          p2 = x2 * c0;
  866.          p3 = x3 * c0;
  867.          p4 = x4 * c0;
  868.          p5 = x5 * c0;
  869.          p6 = x6 * c0;
  870.          p7 = x7 * c0;
  871.  
  872.          /* Reuse the present sample states for next sample */
  873.          x0 = x1;
  874.          x1 = x2;
  875.          x2 = x3;
  876.          x3 = x4;
  877.          x4 = x5;
  878.          x5 = x6;
  879.          x6 = x7;
  880.  
  881.          acc0 += p0;
  882.          acc1 += p1;
  883.          acc2 += p2;
  884.          acc3 += p3;
  885.          acc4 += p4;
  886.          acc5 += p5;
  887.          acc6 += p6;
  888.          acc7 += p7;
  889.  
  890.          /* Decrement the loop counter */
  891.          tapCnt--;
  892.       }
  893.  
  894.       /* Advance the state pointer by 8 to process the next group of 8 samples */
  895.       pState = pState + 8;
  896.  
  897.       /* The results in the 8 accumulators, store in the destination buffer. */
  898.       *pDst++ = acc0;
  899.       *pDst++ = acc1;
  900.       *pDst++ = acc2;
  901.       *pDst++ = acc3;
  902.       *pDst++ = acc4;
  903.       *pDst++ = acc5;
  904.       *pDst++ = acc6;
  905.       *pDst++ = acc7;
  906.  
  907.       blkCnt--;
  908.    }
  909.  
  910.    /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
  911.    ** No loop unrolling is used. */
  912.    blkCnt = blockSize % 0x8U;
  913.  
  914.    while (blkCnt > 0U)
  915.    {
  916.       /* Copy one sample at a time into state buffer */
  917.       *pStateCurnt++ = *pSrc++;
  918.  
  919.       /* Set the accumulator to zero */
  920.       acc0 = 0.0f;
  921.  
  922.       /* Initialize state pointer */
  923.       px = pState;
  924.  
  925.       /* Initialize Coefficient pointer */
  926.       pb = (pCoeffs);
  927.  
  928.       i = numTaps;
  929.  
  930.       /* Perform the multiply-accumulates */
  931.       do
  932.       {
  933.          acc0 += *px++ * *pb++;
  934.          i--;
  935.  
  936.       } while (i > 0U);
  937.  
  938.       /* The result is store in the destination buffer. */
  939.       *pDst++ = acc0;
  940.  
  941.       /* Advance state pointer by 1 for the next sample */
  942.       pState = pState + 1;
  943.  
  944.       blkCnt--;
  945.    }
  946.  
  947.    /* Processing is complete.
  948.    ** Now copy the last numTaps - 1 samples to the start of the state buffer.
  949.    ** This prepares the state buffer for the next function call. */
  950.  
  951.    /* Points to the start of the state buffer */
  952.    pStateCurnt = S->pState;
  953.  
  954.    tapCnt = (numTaps - 1U) >> 2U;
  955.  
  956.    /* copy data */
  957.    while (tapCnt > 0U)
  958.    {
  959.       *pStateCurnt++ = *pState++;
  960.       *pStateCurnt++ = *pState++;
  961.       *pStateCurnt++ = *pState++;
  962.       *pStateCurnt++ = *pState++;
  963.  
  964.       /* Decrement the loop counter */
  965.       tapCnt--;
  966.    }
  967.  
  968.    /* Calculate remaining number of copies */
  969.    tapCnt = (numTaps - 1U) % 0x4U;
  970.  
  971.    /* Copy the remaining q31_t data */
  972.    while (tapCnt > 0U)
  973.    {
  974.       *pStateCurnt++ = *pState++;
  975.  
  976.       /* Decrement the loop counter */
  977.       tapCnt--;
  978.    }
  979. }
  980.  
  981. #endif
  982.  
  983. /**
  984. * @} end of FIR group
  985. */
  986.