Subversion Repositories dashGPS

Rev

Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /* ----------------------------------------------------------------------
  2.  * Project:      CMSIS DSP Library
  3.  * Title:        arm_dct4_q15.c
  4.  * Description:  Processing function of DCT4 & IDCT4 Q15
  5.  *
  6.  * $Date:        27. January 2017
  7.  * $Revision:    V.1.5.1
  8.  *
  9.  * Target Processor: Cortex-M cores
  10.  * -------------------------------------------------------------------- */
  11. /*
  12.  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13.  *
  14.  * SPDX-License-Identifier: Apache-2.0
  15.  *
  16.  * Licensed under the Apache License, Version 2.0 (the License); you may
  17.  * not use this file except in compliance with the License.
  18.  * You may obtain a copy of the License at
  19.  *
  20.  * www.apache.org/licenses/LICENSE-2.0
  21.  *
  22.  * Unless required by applicable law or agreed to in writing, software
  23.  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25.  * See the License for the specific language governing permissions and
  26.  * limitations under the License.
  27.  */
  28.  
  29. #include "arm_math.h"
  30.  
  31. /**
  32.  * @addtogroup DCT4_IDCT4
  33.  * @{
  34.  */
  35.  
  36. /**
  37.  * @brief Processing function for the Q15 DCT4/IDCT4.
  38.  * @param[in]       *S             points to an instance of the Q15 DCT4 structure.
  39.  * @param[in]       *pState        points to state buffer.
  40.  * @param[in,out]   *pInlineBuffer points to the in-place input and output buffer.
  41.  * @return none.
  42.  *
  43.  * \par Input an output formats:
  44.  * Internally inputs are downscaled in the RFFT process function to avoid overflows.
  45.  * Number of bits downscaled, depends on the size of the transform.
  46.  * The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below:
  47.  *
  48.  * \image html dct4FormatsQ15Table.gif
  49.  */
  50.  
  51. void arm_dct4_q15(
  52.   const arm_dct4_instance_q15 * S,
  53.   q15_t * pState,
  54.   q15_t * pInlineBuffer)
  55. {
  56.   uint32_t i;                                    /* Loop counter */
  57.   q15_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
  58.   q15_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
  59.   q15_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
  60.   q15_t in;                                      /* Temporary variable */
  61.  
  62.  
  63.   /* DCT4 computation involves DCT2 (which is calculated using RFFT)
  64.    * along with some pre-processing and post-processing.
  65.    * Computational procedure is explained as follows:
  66.    * (a) Pre-processing involves multiplying input with cos factor,
  67.    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
  68.    *              where,
  69.    *                 r(n) -- output of preprocessing
  70.    *                 u(n) -- input to preprocessing(actual Source buffer)
  71.    * (b) Calculation of DCT2 using FFT is divided into three steps:
  72.    *                  Step1: Re-ordering of even and odd elements of input.
  73.    *                  Step2: Calculating FFT of the re-ordered input.
  74.    *                  Step3: Taking the real part of the product of FFT output and weights.
  75.    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
  76.    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
  77.    *                        where,
  78.    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
  79.    * (d) Multiplying the output with the normalizing factor sqrt(2/N).
  80.    */
  81.  
  82.         /*-------- Pre-processing ------------*/
  83.   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
  84.   arm_mult_q15(pInlineBuffer, cosFact, pInlineBuffer, S->N);
  85.   arm_shift_q15(pInlineBuffer, 1, pInlineBuffer, S->N);
  86.  
  87.   /* ----------------------------------------------------------------
  88.    * Step1: Re-ordering of even and odd elements as
  89.    *             pState[i] =  pInlineBuffer[2*i] and
  90.    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
  91.    ---------------------------------------------------------------------*/
  92.  
  93.   /* pS1 initialized to pState */
  94.   pS1 = pState;
  95.  
  96.   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
  97.   pS2 = pState + (S->N - 1U);
  98.  
  99.   /* pbuff initialized to input buffer */
  100.   pbuff = pInlineBuffer;
  101.  
  102.  
  103. #if defined (ARM_MATH_DSP)
  104.  
  105.   /* Run the below code for Cortex-M4 and Cortex-M3 */
  106.  
  107.   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
  108.   i = (uint32_t) S->Nby2 >> 2U;
  109.  
  110.   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
  111.    ** a second loop below computes the remaining 1 to 3 samples. */
  112.   do
  113.   {
  114.     /* Re-ordering of even and odd elements */
  115.     /* pState[i] =  pInlineBuffer[2*i] */
  116.     *pS1++ = *pbuff++;
  117.     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
  118.     *pS2-- = *pbuff++;
  119.  
  120.     *pS1++ = *pbuff++;
  121.     *pS2-- = *pbuff++;
  122.  
  123.     *pS1++ = *pbuff++;
  124.     *pS2-- = *pbuff++;
  125.  
  126.     *pS1++ = *pbuff++;
  127.     *pS2-- = *pbuff++;
  128.  
  129.     /* Decrement the loop counter */
  130.     i--;
  131.   } while (i > 0U);
  132.  
  133.   /* pbuff initialized to input buffer */
  134.   pbuff = pInlineBuffer;
  135.  
  136.   /* pS1 initialized to pState */
  137.   pS1 = pState;
  138.  
  139.   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
  140.   i = (uint32_t) S->N >> 2U;
  141.  
  142.   /* Processing with loop unrolling 4 times as N is always multiple of 4.
  143.    * Compute 4 outputs at a time */
  144.   do
  145.   {
  146.     /* Writing the re-ordered output back to inplace input buffer */
  147.     *pbuff++ = *pS1++;
  148.     *pbuff++ = *pS1++;
  149.     *pbuff++ = *pS1++;
  150.     *pbuff++ = *pS1++;
  151.  
  152.     /* Decrement the loop counter */
  153.     i--;
  154.   } while (i > 0U);
  155.  
  156.  
  157.   /* ---------------------------------------------------------
  158.    *     Step2: Calculate RFFT for N-point input
  159.    * ---------------------------------------------------------- */
  160.   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
  161.   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);
  162.  
  163.  /*----------------------------------------------------------------------
  164.   *  Step3: Multiply the FFT output with the weights.
  165.   *----------------------------------------------------------------------*/
  166.   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);
  167.  
  168.   /* The output of complex multiplication is in 3.13 format.
  169.    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
  170.   arm_shift_q15(pState, 2, pState, S->N * 2);
  171.  
  172.   /* ----------- Post-processing ---------- */
  173.   /* DCT-IV can be obtained from DCT-II by the equation,
  174.    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
  175.    *       Hence, Y4(0) = Y2(0)/2  */
  176.   /* Getting only real part from the output and Converting to DCT-IV */
  177.  
  178.   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
  179.   i = ((uint32_t) S->N - 1U) >> 2U;
  180.  
  181.   /* pbuff initialized to input buffer. */
  182.   pbuff = pInlineBuffer;
  183.  
  184.   /* pS1 initialized to pState */
  185.   pS1 = pState;
  186.  
  187.   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
  188.   in = *pS1++ >> 1U;
  189.   /* input buffer acts as inplace, so output values are stored in the input itself. */
  190.   *pbuff++ = in;
  191.  
  192.   /* pState pointer is incremented twice as the real values are located alternatively in the array */
  193.   pS1++;
  194.  
  195.   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
  196.    ** a second loop below computes the remaining 1 to 3 samples. */
  197.   do
  198.   {
  199.     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
  200.     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
  201.     in = *pS1++ - in;
  202.     *pbuff++ = in;
  203.     /* points to the next real value */
  204.     pS1++;
  205.  
  206.     in = *pS1++ - in;
  207.     *pbuff++ = in;
  208.     pS1++;
  209.  
  210.     in = *pS1++ - in;
  211.     *pbuff++ = in;
  212.     pS1++;
  213.  
  214.     in = *pS1++ - in;
  215.     *pbuff++ = in;
  216.     pS1++;
  217.  
  218.     /* Decrement the loop counter */
  219.     i--;
  220.   } while (i > 0U);
  221.  
  222.   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
  223.    ** No loop unrolling is used. */
  224.   i = ((uint32_t) S->N - 1U) % 0x4U;
  225.  
  226.   while (i > 0U)
  227.   {
  228.     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
  229.     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
  230.     in = *pS1++ - in;
  231.     *pbuff++ = in;
  232.     /* points to the next real value */
  233.     pS1++;
  234.  
  235.     /* Decrement the loop counter */
  236.     i--;
  237.   }
  238.  
  239.  
  240.    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
  241.  
  242.   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
  243.   i = (uint32_t) S->N >> 2U;
  244.  
  245.   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
  246.   pbuff = pInlineBuffer;
  247.  
  248.   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
  249.   do
  250.   {
  251.     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
  252.     in = *pbuff;
  253.     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
  254.  
  255.     in = *pbuff;
  256.     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
  257.  
  258.     in = *pbuff;
  259.     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
  260.  
  261.     in = *pbuff;
  262.     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
  263.  
  264.     /* Decrement the loop counter */
  265.     i--;
  266.   } while (i > 0U);
  267.  
  268.  
  269. #else
  270.  
  271.   /* Run the below code for Cortex-M0 */
  272.  
  273.   /* Initializing the loop counter to N/2 */
  274.   i = (uint32_t) S->Nby2;
  275.  
  276.   do
  277.   {
  278.     /* Re-ordering of even and odd elements */
  279.     /* pState[i] =  pInlineBuffer[2*i] */
  280.     *pS1++ = *pbuff++;
  281.     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
  282.     *pS2-- = *pbuff++;
  283.  
  284.     /* Decrement the loop counter */
  285.     i--;
  286.   } while (i > 0U);
  287.  
  288.   /* pbuff initialized to input buffer */
  289.   pbuff = pInlineBuffer;
  290.  
  291.   /* pS1 initialized to pState */
  292.   pS1 = pState;
  293.  
  294.   /* Initializing the loop counter */
  295.   i = (uint32_t) S->N;
  296.  
  297.   do
  298.   {
  299.     /* Writing the re-ordered output back to inplace input buffer */
  300.     *pbuff++ = *pS1++;
  301.  
  302.     /* Decrement the loop counter */
  303.     i--;
  304.   } while (i > 0U);
  305.  
  306.  
  307.   /* ---------------------------------------------------------
  308.    *     Step2: Calculate RFFT for N-point input
  309.    * ---------------------------------------------------------- */
  310.   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
  311.   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);
  312.  
  313.  /*----------------------------------------------------------------------
  314.   *  Step3: Multiply the FFT output with the weights.
  315.   *----------------------------------------------------------------------*/
  316.   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);
  317.  
  318.   /* The output of complex multiplication is in 3.13 format.
  319.    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
  320.   arm_shift_q15(pState, 2, pState, S->N * 2);
  321.  
  322.   /* ----------- Post-processing ---------- */
  323.   /* DCT-IV can be obtained from DCT-II by the equation,
  324.    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
  325.    *       Hence, Y4(0) = Y2(0)/2  */
  326.   /* Getting only real part from the output and Converting to DCT-IV */
  327.  
  328.   /* Initializing the loop counter */
  329.   i = ((uint32_t) S->N - 1U);
  330.  
  331.   /* pbuff initialized to input buffer. */
  332.   pbuff = pInlineBuffer;
  333.  
  334.   /* pS1 initialized to pState */
  335.   pS1 = pState;
  336.  
  337.   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
  338.   in = *pS1++ >> 1U;
  339.   /* input buffer acts as inplace, so output values are stored in the input itself. */
  340.   *pbuff++ = in;
  341.  
  342.   /* pState pointer is incremented twice as the real values are located alternatively in the array */
  343.   pS1++;
  344.  
  345.   do
  346.   {
  347.     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
  348.     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
  349.     in = *pS1++ - in;
  350.     *pbuff++ = in;
  351.     /* points to the next real value */
  352.     pS1++;
  353.  
  354.     /* Decrement the loop counter */
  355.     i--;
  356.   } while (i > 0U);
  357.  
  358.    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
  359.  
  360.   /* Initializing the loop counter */
  361.   i = (uint32_t) S->N;
  362.  
  363.   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
  364.   pbuff = pInlineBuffer;
  365.  
  366.   do
  367.   {
  368.     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
  369.     in = *pbuff;
  370.     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
  371.  
  372.     /* Decrement the loop counter */
  373.     i--;
  374.   } while (i > 0U);
  375.  
  376. #endif /* #if defined (ARM_MATH_DSP) */
  377.  
  378. }
  379.  
  380. /**
  381.    * @} end of DCT4_IDCT4 group
  382.    */
  383.