Subversion Repositories AFRtranscoder

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 mjames 1
/* ----------------------------------------------------------------------
2
 * Project:      CMSIS DSP Library
3
 * Title:        arm_dct4_q31.c
4
 * Description:  Processing function of DCT4 & IDCT4 Q31
5
 *
6
 * $Date:        27. January 2017
7
 * $Revision:    V.1.5.1
8
 *
9
 * Target Processor: Cortex-M cores
10
 * -------------------------------------------------------------------- */
11
/*
12
 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
13
 *
14
 * SPDX-License-Identifier: Apache-2.0
15
 *
16
 * Licensed under the Apache License, Version 2.0 (the License); you may
17
 * not use this file except in compliance with the License.
18
 * You may obtain a copy of the License at
19
 *
20
 * www.apache.org/licenses/LICENSE-2.0
21
 *
22
 * Unless required by applicable law or agreed to in writing, software
23
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25
 * See the License for the specific language governing permissions and
26
 * limitations under the License.
27
 */
28
 
29
#include "arm_math.h"
30
 
31
/**
32
 * @addtogroup DCT4_IDCT4
33
 * @{
34
 */
35
 
36
/**
37
 * @brief Processing function for the Q31 DCT4/IDCT4.
38
 * @param[in]       *S             points to an instance of the Q31 DCT4 structure.
39
 * @param[in]       *pState        points to state buffer.
40
 * @param[in,out]   *pInlineBuffer points to the in-place input and output buffer.
41
 * @return none.
42
 * \par Input an output formats:
43
 * Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process,
44
 * as the conversion from DCT2 to DCT4 involves one subtraction.
45
 * Internally inputs are downscaled in the RFFT process function to avoid overflows.
46
 * Number of bits downscaled, depends on the size of the transform.
47
 * The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below:
48
 *
49
 * \image html dct4FormatsQ31Table.gif
50
 */
51
 
52
void arm_dct4_q31(
53
  const arm_dct4_instance_q31 * S,
54
  q31_t * pState,
55
  q31_t * pInlineBuffer)
56
{
57
  uint16_t i;                                    /* Loop counter */
58
  q31_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
59
  q31_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
60
  q31_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
61
  q31_t in;                                      /* Temporary variable */
62
 
63
 
64
  /* DCT4 computation involves DCT2 (which is calculated using RFFT)
65
   * along with some pre-processing and post-processing.
66
   * Computational procedure is explained as follows:
67
   * (a) Pre-processing involves multiplying input with cos factor,
68
   *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
69
   *              where,
70
   *                 r(n) -- output of preprocessing
71
   *                 u(n) -- input to preprocessing(actual Source buffer)
72
   * (b) Calculation of DCT2 using FFT is divided into three steps:
73
   *                  Step1: Re-ordering of even and odd elements of input.
74
   *                  Step2: Calculating FFT of the re-ordered input.
75
   *                  Step3: Taking the real part of the product of FFT output and weights.
76
   * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
77
   *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
78
   *                        where,
79
   *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
80
   * (d) Multiplying the output with the normalizing factor sqrt(2/N).
81
   */
82
 
83
        /*-------- Pre-processing ------------*/
84
  /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
85
  arm_mult_q31(pInlineBuffer, cosFact, pInlineBuffer, S->N);
86
  arm_shift_q31(pInlineBuffer, 1, pInlineBuffer, S->N);
87
 
88
  /* ----------------------------------------------------------------
89
   * Step1: Re-ordering of even and odd elements as
90
   *             pState[i] =  pInlineBuffer[2*i] and
91
   *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
92
   ---------------------------------------------------------------------*/
93
 
94
  /* pS1 initialized to pState */
95
  pS1 = pState;
96
 
97
  /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
98
  pS2 = pState + (S->N - 1U);
99
 
100
  /* pbuff initialized to input buffer */
101
  pbuff = pInlineBuffer;
102
 
103
#if defined (ARM_MATH_DSP)
104
 
105
  /* Run the below code for Cortex-M4 and Cortex-M3 */
106
 
107
  /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
108
  i = S->Nby2 >> 2U;
109
 
110
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
111
   ** a second loop below computes the remaining 1 to 3 samples. */
112
  do
113
  {
114
    /* Re-ordering of even and odd elements */
115
    /* pState[i] =  pInlineBuffer[2*i] */
116
    *pS1++ = *pbuff++;
117
    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
118
    *pS2-- = *pbuff++;
119
 
120
    *pS1++ = *pbuff++;
121
    *pS2-- = *pbuff++;
122
 
123
    *pS1++ = *pbuff++;
124
    *pS2-- = *pbuff++;
125
 
126
    *pS1++ = *pbuff++;
127
    *pS2-- = *pbuff++;
128
 
129
    /* Decrement the loop counter */
130
    i--;
131
  } while (i > 0U);
132
 
133
  /* pbuff initialized to input buffer */
134
  pbuff = pInlineBuffer;
135
 
136
  /* pS1 initialized to pState */
137
  pS1 = pState;
138
 
139
  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
140
  i = S->N >> 2U;
141
 
142
  /* Processing with loop unrolling 4 times as N is always multiple of 4.
143
   * Compute 4 outputs at a time */
144
  do
145
  {
146
    /* Writing the re-ordered output back to inplace input buffer */
147
    *pbuff++ = *pS1++;
148
    *pbuff++ = *pS1++;
149
    *pbuff++ = *pS1++;
150
    *pbuff++ = *pS1++;
151
 
152
    /* Decrement the loop counter */
153
    i--;
154
  } while (i > 0U);
155
 
156
 
157
  /* ---------------------------------------------------------
158
   *     Step2: Calculate RFFT for N-point input
159
   * ---------------------------------------------------------- */
160
  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
161
  arm_rfft_q31(S->pRfft, pInlineBuffer, pState);
162
 
163
  /*----------------------------------------------------------------------
164
   *  Step3: Multiply the FFT output with the weights.
165
   *----------------------------------------------------------------------*/
166
  arm_cmplx_mult_cmplx_q31(pState, weights, pState, S->N);
167
 
168
  /* The output of complex multiplication is in 3.29 format.
169
   * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
170
  arm_shift_q31(pState, 2, pState, S->N * 2);
171
 
172
  /* ----------- Post-processing ---------- */
173
  /* DCT-IV can be obtained from DCT-II by the equation,
174
   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
175
   *       Hence, Y4(0) = Y2(0)/2  */
176
  /* Getting only real part from the output and Converting to DCT-IV */
177
 
178
  /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
179
  i = (S->N - 1U) >> 2U;
180
 
181
  /* pbuff initialized to input buffer. */
182
  pbuff = pInlineBuffer;
183
 
184
  /* pS1 initialized to pState */
185
  pS1 = pState;
186
 
187
  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
188
  in = *pS1++ >> 1U;
189
  /* input buffer acts as inplace, so output values are stored in the input itself. */
190
  *pbuff++ = in;
191
 
192
  /* pState pointer is incremented twice as the real values are located alternatively in the array */
193
  pS1++;
194
 
195
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
196
   ** a second loop below computes the remaining 1 to 3 samples. */
197
  do
198
  {
199
    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
200
    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
201
    in = *pS1++ - in;
202
    *pbuff++ = in;
203
    /* points to the next real value */
204
    pS1++;
205
 
206
    in = *pS1++ - in;
207
    *pbuff++ = in;
208
    pS1++;
209
 
210
    in = *pS1++ - in;
211
    *pbuff++ = in;
212
    pS1++;
213
 
214
    in = *pS1++ - in;
215
    *pbuff++ = in;
216
    pS1++;
217
 
218
    /* Decrement the loop counter */
219
    i--;
220
  } while (i > 0U);
221
 
222
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
223
   ** No loop unrolling is used. */
224
  i = (S->N - 1U) % 0x4U;
225
 
226
  while (i > 0U)
227
  {
228
    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
229
    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
230
    in = *pS1++ - in;
231
    *pbuff++ = in;
232
    /* points to the next real value */
233
    pS1++;
234
 
235
    /* Decrement the loop counter */
236
    i--;
237
  }
238
 
239
 
240
        /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
241
 
242
  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
243
  i = S->N >> 2U;
244
 
245
  /* pbuff initialized to the pInlineBuffer(now contains the output values) */
246
  pbuff = pInlineBuffer;
247
 
248
  /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
249
  do
250
  {
251
    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
252
    in = *pbuff;
253
    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
254
 
255
    in = *pbuff;
256
    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
257
 
258
    in = *pbuff;
259
    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
260
 
261
    in = *pbuff;
262
    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
263
 
264
    /* Decrement the loop counter */
265
    i--;
266
  } while (i > 0U);
267
 
268
 
269
#else
270
 
271
  /* Run the below code for Cortex-M0 */
272
 
273
  /* Initializing the loop counter to N/2 */
274
  i = S->Nby2;
275
 
276
  do
277
  {
278
    /* Re-ordering of even and odd elements */
279
    /* pState[i] =  pInlineBuffer[2*i] */
280
    *pS1++ = *pbuff++;
281
    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
282
    *pS2-- = *pbuff++;
283
 
284
    /* Decrement the loop counter */
285
    i--;
286
  } while (i > 0U);
287
 
288
  /* pbuff initialized to input buffer */
289
  pbuff = pInlineBuffer;
290
 
291
  /* pS1 initialized to pState */
292
  pS1 = pState;
293
 
294
  /* Initializing the loop counter */
295
  i = S->N;
296
 
297
  do
298
  {
299
    /* Writing the re-ordered output back to inplace input buffer */
300
    *pbuff++ = *pS1++;
301
 
302
    /* Decrement the loop counter */
303
    i--;
304
  } while (i > 0U);
305
 
306
 
307
  /* ---------------------------------------------------------
308
   *     Step2: Calculate RFFT for N-point input
309
   * ---------------------------------------------------------- */
310
  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
311
  arm_rfft_q31(S->pRfft, pInlineBuffer, pState);
312
 
313
  /*----------------------------------------------------------------------
314
   *  Step3: Multiply the FFT output with the weights.
315
   *----------------------------------------------------------------------*/
316
  arm_cmplx_mult_cmplx_q31(pState, weights, pState, S->N);
317
 
318
  /* The output of complex multiplication is in 3.29 format.
319
   * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
320
  arm_shift_q31(pState, 2, pState, S->N * 2);
321
 
322
  /* ----------- Post-processing ---------- */
323
  /* DCT-IV can be obtained from DCT-II by the equation,
324
   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
325
   *       Hence, Y4(0) = Y2(0)/2  */
326
  /* Getting only real part from the output and Converting to DCT-IV */
327
 
328
  /* pbuff initialized to input buffer. */
329
  pbuff = pInlineBuffer;
330
 
331
  /* pS1 initialized to pState */
332
  pS1 = pState;
333
 
334
  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
335
  in = *pS1++ >> 1U;
336
  /* input buffer acts as inplace, so output values are stored in the input itself. */
337
  *pbuff++ = in;
338
 
339
  /* pState pointer is incremented twice as the real values are located alternatively in the array */
340
  pS1++;
341
 
342
  /* Initializing the loop counter */
343
  i = (S->N - 1U);
344
 
345
  while (i > 0U)
346
  {
347
    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
348
    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
349
    in = *pS1++ - in;
350
    *pbuff++ = in;
351
    /* points to the next real value */
352
    pS1++;
353
 
354
    /* Decrement the loop counter */
355
    i--;
356
  }
357
 
358
 
359
        /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
360
 
361
  /* Initializing the loop counter */
362
  i = S->N;
363
 
364
  /* pbuff initialized to the pInlineBuffer(now contains the output values) */
365
  pbuff = pInlineBuffer;
366
 
367
  do
368
  {
369
    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
370
    in = *pbuff;
371
    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
372
 
373
    /* Decrement the loop counter */
374
    i--;
375
  } while (i > 0U);
376
 
377
#endif /* #if defined (ARM_MATH_DSP) */
378
 
379
}
380
 
381
/**
382
   * @} end of DCT4_IDCT4 group
383
   */