RetroArch
xdsp.h
Go to the documentation of this file.
1 /*-========================================================================-_
2  | - XDSP - |
3  | Copyright (c) Microsoft Corporation. All rights reserved. |
4  |~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
5  |PROJECT: XDSP MODEL: Unmanaged User-mode |
6  |VERSION: 1.2 EXCEPT: No Exceptions |
7  |CLASS: N / A MINREQ: WinXP, Xbox360 |
8  |BASE: N / A DIALECT: MSC++ 14.00 |
9  |>------------------------------------------------------------------------<|
10  | DUTY: DSP functions with CPU extension specific optimizations |
11  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
12  NOTES:
13  1. Definition of terms:
14  DSP: Digital Signal Processing.
15  FFT: Fast Fourier Transform.
16  Frame: A block of samples, one per channel,
17  to be played simultaneously.
18 
19  2. All buffer parameters must be 16-byte aligned.
20 
21  3. All FFT functions support only FLOAT32 audio. */
22 
23 #pragma once
24 //--------------<D-E-F-I-N-I-T-I-O-N-S>-------------------------------------//
25 #include <windef.h> // general windows types
26 #include <math.h> // trigonometric functions
27 #if defined(_XBOX) // SIMD intrinsics
28  #include <ppcintrinsics.h>
29 #else
30  #include <emmintrin.h>
31 #endif
32 
33 
34 //--------------<M-A-C-R-O-S>-----------------------------------------------//
35 // assertion
36 #if !defined(DSPASSERT)
37  #if DBG
38  #define DSPASSERT(exp) if (!(exp)) { OutputDebugStringA("XDSP ASSERT: " #exp ", {" __FUNCTION__ "}\n"); __debugbreak(); }
39  #else
40  #define DSPASSERT(exp) __assume(exp)
41  #endif
42 #endif
43 
44 // true if n is a power of 2
45 #if !defined(ISPOWEROF2)
46  #define ISPOWEROF2(n) ( ((n)&((n)-1)) == 0 && (n) != 0 )
47 #endif
48 
49 
50 //--------------<H-E-L-P-E-R-S>---------------------------------------------//
51 namespace XDSP {
52 #pragma warning(push)
53 #pragma warning(disable: 4328 4640) // disable "indirection alignment of formal parameter", "construction of local static object is not thread-safe" compile warnings
54 
55 
56 // Helper functions, used by the FFT functions.
57 // The application need not call them directly.
58 
59  // primitive types
60  typedef __m128 XVECTOR;
61  typedef XVECTOR& XVECTORREF;
62  typedef const XVECTOR& XVECTORREFC;
63 
64 
65  // Parallel multiplication of four complex numbers, assuming
66  // real and imaginary values are stored in separate vectors.
67  __forceinline void vmulComplex (__out XVECTORREF rResult, __out XVECTORREF iResult, __in XVECTORREFC r1, __in XVECTORREFC i1, __in XVECTORREFC r2, __in XVECTORREFC i2)
68  {
69  // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
70  XVECTOR vi1i2 = _mm_mul_ps(i1, i2);
71  XVECTOR vr1r2 = _mm_mul_ps(r1, r2);
72  XVECTOR vr1i2 = _mm_mul_ps(r1, i2);
73  XVECTOR vr2i1 = _mm_mul_ps(r2, i1);
74  rResult = _mm_sub_ps(vr1r2, vi1i2); // real: (r1*r2 - i1*i2)
75  iResult = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
76  }
77  __forceinline void vmulComplex (__inout XVECTORREF r1, __inout XVECTORREF i1, __in XVECTORREFC r2, __in XVECTORREFC i2)
78  {
79  // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
80  XVECTOR vi1i2 = _mm_mul_ps(i1, i2);
81  XVECTOR vr1r2 = _mm_mul_ps(r1, r2);
82  XVECTOR vr1i2 = _mm_mul_ps(r1, i2);
83  XVECTOR vr2i1 = _mm_mul_ps(r2, i1);
84  r1 = _mm_sub_ps(vr1r2, vi1i2); // real: (r1*r2 - i1*i2)
85  i1 = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
86  }
87 
88 
89  // Radix-4 decimation-in-time FFT butterfly.
90  // This version assumes that all four elements of the butterfly are
91  // adjacent in a single vector.
92  //
93  // Compute the product of the complex input vector and the
94  // 4-element DFT matrix:
95  // | 1 1 1 1 | | (r1X,i1X) |
96  // | 1 -j -1 j | | (r1Y,i1Y) |
97  // | 1 -1 1 -1 | | (r1Z,i1Z) |
98  // | 1 j -1 -j | | (r1W,i1W) |
99  //
100  // This matrix can be decomposed into two simpler ones to reduce the
101  // number of additions needed. The decomposed matrices look like this:
102  // | 1 0 1 0 | | 1 0 1 0 |
103  // | 0 1 0 -j | | 1 0 -1 0 |
104  // | 1 0 -1 0 | | 0 1 0 1 |
105  // | 0 1 0 j | | 0 1 0 -1 |
106  //
107  // Combine as follows:
108  // | 1 0 1 0 | | (r1X,i1X) | | (r1X + r1Z, i1X + i1Z) |
109  // Temp = | 1 0 -1 0 | * | (r1Y,i1Y) | = | (r1X - r1Z, i1X - i1Z) |
110  // | 0 1 0 1 | | (r1Z,i1Z) | | (r1Y + r1W, i1Y + i1W) |
111  // | 0 1 0 -1 | | (r1W,i1W) | | (r1Y - r1W, i1Y - i1W) |
112  //
113  // | 1 0 1 0 | | (rTempX,iTempX) | | (rTempX + rTempZ, iTempX + iTempZ) |
114  // Result = | 0 1 0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) |
115  // | 1 0 -1 0 | | (rTempZ,iTempZ) | | (rTempX - rTempZ, iTempX - iTempZ) |
116  // | 0 1 0 j | | (rTempW,iTempW) | | (rTempY - iTempW, iTempY + rTempW) |
117  __forceinline void ButterflyDIT4_1 (__inout XVECTORREF r1, __inout XVECTORREF i1)
118  {
119  // sign constants for radix-4 butterflies
120  const static XVECTOR vDFT4SignBits1 = { 0.0f, -0.0f, 0.0f, -0.0f };
121  const static XVECTOR vDFT4SignBits2 = { 0.0f, 0.0f, -0.0f, -0.0f };
122  const static XVECTOR vDFT4SignBits3 = { 0.0f, -0.0f, -0.0f, 0.0f };
123 
124 
125  // calculating Temp
126  XVECTOR rTemp = _mm_add_ps( _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(1, 1, 0, 0)), // [r1X| r1X|r1Y| r1Y] +
127  _mm_xor_ps(_mm_shuffle_ps(r1, r1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [r1Z|-r1Z|r1W|-r1W]
128  XVECTOR iTemp = _mm_add_ps( _mm_shuffle_ps(i1, i1, _MM_SHUFFLE(1, 1, 0, 0)), // [i1X| i1X|i1Y| i1Y] +
129  _mm_xor_ps(_mm_shuffle_ps(i1, i1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [i1Z|-i1Z|i1W|-i1W]
130 
131  // calculating Result
132  XVECTOR rZrWiZiW = _mm_shuffle_ps(rTemp, iTemp, _MM_SHUFFLE(3, 2, 3, 2)); // [rTempZ|rTempW|iTempZ|iTempW]
133  XVECTOR rZiWrZiW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(3, 0, 3, 0)); // [rTempZ|iTempW|rTempZ|iTempW]
134  XVECTOR iZrWiZrW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(1, 2, 1, 2)); // [rTempZ|iTempW|rTempZ|iTempW]
135  r1 = _mm_add_ps( _mm_shuffle_ps(rTemp, rTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [rTempX| rTempY| rTempX| rTempY] +
136  _mm_xor_ps(rZiWrZiW, vDFT4SignBits2) ); // [rTempZ| iTempW|-rTempZ|-iTempW]
137  i1 = _mm_add_ps( _mm_shuffle_ps(iTemp, iTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [iTempX| iTempY| iTempX| iTempY] +
138  _mm_xor_ps(iZrWiZrW, vDFT4SignBits3) ); // [iTempZ|-rTempW|-iTempZ| rTempW]
139  }
140 
141  // Radix-4 decimation-in-time FFT butterfly.
142  // This version assumes that elements of the butterfly are
143  // in different vectors, so that each vector in the input
144  // contains elements from four different butterflies.
145  // The four separate butterflies are processed in parallel.
146  //
147  // The calculations here are the same as the ones in the single-vector
148  // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W)
149  // they are done in parallel on sixteen independent complex values.
150  // There is no interdependence between the vector elements:
151  // | 1 0 1 0 | | (rIn0,iIn0) | | (rIn0 + rIn2, iIn0 + iIn2) |
152  // | 1 0 -1 0 | * | (rIn1,iIn1) | = Temp = | (rIn0 - rIn2, iIn0 - iIn2) |
153  // | 0 1 0 1 | | (rIn2,iIn2) | | (rIn1 + rIn3, iIn1 + iIn3) |
154  // | 0 1 0 -1 | | (rIn3,iIn3) | | (rIn1 - rIn3, iIn1 - iIn3) |
155  //
156  // | 1 0 1 0 | | (rTemp0,iTemp0) | | (rTemp0 + rTemp2, iTemp0 + iTemp2) |
157  // Result = | 0 1 0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) |
158  // | 1 0 -1 0 | | (rTemp2,iTemp2) | | (rTemp0 - rTemp2, iTemp0 - iTemp2) |
159  // | 0 1 0 j | | (rTemp3,iTemp3) | | (rTemp1 - iTemp3, iTemp1 + rTemp3) |
160  __forceinline void ButterflyDIT4_4 (__inout XVECTORREF r0,
161  __inout XVECTORREF r1,
162  __inout XVECTORREF r2,
163  __inout XVECTORREF r3,
164  __inout XVECTORREF i0,
165  __inout XVECTORREF i1,
166  __inout XVECTORREF i2,
167  __inout XVECTORREF i3,
168  __in_ecount(uStride*4) const XVECTOR* __restrict pUnityTableReal,
169  __in_ecount(uStride*4) const XVECTOR* __restrict pUnityTableImaginary,
170  const UINT32 uStride, const BOOL fLast)
171  {
172  DSPASSERT(pUnityTableReal != NULL);
173  DSPASSERT(pUnityTableImaginary != NULL);
174  DSPASSERT((UINT_PTR)pUnityTableReal % 16 == 0);
175  DSPASSERT((UINT_PTR)pUnityTableImaginary % 16 == 0);
176  DSPASSERT(ISPOWEROF2(uStride));
177 
178  XVECTOR rTemp0, rTemp1, rTemp2, rTemp3, rTemp4, rTemp5, rTemp6, rTemp7;
179  XVECTOR iTemp0, iTemp1, iTemp2, iTemp3, iTemp4, iTemp5, iTemp6, iTemp7;
180 
181 
182  // calculating Temp
183  rTemp0 = _mm_add_ps(r0, r2); iTemp0 = _mm_add_ps(i0, i2);
184  rTemp2 = _mm_add_ps(r1, r3); iTemp2 = _mm_add_ps(i1, i3);
185  rTemp1 = _mm_sub_ps(r0, r2); iTemp1 = _mm_sub_ps(i0, i2);
186  rTemp3 = _mm_sub_ps(r1, r3); iTemp3 = _mm_sub_ps(i1, i3);
187  rTemp4 = _mm_add_ps(rTemp0, rTemp2); iTemp4 = _mm_add_ps(iTemp0, iTemp2);
188  rTemp5 = _mm_add_ps(rTemp1, iTemp3); iTemp5 = _mm_sub_ps(iTemp1, rTemp3);
189  rTemp6 = _mm_sub_ps(rTemp0, rTemp2); iTemp6 = _mm_sub_ps(iTemp0, iTemp2);
190  rTemp7 = _mm_sub_ps(rTemp1, iTemp3); iTemp7 = _mm_add_ps(iTemp1, rTemp3);
191 
192  // calculating Result
193  // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial
194  vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]);
195  vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride*2], pUnityTableImaginary[uStride*2]);
196  vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride*3], pUnityTableImaginary[uStride*3]);
197  if (fLast) {
198  ButterflyDIT4_1(rTemp4, iTemp4);
199  ButterflyDIT4_1(rTemp5, iTemp5);
200  ButterflyDIT4_1(rTemp6, iTemp6);
201  ButterflyDIT4_1(rTemp7, iTemp7);
202  }
203 
204 
205  r0 = rTemp4; i0 = iTemp4;
206  r1 = rTemp5; i1 = iTemp5;
207  r2 = rTemp6; i2 = iTemp6;
208  r3 = rTemp7; i3 = iTemp7;
209  }
210 
211 //--------------<F-U-N-C-T-I-O-N-S>-----------------------------------------//
212 
214  // DESCRIPTION:
215  // 4-sample FFT.
216  //
217  // PARAMETERS:
218  // pReal - [inout] real components, must have at least uCount elements
219  // pImaginary - [inout] imaginary components, must have at least uCount elements
220  // uCount - [in] number of FFT iterations
221  //
222  // RETURN VALUE:
223  // void
225  __forceinline void FFT4 (__inout_ecount(uCount) XVECTOR* __restrict pReal, __inout_ecount(uCount) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
226  {
227  DSPASSERT(pReal != NULL);
228  DSPASSERT(pImaginary != NULL);
229  DSPASSERT((UINT_PTR)pReal % 16 == 0);
230  DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
231  DSPASSERT(ISPOWEROF2(uCount));
232 
233  for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
234  ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]);
235  }
236  }
237 
238 
239 
241  // DESCRIPTION:
242  // 8-sample FFT.
243  //
244  // PARAMETERS:
245  // pReal - [inout] real components, must have at least uCount*2 elements
246  // pImaginary - [inout] imaginary components, must have at least uCount*2 elements
247  // uCount - [in] number of FFT iterations
248  //
249  // RETURN VALUE:
250  // void
252  __forceinline void FFT8 (__inout_ecount(uCount*2) XVECTOR* __restrict pReal, __inout_ecount(uCount*2) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
253  {
254  DSPASSERT(pReal != NULL);
255  DSPASSERT(pImaginary != NULL);
256  DSPASSERT((UINT_PTR)pReal % 16 == 0);
257  DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
258  DSPASSERT(ISPOWEROF2(uCount));
259 
260  static XVECTOR wr1 = { 1.0f, 0.70710677f, 0.0f, -0.70710677f };
261  static XVECTOR wi1 = { 0.0f, -0.70710677f, -1.0f, -0.70710677f };
262  static XVECTOR wr2 = { -1.0f, -0.70710677f, 0.0f, 0.70710677f };
263  static XVECTOR wi2 = { 0.0f, 0.70710677f, 1.0f, 0.70710677f };
264 
265 
266  for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
267  XVECTOR* __restrict pR = pReal + uIndex*2;
268  XVECTOR* __restrict pI = pImaginary + uIndex*2;
269 
270  XVECTOR oddsR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(3, 1, 3, 1));
271  XVECTOR evensR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(2, 0, 2, 0));
272  XVECTOR oddsI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(3, 1, 3, 1));
273  XVECTOR evensI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(2, 0, 2, 0));
274  ButterflyDIT4_1(oddsR, oddsI);
275  ButterflyDIT4_1(evensR, evensI);
276 
277  XVECTOR r, i;
278  vmulComplex(r, i, oddsR, oddsI, wr1, wi1);
279  pR[0] = _mm_add_ps(evensR, r);
280  pI[0] = _mm_add_ps(evensI, i);
281 
282  vmulComplex(r, i, oddsR, oddsI, wr2, wi2);
283  pR[1] = _mm_add_ps(evensR, r);
284  pI[1] = _mm_add_ps(evensI, i);
285  }
286  }
287 
288 
289 
291  // DESCRIPTION:
292  // 16-sample FFT.
293  //
294  // PARAMETERS:
295  // pReal - [inout] real components, must have at least uCount*4 elements
296  // pImaginary - [inout] imaginary components, must have at least uCount*4 elements
297  // uCount - [in] number of FFT iterations
298  //
299  // RETURN VALUE:
300  // void
302  __forceinline void FFT16 (__inout_ecount(uCount*4) XVECTOR* __restrict pReal, __inout_ecount(uCount*4) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
303  {
304  DSPASSERT(pReal != NULL);
305  DSPASSERT(pImaginary != NULL);
306  DSPASSERT((UINT_PTR)pReal % 16 == 0);
307  DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
308  DSPASSERT(ISPOWEROF2(uCount));
309 
310  XVECTOR aUnityTableReal[4] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.92387950f, 0.70710677f, 0.38268343f, 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f, 1.0f, 0.38268343f, -0.70710677f, -0.92387950f };
311  XVECTOR aUnityTableImaginary[4] = { -0.0f, -0.0f, -0.0f, -0.0f, -0.0f, -0.38268343f, -0.70710677f, -0.92387950f, -0.0f, -0.70710677f, -1.0f, -0.70710677f, -0.0f, -0.92387950f, -0.70710677f, 0.38268343f };
312 
313 
314  for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
315  ButterflyDIT4_4(pReal[uIndex*4],
316  pReal[uIndex*4 + 1],
317  pReal[uIndex*4 + 2],
318  pReal[uIndex*4 + 3],
319  pImaginary[uIndex*4],
320  pImaginary[uIndex*4 + 1],
321  pImaginary[uIndex*4 + 2],
322  pImaginary[uIndex*4 + 3],
323  aUnityTableReal,
324  aUnityTableImaginary,
325  1, TRUE);
326  }
327  }
328 
329 
330 
332  // DESCRIPTION:
333  // 2^N-sample FFT.
334  //
335  // REMARKS:
336  // For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4().
337  //
338  // PARAMETERS:
339  // pReal - [inout] real components, must have at least (uLength*uCount)/4 elements
340  // pImaginary - [inout] imaginary components, must have at least (uLength*uCount)/4 elements
341  // pUnityTable - [in] unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable()
342  // uLength - [in] FFT length in samples, must be a power of 2 > 16
343  // uCount - [in] number of FFT iterations
344  //
345  // RETURN VALUE:
346  // void
348  inline void FFT (__inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pReal, __inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pImaginary, __in_ecount(uLength*uCount) const XVECTOR* __restrict pUnityTable, const UINT32 uLength, const UINT32 uCount=1)
349  {
350  DSPASSERT(pReal != NULL);
351  DSPASSERT(pImaginary != NULL);
352  DSPASSERT(pUnityTable != NULL);
353  DSPASSERT((UINT_PTR)pReal % 16 == 0);
354  DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
355  DSPASSERT((UINT_PTR)pUnityTable % 16 == 0);
356  DSPASSERT(uLength > 16);
357  DSPASSERT(ISPOWEROF2(uLength));
358  DSPASSERT(ISPOWEROF2(uCount));
359 
360  const XVECTOR* __restrict pUnityTableReal = pUnityTable;
361  const XVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2);
362  const UINT32 uTotal = uCount * uLength;
363  const UINT32 uTotal_vectors = uTotal >> 2;
364  const UINT32 uStage_vectors = uLength >> 2;
365  const UINT32 uStage_vectors_mask = uStage_vectors - 1;
366  const UINT32 uStride = uLength >> 4; // stride between butterfly elements
367  const UINT32 uStrideMask = uStride - 1;
368  const UINT32 uStride2 = uStride * 2;
369  const UINT32 uStride3 = uStride * 3;
370  const UINT32 uStrideInvMask = ~uStrideMask;
371 
372 
373  for (UINT32 uIndex=0; uIndex<(uTotal_vectors>>2); ++uIndex) {
374  const UINT32 n = ((uIndex & uStrideInvMask) << 2) + (uIndex & uStrideMask);
375  ButterflyDIT4_4(pReal[n],
376  pReal[n + uStride],
377  pReal[n + uStride2],
378  pReal[n + uStride3],
379  pImaginary[n ],
380  pImaginary[n + uStride],
381  pImaginary[n + uStride2],
382  pImaginary[n + uStride3],
383  pUnityTableReal + (n & uStage_vectors_mask),
384  pUnityTableImaginary + (n & uStage_vectors_mask),
385  uStride, FALSE);
386  }
387 
388 
389  if (uLength > 16*4) {
390  FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4);
391  } else if (uLength == 16*4) {
392  FFT16(pReal, pImaginary, uCount*4);
393  } else if (uLength == 8*4) {
394  FFT8(pReal, pImaginary, uCount*4);
395  } else if (uLength == 4*4) {
396  FFT4(pReal, pImaginary, uCount*4);
397  }
398  }
399 
400 //--------------------------------------------------------------------------//
402  // DESCRIPTION:
403  // Initializes unity roots lookup table used by FFT functions.
404  // Once initialized, the table need not be initialized again unless a
405  // different FFT length is desired.
406  //
407  // REMARKS:
408  // The unity tables of FFT length 16 and below are hard coded into the
409  // respective FFT functions and so need not be initialized.
410  //
411  // PARAMETERS:
412  // pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements
413  // uLength - [in] FFT length in frames, must be a power of 2 > 16
414  //
415  // RETURN VALUE:
416  // void
418 inline void FFTInitializeUnityTable (__out_ecount(uLength) XVECTOR* __restrict pUnityTable, UINT32 uLength)
419 {
420  DSPASSERT(pUnityTable != NULL);
421  DSPASSERT(uLength > 16);
422  DSPASSERT(ISPOWEROF2(uLength));
423 
424  FLOAT32* __restrict pfUnityTable = (FLOAT32* __restrict)pUnityTable;
425 
426 
427  // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16
428  do {
429  FLOAT32 flStep = 6.283185307f / uLength; // 2PI / FFT length
430  uLength >>= 2;
431 
432  // pUnityTable[0 to uLength*4-1] contains real components for current FFT length
433  // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length
434  for (UINT32 i=0; i<4; ++i) {
435  for (UINT32 j=0; j<uLength; ++j) {
436  UINT32 uIndex = (i*uLength) + j;
437  pfUnityTable[uIndex] = cosf(FLOAT32(i)*FLOAT32(j)*flStep); // real component
438  pfUnityTable[uIndex + uLength*4] = -sinf(FLOAT32(i)*FLOAT32(j)*flStep); // imaginary component
439  }
440  }
441  pfUnityTable += uLength*8;
442  } while (uLength > 16);
443 }
444 
445 
447  // DESCRIPTION:
448  // The FFT functions generate output in bit reversed order.
449  // Use this function to re-arrange them into order of increasing frequency.
450  //
451  // REMARKS:
452  //
453  // PARAMETERS:
454  // pOutput - [out] output buffer, receives samples in order of increasing frequency, cannot overlap pInput, must have at least (1<<uLog2Length)/4 elements
455  // pInput - [in] input buffer, samples in bit reversed order as generated by FFT functions, cannot overlap pOutput, must have at least (1<<uLog2Length)/4 elements
456  // uLog2Length - [in] LOG (base 2) of FFT length in samples, must be >= 2
457  //
458  // RETURN VALUE:
459  // void
461 inline void FFTUnswizzle (__out_ecount((1<<uLog2Length)/4) XVECTOR* __restrict pOutput, __in_ecount((1<<uLog2Length)/4) const XVECTOR* __restrict pInput, const UINT32 uLog2Length)
462 {
463  DSPASSERT(pOutput != NULL);
464  DSPASSERT(pInput != NULL);
465  DSPASSERT(uLog2Length >= 2);
466 
467  FLOAT32* __restrict pfOutput = (FLOAT32* __restrict)pOutput;
468  const FLOAT32* __restrict pfInput = (const FLOAT32* __restrict)pInput;
469  const UINT32 uLength = UINT32(1 << uLog2Length);
470 
471 
472  if ((uLog2Length & 0x1) == 0) {
473  // even powers of two
474  for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
475  UINT32 n = uIndex;
476  n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 );
477  n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 );
478  n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 );
479  n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
480  n >>= (32 - uLog2Length);
481  pfOutput[n] = pfInput[uIndex];
482  }
483  } else {
484  // odd powers of two
485  for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
486  UINT32 n = (uIndex>>3);
487  n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 );
488  n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 );
489  n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 );
490  n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
491  n >>= (32 - (uLog2Length-3));
492  n |= ((uIndex & 0x7) << (uLog2Length - 3));
493  pfOutput[n] = pfInput[uIndex];
494  }
495  }
496 }
497 
498 
500  // DESCRIPTION:
501  // Convert complex components to polar form.
502  //
503  // PARAMETERS:
504  // pOutput - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements
505  // pInputReal - [in] input buffer (real components), must have at least uLength/4 elements
506  // pInputImaginary - [in] input buffer (imaginary components), must have at least uLength/4 elements
507  // uLength - [in] FFT length in samples, must be a power of 2 >= 4
508  //
509  // RETURN VALUE:
510  // void
512 inline void FFTPolar (__out_ecount(uLength/4) XVECTOR* __restrict pOutput, __in_ecount(uLength/4) const XVECTOR* __restrict pInputReal, __in_ecount(uLength/4) const XVECTOR* __restrict pInputImaginary, const UINT32 uLength)
513 {
514  DSPASSERT(pOutput != NULL);
515  DSPASSERT(pInputReal != NULL);
516  DSPASSERT(pInputImaginary != NULL);
517  DSPASSERT(uLength >= 4);
518  DSPASSERT(ISPOWEROF2(uLength));
519 
520  FLOAT32 flOneOverLength = 1.0f / uLength;
521 
522 
523  // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2
524  XVECTOR vOneOverLength = _mm_set_ps1(flOneOverLength);
525 
526  for (UINT32 uIndex=0; uIndex<(uLength>>2); ++uIndex) {
527  XVECTOR vReal = _mm_mul_ps(pInputReal[uIndex], vOneOverLength);
528  XVECTOR vImaginary = _mm_mul_ps(pInputImaginary[uIndex], vOneOverLength);
529  XVECTOR vRR = _mm_mul_ps(vReal, vReal);
530  XVECTOR vII = _mm_mul_ps(vImaginary, vImaginary);
531  XVECTOR vRRplusII = _mm_add_ps(vRR, vII);
532  XVECTOR vTotal = _mm_sqrt_ps(vRRplusII);
533  pOutput[uIndex] = _mm_add_ps(vTotal, vTotal);
534  }
535 }
536 
537 
538 
539 
540 
541 //--------------------------------------------------------------------------//
543  // DESCRIPTION:
544  // Deinterleaves audio samples such that all samples corresponding to
545 
546  //
547  // REMARKS:
548  // For example, audio of the form [LRLRLR] becomes [LLLRRR].
549  //
550  // PARAMETERS:
551  // pOutput - [out] output buffer, receives samples in deinterleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements
552  // pInput - [in] input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements
553  // uChannelCount - [in] number of channels, must be > 1
554  // uFrameCount - [in] number of frames of valid data, must be > 0
555  //
556  // RETURN VALUE:
557  // void
559 inline void Deinterleave (__out_ecount((uChannelCount*uFrameCount)/4) XVECTOR* __restrict pOutput, __in_ecount((uChannelCount*uFrameCount)/4) const XVECTOR* __restrict pInput, const UINT32 uChannelCount, const UINT32 uFrameCount)
560 {
561  DSPASSERT(pOutput != NULL);
562  DSPASSERT(pInput != NULL);
563  DSPASSERT(uChannelCount > 1);
564  DSPASSERT(uFrameCount > 0);
565 
566  FLOAT32* __restrict pfOutput = (FLOAT32* __restrict)pOutput;
567  const FLOAT32* __restrict pfInput = (const FLOAT32* __restrict)pInput;
568 
569 
570  for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
571  for (UINT32 uFrame=0; uFrame<uFrameCount; ++uFrame) {
572  pfOutput[uChannel * uFrameCount + uFrame] = pfInput[uFrame * uChannelCount + uChannel];
573  }
574  }
575 }
576 
577 
579  // DESCRIPTION:
580  // Interleaves audio samples such that all samples corresponding to
581 
582  //
583  // REMARKS:
584  // For example, audio of the form [LLLRRR] becomes [LRLRLR].
585  //
586  // PARAMETERS:
587  // pOutput - [out] output buffer, receives samples in interleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements
588  // pInput - [in] input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements
589  // uChannelCount - [in] number of channels, must be > 1
590  // uFrameCount - [in] number of frames of valid data, must be > 0
591  //
592  // RETURN VALUE:
593  // void
595 inline void Interleave (__out_ecount((uChannelCount*uFrameCount)/4) XVECTOR* __restrict pOutput, __in_ecount((uChannelCount*uFrameCount)/4) const XVECTOR* __restrict pInput, const UINT32 uChannelCount, const UINT32 uFrameCount)
596 {
597  DSPASSERT(pOutput != NULL);
598  DSPASSERT(pInput != NULL);
599  DSPASSERT(uChannelCount > 1);
600  DSPASSERT(uFrameCount > 0);
601 
602  FLOAT32* __restrict pfOutput = (FLOAT32* __restrict)pOutput;
603  const FLOAT32* __restrict pfInput = (const FLOAT32* __restrict)pInput;
604 
605 
606  for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
607  for (UINT32 uFrame=0; uFrame<uFrameCount; ++uFrame) {
608  pfOutput[uFrame * uChannelCount + uChannel] = pfInput[uChannel * uFrameCount + uFrame];
609  }
610  }
611 }
612 
613 
614 
615 
616 
617 //--------------------------------------------------------------------------//
619  // DESCRIPTION:
620  // This function applies a 2^N-sample FFT and unswizzles the result such
621  // that the samples are in order of increasing frequency.
622  // Audio is first deinterleaved if multichannel.
623  //
624  // PARAMETERS:
625  // pReal - [inout] real components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
626  // pImaginary - [out] imaginary components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
627  // pUnityTable - [in] unity table, must have at least (1<<uLog2Length) elements, see FFTInitializeUnityTable()
628  // uChannelCount - [in] number of channels, must be within [1, 6]
629  // uLog2Length - [in] LOG (base 2) of FFT length in frames, must within [2, 9]
630  //
631  // RETURN VALUE:
632  // void
634 inline void FFTInterleaved (__inout_ecount((1<<uLog2Length*uChannelCount)/4) XVECTOR* __restrict pReal, __out_ecount((1<<uLog2Length*uChannelCount)/4) XVECTOR* __restrict pImaginary, __in_ecount(1<<uLog2Length) const XVECTOR* __restrict pUnityTable, const UINT32 uChannelCount, const UINT32 uLog2Length)
635 {
636  DSPASSERT(pReal != NULL);
637  DSPASSERT(pImaginary != NULL);
638  DSPASSERT(pUnityTable != NULL);
639  DSPASSERT((UINT_PTR)pReal % 16 == 0);
640  DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
641  DSPASSERT((UINT_PTR)pUnityTable % 16 == 0);
642  DSPASSERT(uChannelCount > 0 && uChannelCount <= 6);
643  DSPASSERT(uLog2Length >= 2 && uLog2Length <= 9);
644 
645  XVECTOR vRealTemp[768];
646  XVECTOR vImaginaryTemp[768];
647  const UINT32 uLength = UINT32(1 << uLog2Length);
648 
649 
650  if (uChannelCount > 1) {
651  Deinterleave(vRealTemp, pReal, uChannelCount, uLength);
652  } else {
653  CopyMemory(vRealTemp, pReal, (uLength>>2)*sizeof(XVECTOR));
654  }
655  for (UINT32 u=0; u<uChannelCount*(uLength>>2); u++) {
656  vImaginaryTemp[u] = _mm_setzero_ps();
657  }
658 
659  if (uLength > 16) {
660  for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
661  FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength);
662  }
663  } else if (uLength == 16) {
664  for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
665  FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
666  }
667  } else if (uLength == 8) {
668  for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
669  FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
670  }
671  } else if (uLength == 4) {
672  for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
673  FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
674  }
675  }
676 
677  for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
678  FFTUnswizzle(&pReal[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length);
679  FFTUnswizzle(&pImaginary[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], uLog2Length);
680  }
681 }
682 
683 
685  // DESCRIPTION:
686  // This function applies a 2^N-sample inverse FFT.
687  // Audio is interleaved if multichannel.
688  //
689  // PARAMETERS:
690  // pReal - [inout] real components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
691  // pImaginary - [out] imaginary components, must have at least (1<<uLog2Length*uChannelCount)/4 elements
692  // pUnityTable - [in] unity table, must have at least (1<<uLog2Length) elements, see FFTInitializeUnityTable()
693  // uChannelCount - [in] number of channels, must be > 0
694  // uLog2Length - [in] LOG (base 2) of FFT length in frames, must within [2, 10]
695  //
696  // RETURN VALUE:
697  // void
699 inline void IFFTDeinterleaved (__inout_ecount((1<<uLog2Length*uChannelCount)/4) XVECTOR* __restrict pReal, __out_ecount((1<<uLog2Length*uChannelCount)/4) XVECTOR* __restrict pImaginary, __in_ecount(1<<uLog2Length) const XVECTOR* __restrict pUnityTable, const UINT32 uChannelCount, const UINT32 uLog2Length)
700 {
701  DSPASSERT(pReal != NULL);
702  DSPASSERT(pImaginary != NULL);
703  DSPASSERT(pUnityTable != NULL);
704  DSPASSERT((UINT_PTR)pReal % 16 == 0);
705  DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
706  DSPASSERT((UINT_PTR)pUnityTable % 16 == 0);
707  DSPASSERT(uChannelCount > 0 && uChannelCount <= 6);
708  DSPASSERT(uLog2Length >= 2 && uLog2Length <= 9);
709 
710  XVECTOR vRealTemp[768];
711  XVECTOR vImaginaryTemp[768];
712  const UINT32 uLength = UINT32(1 << uLog2Length);
713 
714 
715  const XVECTOR vRnp = _mm_set_ps1(1.0f/uLength);
716  const XVECTOR vRnm = _mm_set_ps1(-1.0f/uLength);
717  for (UINT32 u=0; u<uChannelCount*(uLength>>2); u++) {
718  vRealTemp[u] = _mm_mul_ps(pReal[u], vRnp);
719  vImaginaryTemp[u] = _mm_mul_ps(pImaginary[u], vRnm);
720  }
721 
722  if (uLength > 16) {
723  for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
724  FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength);
725  }
726  } else if (uLength == 16) {
727  for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
728  FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
729  }
730  } else if (uLength == 8) {
731  for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
732  FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
733  }
734  } else if (uLength == 4) {
735  for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
736  FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
737  }
738  }
739 
740  for (UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
741  FFTUnswizzle(&vImaginaryTemp[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length);
742  }
743  if (uChannelCount > 1) {
744  Interleave(pReal, vImaginaryTemp, uChannelCount, uLength);
745  } else {
746  CopyMemory(pReal, vImaginaryTemp, (uLength>>2)*sizeof(XVECTOR));
747  }
748 }
749 
750 
751 #pragma warning(pop)
752 }; // namespace XDSP
753 //---------------------------------<-EOF->----------------------------------//
754 
__in_ecount(4) CONST FLOAT *pF
__forceinline void FFT4(__inout_ecount(uCount) XVECTOR *__restrict pReal, __inout_ecount(uCount) XVECTOR *__restrict pImaginary, const UINT32 uCount=1)
Definition: xdsp.h:225
void Interleave(__out_ecount((uChannelCount *uFrameCount)/4) XVECTOR *__restrict pOutput, __in_ecount((uChannelCount *uFrameCount)/4) const XVECTOR *__restrict pInput, const UINT32 uChannelCount, const UINT32 uFrameCount)
Definition: xdsp.h:595
GLdouble GLdouble GLdouble r
Definition: glext.h:6406
#define FALSE
Definition: stb_vorbis.h:232
__out_ecount(4) FLOAT *WINAPI D3DXSHMultiply2(__out_ecount(4) FLOAT *pOut
GLfloat f
Definition: glext.h:8207
bf_uint8_t r1
Definition: connect_ps4.c:74
bf_uint8_t r3
Definition: connect_ps4.c:80
void IFFTDeinterleaved(__inout_ecount((1<< uLog2Length *uChannelCount)/4) XVECTOR *__restrict pReal, __out_ecount((1<< uLog2Length *uChannelCount)/4) XVECTOR *__restrict pImaginary, __in_ecount(1<< uLog2Length) const XVECTOR *__restrict pUnityTable, const UINT32 uChannelCount, const UINT32 uLog2Length)
Definition: xdsp.h:699
bf_uint8_t r2
Definition: connect_ps4.c:76
void FFT(__inout_ecount((uLength *uCount)/4) XVECTOR *__restrict pReal, __inout_ecount((uLength *uCount)/4) XVECTOR *__restrict pImaginary, __in_ecount(uLength *uCount) const XVECTOR *__restrict pUnityTable, const UINT32 uLength, const UINT32 uCount=1)
Definition: xdsp.h:348
GLint i1
Definition: nx_glsym.h:308
__forceinline void FFT8(__inout_ecount(uCount *2) XVECTOR *__restrict pReal, __inout_ecount(uCount *2) XVECTOR *__restrict pImaginary, const UINT32 uCount=1)
Definition: xdsp.h:252
#define NULL
Pointer to 0.
Definition: gctypes.h:65
#define UINT_PTR
Definition: Common.h:66
#define ISPOWEROF2(n)
Definition: xdsp.h:46
__forceinline void ButterflyDIT4_4(__inout XVECTORREF r0, __inout XVECTORREF r1, __inout XVECTORREF r2, __inout XVECTORREF r3, __inout XVECTORREF i0, __inout XVECTORREF i1, __inout XVECTORREF i2, __inout XVECTORREF i3, __in_ecount(uStride *4) const XVECTOR *__restrict pUnityTableReal, __in_ecount(uStride *4) const XVECTOR *__restrict pUnityTableImaginary, const UINT32 uStride, const BOOL fLast)
Definition: xdsp.h:160
void FFTInterleaved(__inout_ecount((1<< uLog2Length *uChannelCount)/4) XVECTOR *__restrict pReal, __out_ecount((1<< uLog2Length *uChannelCount)/4) XVECTOR *__restrict pImaginary, __in_ecount(1<< uLog2Length) const XVECTOR *__restrict pUnityTable, const UINT32 uChannelCount, const UINT32 uLog2Length)
Definition: xdsp.h:634
GLint GLint i2
Definition: nx_glsym.h:308
void FFTPolar(__out_ecount(uLength/4) XVECTOR *__restrict pOutput, __in_ecount(uLength/4) const XVECTOR *__restrict pInputReal, __in_ecount(uLength/4) const XVECTOR *__restrict pInputImaginary, const UINT32 uLength)
Definition: xdsp.h:512
float FLOAT32
Definition: xapobase.h:60
__forceinline void ButterflyDIT4_1(__inout XVECTORREF r1, __inout XVECTORREF i1)
Definition: xdsp.h:117
__m128 XVECTOR
Definition: xdsp.h:60
XVECTOR & XVECTORREF
Definition: xdsp.h:61
void Deinterleave(__out_ecount((uChannelCount *uFrameCount)/4) XVECTOR *__restrict pOutput, __in_ecount((uChannelCount *uFrameCount)/4) const XVECTOR *__restrict pInput, const UINT32 uChannelCount, const UINT32 uFrameCount)
Definition: xdsp.h:559
unsigned int BOOL
Definition: gctypes.h:51
uint32_t UINT32
Definition: coretypes.h:10
GLint j
Definition: nx_glsym.h:307
#define DSPASSERT(exp)
Definition: xdsp.h:40
const XVECTOR & XVECTORREFC
Definition: xdsp.h:62
void FFTInitializeUnityTable(__out_ecount(uLength) XVECTOR *__restrict pUnityTable, UINT32 uLength)
Definition: xdsp.h:418
__forceinline void vmulComplex(__out XVECTORREF rResult, __out XVECTORREF iResult, __in XVECTORREFC r1, __in XVECTORREFC i1, __in XVECTORREFC r2, __in XVECTORREFC i2)
Definition: xdsp.h:67
#define TRUE
Definition: stb_vorbis.h:231
__forceinline void FFT16(__inout_ecount(uCount *4) XVECTOR *__restrict pReal, __inout_ecount(uCount *4) XVECTOR *__restrict pImaginary, const UINT32 uCount=1)
Definition: xdsp.h:302
Definition: xdsp.h:51
float4 i3
Definition: foo.h:3
GLdouble n
Definition: glext.h:8396
void FFTUnswizzle(__out_ecount((1<< uLog2Length)/4) XVECTOR *__restrict pOutput, __in_ecount((1<< uLog2Length)/4) const XVECTOR *__restrict pInput, const UINT32 uLog2Length)
Definition: xdsp.h:461