PandA-2024.02
fft.c
Go to the documentation of this file.
1 /*
2 Implementations based on:
3 V. Volkov and B. Kazian. Fitting fft onto the g80 architecture. 2008.
4 */
5 
6 #include "fft.h"
7 
9 #define THREADS 64
10 #define cmplx_M_x(a_x, a_y, b_x, b_y) (a_x*b_x - a_y *b_y)
11 #define cmplx_M_y(a_x, a_y, b_x, b_y) (a_x*b_y + a_y *b_x)
12 #define cmplx_MUL_x(a_x, a_y, b_x, b_y ) (a_x*b_x - a_y*b_y)
13 #define cmplx_MUL_y(a_x, a_y, b_x, b_y ) (a_x*b_y + a_y*b_x)
14 #define cmplx_mul_x(a_x, a_y, b_x, b_y) (a_x*b_x - a_y*b_y)
15 #define cmplx_mul_y(a_x, a_y, b_x, b_y) (a_x*b_y + a_y*b_x)
16 #define cmplx_add_x(a_x, b_x) (a_x + b_x)
17 #define cmplx_add_y(a_y, b_y) (a_y + b_y)
18 #define cmplx_sub_x(a_x, b_x) (a_x - b_x)
19 #define cmplx_sub_y(a_y, b_y) (a_y - b_y)
20 #define cm_fl_mul_x(a_x, b) (b*a_x)
21 #define cm_fl_mul_y(a_y, b) (b*a_y)
22 
23 void twiddles8(TYPE a_x[8], TYPE a_y[8], int i, int n){
24  int reversed8[8] = {0,4,2,6,1,5,3,7};
25  int j;
26  TYPE phi, tmp, phi_x, phi_y;
27 
28  twiddles:for(j=1; j < 8; j++){
29  phi = ((-2*PI*reversed8[j]/n)*i);
30  phi_x = cos(phi);
31  phi_y = sin(phi);
32  tmp = a_x[j];
33  a_x[j] = cmplx_M_x(a_x[j], a_y[j], phi_x, phi_y);
34  a_y[j] = cmplx_M_y(tmp, a_y[j], phi_x, phi_y);
35  }
36 }
38 
39 #define FF2(a0_x, a0_y, a1_x, a1_y){ \
40  TYPE c0_x = *a0_x; \
41  TYPE c0_y = *a0_y; \
42  *a0_x = cmplx_add_x(c0_x, *a1_x); \
43  *a0_y = cmplx_add_y(c0_y, *a1_y); \
44  *a1_x = cmplx_sub_x(c0_x, *a1_x); \
45  *a1_y = cmplx_sub_y(c0_y, *a1_y); \
46 }
47 
48 #define FFT4(a0_x, a0_y, a1_x, a1_y, a2_x, a2_y, a3_x, a3_y){ \
49  TYPE exp_1_44_x; \
50  TYPE exp_1_44_y; \
51  TYPE tmp; \
52  exp_1_44_x = 0.0; \
53  exp_1_44_y = -1.0; \
54  FF2( a0_x, a0_y, a2_x, a2_y); \
55  FF2( a1_x, a1_y, a3_x, a3_y); \
56  tmp = *a3_x; \
57  *a3_x = *a3_x*exp_1_44_x-*a3_y*exp_1_44_y; \
58  *a3_y = tmp*exp_1_44_y - *a3_y*exp_1_44_x; \
59  FF2( a0_x, a0_y, a1_x, a1_y ); \
60  FF2( a2_x, a2_y, a3_x, a3_y ); \
61 }
62 
63 #define FFT8(a_x, a_y) \
64 { \
65  TYPE exp_1_8_x, exp_1_4_x, exp_3_8_x; \
66  TYPE exp_1_8_y, exp_1_4_y, exp_3_8_y; \
67  TYPE tmp_1; \
68  exp_1_8_x = 1; \
69  exp_1_8_y = -1; \
70  exp_1_4_x = 0; \
71  exp_1_4_y = -1; \
72  exp_3_8_x = -1; \
73  exp_3_8_y = -1; \
74  FF2( &a_x[0], &a_y[0], &a_x[4], &a_y[4]); \
75  FF2( &a_x[1], &a_y[1], &a_x[5], &a_y[5]); \
76  FF2( &a_x[2], &a_y[2], &a_x[6], &a_y[6]); \
77  FF2( &a_x[3], &a_y[3], &a_x[7], &a_y[7]); \
78  tmp_1 = a_x[5]; \
79  a_x[5] = cm_fl_mul_x( cmplx_mul_x(a_x[5], a_y[5], exp_1_8_x, exp_1_8_y), M_SQRT1_2 ); \
80  a_y[5] = cm_fl_mul_y( cmplx_mul_y(tmp_1, a_y[5], exp_1_8_x, exp_1_8_y) , M_SQRT1_2 ); \
81  tmp_1 = a_x[6]; \
82  a_x[6] = cmplx_mul_x( a_x[6], a_y[6], exp_1_4_x , exp_1_4_y); \
83  a_y[6] = cmplx_mul_y( tmp_1, a_y[6], exp_1_4_x , exp_1_4_y); \
84  tmp_1 = a_x[7]; \
85  a_x[7] = cm_fl_mul_x( cmplx_mul_x(a_x[7], a_y[7], exp_3_8_x, exp_3_8_y), M_SQRT1_2 ); \
86  a_y[7] = cm_fl_mul_y( cmplx_mul_y(tmp_1, a_y[7], exp_3_8_x, exp_3_8_y) , M_SQRT1_2 ); \
87  FFT4( &a_x[0], &a_y[0], &a_x[1], &a_y[1], &a_x[2], &a_y[2], &a_x[3], &a_y[3] ); \
88  FFT4( &a_x[4], &a_y[4], &a_x[5], &a_y[5], &a_x[6], &a_y[6], &a_x[7], &a_y[7] ); \
89 }
90 
91 void loadx8(TYPE a_x[], TYPE x[], int offset, int sx){
92  a_x[0] = x[0*sx+offset];
93  a_x[1] = x[1*sx+offset];
94  a_x[2] = x[2*sx+offset];
95  a_x[3] = x[3*sx+offset];
96  a_x[4] = x[4*sx+offset];
97  a_x[5] = x[5*sx+offset];
98  a_x[6] = x[6*sx+offset];
99  a_x[7] = x[7*sx+offset];
100 }
101 
102 void loady8(TYPE a_y[], TYPE x[], int offset, int sx){
103  a_y[0] = x[0*sx+offset];
104  a_y[1] = x[1*sx+offset];
105  a_y[2] = x[2*sx+offset];
106  a_y[3] = x[3*sx+offset];
107  a_y[4] = x[4*sx+offset];
108  a_y[5] = x[5*sx+offset];
109  a_y[6] = x[6*sx+offset];
110  a_y[7] = x[7*sx+offset];
111 }
112 
113 void fft1D_512(TYPE work_x[512], TYPE work_y[512]){
114  int tid, hi, lo, stride;
115  int reversed[] = {0,4,2,6,1,5,3,7};
116  TYPE DATA_x[THREADS*8];
117  TYPE DATA_y[THREADS*8];
118 
119  TYPE data_x[ 8 ];
120  TYPE data_y[ 8 ];
121 
122  TYPE smem[8*8*9];
123 
124  stride = THREADS;
125 
126  //Do it all at once...
127 loop1 : for(tid = 0; tid < THREADS; tid++){
128  //GLOBAL_LOAD...
129  data_x[0] = work_x[0*stride+tid];
130  data_x[1] = work_x[1*stride+tid];
131  data_x[2] = work_x[2*stride+tid];
132  data_x[3] = work_x[3*stride+tid];
133  data_x[4] = work_x[4*stride+tid];
134  data_x[5] = work_x[5*stride+tid];
135  data_x[6] = work_x[6*stride+tid];
136  data_x[7] = work_x[7*stride+tid];
137 
138  data_y[0] = work_y[0*stride+tid];
139  data_y[1] = work_y[1*stride+tid];
140  data_y[2] = work_y[2*stride+tid];
141  data_y[3] = work_y[3*stride+tid];
142  data_y[4] = work_y[4*stride+tid];
143  data_y[5] = work_y[5*stride+tid];
144  data_y[6] = work_y[6*stride+tid];
145  data_y[7] = work_y[7*stride+tid];
146 
147  //First 8 point FFT...
148  FFT8(data_x, data_y);
149 
150  //First Twiddle
151  twiddles8(data_x, data_y, tid, 512);
152 
153  //save for fence
154  DATA_x[tid*8] = data_x[0];
155  DATA_x[tid*8 + 1] = data_x[1];
156  DATA_x[tid*8 + 2] = data_x[2];
157  DATA_x[tid*8 + 3] = data_x[3];
158  DATA_x[tid*8 + 4] = data_x[4];
159  DATA_x[tid*8 + 5] = data_x[5];
160  DATA_x[tid*8 + 6] = data_x[6];
161  DATA_x[tid*8 + 7] = data_x[7];
162 
163  DATA_y[tid*8] = data_y[0];
164  DATA_y[tid*8 + 1] = data_y[1];
165  DATA_y[tid*8 + 2] = data_y[2];
166  DATA_y[tid*8 + 3] = data_y[3];
167  DATA_y[tid*8 + 4] = data_y[4];
168  DATA_y[tid*8 + 5] = data_y[5];
169  DATA_y[tid*8 + 6] = data_y[6];
170  DATA_y[tid*8 + 7] = data_y[7];
171  }
172  int sx, offset;
173  sx = 66;
174 loop2 : for(tid = 0; tid < 64; tid++){
175  hi = tid>>3;
176  lo = tid&7;
177  offset = hi*8+lo;
178  smem[0*sx+offset] = DATA_x[tid*8 + 0];
179  smem[4*sx+offset] = DATA_x[tid*8 + 1];
180  smem[1*sx+offset] = DATA_x[tid*8 + 4];
181  smem[5*sx+offset] = DATA_x[tid*8 + 5];
182  smem[2*sx+offset] = DATA_x[tid*8 + 2];
183  smem[6*sx+offset] = DATA_x[tid*8 + 3];
184  smem[3*sx+offset] = DATA_x[tid*8 + 6];
185  smem[7*sx+offset] = DATA_x[tid*8 + 7];
186  }
187  sx = 8;
188 loop3 : for(tid = 0; tid < 64; tid++){
189  hi = tid>>3;
190  lo = tid&7;
191  offset = lo*66+hi;
192 
193  DATA_x[tid*8 +0] = smem[0*sx+offset];
194  DATA_x[tid*8 +4] = smem[4*sx+offset];
195  DATA_x[tid*8 +1] = smem[1*sx+offset];
196  DATA_x[tid*8 +5] = smem[5*sx+offset];
197  DATA_x[tid*8 +2] = smem[2*sx+offset];
198  DATA_x[tid*8 +6] = smem[6*sx+offset];
199  DATA_x[tid*8 +3] = smem[3*sx+offset];
200  DATA_x[tid*8 +7] = smem[7*sx+offset];
201  }
202 
203  sx = 66;
204 loop4 : for(tid = 0; tid < 64; tid++){
205  hi = tid>>3;
206  lo = tid&7;
207  offset = hi*8+lo;
208 
209  smem[0*sx+offset] = DATA_y[tid*8 + 0];
210  smem[4*sx+offset] = DATA_y[tid*8 + 1];
211  smem[1*sx+offset] = DATA_y[tid*8 + 4];
212  smem[5*sx+offset] = DATA_y[tid*8 + 5];
213  smem[2*sx+offset] = DATA_y[tid*8 + 2];
214  smem[6*sx+offset] = DATA_y[tid*8 + 3];
215  smem[3*sx+offset] = DATA_y[tid*8 + 6];
216  smem[7*sx+offset] = DATA_y[tid*8 + 7];
217  }
218 
219 loop5 : for(tid = 0; tid < 64; tid++){
220  data_y[0] = DATA_y[tid*8 + 0];
221  data_y[1] = DATA_y[tid*8 + 1];
222  data_y[2] = DATA_y[tid*8 + 2];
223  data_y[3] = DATA_y[tid*8 + 3];
224  data_y[4] = DATA_y[tid*8 + 4];
225  data_y[5] = DATA_y[tid*8 + 5];
226  data_y[6] = DATA_y[tid*8 + 6];
227  data_y[7] = DATA_y[tid*8 + 7];
228 
229  hi = tid>>3;
230  lo = tid&7;
231 
232  loady8(data_y, smem, lo*66+hi, 8);
233 
234  DATA_y[tid*8] = data_y[0];
235  DATA_y[tid*8 + 1] = data_y[1];
236  DATA_y[tid*8 + 2] = data_y[2];
237  DATA_y[tid*8 + 3] = data_y[3];
238  DATA_y[tid*8 + 4] = data_y[4];
239  DATA_y[tid*8 + 5] = data_y[5];
240  DATA_y[tid*8 + 6] = data_y[6];
241  DATA_y[tid*8 + 7] = data_y[7];
242  }
243 
244 loop6 : for(tid = 0; tid < 64; tid++){
245  data_x[0] = DATA_x[tid*8 + 0];
246  data_x[1] = DATA_x[tid*8 + 1];
247  data_x[2] = DATA_x[tid*8 + 2];
248  data_x[3] = DATA_x[tid*8 + 3];
249  data_x[4] = DATA_x[tid*8 + 4];
250  data_x[5] = DATA_x[tid*8 + 5];
251  data_x[6] = DATA_x[tid*8 + 6];
252  data_x[7] = DATA_x[tid*8 + 7];
253 
254  data_y[0] = DATA_y[tid*8 + 0];
255  data_y[1] = DATA_y[tid*8 + 1];
256  data_y[2] = DATA_y[tid*8 + 2];
257  data_y[3] = DATA_y[tid*8 + 3];
258  data_y[4] = DATA_y[tid*8 + 4];
259  data_y[5] = DATA_y[tid*8 + 5];
260  data_y[6] = DATA_y[tid*8 + 6];
261  data_y[7] = DATA_y[tid*8 + 7];
262 
263  //Second FFT8...
264  FFT8(data_x, data_y);
265 
266  //Calculate hi for second twiddle calculation...
267  hi = tid>>3;
268 
269  //Second twiddles calc, use hi and 64 stride version as defined in G80/SHOC...
270  twiddles8(data_x, data_y, hi, 64);
271 
272  //Save for final transpose...
273  DATA_x[tid*8] = data_x[0];
274  DATA_x[tid*8 + 1] = data_x[1];
275  DATA_x[tid*8 + 2] = data_x[2];
276  DATA_x[tid*8 + 3] = data_x[3];
277  DATA_x[tid*8 + 4] = data_x[4];
278  DATA_x[tid*8 + 5] = data_x[5];
279  DATA_x[tid*8 + 6] = data_x[6];
280  DATA_x[tid*8 + 7] = data_x[7];
281 
282  DATA_y[tid*8] = data_y[0];
283  DATA_y[tid*8 + 1] = data_y[1];
284  DATA_y[tid*8 + 2] = data_y[2];
285  DATA_y[tid*8 + 3] = data_y[3];
286  DATA_y[tid*8 + 4] = data_y[4];
287  DATA_y[tid*8 + 5] = data_y[5];
288  DATA_y[tid*8 + 6] = data_y[6];
289  DATA_y[tid*8 + 7] = data_y[7];
290  }
291 
292  //Transpose..
293  sx = 72;
294 loop7 : for(tid = 0; tid < 64; tid++){
295  hi = tid>>3;
296  lo = tid&7;
297  offset = hi*8+lo;
298  smem[0*sx+offset] = DATA_x[tid*8 + 0];
299  smem[4*sx+offset] = DATA_x[tid*8 + 1];
300  smem[1*sx+offset] = DATA_x[tid*8 + 4];
301  smem[5*sx+offset] = DATA_x[tid*8 + 5];
302  smem[2*sx+offset] = DATA_x[tid*8 + 2];
303  smem[6*sx+offset] = DATA_x[tid*8 + 3];
304  smem[3*sx+offset] = DATA_x[tid*8 + 6];
305  smem[7*sx+offset] = DATA_x[tid*8 + 7];
306  }
307 
308  sx = 8;
309 loop8 : for(tid = 0; tid < 64; tid++){
310  hi = tid>>3;
311  lo = tid&7;
312  offset = hi*72+lo;
313 
314  DATA_x[tid*8 +0] = smem[0*sx+offset];
315  DATA_x[tid*8 +4] = smem[4*sx+offset];
316  DATA_x[tid*8 +1] = smem[1*sx+offset];
317  DATA_x[tid*8 +5] = smem[5*sx+offset];
318  DATA_x[tid*8 +2] = smem[2*sx+offset];
319  DATA_x[tid*8 +6] = smem[6*sx+offset];
320  DATA_x[tid*8 +3] = smem[3*sx+offset];
321  DATA_x[tid*8 +7] = smem[7*sx+offset];
322  }
323 
324  sx = 72;
325 loop9 : for(tid = 0; tid < 64; tid++){
326  hi = tid>>3;
327  lo = tid&7;
328  offset = hi*8+lo;
329 
330  smem[0*sx+offset] = DATA_y[tid*8 + 0];
331  smem[4*sx+offset] = DATA_y[tid*8 + 1];
332  smem[1*sx+offset] = DATA_y[tid*8 + 4];
333  smem[5*sx+offset] = DATA_y[tid*8 + 5];
334  smem[2*sx+offset] = DATA_y[tid*8 + 2];
335  smem[6*sx+offset] = DATA_y[tid*8 + 3];
336  smem[3*sx+offset] = DATA_y[tid*8 + 6];
337  smem[7*sx+offset] = DATA_y[tid*8 + 7];
338  }
339 
340 loop10 : for(tid = 0; tid < 64; tid++){
341  data_y[0] = DATA_y[tid*8 + 0];
342  data_y[1] = DATA_y[tid*8 + 1];
343  data_y[2] = DATA_y[tid*8 + 2];
344  data_y[3] = DATA_y[tid*8 + 3];
345  data_y[4] = DATA_y[tid*8 + 4];
346  data_y[5] = DATA_y[tid*8 + 5];
347  data_y[6] = DATA_y[tid*8 + 6];
348  data_y[7] = DATA_y[tid*8 + 7];
349 
350  hi = tid>>3;
351  lo = tid&7;
352 
353  loady8(data_y, smem, hi*72+lo, 8);
354 
355  DATA_y[tid*8 + 0] = data_y[0];
356  DATA_y[tid*8 + 1] = data_y[1];
357  DATA_y[tid*8 + 2] = data_y[2];
358  DATA_y[tid*8 + 3] = data_y[3];
359  DATA_y[tid*8 + 4] = data_y[4];
360  DATA_y[tid*8 + 5] = data_y[5];
361  DATA_y[tid*8 + 6] = data_y[6];
362  DATA_y[tid*8 + 7] = data_y[7];
363  }
364 
365 loop11 : for(tid = 0; tid < 64; tid++){
366  //Load post-trans
367  data_y[0] = DATA_y[tid*8];
368  data_y[1] = DATA_y[tid*8 + 1];
369  data_y[2] = DATA_y[tid*8 + 2];
370  data_y[3] = DATA_y[tid*8 + 3];
371  data_y[4] = DATA_y[tid*8 + 4];
372  data_y[5] = DATA_y[tid*8 + 5];
373  data_y[6] = DATA_y[tid*8 + 6];
374  data_y[7] = DATA_y[tid*8 + 7];
375 
376  data_x[0] = DATA_x[tid*8];
377  data_x[1] = DATA_x[tid*8 + 1];
378  data_x[2] = DATA_x[tid*8 + 2];
379  data_x[3] = DATA_x[tid*8 + 3];
380  data_x[4] = DATA_x[tid*8 + 4];
381  data_x[5] = DATA_x[tid*8 + 5];
382  data_x[6] = DATA_x[tid*8 + 6];
383  data_x[7] = DATA_x[tid*8 + 7];
384 
385  //Final 8pt FFT...
386  FFT8(data_x, data_y);
387 
388  //Global store
389  work_x[0*stride+tid] = data_x[reversed[0]];
390  work_x[1*stride+tid] = data_x[reversed[1]];
391  work_x[2*stride+tid] = data_x[reversed[2]];
392  work_x[3*stride+tid] = data_x[reversed[3]];
393  work_x[4*stride+tid] = data_x[reversed[4]];
394  work_x[5*stride+tid] = data_x[reversed[5]];
395  work_x[6*stride+tid] = data_x[reversed[6]];
396  work_x[7*stride+tid] = data_x[reversed[7]];
397 
398  work_y[0*stride+tid] = data_y[reversed[0]];
399  work_y[1*stride+tid] = data_y[reversed[1]];
400  work_y[2*stride+tid] = data_y[reversed[2]];
401  work_y[3*stride+tid] = data_y[reversed[3]];
402  work_y[4*stride+tid] = data_y[reversed[4]];
403  work_y[5*stride+tid] = data_y[reversed[5]];
404  work_y[6*stride+tid] = data_y[reversed[6]];
405  work_y[7*stride+tid] = data_y[reversed[7]];
406  }
407 }
#define cmplx_M_x(a_x, a_y, b_x, b_y)
Definition: fft.c:10
#define FFT8(a_x, a_y)
Definition: fft.c:63
#define TYPE
Definition: backprop.h:21
void loady8(TYPE a_y[], TYPE x[], int offset, int sx)
Definition: fft.c:102
#define PI
Definition: fft.h:18
void twiddles8(TYPE a_x[8], TYPE a_y[8], int i, int n)
Definition: fft.c:23
uint_fast16_t i
Definition: support.h:78
unsigned offset[NUM_VERTICES+1]
Definition: graph.h:3
void loadx8(TYPE a_x[], TYPE x[], int offset, int sx)
Definition: fft.c:91
#define cmplx_M_y(a_x, a_y, b_x, b_y)
Definition: fft.c:11
void fft1D_512(TYPE work_x[512], TYPE work_y[512])
Definition: fft.c:113
x
Return the smallest n such that 2^n >= _x.
#define THREADS
Definition: fft.c:9

Generated on Mon Feb 12 2024 13:02:49 for PandA-2024.02 by doxygen 1.8.13