10 #define cmplx_M_x(a_x, a_y, b_x, b_y) (a_x*b_x - a_y *b_y) 11 #define cmplx_M_y(a_x, a_y, b_x, b_y) (a_x*b_y + a_y *b_x) 12 #define cmplx_MUL_x(a_x, a_y, b_x, b_y ) (a_x*b_x - a_y*b_y) 13 #define cmplx_MUL_y(a_x, a_y, b_x, b_y ) (a_x*b_y + a_y*b_x) 14 #define cmplx_mul_x(a_x, a_y, b_x, b_y) (a_x*b_x - a_y*b_y) 15 #define cmplx_mul_y(a_x, a_y, b_x, b_y) (a_x*b_y + a_y*b_x) 16 #define cmplx_add_x(a_x, b_x) (a_x + b_x) 17 #define cmplx_add_y(a_y, b_y) (a_y + b_y) 18 #define cmplx_sub_x(a_x, b_x) (a_x - b_x) 19 #define cmplx_sub_y(a_y, b_y) (a_y - b_y) 20 #define cm_fl_mul_x(a_x, b) (b*a_x) 21 #define cm_fl_mul_y(a_y, b) (b*a_y) 24 int reversed8[8] = {0,4,2,6,1,5,3,7};
28 twiddles:
for(j=1; j < 8; j++){
29 phi = ((-2*
PI*reversed8[j]/n)*i);
33 a_x[j] =
cmplx_M_x(a_x[j], a_y[j], phi_x, phi_y);
34 a_y[j] =
cmplx_M_y(tmp, a_y[j], phi_x, phi_y);
39 #define FF2(a0_x, a0_y, a1_x, a1_y){ \ 42 *a0_x = cmplx_add_x(c0_x, *a1_x); \ 43 *a0_y = cmplx_add_y(c0_y, *a1_y); \ 44 *a1_x = cmplx_sub_x(c0_x, *a1_x); \ 45 *a1_y = cmplx_sub_y(c0_y, *a1_y); \ 48 #define FFT4(a0_x, a0_y, a1_x, a1_y, a2_x, a2_y, a3_x, a3_y){ \ 54 FF2( a0_x, a0_y, a2_x, a2_y); \ 55 FF2( a1_x, a1_y, a3_x, a3_y); \ 57 *a3_x = *a3_x*exp_1_44_x-*a3_y*exp_1_44_y; \ 58 *a3_y = tmp*exp_1_44_y - *a3_y*exp_1_44_x; \ 59 FF2( a0_x, a0_y, a1_x, a1_y ); \ 60 FF2( a2_x, a2_y, a3_x, a3_y ); \ 63 #define FFT8(a_x, a_y) \ 65 TYPE exp_1_8_x, exp_1_4_x, exp_3_8_x; \ 66 TYPE exp_1_8_y, exp_1_4_y, exp_3_8_y; \ 74 FF2( &a_x[0], &a_y[0], &a_x[4], &a_y[4]); \ 75 FF2( &a_x[1], &a_y[1], &a_x[5], &a_y[5]); \ 76 FF2( &a_x[2], &a_y[2], &a_x[6], &a_y[6]); \ 77 FF2( &a_x[3], &a_y[3], &a_x[7], &a_y[7]); \ 79 a_x[5] = cm_fl_mul_x( cmplx_mul_x(a_x[5], a_y[5], exp_1_8_x, exp_1_8_y), M_SQRT1_2 ); \ 80 a_y[5] = cm_fl_mul_y( cmplx_mul_y(tmp_1, a_y[5], exp_1_8_x, exp_1_8_y) , M_SQRT1_2 ); \ 82 a_x[6] = cmplx_mul_x( a_x[6], a_y[6], exp_1_4_x , exp_1_4_y); \ 83 a_y[6] = cmplx_mul_y( tmp_1, a_y[6], exp_1_4_x , exp_1_4_y); \ 85 a_x[7] = cm_fl_mul_x( cmplx_mul_x(a_x[7], a_y[7], exp_3_8_x, exp_3_8_y), M_SQRT1_2 ); \ 86 a_y[7] = cm_fl_mul_y( cmplx_mul_y(tmp_1, a_y[7], exp_3_8_x, exp_3_8_y) , M_SQRT1_2 ); \ 87 FFT4( &a_x[0], &a_y[0], &a_x[1], &a_y[1], &a_x[2], &a_y[2], &a_x[3], &a_y[3] ); \ 88 FFT4( &a_x[4], &a_y[4], &a_x[5], &a_y[5], &a_x[6], &a_y[6], &a_x[7], &a_y[7] ); \ 114 int tid, hi, lo, stride;
115 int reversed[] = {0,4,2,6,1,5,3,7};
127 loop1 :
for(tid = 0; tid <
THREADS; tid++){
129 data_x[0] = work_x[0*stride+tid];
130 data_x[1] = work_x[1*stride+tid];
131 data_x[2] = work_x[2*stride+tid];
132 data_x[3] = work_x[3*stride+tid];
133 data_x[4] = work_x[4*stride+tid];
134 data_x[5] = work_x[5*stride+tid];
135 data_x[6] = work_x[6*stride+tid];
136 data_x[7] = work_x[7*stride+tid];
138 data_y[0] = work_y[0*stride+tid];
139 data_y[1] = work_y[1*stride+tid];
140 data_y[2] = work_y[2*stride+tid];
141 data_y[3] = work_y[3*stride+tid];
142 data_y[4] = work_y[4*stride+tid];
143 data_y[5] = work_y[5*stride+tid];
144 data_y[6] = work_y[6*stride+tid];
145 data_y[7] = work_y[7*stride+tid];
148 FFT8(data_x, data_y);
154 DATA_x[tid*8] = data_x[0];
155 DATA_x[tid*8 + 1] = data_x[1];
156 DATA_x[tid*8 + 2] = data_x[2];
157 DATA_x[tid*8 + 3] = data_x[3];
158 DATA_x[tid*8 + 4] = data_x[4];
159 DATA_x[tid*8 + 5] = data_x[5];
160 DATA_x[tid*8 + 6] = data_x[6];
161 DATA_x[tid*8 + 7] = data_x[7];
163 DATA_y[tid*8] = data_y[0];
164 DATA_y[tid*8 + 1] = data_y[1];
165 DATA_y[tid*8 + 2] = data_y[2];
166 DATA_y[tid*8 + 3] = data_y[3];
167 DATA_y[tid*8 + 4] = data_y[4];
168 DATA_y[tid*8 + 5] = data_y[5];
169 DATA_y[tid*8 + 6] = data_y[6];
170 DATA_y[tid*8 + 7] = data_y[7];
174 loop2 :
for(tid = 0; tid < 64; tid++){
178 smem[0*sx+
offset] = DATA_x[tid*8 + 0];
179 smem[4*sx+
offset] = DATA_x[tid*8 + 1];
180 smem[1*sx+
offset] = DATA_x[tid*8 + 4];
181 smem[5*sx+
offset] = DATA_x[tid*8 + 5];
182 smem[2*sx+
offset] = DATA_x[tid*8 + 2];
183 smem[6*sx+
offset] = DATA_x[tid*8 + 3];
184 smem[3*sx+
offset] = DATA_x[tid*8 + 6];
185 smem[7*sx+
offset] = DATA_x[tid*8 + 7];
188 loop3 :
for(tid = 0; tid < 64; tid++){
193 DATA_x[tid*8 +0] = smem[0*sx+
offset];
194 DATA_x[tid*8 +4] = smem[4*sx+
offset];
195 DATA_x[tid*8 +1] = smem[1*sx+
offset];
196 DATA_x[tid*8 +5] = smem[5*sx+
offset];
197 DATA_x[tid*8 +2] = smem[2*sx+
offset];
198 DATA_x[tid*8 +6] = smem[6*sx+
offset];
199 DATA_x[tid*8 +3] = smem[3*sx+
offset];
200 DATA_x[tid*8 +7] = smem[7*sx+
offset];
204 loop4 :
for(tid = 0; tid < 64; tid++){
209 smem[0*sx+
offset] = DATA_y[tid*8 + 0];
210 smem[4*sx+
offset] = DATA_y[tid*8 + 1];
211 smem[1*sx+
offset] = DATA_y[tid*8 + 4];
212 smem[5*sx+
offset] = DATA_y[tid*8 + 5];
213 smem[2*sx+
offset] = DATA_y[tid*8 + 2];
214 smem[6*sx+
offset] = DATA_y[tid*8 + 3];
215 smem[3*sx+
offset] = DATA_y[tid*8 + 6];
216 smem[7*sx+
offset] = DATA_y[tid*8 + 7];
219 loop5 :
for(tid = 0; tid < 64; tid++){
220 data_y[0] = DATA_y[tid*8 + 0];
221 data_y[1] = DATA_y[tid*8 + 1];
222 data_y[2] = DATA_y[tid*8 + 2];
223 data_y[3] = DATA_y[tid*8 + 3];
224 data_y[4] = DATA_y[tid*8 + 4];
225 data_y[5] = DATA_y[tid*8 + 5];
226 data_y[6] = DATA_y[tid*8 + 6];
227 data_y[7] = DATA_y[tid*8 + 7];
232 loady8(data_y, smem, lo*66+hi, 8);
234 DATA_y[tid*8] = data_y[0];
235 DATA_y[tid*8 + 1] = data_y[1];
236 DATA_y[tid*8 + 2] = data_y[2];
237 DATA_y[tid*8 + 3] = data_y[3];
238 DATA_y[tid*8 + 4] = data_y[4];
239 DATA_y[tid*8 + 5] = data_y[5];
240 DATA_y[tid*8 + 6] = data_y[6];
241 DATA_y[tid*8 + 7] = data_y[7];
244 loop6 :
for(tid = 0; tid < 64; tid++){
245 data_x[0] = DATA_x[tid*8 + 0];
246 data_x[1] = DATA_x[tid*8 + 1];
247 data_x[2] = DATA_x[tid*8 + 2];
248 data_x[3] = DATA_x[tid*8 + 3];
249 data_x[4] = DATA_x[tid*8 + 4];
250 data_x[5] = DATA_x[tid*8 + 5];
251 data_x[6] = DATA_x[tid*8 + 6];
252 data_x[7] = DATA_x[tid*8 + 7];
254 data_y[0] = DATA_y[tid*8 + 0];
255 data_y[1] = DATA_y[tid*8 + 1];
256 data_y[2] = DATA_y[tid*8 + 2];
257 data_y[3] = DATA_y[tid*8 + 3];
258 data_y[4] = DATA_y[tid*8 + 4];
259 data_y[5] = DATA_y[tid*8 + 5];
260 data_y[6] = DATA_y[tid*8 + 6];
261 data_y[7] = DATA_y[tid*8 + 7];
264 FFT8(data_x, data_y);
273 DATA_x[tid*8] = data_x[0];
274 DATA_x[tid*8 + 1] = data_x[1];
275 DATA_x[tid*8 + 2] = data_x[2];
276 DATA_x[tid*8 + 3] = data_x[3];
277 DATA_x[tid*8 + 4] = data_x[4];
278 DATA_x[tid*8 + 5] = data_x[5];
279 DATA_x[tid*8 + 6] = data_x[6];
280 DATA_x[tid*8 + 7] = data_x[7];
282 DATA_y[tid*8] = data_y[0];
283 DATA_y[tid*8 + 1] = data_y[1];
284 DATA_y[tid*8 + 2] = data_y[2];
285 DATA_y[tid*8 + 3] = data_y[3];
286 DATA_y[tid*8 + 4] = data_y[4];
287 DATA_y[tid*8 + 5] = data_y[5];
288 DATA_y[tid*8 + 6] = data_y[6];
289 DATA_y[tid*8 + 7] = data_y[7];
294 loop7 :
for(tid = 0; tid < 64; tid++){
298 smem[0*sx+
offset] = DATA_x[tid*8 + 0];
299 smem[4*sx+
offset] = DATA_x[tid*8 + 1];
300 smem[1*sx+
offset] = DATA_x[tid*8 + 4];
301 smem[5*sx+
offset] = DATA_x[tid*8 + 5];
302 smem[2*sx+
offset] = DATA_x[tid*8 + 2];
303 smem[6*sx+
offset] = DATA_x[tid*8 + 3];
304 smem[3*sx+
offset] = DATA_x[tid*8 + 6];
305 smem[7*sx+
offset] = DATA_x[tid*8 + 7];
309 loop8 :
for(tid = 0; tid < 64; tid++){
314 DATA_x[tid*8 +0] = smem[0*sx+
offset];
315 DATA_x[tid*8 +4] = smem[4*sx+
offset];
316 DATA_x[tid*8 +1] = smem[1*sx+
offset];
317 DATA_x[tid*8 +5] = smem[5*sx+
offset];
318 DATA_x[tid*8 +2] = smem[2*sx+
offset];
319 DATA_x[tid*8 +6] = smem[6*sx+
offset];
320 DATA_x[tid*8 +3] = smem[3*sx+
offset];
321 DATA_x[tid*8 +7] = smem[7*sx+
offset];
325 loop9 :
for(tid = 0; tid < 64; tid++){
330 smem[0*sx+
offset] = DATA_y[tid*8 + 0];
331 smem[4*sx+
offset] = DATA_y[tid*8 + 1];
332 smem[1*sx+
offset] = DATA_y[tid*8 + 4];
333 smem[5*sx+
offset] = DATA_y[tid*8 + 5];
334 smem[2*sx+
offset] = DATA_y[tid*8 + 2];
335 smem[6*sx+
offset] = DATA_y[tid*8 + 3];
336 smem[3*sx+
offset] = DATA_y[tid*8 + 6];
337 smem[7*sx+
offset] = DATA_y[tid*8 + 7];
340 loop10 :
for(tid = 0; tid < 64; tid++){
341 data_y[0] = DATA_y[tid*8 + 0];
342 data_y[1] = DATA_y[tid*8 + 1];
343 data_y[2] = DATA_y[tid*8 + 2];
344 data_y[3] = DATA_y[tid*8 + 3];
345 data_y[4] = DATA_y[tid*8 + 4];
346 data_y[5] = DATA_y[tid*8 + 5];
347 data_y[6] = DATA_y[tid*8 + 6];
348 data_y[7] = DATA_y[tid*8 + 7];
353 loady8(data_y, smem, hi*72+lo, 8);
355 DATA_y[tid*8 + 0] = data_y[0];
356 DATA_y[tid*8 + 1] = data_y[1];
357 DATA_y[tid*8 + 2] = data_y[2];
358 DATA_y[tid*8 + 3] = data_y[3];
359 DATA_y[tid*8 + 4] = data_y[4];
360 DATA_y[tid*8 + 5] = data_y[5];
361 DATA_y[tid*8 + 6] = data_y[6];
362 DATA_y[tid*8 + 7] = data_y[7];
365 loop11 :
for(tid = 0; tid < 64; tid++){
367 data_y[0] = DATA_y[tid*8];
368 data_y[1] = DATA_y[tid*8 + 1];
369 data_y[2] = DATA_y[tid*8 + 2];
370 data_y[3] = DATA_y[tid*8 + 3];
371 data_y[4] = DATA_y[tid*8 + 4];
372 data_y[5] = DATA_y[tid*8 + 5];
373 data_y[6] = DATA_y[tid*8 + 6];
374 data_y[7] = DATA_y[tid*8 + 7];
376 data_x[0] = DATA_x[tid*8];
377 data_x[1] = DATA_x[tid*8 + 1];
378 data_x[2] = DATA_x[tid*8 + 2];
379 data_x[3] = DATA_x[tid*8 + 3];
380 data_x[4] = DATA_x[tid*8 + 4];
381 data_x[5] = DATA_x[tid*8 + 5];
382 data_x[6] = DATA_x[tid*8 + 6];
383 data_x[7] = DATA_x[tid*8 + 7];
386 FFT8(data_x, data_y);
389 work_x[0*stride+tid] = data_x[reversed[0]];
390 work_x[1*stride+tid] = data_x[reversed[1]];
391 work_x[2*stride+tid] = data_x[reversed[2]];
392 work_x[3*stride+tid] = data_x[reversed[3]];
393 work_x[4*stride+tid] = data_x[reversed[4]];
394 work_x[5*stride+tid] = data_x[reversed[5]];
395 work_x[6*stride+tid] = data_x[reversed[6]];
396 work_x[7*stride+tid] = data_x[reversed[7]];
398 work_y[0*stride+tid] = data_y[reversed[0]];
399 work_y[1*stride+tid] = data_y[reversed[1]];
400 work_y[2*stride+tid] = data_y[reversed[2]];
401 work_y[3*stride+tid] = data_y[reversed[3]];
402 work_y[4*stride+tid] = data_y[reversed[4]];
403 work_y[5*stride+tid] = data_y[reversed[5]];
404 work_y[6*stride+tid] = data_y[reversed[6]];
405 work_y[7*stride+tid] = data_y[reversed[7]];
#define cmplx_M_x(a_x, a_y, b_x, b_y)
void loady8(TYPE a_y[], TYPE x[], int offset, int sx)
void twiddles8(TYPE a_x[8], TYPE a_y[8], int i, int n)
unsigned offset[NUM_VERTICES+1]
void loadx8(TYPE a_x[], TYPE x[], int offset, int sx)
#define cmplx_M_y(a_x, a_y, b_x, b_y)
void fft1D_512(TYPE work_x[512], TYPE work_y[512])
x
Return the smallest n such that 2^n >= _x.