1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
9 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11 * Copyright (c) 2001-2012, The GROMACS development team,
12 * check out http://www.gromacs.org for more information.
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * If you want to redistribute modifications, please consider that
20 * scientific software is very special. Version control is crucial -
21 * bugs must be traceable. We will be happy to consider code for
22 * inclusion in the official distribution, but derived work must not
23 * be called official GROMACS. Details are found in the README & COPYING
24 * files - if they are missing, get the official version at www.gromacs.org.
26 * To help us fund GROMACS development, we humbly ask that you cite
27 * the papers on the package - you can find them in the top README file.
29 * For more info, check our website at http://www.gromacs.org
32 * Groningen Machine for Chemical Simulation
44 #define GMX_PARALLEL_ENV_INITIALIZED 1
47 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
62 /* requires fftw compiled with openmp */
63 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
71 #ifndef __FLT_EPSILON__
72 #define __FLT_EPSILON__ FLT_EPSILON
73 #define __DBL_EPSILON__ DBL_EPSILON
80 #include "gmx_fatal.h"
85 /* none of the fftw3 calls, except execute(), are thread-safe, so
86 we need to serialize them with this mutex. */
87 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
89 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex)
90 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex)
91 #else /* GMX_THREAD_MPI */
94 #endif /* GMX_THREAD_MPI */
95 #endif /* GMX_FFT_FFTW3 */
97 static double fft5d_fmax(double a, double b){
101 /* largest factor smaller than sqrt */
102 static int lfactor(int z) {
105 if (z%i==0) return i;
110 static int l2factor(int z) {
114 if (z%i==0) return i;
118 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
119 static int lpfactor(int z) {
122 return fft5d_fmax(lpfactor(f),lpfactor(z/f));
126 #ifdef HAVE_GETTIMEOFDAY
127 #include <sys/time.h>
131 return tv.tv_sec+tv.tv_usec*1e-6;
140 static int vmax(int* a, int s) {
144 if (a[i]>max) max=a[i];
150 copied here from fftgrid, because:
151 1. function there not publically available
152 2. not sure whether we keep fftgrid
153 3. less dependencies for fft5d
155 Only used for non-fftw case
158 gmx_calloc_aligned(size_t size)
162 /*We initialize by zero for Valgrind
163 For non-divisible case we communicate more than the data.
164 If we don't initialize the data we communicate uninitialized data*/
165 p0 = calloc(size+32,1);
169 gmx_fatal(FARGS,"Failed to allocated %u bytes of aligned memory.",size+32);
172 p = (void *) (((size_t) p0 + 32) & (~((size_t) 31)));
174 /* Yeah, yeah, we cannot free this pointer, but who cares... */
179 /* NxMxK the size of the data
180 * comm communicator to use for fft5d
181 * P0 number of processor in 1st axes (can be null for automatic)
182 * lin is allocated by fft5d because size of array is only known after planning phase
183 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
185 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
188 int P[2],bMaster,prank[2],i,t;
190 int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
191 int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
192 int C[3],rC[3],nP[2];
194 t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
198 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
200 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
202 MPI_Comm_size(comm[0],&P[0]);
203 MPI_Comm_rank(comm[0],&prank[0]);
212 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
214 MPI_Comm_size(comm[1],&P[1]);
215 MPI_Comm_rank(comm[1],&prank[1]);
224 bMaster=(prank[0]==0&&prank[1]==0);
229 fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
230 P[0],P[1],prank[0],prank[1]);
235 fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
236 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
237 /* The check below is not correct, one prime factor 11 or 13 is ok.
238 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
239 printf("WARNING: FFT very slow with prime factors larger 7\n");
240 printf("Change FFT size or in case you cannot change it look at\n");
241 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
246 if (NG==0 || MG==0 || KG==0) {
247 if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
251 rNG=NG;rMG=MG;rKG=KG;
253 if (flags&FFT5D_REALCOMPLEX) {
254 if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
256 if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
262 /*for transpose we need to know the size for each processor not only our own size*/
264 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
265 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
266 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
267 oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
268 oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
269 oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
275 oN0[i]=i*ceil((double)NG/P[0]);
276 oM0[i]=i*ceil((double)MG/P[0]);
277 oK0[i]=i*ceil((double)KG/P[0]);
287 oN1[i]=i*ceil((double)NG/P[1]);
288 oM1[i]=i*ceil((double)MG/P[1]);
289 oK1[i]=i*ceil((double)KG/P[1]);
296 for (i=0;i<P[0]-1;i++)
298 N0[i]=oN0[i+1]-oN0[i];
299 M0[i]=oM0[i+1]-oM0[i];
300 K0[i]=oK0[i+1]-oK0[i];
302 N0[P[0]-1]=NG-oN0[P[0]-1];
303 M0[P[0]-1]=MG-oM0[P[0]-1];
304 K0[P[0]-1]=KG-oK0[P[0]-1];
305 for (i=0;i<P[1]-1;i++)
307 N1[i]=oN1[i+1]-oN1[i];
308 M1[i]=oM1[i+1]-oM1[i];
309 K1[i]=oK1[i+1]-oK1[i];
311 N1[P[1]-1]=NG-oN1[P[1]-1];
312 M1[P[1]-1]=MG-oM1[P[1]-1];
313 K1[P[1]-1]=KG-oK1[P[1]-1];
315 /* for step 1-3 the local N,M,K sizes of the transposed system
316 C: contiguous dimension, and nP: number of processor in subcommunicator
320 pM[0] = M0[prank[0]];
321 oM[0] = oM0[prank[0]];
322 pK[0] = K1[prank[1]];
323 oK[0] = oK1[prank[1]];
326 if (!(flags&FFT5D_ORDER_YZ)) {
327 N[0] = vmax(N1,P[1]);
329 K[0] = vmax(K1,P[1]);
330 pN[0] = N1[prank[1]];
336 N[1] = vmax(K0,P[0]);
337 pN[1] = K0[prank[0]];
342 M[1] = vmax(M0,P[0]);
343 pM[1] = M0[prank[0]];
344 oM[1] = oM0[prank[0]];
346 pK[1] = N1[prank[1]];
347 oK[1] = oN1[prank[1]];
353 M[2] = vmax(K0,P[0]);
354 pM[2] = K0[prank[0]];
355 oM[2] = oK0[prank[0]];
356 K[2] = vmax(N1,P[1]);
357 pK[2] = N1[prank[1]];
358 oK[2] = oN1[prank[1]];
359 free(N0); free(oN0); /*these are not used for this order*/
360 free(M1); free(oM1); /*the rest is freed in destroy*/
362 N[0] = vmax(N0,P[0]);
363 M[0] = vmax(M0,P[0]);
365 pN[0] = N0[prank[0]];
371 N[1] = vmax(M1,P[1]);
372 pN[1] = M1[prank[1]];
378 pM[1] = N0[prank[0]];
379 oM[1] = oN0[prank[0]];
380 K[1] = vmax(K1,P[1]);
381 pK[1] = K1[prank[1]];
382 oK[1] = oK1[prank[1]];
388 M[2] = vmax(N0,P[0]);
389 pM[2] = N0[prank[0]];
390 oM[2] = oN0[prank[0]];
391 K[2] = vmax(M1,P[1]);
392 pK[2] = M1[prank[1]];
393 oK[2] = oM1[prank[1]];
394 free(N1); free(oN1); /*these are not used for this order*/
395 free(K0); free(oK0); /*the rest is freed in destroy*/
399 Difference between x-y-z regarding 2d decomposition is whether they are
400 distributed along axis 1, 2 or both
403 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
404 lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
405 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
406 if (!(flags&FFT5D_NOMALLOC)) {
407 lin = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
408 lout = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
409 lout2 = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
410 lout3 = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
418 plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
423 fprintf(debug, "Running on %d threads\n",nthreads);
426 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
427 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
428 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
429 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
430 * and that the 3d plan is faster than the 1d plan.
432 if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) { /*don't do 3d plan in parallel or if in_place requested */
433 int fftwflags=FFTW_DESTROY_INPUT;
435 int inNG=NG,outMG=MG,outKG=KG;
438 if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
439 if (flags&FFT5D_REALCOMPLEX) {
440 if (!(flags&FFT5D_BACKWARD)) { /*input pointer is not complex*/
442 } else { /*output pointer is not complex*/
443 if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
448 if (!(flags&FFT5D_BACKWARD)) {
453 dims[0].is = inNG*MG; /*N M K*/
456 if (!(flags&FFT5D_ORDER_YZ)) {
457 dims[0].os = MG; /*M K N*/
461 dims[0].os = 1; /*K N M*/
466 if (!(flags&FFT5D_ORDER_YZ)) {
475 dims[0].os = outMG*KG;
487 dims[0].os = outKG*NG;
493 #ifdef FFT5D_FFTW_THREADS
494 FFTW(plan_with_nthreads)(nthreads);
497 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
498 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
499 /*howmany*/ 0, /*howmany_dims*/0 ,
500 (real*)lin, (FFTW(complex) *)lout,
501 /*flags*/ fftwflags);
502 } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
503 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
504 /*howmany*/ 0, /*howmany_dims*/0 ,
505 (FFTW(complex) *)lin, (real*)lout,
506 /*flags*/ fftwflags);
508 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
509 /*howmany*/ 0, /*howmany_dims*/0 ,
510 (FFTW(complex) *)lin, (FFTW(complex) *)lout,
511 /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
514 #ifdef FFT5D_FFTW_THREADS
515 FFTW(plan_with_nthreads)(1);
520 if (!plan->p3d) { /* for decomposition and if 3d plan did not work */
521 #endif /* GMX_FFT_FFTW3 */
525 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
526 s,rC[s],M[s],pK[s],C[s],lsize);
528 plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
530 /* Make sure that the init routines are only called by one thread at a time and in order
531 (later is only important to not confuse valgrind)
533 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
534 for(t=0; t<nthreads; t++)
536 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
538 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
539 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
541 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
549 if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
550 plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
552 plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
554 #ifdef FFT5D_MPI_TRANSPOSE
557 if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
558 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
560 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
571 plan->NG=NG;plan->MG=MG;plan->KG=KG;
574 plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
575 plan->oM[s]=oM[s];plan->oK[s]=oK[s];
576 plan->C[s]=C[s];plan->rC[s]=rC[s];
577 plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
580 plan->P[s]=nP[s];plan->coor[s]=prank[s];
583 /* plan->fftorder=fftorder;
584 plan->direction=direction;
585 plan->realcomplex=realcomplex;
588 plan->nthreads=nthreads;
608 /*here x,y,z and N,M,K is in rotated coordinate system!!
609 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
610 maxN,maxM,maxK is max size of local data
611 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
612 NG, MG, KG is size of global data*/
613 static void splitaxes(t_complex* lout,const t_complex* lin,
614 int maxN,int maxM,int maxK, int pN, int pM, int pK,
615 int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
618 int in_i,out_i,in_z,out_z,in_y,out_y;
621 for (z=startz; z<endz+1; z++) /*3. z l*/
636 for (i=0; i<P; i++) /*index cube along long axis*/
638 out_i = out_z + i*maxN*maxM*maxK;
640 for (y=s_y;y<e_y;y++) { /*2. y k*/
641 out_y = out_i + y*maxN;
643 for (x=0;x<N[i];x++) { /*1. x j*/
644 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
645 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
646 /*before split data contiguos - thus if different processor get different amount oN is different*/
653 /*make axis contiguous again (after AllToAll) and also do local transpose*/
654 /*transpose mayor and major dimension
656 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
657 N,M,K local dimensions
659 static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
660 int maxN,int maxM,int maxK,int pN, int pM, int pK,
661 int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
664 int out_i,in_i,out_x,in_x,out_z,in_z;
667 for (x=startx;x<endx+1;x++) /*1.j*/
689 for (i=0;i<P;i++) /*index cube along long axis*/
691 out_i = out_x + oK[i];
692 in_i = in_x + i*maxM*maxN*maxK;
693 for (z=0;z<K[i];z++) /*3.l*/
696 in_z = in_i + z*maxM*maxN;
697 for (y=s_y;y<e_y;y++) /*2.k*/
699 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
706 /*make axis contiguous again (after AllToAll) and also do local transpose
707 tranpose mayor and middle dimension
709 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
712 static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
713 int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
715 int out_i,in_i,out_z,in_z,out_x,in_x;
718 for (z=startz; z<endz+1; z++)
739 for (i=0; i<P; i++) /*index cube along long axis*/
741 out_i = out_z + oM[i];
742 in_i = in_z + i*maxM*maxN*maxK;
743 for (x=s_x;x<e_x;x++)
745 out_x = out_i + x*MG;
749 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
757 static void rotate(int x[]) {
767 /*compute the offset to compare or print transposed local data in original input coordinates
768 xs matrix dimension size, xl dimension length, xc decomposition offset
769 s: step in computation = number of transposes*/
770 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
771 /* int direction = plan->direction;
772 int fftorder = plan->fftorder;*/
776 int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
777 *C=plan->C, *rC=plan->rC;
779 NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
781 if (!(plan->flags&FFT5D_ORDER_YZ)) {
783 case 0: o=XYZ; break;
784 case 1: o=ZYX; break;
785 case 2: o=YZX; break;
790 case 0: o=XYZ; break;
791 case 1: o=YXZ; break;
792 case 2: o=ZXY; break;
798 case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
799 case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
800 case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
801 case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
802 case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
803 case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
805 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
807 /*xs, xl give dimension size and data length in local transposed coordinate system
808 for 0(/1/2): x(/y/z) in original coordinate system*/
811 case 1: xs[i]=1; xc[i]=0; xl[i]=C[s];break;
812 case 2: xs[i]=C[s]; xc[i]=oM[s]; xl[i]=pM[s];break;
813 case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
816 /*input order is different for test program to match FFTW order
817 (important for complex to real)*/
818 if (plan->flags&FFT5D_BACKWARD) {
823 if (plan->flags&FFT5D_ORDER_YZ) {
830 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
835 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
837 int *coor = plan->coor;
838 int xs[3],xl[3],xc[3],NG[3];
839 int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
840 compute_offsets(plan,xs,xl,xc,NG,s);
841 fprintf(debug,txt,coor[0],coor[1],s);
842 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
843 for(z=0;z<xl[2];z++) {
844 for(y=0;y<xl[1];y++) {
845 fprintf(debug,"%d %d: ",coor[0],coor[1]);
846 for (x=0;x<xl[0];x++) {
848 fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
857 void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
858 t_complex *lin = plan->lin;
859 t_complex *lout = plan->lout;
860 t_complex *lout2 = plan->lout2;
861 t_complex *lout3 = plan->lout3;
862 t_complex *fftout,*joinin;
864 gmx_fft_t **p1d=plan->p1d;
865 #ifdef FFT5D_MPI_TRANSPOSE
866 FFTW(plan) *mpip=plan->mpip;
869 MPI_Comm *cart=plan->cart;
872 double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
873 int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
874 *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
875 int s=0,tstart,tend,bParallelDim;
889 FFTW(execute)(plan->p3d);
893 times->fft+=MPI_Wtime()-time;
904 if (plan->flags&FFT5D_DEBUG && thread == 0)
906 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
909 for (s=0;s<2;s++) { /*loop over first two FFT steps (corner rotations)*/
912 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] !=0 && P[s]>1 )
922 /* ---------- START FFT ------------ */
924 if (times!=0 && thread == 0)
944 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
945 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
947 gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
950 gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,fftout+tstart);
955 if (times != NULL && thread == 0)
957 time_fft+=MPI_Wtime()-time;
960 if (plan->flags&FFT5D_DEBUG && thread == 0)
962 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
964 /* ---------- END FFT ------------ */
966 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
969 if (times != NULL && thread == 0)
976 1. (most outer) axes (x) is split into P[s] parts of size N[s]
980 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
982 splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
984 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
986 if (times != NULL && thread == 0)
988 time_local+=MPI_Wtime()-time;
992 /* ---------- END SPLIT , START TRANSPOSE------------ */
1002 wallcycle_start(times,ewcPME_FFTCOMM);
1004 #ifdef FFT5D_MPI_TRANSPOSE
1005 FFTW(execute)(mpip[s]);
1008 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
1009 MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
1011 MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
1013 gmx_incons("fft5d MPI call without MPI configuration");
1015 #endif /*FFT5D_MPI_TRANSPOSE*/
1019 time_mpi[s]=MPI_Wtime()-time;
1022 wallcycle_stop(times,ewcPME_FFTCOMM);
1026 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1028 /* ---------- END SPLIT + TRANSPOSE------------ */
1030 /* ---------- START JOIN ------------ */
1032 if (times != NULL && thread == 0)
1043 /*bring back in matrix form
1044 thus make new 1. axes contiguos
1045 also local transpose 1 and 2/3
1046 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1048 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
1051 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1052 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1053 joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
1059 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1060 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1061 joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
1066 if (times != NULL && thread == 0)
1068 time_local+=MPI_Wtime()-time;
1071 if (plan->flags&FFT5D_DEBUG && thread == 0)
1073 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1075 /* ---------- END JOIN ------------ */
1077 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1078 } /* for(s=0;s<2;s++) */
1080 if (times != NULL && thread == 0)
1086 if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
1088 /* ----------- FFT ----------- */
1089 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1090 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
1091 gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
1093 gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,lout+tstart);
1095 /* ------------ END FFT ---------*/
1098 if (times != NULL && thread == 0)
1100 time_fft+=MPI_Wtime()-time;
1102 times->fft+=time_fft;
1103 times->local+=time_local;
1104 times->mpi2+=time_mpi[1];
1105 times->mpi1+=time_mpi[0];
1109 if (plan->flags&FFT5D_DEBUG && thread == 0)
1111 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1115 void fft5d_destroy(fft5d_plan plan) {
1121 for (t=0;t<plan->nthreads;t++)
1123 gmx_many_fft_destroy(plan->p1d[s][t]);
1129 free(plan->iNin[s]);
1134 free(plan->oNin[s]);
1139 free(plan->iNout[s]);
1144 free(plan->oNout[s]);
1148 #ifdef GMX_FFT_FFTW3
1150 #ifdef FFT5D_MPI_TRANSPOS
1153 FFTW(destroy_plan)(plan->mpip[s]);
1157 FFTW(destroy_plan)(plan->p3d);
1159 #endif /* FFT5D_MPI_TRANSPOS */
1160 #endif /* GMX_FFT_FFTW3 */
1162 /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
1165 #ifdef FFT5D_THREADS
1166 #ifdef FFT5D_FFTW_THREADS
1167 /*FFTW(cleanup_threads)();*/
1174 /*Is this better than direct access of plan? enough data?
1175 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1176 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
1186 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1187 of processor dimensions*/
1188 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
1189 MPI_Comm cart[2]={0};
1196 int rdim1[] = {0,1}, rdim2[] = {1,0};
1198 MPI_Comm_size(comm,&size);
1199 MPI_Comm_rank(comm,&prank);
1201 if (P0==0) P0 = lfactor(size);
1204 if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1208 P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1210 /*Difference between x-y-z regarding 2d decomposition is whether they are
1211 distributed along axis 1, 2 or both*/
1213 MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1214 MPI_Cart_get(gcart,2,P,wrap,coor);
1215 MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1216 MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1218 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
1223 /*prints in original coordinate system of data (as the input to FFT)*/
1224 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1225 int xs[3],xl[3],xc[3],NG[3];
1227 int *coor = plan->coor;
1228 int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1229 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1234 compute_offsets(plan,xs,xl,xc,NG,2);
1235 if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1236 for (z=0;z<xl[2];z++) {
1237 for(y=0;y<xl[1];y++) {
1238 if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1239 for (x=0;x<xl[0];x++) {
1240 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1242 a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1243 if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1245 b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1247 b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1248 if (plan->flags&FFT5D_DEBUG) {
1249 printf("%f %f, ",a,b);
1251 if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1252 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1254 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1257 if (plan->flags&FFT5D_DEBUG) printf(",");
1259 if (plan->flags&FFT5D_DEBUG) printf("\n");