1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
9 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11 * Copyright (c) 2001-2012, The GROMACS development team,
12 * check out http://www.gromacs.org for more information.
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * If you want to redistribute modifications, please consider that
20 * scientific software is very special. Version control is crucial -
21 * bugs must be traceable. We will be happy to consider code for
22 * inclusion in the official distribution, but derived work must not
23 * be called official GROMACS. Details are found in the README & COPYING
24 * files - if they are missing, get the official version at www.gromacs.org.
26 * To help us fund GROMACS development, we humbly ask that you cite
27 * the papers on the package - you can find them in the top README file.
29 * For more info, check our website at http://www.gromacs.org
32 * Groningen Machine for Chemical Simulation
44 #define GMX_PARALLEL_ENV_INITIALIZED 1
47 #define GMX_PARALLEL_ENV_INITIALIZED 1
49 #define GMX_PARALLEL_ENV_INITIALIZED 0
65 /* requires fftw compiled with openmp */
66 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
75 #ifndef __FLT_EPSILON__
76 #define __FLT_EPSILON__ FLT_EPSILON
77 #define __DBL_EPSILON__ DBL_EPSILON
84 #include "gmx_fatal.h"
89 /* none of the fftw3 calls, except execute(), are thread-safe, so
90 we need to serialize them with this mutex. */
91 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
93 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex)
94 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex)
95 #else /* GMX_THREAD_MPI */
98 #endif /* GMX_THREAD_MPI */
99 #endif /* GMX_FFT_FFTW3 */
101 static double fft5d_fmax(double a, double b){
105 /* largest factor smaller than sqrt */
106 static int lfactor(int z) {
109 if (z%i==0) return i;
114 static int l2factor(int z) {
118 if (z%i==0) return i;
122 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
123 static int lpfactor(int z) {
126 return fft5d_fmax(lpfactor(f),lpfactor(z/f));
130 #ifdef HAVE_GETTIMEOFDAY
131 #include <sys/time.h>
135 return tv.tv_sec+tv.tv_usec*1e-6;
144 static int vmax(int* a, int s) {
148 if (a[i]>max) max=a[i];
154 /* NxMxK the size of the data
155 * comm communicator to use for fft5d
156 * P0 number of processor in 1st axes (can be null for automatic)
157 * lin is allocated by fft5d because size of array is only known after planning phase
158 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
160 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
163 int P[2],bMaster,prank[2],i,t;
165 int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
166 int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
167 int C[3],rC[3],nP[2];
169 t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
173 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
175 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
177 MPI_Comm_size(comm[0],&P[0]);
178 MPI_Comm_rank(comm[0],&prank[0]);
187 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
189 MPI_Comm_size(comm[1],&P[1]);
190 MPI_Comm_rank(comm[1],&prank[1]);
199 bMaster=(prank[0]==0&&prank[1]==0);
204 fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
205 P[0],P[1],prank[0],prank[1]);
210 fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
211 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
212 /* The check below is not correct, one prime factor 11 or 13 is ok.
213 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
214 printf("WARNING: FFT very slow with prime factors larger 7\n");
215 printf("Change FFT size or in case you cannot change it look at\n");
216 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
221 if (NG==0 || MG==0 || KG==0) {
222 if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
226 rNG=NG;rMG=MG;rKG=KG;
228 if (flags&FFT5D_REALCOMPLEX) {
229 if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
231 if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
237 /*for transpose we need to know the size for each processor not only our own size*/
239 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
240 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
241 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
242 oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
243 oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
244 oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
250 oN0[i]=i*ceil((double)NG/P[0]);
251 oM0[i]=i*ceil((double)MG/P[0]);
252 oK0[i]=i*ceil((double)KG/P[0]);
262 oN1[i]=i*ceil((double)NG/P[1]);
263 oM1[i]=i*ceil((double)MG/P[1]);
264 oK1[i]=i*ceil((double)KG/P[1]);
271 for (i=0;i<P[0]-1;i++)
273 N0[i]=oN0[i+1]-oN0[i];
274 M0[i]=oM0[i+1]-oM0[i];
275 K0[i]=oK0[i+1]-oK0[i];
277 N0[P[0]-1]=NG-oN0[P[0]-1];
278 M0[P[0]-1]=MG-oM0[P[0]-1];
279 K0[P[0]-1]=KG-oK0[P[0]-1];
280 for (i=0;i<P[1]-1;i++)
282 N1[i]=oN1[i+1]-oN1[i];
283 M1[i]=oM1[i+1]-oM1[i];
284 K1[i]=oK1[i+1]-oK1[i];
286 N1[P[1]-1]=NG-oN1[P[1]-1];
287 M1[P[1]-1]=MG-oM1[P[1]-1];
288 K1[P[1]-1]=KG-oK1[P[1]-1];
290 /* for step 1-3 the local N,M,K sizes of the transposed system
291 C: contiguous dimension, and nP: number of processor in subcommunicator
295 pM[0] = M0[prank[0]];
296 oM[0] = oM0[prank[0]];
297 pK[0] = K1[prank[1]];
298 oK[0] = oK1[prank[1]];
301 if (!(flags&FFT5D_ORDER_YZ)) {
302 N[0] = vmax(N1,P[1]);
304 K[0] = vmax(K1,P[1]);
305 pN[0] = N1[prank[1]];
311 N[1] = vmax(K0,P[0]);
312 pN[1] = K0[prank[0]];
317 M[1] = vmax(M0,P[0]);
318 pM[1] = M0[prank[0]];
319 oM[1] = oM0[prank[0]];
321 pK[1] = N1[prank[1]];
322 oK[1] = oN1[prank[1]];
328 M[2] = vmax(K0,P[0]);
329 pM[2] = K0[prank[0]];
330 oM[2] = oK0[prank[0]];
331 K[2] = vmax(N1,P[1]);
332 pK[2] = N1[prank[1]];
333 oK[2] = oN1[prank[1]];
334 free(N0); free(oN0); /*these are not used for this order*/
335 free(M1); free(oM1); /*the rest is freed in destroy*/
337 N[0] = vmax(N0,P[0]);
338 M[0] = vmax(M0,P[0]);
340 pN[0] = N0[prank[0]];
346 N[1] = vmax(M1,P[1]);
347 pN[1] = M1[prank[1]];
353 pM[1] = N0[prank[0]];
354 oM[1] = oN0[prank[0]];
355 K[1] = vmax(K1,P[1]);
356 pK[1] = K1[prank[1]];
357 oK[1] = oK1[prank[1]];
363 M[2] = vmax(N0,P[0]);
364 pM[2] = N0[prank[0]];
365 oM[2] = oN0[prank[0]];
366 K[2] = vmax(M1,P[1]);
367 pK[2] = M1[prank[1]];
368 oK[2] = oM1[prank[1]];
369 free(N1); free(oN1); /*these are not used for this order*/
370 free(K0); free(oK0); /*the rest is freed in destroy*/
372 N[2]=pN[2]=-1; /*not used*/
375 Difference between x-y-z regarding 2d decomposition is whether they are
376 distributed along axis 1, 2 or both
379 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
380 lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
381 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
382 if (!(flags&FFT5D_NOMALLOC)) {
383 snew_aligned(lin, lsize, 32);
384 snew_aligned(lout, lsize, 32);
385 snew_aligned(lout2, lsize, 32);
386 snew_aligned(lout3, lsize, 32);
394 plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
399 fprintf(debug, "Running on %d threads\n",nthreads);
402 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
403 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
404 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
405 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
406 * and that the 3d plan is faster than the 1d plan.
408 if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) { /*don't do 3d plan in parallel or if in_place requested */
409 int fftwflags=FFTW_DESTROY_INPUT;
411 int inNG=NG,outMG=MG,outKG=KG;
414 if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
415 if (flags&FFT5D_REALCOMPLEX) {
416 if (!(flags&FFT5D_BACKWARD)) { /*input pointer is not complex*/
418 } else { /*output pointer is not complex*/
419 if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
424 if (!(flags&FFT5D_BACKWARD)) {
429 dims[0].is = inNG*MG; /*N M K*/
432 if (!(flags&FFT5D_ORDER_YZ)) {
433 dims[0].os = MG; /*M K N*/
437 dims[0].os = 1; /*K N M*/
442 if (!(flags&FFT5D_ORDER_YZ)) {
451 dims[0].os = outMG*KG;
463 dims[0].os = outKG*NG;
469 #ifdef FFT5D_FFTW_THREADS
470 FFTW(plan_with_nthreads)(nthreads);
473 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
474 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
475 /*howmany*/ 0, /*howmany_dims*/0 ,
476 (real*)lin, (FFTW(complex) *)lout,
477 /*flags*/ fftwflags);
478 } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
479 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
480 /*howmany*/ 0, /*howmany_dims*/0 ,
481 (FFTW(complex) *)lin, (real*)lout,
482 /*flags*/ fftwflags);
484 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
485 /*howmany*/ 0, /*howmany_dims*/0 ,
486 (FFTW(complex) *)lin, (FFTW(complex) *)lout,
487 /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
490 #ifdef FFT5D_FFTW_THREADS
491 FFTW(plan_with_nthreads)(1);
496 if (!plan->p3d) { /* for decomposition and if 3d plan did not work */
497 #endif /* GMX_FFT_FFTW3 */
501 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
502 s,rC[s],M[s],pK[s],C[s],lsize);
504 plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
506 /* Make sure that the init routines are only called by one thread at a time and in order
507 (later is only important to not confuse valgrind)
509 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
510 for(t=0; t<nthreads; t++)
512 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
514 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
515 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
517 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
525 if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
526 plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
528 plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
530 #ifdef FFT5D_MPI_TRANSPOSE
533 if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
534 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
536 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
547 plan->NG=NG;plan->MG=MG;plan->KG=KG;
550 plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
551 plan->oM[s]=oM[s];plan->oK[s]=oK[s];
552 plan->C[s]=C[s];plan->rC[s]=rC[s];
553 plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
556 plan->P[s]=nP[s];plan->coor[s]=prank[s];
559 /* plan->fftorder=fftorder;
560 plan->direction=direction;
561 plan->realcomplex=realcomplex;
564 plan->nthreads=nthreads;
584 /*here x,y,z and N,M,K is in rotated coordinate system!!
585 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
586 maxN,maxM,maxK is max size of local data
587 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
588 NG, MG, KG is size of global data*/
589 static void splitaxes(t_complex* lout,const t_complex* lin,
590 int maxN,int maxM,int maxK, int pN, int pM, int pK,
591 int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
594 int in_i,out_i,in_z,out_z,in_y,out_y;
597 for (z=startz; z<endz+1; z++) /*3. z l*/
612 for (i=0; i<P; i++) /*index cube along long axis*/
614 out_i = out_z + i*maxN*maxM*maxK;
616 for (y=s_y;y<e_y;y++) { /*2. y k*/
617 out_y = out_i + y*maxN;
619 for (x=0;x<N[i];x++) { /*1. x j*/
620 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
621 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
622 /*before split data contiguos - thus if different processor get different amount oN is different*/
629 /*make axis contiguous again (after AllToAll) and also do local transpose*/
630 /*transpose mayor and major dimension
632 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
633 N,M,K local dimensions
635 static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
636 int maxN,int maxM,int maxK,int pN, int pM, int pK,
637 int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
640 int out_i,in_i,out_x,in_x,out_z,in_z;
643 for (x=startx;x<endx+1;x++) /*1.j*/
665 for (i=0;i<P;i++) /*index cube along long axis*/
667 out_i = out_x + oK[i];
668 in_i = in_x + i*maxM*maxN*maxK;
669 for (z=0;z<K[i];z++) /*3.l*/
672 in_z = in_i + z*maxM*maxN;
673 for (y=s_y;y<e_y;y++) /*2.k*/
675 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
682 /*make axis contiguous again (after AllToAll) and also do local transpose
683 tranpose mayor and middle dimension
685 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
688 static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
689 int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
691 int out_i,in_i,out_z,in_z,out_x,in_x;
694 for (z=startz; z<endz+1; z++)
715 for (i=0; i<P; i++) /*index cube along long axis*/
717 out_i = out_z + oM[i];
718 in_i = in_z + i*maxM*maxN*maxK;
719 for (x=s_x;x<e_x;x++)
721 out_x = out_i + x*MG;
725 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
733 static void rotate_offsets(int x[]) {
743 /*compute the offset to compare or print transposed local data in original input coordinates
744 xs matrix dimension size, xl dimension length, xc decomposition offset
745 s: step in computation = number of transposes*/
746 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
747 /* int direction = plan->direction;
748 int fftorder = plan->fftorder;*/
752 int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
753 *C=plan->C, *rC=plan->rC;
755 NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
757 if (!(plan->flags&FFT5D_ORDER_YZ)) {
759 case 0: o=XYZ; break;
760 case 1: o=ZYX; break;
761 case 2: o=YZX; break;
766 case 0: o=XYZ; break;
767 case 1: o=YXZ; break;
768 case 2: o=ZXY; break;
774 case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
775 case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
776 case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
777 case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
778 case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
779 case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
781 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
783 /*xs, xl give dimension size and data length in local transposed coordinate system
784 for 0(/1/2): x(/y/z) in original coordinate system*/
787 case 1: xs[i]=1; xc[i]=0; xl[i]=C[s];break;
788 case 2: xs[i]=C[s]; xc[i]=oM[s]; xl[i]=pM[s];break;
789 case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
792 /*input order is different for test program to match FFTW order
793 (important for complex to real)*/
794 if (plan->flags&FFT5D_BACKWARD) {
799 if (plan->flags&FFT5D_ORDER_YZ) {
806 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
811 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
813 int *coor = plan->coor;
814 int xs[3],xl[3],xc[3],NG[3];
815 int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
816 compute_offsets(plan,xs,xl,xc,NG,s);
817 fprintf(debug,txt,coor[0],coor[1],s);
818 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
819 for(z=0;z<xl[2];z++) {
820 for(y=0;y<xl[1];y++) {
821 fprintf(debug,"%d %d: ",coor[0],coor[1]);
822 for (x=0;x<xl[0];x++) {
824 fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
833 void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
834 t_complex *lin = plan->lin;
835 t_complex *lout = plan->lout;
836 t_complex *lout2 = plan->lout2;
837 t_complex *lout3 = plan->lout3;
838 t_complex *fftout,*joinin;
840 gmx_fft_t **p1d=plan->p1d;
841 #ifdef FFT5D_MPI_TRANSPOSE
842 FFTW(plan) *mpip=plan->mpip;
845 MPI_Comm *cart=plan->cart;
848 double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
849 int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
850 *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
851 int s=0,tstart,tend,bParallelDim;
865 FFTW(execute)(plan->p3d);
869 times->fft+=MPI_Wtime()-time;
880 if (plan->flags&FFT5D_DEBUG && thread == 0)
882 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
885 for (s=0;s<2;s++) { /*loop over first two FFT steps (corner rotations)*/
888 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
898 /* ---------- START FFT ------------ */
900 if (times!=0 && thread == 0)
920 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
921 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
923 gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
926 gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,fftout+tstart);
931 if (times != NULL && thread == 0)
933 time_fft+=MPI_Wtime()-time;
936 if (plan->flags&FFT5D_DEBUG && thread == 0)
938 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
940 /* ---------- END FFT ------------ */
942 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
945 if (times != NULL && thread == 0)
952 1. (most outer) axes (x) is split into P[s] parts of size N[s]
956 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
958 splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
960 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
962 if (times != NULL && thread == 0)
964 time_local+=MPI_Wtime()-time;
968 /* ---------- END SPLIT , START TRANSPOSE------------ */
978 wallcycle_start(times,ewcPME_FFTCOMM);
980 #ifdef FFT5D_MPI_TRANSPOSE
981 FFTW(execute)(mpip[s]);
984 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
985 MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
987 MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
989 gmx_incons("fft5d MPI call without MPI configuration");
991 #endif /*FFT5D_MPI_TRANSPOSE*/
995 time_mpi[s]=MPI_Wtime()-time;
998 wallcycle_stop(times,ewcPME_FFTCOMM);
1002 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1004 /* ---------- END SPLIT + TRANSPOSE------------ */
1006 /* ---------- START JOIN ------------ */
1008 if (times != NULL && thread == 0)
1019 /*bring back in matrix form
1020 thus make new 1. axes contiguos
1021 also local transpose 1 and 2/3
1022 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1024 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
1027 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1028 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1029 joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
1035 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1036 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1037 joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
1042 if (times != NULL && thread == 0)
1044 time_local+=MPI_Wtime()-time;
1047 if (plan->flags&FFT5D_DEBUG && thread == 0)
1049 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1051 /* ---------- END JOIN ------------ */
1053 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1054 } /* for(s=0;s<2;s++) */
1056 if (times != NULL && thread == 0)
1062 if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
1064 /* ----------- FFT ----------- */
1065 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1066 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
1067 gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
1069 gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,lout+tstart);
1071 /* ------------ END FFT ---------*/
1074 if (times != NULL && thread == 0)
1076 time_fft+=MPI_Wtime()-time;
1078 times->fft+=time_fft;
1079 times->local+=time_local;
1080 times->mpi2+=time_mpi[1];
1081 times->mpi1+=time_mpi[0];
1085 if (plan->flags&FFT5D_DEBUG && thread == 0)
1087 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1091 void fft5d_destroy(fft5d_plan plan) {
1097 for (t=0;t<plan->nthreads;t++)
1099 gmx_many_fft_destroy(plan->p1d[s][t]);
1105 free(plan->iNin[s]);
1110 free(plan->oNin[s]);
1115 free(plan->iNout[s]);
1120 free(plan->oNout[s]);
1124 #ifdef GMX_FFT_FFTW3
1126 #ifdef FFT5D_MPI_TRANSPOS
1129 FFTW(destroy_plan)(plan->mpip[s]);
1133 FFTW(destroy_plan)(plan->p3d);
1135 #endif /* FFT5D_MPI_TRANSPOS */
1136 #endif /* GMX_FFT_FFTW3 */
1138 if (!(plan->flags&FFT5D_NOMALLOC))
1140 sfree_aligned(plan->lin);
1141 sfree_aligned(plan->lout);
1142 sfree_aligned(plan->lout2);
1143 sfree_aligned(plan->lout3);
1146 #ifdef FFT5D_THREADS
1147 #ifdef FFT5D_FFTW_THREADS
1148 /*FFTW(cleanup_threads)();*/
1155 /*Is this better than direct access of plan? enough data?
1156 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1157 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
1167 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1168 of processor dimensions*/
1169 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
1170 MPI_Comm cart[2]={0};
1177 int rdim1[] = {0,1}, rdim2[] = {1,0};
1179 MPI_Comm_size(comm,&size);
1180 MPI_Comm_rank(comm,&prank);
1182 if (P0==0) P0 = lfactor(size);
1185 if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1189 P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1191 /*Difference between x-y-z regarding 2d decomposition is whether they are
1192 distributed along axis 1, 2 or both*/
1194 MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1195 MPI_Cart_get(gcart,2,P,wrap,coor);
1196 MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1197 MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1199 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
1204 /*prints in original coordinate system of data (as the input to FFT)*/
1205 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1206 int xs[3],xl[3],xc[3],NG[3];
1208 int *coor = plan->coor;
1209 int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1210 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1215 compute_offsets(plan,xs,xl,xc,NG,2);
1216 if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1217 for (z=0;z<xl[2];z++) {
1218 for(y=0;y<xl[1];y++) {
1219 if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1220 for (x=0;x<xl[0];x++) {
1221 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1223 a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1224 if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1226 b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1228 b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1229 if (plan->flags&FFT5D_DEBUG) {
1230 printf("%f %f, ",a,b);
1232 if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1233 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1235 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1238 if (plan->flags&FFT5D_DEBUG) printf(",");
1240 if (plan->flags&FFT5D_DEBUG) printf("\n");