2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
48 #define GMX_PARALLEL_ENV_INITIALIZED 1
51 #define GMX_PARALLEL_ENV_INITIALIZED 1
53 #define GMX_PARALLEL_ENV_INITIALIZED 0
65 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
67 /* requires fftw compiled with openmp */
68 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
77 #ifndef __FLT_EPSILON__
78 #define __FLT_EPSILON__ FLT_EPSILON
79 #define __DBL_EPSILON__ DBL_EPSILON
86 #include "gmx_fatal.h"
91 /* none of the fftw3 calls, except execute(), are thread-safe, so
92 we need to serialize them with this mutex. */
93 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
95 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex)
96 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex)
97 #else /* GMX_THREAD_MPI */
100 #endif /* GMX_THREAD_MPI */
101 #endif /* GMX_FFT_FFTW3 */
103 static double fft5d_fmax(double a, double b){
107 /* largest factor smaller than sqrt */
108 static int lfactor(int z) {
111 if (z%i==0) return i;
116 static int l2factor(int z) {
120 if (z%i==0) return i;
124 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
125 static int lpfactor(int z) {
128 return fft5d_fmax(lpfactor(f),lpfactor(z/f));
132 #ifdef HAVE_GETTIMEOFDAY
133 #include <sys/time.h>
137 return tv.tv_sec+tv.tv_usec*1e-6;
146 static int vmax(int* a, int s) {
150 if (a[i]>max) max=a[i];
156 /* NxMxK the size of the data
157 * comm communicator to use for fft5d
158 * P0 number of processor in 1st axes (can be null for automatic)
159 * lin is allocated by fft5d because size of array is only known after planning phase
160 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
162 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
165 int P[2],bMaster,prank[2],i,t;
167 int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
168 int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
169 int C[3],rC[3],nP[2];
171 t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
175 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
177 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
179 MPI_Comm_size(comm[0],&P[0]);
180 MPI_Comm_rank(comm[0],&prank[0]);
189 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
191 MPI_Comm_size(comm[1],&P[1]);
192 MPI_Comm_rank(comm[1],&prank[1]);
201 bMaster=(prank[0]==0&&prank[1]==0);
206 fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
207 P[0],P[1],prank[0],prank[1]);
212 fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
213 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
214 /* The check below is not correct, one prime factor 11 or 13 is ok.
215 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
216 printf("WARNING: FFT very slow with prime factors larger 7\n");
217 printf("Change FFT size or in case you cannot change it look at\n");
218 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
223 if (NG==0 || MG==0 || KG==0) {
224 if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
228 rNG=NG;rMG=MG;rKG=KG;
230 if (flags&FFT5D_REALCOMPLEX) {
231 if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
233 if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
239 /*for transpose we need to know the size for each processor not only our own size*/
241 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
242 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
243 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
244 oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
245 oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
246 oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
252 oN0[i]=i*ceil((double)NG/P[0]);
253 oM0[i]=i*ceil((double)MG/P[0]);
254 oK0[i]=i*ceil((double)KG/P[0]);
264 oN1[i]=i*ceil((double)NG/P[1]);
265 oM1[i]=i*ceil((double)MG/P[1]);
266 oK1[i]=i*ceil((double)KG/P[1]);
273 for (i=0;i<P[0]-1;i++)
275 N0[i]=oN0[i+1]-oN0[i];
276 M0[i]=oM0[i+1]-oM0[i];
277 K0[i]=oK0[i+1]-oK0[i];
279 N0[P[0]-1]=NG-oN0[P[0]-1];
280 M0[P[0]-1]=MG-oM0[P[0]-1];
281 K0[P[0]-1]=KG-oK0[P[0]-1];
282 for (i=0;i<P[1]-1;i++)
284 N1[i]=oN1[i+1]-oN1[i];
285 M1[i]=oM1[i+1]-oM1[i];
286 K1[i]=oK1[i+1]-oK1[i];
288 N1[P[1]-1]=NG-oN1[P[1]-1];
289 M1[P[1]-1]=MG-oM1[P[1]-1];
290 K1[P[1]-1]=KG-oK1[P[1]-1];
292 /* for step 1-3 the local N,M,K sizes of the transposed system
293 C: contiguous dimension, and nP: number of processor in subcommunicator
297 pM[0] = M0[prank[0]];
298 oM[0] = oM0[prank[0]];
299 pK[0] = K1[prank[1]];
300 oK[0] = oK1[prank[1]];
303 if (!(flags&FFT5D_ORDER_YZ)) {
304 N[0] = vmax(N1,P[1]);
306 K[0] = vmax(K1,P[1]);
307 pN[0] = N1[prank[1]];
313 N[1] = vmax(K0,P[0]);
314 pN[1] = K0[prank[0]];
319 M[1] = vmax(M0,P[0]);
320 pM[1] = M0[prank[0]];
321 oM[1] = oM0[prank[0]];
323 pK[1] = N1[prank[1]];
324 oK[1] = oN1[prank[1]];
330 M[2] = vmax(K0,P[0]);
331 pM[2] = K0[prank[0]];
332 oM[2] = oK0[prank[0]];
333 K[2] = vmax(N1,P[1]);
334 pK[2] = N1[prank[1]];
335 oK[2] = oN1[prank[1]];
336 free(N0); free(oN0); /*these are not used for this order*/
337 free(M1); free(oM1); /*the rest is freed in destroy*/
339 N[0] = vmax(N0,P[0]);
340 M[0] = vmax(M0,P[0]);
342 pN[0] = N0[prank[0]];
348 N[1] = vmax(M1,P[1]);
349 pN[1] = M1[prank[1]];
355 pM[1] = N0[prank[0]];
356 oM[1] = oN0[prank[0]];
357 K[1] = vmax(K1,P[1]);
358 pK[1] = K1[prank[1]];
359 oK[1] = oK1[prank[1]];
365 M[2] = vmax(N0,P[0]);
366 pM[2] = N0[prank[0]];
367 oM[2] = oN0[prank[0]];
368 K[2] = vmax(M1,P[1]);
369 pK[2] = M1[prank[1]];
370 oK[2] = oM1[prank[1]];
371 free(N1); free(oN1); /*these are not used for this order*/
372 free(K0); free(oK0); /*the rest is freed in destroy*/
374 N[2]=pN[2]=-1; /*not used*/
377 Difference between x-y-z regarding 2d decomposition is whether they are
378 distributed along axis 1, 2 or both
381 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
382 lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
383 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
384 if (!(flags&FFT5D_NOMALLOC)) {
385 snew_aligned(lin, lsize, 32);
386 snew_aligned(lout, lsize, 32);
389 /* We need extra transpose buffers to avoid OpenMP barriers */
390 snew_aligned(lout2, lsize, 32);
391 snew_aligned(lout3, lsize, 32);
395 /* We can reuse the buffers to avoid cache misses */
414 plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
419 fprintf(debug, "Running on %d threads\n",nthreads);
422 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
423 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
424 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
425 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
426 * and that the 3d plan is faster than the 1d plan.
428 if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) { /*don't do 3d plan in parallel or if in_place requested */
429 int fftwflags=FFTW_DESTROY_INPUT;
431 int inNG=NG,outMG=MG,outKG=KG;
434 if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
435 if (flags&FFT5D_REALCOMPLEX) {
436 if (!(flags&FFT5D_BACKWARD)) { /*input pointer is not complex*/
438 } else { /*output pointer is not complex*/
439 if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
444 if (!(flags&FFT5D_BACKWARD)) {
449 dims[0].is = inNG*MG; /*N M K*/
452 if (!(flags&FFT5D_ORDER_YZ)) {
453 dims[0].os = MG; /*M K N*/
457 dims[0].os = 1; /*K N M*/
462 if (!(flags&FFT5D_ORDER_YZ)) {
471 dims[0].os = outMG*KG;
483 dims[0].os = outKG*NG;
489 #ifdef FFT5D_FFTW_THREADS
490 FFTW(plan_with_nthreads)(nthreads);
493 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
494 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
495 /*howmany*/ 0, /*howmany_dims*/0 ,
496 (real*)lin, (FFTW(complex) *)lout,
497 /*flags*/ fftwflags);
498 } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
499 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
500 /*howmany*/ 0, /*howmany_dims*/0 ,
501 (FFTW(complex) *)lin, (real*)lout,
502 /*flags*/ fftwflags);
504 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
505 /*howmany*/ 0, /*howmany_dims*/0 ,
506 (FFTW(complex) *)lin, (FFTW(complex) *)lout,
507 /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
510 #ifdef FFT5D_FFTW_THREADS
511 FFTW(plan_with_nthreads)(1);
516 if (!plan->p3d) { /* for decomposition and if 3d plan did not work */
517 #endif /* GMX_FFT_FFTW3 */
521 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
522 s,rC[s],M[s],pK[s],C[s],lsize);
524 plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
526 /* Make sure that the init routines are only called by one thread at a time and in order
527 (later is only important to not confuse valgrind)
529 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
530 for(t=0; t<nthreads; t++)
533 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
535 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
536 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
538 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
546 if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
547 plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
549 plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
551 #ifdef FFT5D_MPI_TRANSPOSE
554 if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
555 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
557 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
568 plan->NG=NG;plan->MG=MG;plan->KG=KG;
571 plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
572 plan->oM[s]=oM[s];plan->oK[s]=oK[s];
573 plan->C[s]=C[s];plan->rC[s]=rC[s];
574 plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
577 plan->P[s]=nP[s];plan->coor[s]=prank[s];
580 /* plan->fftorder=fftorder;
581 plan->direction=direction;
582 plan->realcomplex=realcomplex;
585 plan->nthreads=nthreads;
605 /*here x,y,z and N,M,K is in rotated coordinate system!!
606 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
607 maxN,maxM,maxK is max size of local data
608 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
609 NG, MG, KG is size of global data*/
610 static void splitaxes(t_complex* lout,const t_complex* lin,
611 int maxN,int maxM,int maxK, int pN, int pM, int pK,
612 int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
615 int in_i,out_i,in_z,out_z,in_y,out_y;
618 for (z=startz; z<endz+1; z++) /*3. z l*/
633 for (i=0; i<P; i++) /*index cube along long axis*/
635 out_i = out_z + i*maxN*maxM*maxK;
637 for (y=s_y;y<e_y;y++) { /*2. y k*/
638 out_y = out_i + y*maxN;
640 for (x=0;x<N[i];x++) { /*1. x j*/
641 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
642 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
643 /*before split data contiguos - thus if different processor get different amount oN is different*/
650 /*make axis contiguous again (after AllToAll) and also do local transpose*/
651 /*transpose mayor and major dimension
653 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
654 N,M,K local dimensions
656 static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
657 int maxN,int maxM,int maxK,int pN, int pM, int pK,
658 int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
661 int out_i,in_i,out_x,in_x,out_z,in_z;
664 for (x=startx;x<endx+1;x++) /*1.j*/
686 for (i=0;i<P;i++) /*index cube along long axis*/
688 out_i = out_x + oK[i];
689 in_i = in_x + i*maxM*maxN*maxK;
690 for (z=0;z<K[i];z++) /*3.l*/
693 in_z = in_i + z*maxM*maxN;
694 for (y=s_y;y<e_y;y++) /*2.k*/
696 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
703 /*make axis contiguous again (after AllToAll) and also do local transpose
704 tranpose mayor and middle dimension
706 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
709 static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
710 int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
712 int out_i,in_i,out_z,in_z,out_x,in_x;
715 for (z=startz; z<endz+1; z++)
736 for (i=0; i<P; i++) /*index cube along long axis*/
738 out_i = out_z + oM[i];
739 in_i = in_z + i*maxM*maxN*maxK;
740 for (x=s_x;x<e_x;x++)
742 out_x = out_i + x*MG;
746 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
754 static void rotate_offsets(int x[]) {
764 /*compute the offset to compare or print transposed local data in original input coordinates
765 xs matrix dimension size, xl dimension length, xc decomposition offset
766 s: step in computation = number of transposes*/
767 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
768 /* int direction = plan->direction;
769 int fftorder = plan->fftorder;*/
773 int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
774 *C=plan->C, *rC=plan->rC;
776 NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
778 if (!(plan->flags&FFT5D_ORDER_YZ)) {
780 case 0: o=XYZ; break;
781 case 1: o=ZYX; break;
782 case 2: o=YZX; break;
787 case 0: o=XYZ; break;
788 case 1: o=YXZ; break;
789 case 2: o=ZXY; break;
795 case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
796 case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
797 case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
798 case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
799 case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
800 case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
802 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
804 /*xs, xl give dimension size and data length in local transposed coordinate system
805 for 0(/1/2): x(/y/z) in original coordinate system*/
808 case 1: xs[i]=1; xc[i]=0; xl[i]=C[s];break;
809 case 2: xs[i]=C[s]; xc[i]=oM[s]; xl[i]=pM[s];break;
810 case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
813 /*input order is different for test program to match FFTW order
814 (important for complex to real)*/
815 if (plan->flags&FFT5D_BACKWARD) {
820 if (plan->flags&FFT5D_ORDER_YZ) {
827 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
832 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
834 int *coor = plan->coor;
835 int xs[3],xl[3],xc[3],NG[3];
836 int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
837 compute_offsets(plan,xs,xl,xc,NG,s);
838 fprintf(debug,txt,coor[0],coor[1],s);
839 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
840 for(z=0;z<xl[2];z++) {
841 for(y=0;y<xl[1];y++) {
842 fprintf(debug,"%d %d: ",coor[0],coor[1]);
843 for (x=0;x<xl[0];x++) {
845 fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
854 void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
855 t_complex *lin = plan->lin;
856 t_complex *lout = plan->lout;
857 t_complex *lout2 = plan->lout2;
858 t_complex *lout3 = plan->lout3;
859 t_complex *fftout,*joinin;
861 gmx_fft_t **p1d=plan->p1d;
862 #ifdef FFT5D_MPI_TRANSPOSE
863 FFTW(plan) *mpip=plan->mpip;
866 MPI_Comm *cart=plan->cart;
869 double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
870 int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
871 *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
872 int s=0,tstart,tend,bParallelDim;
886 FFTW(execute)(plan->p3d);
890 times->fft+=MPI_Wtime()-time;
901 if (plan->flags&FFT5D_DEBUG && thread == 0)
903 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
906 for (s=0;s<2;s++) { /*loop over first two FFT steps (corner rotations)*/
909 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
919 /* ---------- START FFT ------------ */
921 if (times!=0 && thread == 0)
927 if (bParallelDim || plan->nthreads == 1) {
941 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
942 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
944 gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
947 gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,fftout+tstart);
952 if (times != NULL && thread == 0)
954 time_fft+=MPI_Wtime()-time;
957 if (plan->flags&FFT5D_DEBUG && thread == 0)
959 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
961 /* ---------- END FFT ------------ */
963 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
966 if (times != NULL && thread == 0)
973 1. (most outer) axes (x) is split into P[s] parts of size N[s]
977 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
979 splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
981 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
983 if (times != NULL && thread == 0)
985 time_local+=MPI_Wtime()-time;
989 /* ---------- END SPLIT , START TRANSPOSE------------ */
999 wallcycle_start(times,ewcPME_FFTCOMM);
1001 #ifdef FFT5D_MPI_TRANSPOSE
1002 FFTW(execute)(mpip[s]);
1005 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
1006 MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
1008 MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
1010 gmx_incons("fft5d MPI call without MPI configuration");
1012 #endif /*FFT5D_MPI_TRANSPOSE*/
1016 time_mpi[s]=MPI_Wtime()-time;
1019 wallcycle_stop(times,ewcPME_FFTCOMM);
1023 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1025 /* ---------- END SPLIT + TRANSPOSE------------ */
1027 /* ---------- START JOIN ------------ */
1029 if (times != NULL && thread == 0)
1040 /*bring back in matrix form
1041 thus make new 1. axes contiguos
1042 also local transpose 1 and 2/3
1043 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1045 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
1048 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1049 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1050 joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
1056 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1057 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1058 joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
1063 if (times != NULL && thread == 0)
1065 time_local+=MPI_Wtime()-time;
1068 if (plan->flags&FFT5D_DEBUG && thread == 0)
1070 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1072 /* ---------- END JOIN ------------ */
1074 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1075 } /* for(s=0;s<2;s++) */
1077 if (times != NULL && thread == 0)
1083 if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
1085 /* ----------- FFT ----------- */
1086 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1087 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
1088 gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
1090 gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,lout+tstart);
1092 /* ------------ END FFT ---------*/
1095 if (times != NULL && thread == 0)
1097 time_fft+=MPI_Wtime()-time;
1099 times->fft+=time_fft;
1100 times->local+=time_local;
1101 times->mpi2+=time_mpi[1];
1102 times->mpi1+=time_mpi[0];
1106 if (plan->flags&FFT5D_DEBUG && thread == 0)
1108 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1112 void fft5d_destroy(fft5d_plan plan) {
1119 for (t=0;t<plan->nthreads;t++)
1121 gmx_many_fft_destroy(plan->p1d[s][t]);
1127 free(plan->iNin[s]);
1132 free(plan->oNin[s]);
1137 free(plan->iNout[s]);
1142 free(plan->oNout[s]);
1146 #ifdef GMX_FFT_FFTW3
1148 #ifdef FFT5D_MPI_TRANSPOS
1151 FFTW(destroy_plan)(plan->mpip[s]);
1155 FFTW(destroy_plan)(plan->p3d);
1157 #endif /* FFT5D_MPI_TRANSPOS */
1158 #endif /* GMX_FFT_FFTW3 */
1160 if (!(plan->flags&FFT5D_NOMALLOC))
1162 sfree_aligned(plan->lin);
1163 sfree_aligned(plan->lout);
1164 if (plan->nthreads > 1)
1166 sfree_aligned(plan->lout2);
1167 sfree_aligned(plan->lout3);
1171 #ifdef FFT5D_THREADS
1172 #ifdef FFT5D_FFTW_THREADS
1173 /*FFTW(cleanup_threads)();*/
1180 /*Is this better than direct access of plan? enough data?
1181 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1182 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
1192 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1193 of processor dimensions*/
1194 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
1195 MPI_Comm cart[2]={0};
1202 int rdim1[] = {0,1}, rdim2[] = {1,0};
1204 MPI_Comm_size(comm,&size);
1205 MPI_Comm_rank(comm,&prank);
1207 if (P0==0) P0 = lfactor(size);
1210 if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1214 P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1216 /*Difference between x-y-z regarding 2d decomposition is whether they are
1217 distributed along axis 1, 2 or both*/
1219 MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1220 MPI_Cart_get(gcart,2,P,wrap,coor);
1221 MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1222 MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1224 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
1229 /*prints in original coordinate system of data (as the input to FFT)*/
1230 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1231 int xs[3],xl[3],xc[3],NG[3];
1233 int *coor = plan->coor;
1234 int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1235 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1240 compute_offsets(plan,xs,xl,xc,NG,2);
1241 if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1242 for (z=0;z<xl[2];z++) {
1243 for(y=0;y<xl[1];y++) {
1244 if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1245 for (x=0;x<xl[0];x++) {
1246 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1248 a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1249 if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1251 b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1253 b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1254 if (plan->flags&FFT5D_DEBUG) {
1255 printf("%f %f, ",a,b);
1257 if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1258 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1260 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1263 if (plan->flags&FFT5D_DEBUG) printf(",");
1265 if (plan->flags&FFT5D_DEBUG) printf("\n");