1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
9 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11 * Copyright (c) 2001-2012, The GROMACS development team,
12 * check out http://www.gromacs.org for more information.
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * If you want to redistribute modifications, please consider that
20 * scientific software is very special. Version control is crucial -
21 * bugs must be traceable. We will be happy to consider code for
22 * inclusion in the official distribution, but derived work must not
23 * be called official GROMACS. Details are found in the README & COPYING
24 * files - if they are missing, get the official version at www.gromacs.org.
26 * To help us fund GROMACS development, we humbly ask that you cite
27 * the papers on the package - you can find them in the top README file.
29 * For more info, check our website at http://www.gromacs.org
32 * Groningen Machine for Chemical Simulation
45 #define GMX_PARALLEL_ENV_INITIALIZED 1
48 #define GMX_PARALLEL_ENV_INITIALIZED 1
50 #define GMX_PARALLEL_ENV_INITIALIZED 0
62 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
67 /* requires fftw compiled with openmp */
68 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
77 #ifndef __FLT_EPSILON__
78 #define __FLT_EPSILON__ FLT_EPSILON
79 #define __DBL_EPSILON__ DBL_EPSILON
86 #include "gmx_fatal.h"
90 #include "thread_mpi/mutex.h"
91 #include "gromacs/utility/exceptions.h"
92 /* none of the fftw3 calls, except execute(), are thread-safe, so
93 we need to serialize them with this mutex. */
94 static tMPI::mutex big_fftw_mutex;
95 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
96 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
97 #endif /* GMX_FFT_FFTW3 */
99 /* largest factor smaller than sqrt */
100 static int lfactor(int z) {
102 for (i=static_cast<int>(sqrt(static_cast<double>(z)));;i--)
103 if (z%i==0) return i;
108 static int l2factor(int z) {
112 if (z%i==0) return i;
116 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
117 static int lpfactor(int z) {
120 return std::max(lpfactor(f),lpfactor(z/f));
124 #ifdef HAVE_GETTIMEOFDAY
125 #include <sys/time.h>
129 return tv.tv_sec+tv.tv_usec*1e-6;
138 static int vmax(int* a, int s) {
142 if (a[i]>max) max=a[i];
148 /* NxMxK the size of the data
149 * comm communicator to use for fft5d
150 * P0 number of processor in 1st axes (can be null for automatic)
151 * lin is allocated by fft5d because size of array is only known after planning phase
152 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
154 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
157 int P[2],bMaster,prank[2],i,t;
159 int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
160 int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
161 int C[3],rC[3],nP[2];
163 t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
167 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
169 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
171 MPI_Comm_size(comm[0],&P[0]);
172 MPI_Comm_rank(comm[0],&prank[0]);
181 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
183 MPI_Comm_size(comm[1],&P[1]);
184 MPI_Comm_rank(comm[1],&prank[1]);
193 bMaster=(prank[0]==0&&prank[1]==0);
198 fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
199 P[0],P[1],prank[0],prank[1]);
204 fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
205 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
206 /* The check below is not correct, one prime factor 11 or 13 is ok.
207 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
208 printf("WARNING: FFT very slow with prime factors larger 7\n");
209 printf("Change FFT size or in case you cannot change it look at\n");
210 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
215 if (NG==0 || MG==0 || KG==0) {
216 if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
220 rNG=NG;rMG=MG;rKG=KG;
222 if (flags&FFT5D_REALCOMPLEX) {
223 if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
225 if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
231 /*for transpose we need to know the size for each processor not only our own size*/
233 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
234 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
235 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
236 oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
237 oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
238 oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
244 oN0[i]=i*ceil((double)NG/P[0]);
245 oM0[i]=i*ceil((double)MG/P[0]);
246 oK0[i]=i*ceil((double)KG/P[0]);
256 oN1[i]=i*ceil((double)NG/P[1]);
257 oM1[i]=i*ceil((double)MG/P[1]);
258 oK1[i]=i*ceil((double)KG/P[1]);
265 for (i=0;i<P[0]-1;i++)
267 N0[i]=oN0[i+1]-oN0[i];
268 M0[i]=oM0[i+1]-oM0[i];
269 K0[i]=oK0[i+1]-oK0[i];
271 N0[P[0]-1]=NG-oN0[P[0]-1];
272 M0[P[0]-1]=MG-oM0[P[0]-1];
273 K0[P[0]-1]=KG-oK0[P[0]-1];
274 for (i=0;i<P[1]-1;i++)
276 N1[i]=oN1[i+1]-oN1[i];
277 M1[i]=oM1[i+1]-oM1[i];
278 K1[i]=oK1[i+1]-oK1[i];
280 N1[P[1]-1]=NG-oN1[P[1]-1];
281 M1[P[1]-1]=MG-oM1[P[1]-1];
282 K1[P[1]-1]=KG-oK1[P[1]-1];
284 /* for step 1-3 the local N,M,K sizes of the transposed system
285 C: contiguous dimension, and nP: number of processor in subcommunicator
289 pM[0] = M0[prank[0]];
290 oM[0] = oM0[prank[0]];
291 pK[0] = K1[prank[1]];
292 oK[0] = oK1[prank[1]];
295 if (!(flags&FFT5D_ORDER_YZ)) {
296 N[0] = vmax(N1,P[1]);
298 K[0] = vmax(K1,P[1]);
299 pN[0] = N1[prank[1]];
305 N[1] = vmax(K0,P[0]);
306 pN[1] = K0[prank[0]];
311 M[1] = vmax(M0,P[0]);
312 pM[1] = M0[prank[0]];
313 oM[1] = oM0[prank[0]];
315 pK[1] = N1[prank[1]];
316 oK[1] = oN1[prank[1]];
322 M[2] = vmax(K0,P[0]);
323 pM[2] = K0[prank[0]];
324 oM[2] = oK0[prank[0]];
325 K[2] = vmax(N1,P[1]);
326 pK[2] = N1[prank[1]];
327 oK[2] = oN1[prank[1]];
328 free(N0); free(oN0); /*these are not used for this order*/
329 free(M1); free(oM1); /*the rest is freed in destroy*/
331 N[0] = vmax(N0,P[0]);
332 M[0] = vmax(M0,P[0]);
334 pN[0] = N0[prank[0]];
340 N[1] = vmax(M1,P[1]);
341 pN[1] = M1[prank[1]];
347 pM[1] = N0[prank[0]];
348 oM[1] = oN0[prank[0]];
349 K[1] = vmax(K1,P[1]);
350 pK[1] = K1[prank[1]];
351 oK[1] = oK1[prank[1]];
357 M[2] = vmax(N0,P[0]);
358 pM[2] = N0[prank[0]];
359 oM[2] = oN0[prank[0]];
360 K[2] = vmax(M1,P[1]);
361 pK[2] = M1[prank[1]];
362 oK[2] = oM1[prank[1]];
363 free(N1); free(oN1); /*these are not used for this order*/
364 free(K0); free(oK0); /*the rest is freed in destroy*/
366 N[2]=pN[2]=-1; /*not used*/
369 Difference between x-y-z regarding 2d decomposition is whether they are
370 distributed along axis 1, 2 or both
373 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
374 lsize = std::max(N[0]*M[0]*K[0]*nP[0],std::max(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
375 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
376 if (!(flags&FFT5D_NOMALLOC)) {
377 snew_aligned(lin, lsize, 32);
378 snew_aligned(lout, lsize, 32);
379 snew_aligned(lout2, lsize, 32);
380 snew_aligned(lout3, lsize, 32);
388 plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
393 fprintf(debug, "Running on %d threads\n",nthreads);
396 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
397 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
398 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
399 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
400 * and that the 3d plan is faster than the 1d plan.
402 if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) { /*don't do 3d plan in parallel or if in_place requested */
403 int fftwflags=FFTW_DESTROY_INPUT;
405 int inNG=NG,outMG=MG,outKG=KG;
408 if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
409 if (flags&FFT5D_REALCOMPLEX) {
410 if (!(flags&FFT5D_BACKWARD)) { /*input pointer is not complex*/
412 } else { /*output pointer is not complex*/
413 if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
418 if (!(flags&FFT5D_BACKWARD)) {
423 dims[0].is = inNG*MG; /*N M K*/
426 if (!(flags&FFT5D_ORDER_YZ)) {
427 dims[0].os = MG; /*M K N*/
431 dims[0].os = 1; /*K N M*/
436 if (!(flags&FFT5D_ORDER_YZ)) {
445 dims[0].os = outMG*KG;
457 dims[0].os = outKG*NG;
463 #ifdef FFT5D_FFTW_THREADS
464 FFTW(plan_with_nthreads)(nthreads);
467 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
468 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
469 /*howmany*/ 0, /*howmany_dims*/0 ,
470 (real*)lin, (FFTW(complex) *)lout,
471 /*flags*/ fftwflags);
472 } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
473 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
474 /*howmany*/ 0, /*howmany_dims*/0 ,
475 (FFTW(complex) *)lin, (real*)lout,
476 /*flags*/ fftwflags);
478 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
479 /*howmany*/ 0, /*howmany_dims*/0 ,
480 (FFTW(complex) *)lin, (FFTW(complex) *)lout,
481 /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
484 #ifdef FFT5D_FFTW_THREADS
485 FFTW(plan_with_nthreads)(1);
490 if (!plan->p3d) { /* for decomposition and if 3d plan did not work */
491 #endif /* GMX_FFT_FFTW3 */
495 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
496 s,rC[s],M[s],pK[s],C[s],lsize);
498 plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
500 /* Make sure that the init routines are only called by one thread at a time and in order
501 (later is only important to not confuse valgrind)
503 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
504 for(t=0; t<nthreads; t++)
506 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
508 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
509 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
511 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
519 if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
520 plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
522 plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
524 #ifdef FFT5D_MPI_TRANSPOSE
527 if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
528 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
530 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
541 plan->NG=NG;plan->MG=MG;plan->KG=KG;
544 plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
545 plan->oM[s]=oM[s];plan->oK[s]=oK[s];
546 plan->C[s]=C[s];plan->rC[s]=rC[s];
547 plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
550 plan->P[s]=nP[s];plan->coor[s]=prank[s];
553 /* plan->fftorder=fftorder;
554 plan->direction=direction;
555 plan->realcomplex=realcomplex;
558 plan->nthreads=nthreads;
578 /*here x,y,z and N,M,K is in rotated coordinate system!!
579 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
580 maxN,maxM,maxK is max size of local data
581 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
582 NG, MG, KG is size of global data*/
583 static void splitaxes(t_complex* lout,const t_complex* lin,
584 int maxN,int maxM,int maxK, int pN, int pM, int pK,
585 int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
588 int in_i,out_i,in_z,out_z,in_y,out_y;
591 for (z=startz; z<endz+1; z++) /*3. z l*/
606 for (i=0; i<P; i++) /*index cube along long axis*/
608 out_i = out_z + i*maxN*maxM*maxK;
610 for (y=s_y;y<e_y;y++) { /*2. y k*/
611 out_y = out_i + y*maxN;
613 for (x=0;x<N[i];x++) { /*1. x j*/
614 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
615 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
616 /*before split data contiguos - thus if different processor get different amount oN is different*/
623 /*make axis contiguous again (after AllToAll) and also do local transpose*/
624 /*transpose mayor and major dimension
626 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
627 N,M,K local dimensions
629 static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
630 int maxN,int maxM,int maxK,int pN, int pM, int pK,
631 int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
634 int out_i,in_i,out_x,in_x,out_z,in_z;
637 for (x=startx;x<endx+1;x++) /*1.j*/
659 for (i=0;i<P;i++) /*index cube along long axis*/
661 out_i = out_x + oK[i];
662 in_i = in_x + i*maxM*maxN*maxK;
663 for (z=0;z<K[i];z++) /*3.l*/
666 in_z = in_i + z*maxM*maxN;
667 for (y=s_y;y<e_y;y++) /*2.k*/
669 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
676 /*make axis contiguous again (after AllToAll) and also do local transpose
677 tranpose mayor and middle dimension
679 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
682 static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
683 int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
685 int out_i,in_i,out_z,in_z,out_x,in_x;
688 for (z=startz; z<endz+1; z++)
709 for (i=0; i<P; i++) /*index cube along long axis*/
711 out_i = out_z + oM[i];
712 in_i = in_z + i*maxM*maxN*maxK;
713 for (x=s_x;x<e_x;x++)
715 out_x = out_i + x*MG;
719 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
727 static void rotate_offsets(int x[]) {
737 /*compute the offset to compare or print transposed local data in original input coordinates
738 xs matrix dimension size, xl dimension length, xc decomposition offset
739 s: step in computation = number of transposes*/
740 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
741 /* int direction = plan->direction;
742 int fftorder = plan->fftorder;*/
746 int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
747 *C=plan->C, *rC=plan->rC;
749 NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
751 if (!(plan->flags&FFT5D_ORDER_YZ)) {
753 case 0: o=XYZ; break;
754 case 1: o=ZYX; break;
755 case 2: o=YZX; break;
760 case 0: o=XYZ; break;
761 case 1: o=YXZ; break;
762 case 2: o=ZXY; break;
768 case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
769 case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
770 case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
771 case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
772 case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
773 case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
775 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
777 /*xs, xl give dimension size and data length in local transposed coordinate system
778 for 0(/1/2): x(/y/z) in original coordinate system*/
781 case 1: xs[i]=1; xc[i]=0; xl[i]=C[s];break;
782 case 2: xs[i]=C[s]; xc[i]=oM[s]; xl[i]=pM[s];break;
783 case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
786 /*input order is different for test program to match FFTW order
787 (important for complex to real)*/
788 if (plan->flags&FFT5D_BACKWARD) {
793 if (plan->flags&FFT5D_ORDER_YZ) {
800 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
805 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
807 int *coor = plan->coor;
808 int xs[3],xl[3],xc[3],NG[3];
809 int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
810 compute_offsets(plan,xs,xl,xc,NG,s);
811 fprintf(debug,txt,coor[0],coor[1],s);
812 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
813 for(z=0;z<xl[2];z++) {
814 for(y=0;y<xl[1];y++) {
815 fprintf(debug,"%d %d: ",coor[0],coor[1]);
816 for (x=0;x<xl[0];x++) {
818 fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
827 void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
828 t_complex *lin = plan->lin;
829 t_complex *lout = plan->lout;
830 t_complex *lout2 = plan->lout2;
831 t_complex *lout3 = plan->lout3;
832 t_complex *fftout,*joinin;
834 gmx_fft_t **p1d=plan->p1d;
835 #ifdef FFT5D_MPI_TRANSPOSE
836 FFTW(plan) *mpip=plan->mpip;
839 MPI_Comm *cart=plan->cart;
842 double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
844 int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
845 *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
846 int s=0,tstart,tend,bParallelDim;
860 FFTW(execute)(plan->p3d);
864 times->fft+=MPI_Wtime()-time;
875 if (plan->flags&FFT5D_DEBUG && thread == 0)
877 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
880 for (s=0;s<2;s++) { /*loop over first two FFT steps (corner rotations)*/
883 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
893 /* ---------- START FFT ------------ */
895 if (times!=0 && thread == 0)
915 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
916 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
918 gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
921 gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,fftout+tstart);
926 if (times != NULL && thread == 0)
928 time_fft+=MPI_Wtime()-time;
931 if (plan->flags&FFT5D_DEBUG && thread == 0)
933 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
935 /* ---------- END FFT ------------ */
937 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
940 if (times != NULL && thread == 0)
947 1. (most outer) axes (x) is split into P[s] parts of size N[s]
951 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
953 splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
955 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
957 if (times != NULL && thread == 0)
959 time_local+=MPI_Wtime()-time;
963 /* ---------- END SPLIT , START TRANSPOSE------------ */
973 wallcycle_start(times,ewcPME_FFTCOMM);
975 #ifdef FFT5D_MPI_TRANSPOSE
976 FFTW(execute)(mpip[s]);
979 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
980 MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
982 MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
984 gmx_incons("fft5d MPI call without MPI configuration");
986 #endif /*FFT5D_MPI_TRANSPOSE*/
990 time_mpi[s]=MPI_Wtime()-time;
993 wallcycle_stop(times,ewcPME_FFTCOMM);
997 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
999 /* ---------- END SPLIT + TRANSPOSE------------ */
1001 /* ---------- START JOIN ------------ */
1003 if (times != NULL && thread == 0)
1014 /*bring back in matrix form
1015 thus make new 1. axes contiguos
1016 also local transpose 1 and 2/3
1017 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1019 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
1022 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1023 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1024 joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
1030 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1031 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1032 joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
1037 if (times != NULL && thread == 0)
1039 time_local+=MPI_Wtime()-time;
1042 if (plan->flags&FFT5D_DEBUG && thread == 0)
1044 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1046 /* ---------- END JOIN ------------ */
1048 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1049 } /* for(s=0;s<2;s++) */
1051 if (times != NULL && thread == 0)
1057 if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
1059 /* ----------- FFT ----------- */
1060 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1061 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
1062 gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
1064 gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,lout+tstart);
1066 /* ------------ END FFT ---------*/
1069 if (times != NULL && thread == 0)
1071 time_fft+=MPI_Wtime()-time;
1073 times->fft+=time_fft;
1074 times->local+=time_local;
1075 times->mpi2+=time_mpi[1];
1076 times->mpi1+=time_mpi[0];
1080 if (plan->flags&FFT5D_DEBUG && thread == 0)
1082 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1086 void fft5d_destroy(fft5d_plan plan) {
1092 for (t=0;t<plan->nthreads;t++)
1094 gmx_many_fft_destroy(plan->p1d[s][t]);
1100 free(plan->iNin[s]);
1105 free(plan->oNin[s]);
1110 free(plan->iNout[s]);
1115 free(plan->oNout[s]);
1119 #ifdef GMX_FFT_FFTW3
1121 #ifdef FFT5D_MPI_TRANSPOS
1124 FFTW(destroy_plan)(plan->mpip[s]);
1126 #endif /* FFT5D_MPI_TRANSPOS */
1129 FFTW(destroy_plan)(plan->p3d);
1132 #endif /* GMX_FFT_FFTW3 */
1134 if (!(plan->flags&FFT5D_NOMALLOC))
1136 sfree_aligned(plan->lin);
1137 sfree_aligned(plan->lout);
1138 sfree_aligned(plan->lout2);
1139 sfree_aligned(plan->lout3);
1142 #ifdef FFT5D_THREADS
1143 #ifdef FFT5D_FFTW_THREADS
1144 /*FFTW(cleanup_threads)();*/
1151 /*Is this better than direct access of plan? enough data?
1152 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1153 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
1163 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1164 of processor dimensions*/
1165 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
1166 MPI_Comm cart[2]={0};
1173 int rdim1[] = {0,1}, rdim2[] = {1,0};
1175 MPI_Comm_size(comm,&size);
1176 MPI_Comm_rank(comm,&prank);
1178 if (P0==0) P0 = lfactor(size);
1181 if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1185 P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1187 /*Difference between x-y-z regarding 2d decomposition is whether they are
1188 distributed along axis 1, 2 or both*/
1190 MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1191 MPI_Cart_get(gcart,2,P,wrap,coor);
1192 MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1193 MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1195 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
1200 /*prints in original coordinate system of data (as the input to FFT)*/
1201 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1202 int xs[3],xl[3],xc[3],NG[3];
1204 int *coor = plan->coor;
1205 int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1206 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1211 compute_offsets(plan,xs,xl,xc,NG,2);
1212 if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1213 for (z=0;z<xl[2];z++) {
1214 for(y=0;y<xl[1];y++) {
1215 if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1216 for (x=0;x<xl[0];x++) {
1217 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1219 a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1220 if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1222 b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1224 b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1225 if (plan->flags&FFT5D_DEBUG) {
1226 printf("%f %f, ",a,b);
1228 if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1229 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1231 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1234 if (plan->flags&FFT5D_DEBUG) printf(",");
1236 if (plan->flags&FFT5D_DEBUG) printf("\n");