1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
9 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11 * Copyright (c) 2001-2012, The GROMACS development team,
12 * check out http://www.gromacs.org for more information.
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * If you want to redistribute modifications, please consider that
20 * scientific software is very special. Version control is crucial -
21 * bugs must be traceable. We will be happy to consider code for
22 * inclusion in the official distribution, but derived work must not
23 * be called official GROMACS. Details are found in the README & COPYING
24 * files - if they are missing, get the official version at www.gromacs.org.
26 * To help us fund GROMACS development, we humbly ask that you cite
27 * the papers on the package - you can find them in the top README file.
29 * For more info, check our website at http://www.gromacs.org
32 * Groningen Machine for Chemical Simulation
44 #define GMX_PARALLEL_ENV_INITIALIZED 1
47 #define GMX_PARALLEL_ENV_INITIALIZED 1
49 #define GMX_PARALLEL_ENV_INITIALIZED 0
61 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
63 /* requires fftw compiled with openmp */
64 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
73 #ifndef __FLT_EPSILON__
74 #define __FLT_EPSILON__ FLT_EPSILON
75 #define __DBL_EPSILON__ DBL_EPSILON
82 #include "gmx_fatal.h"
87 /* none of the fftw3 calls, except execute(), are thread-safe, so
88 we need to serialize them with this mutex. */
89 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
91 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex)
92 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex)
93 #else /* GMX_THREAD_MPI */
96 #endif /* GMX_THREAD_MPI */
97 #endif /* GMX_FFT_FFTW3 */
99 static double fft5d_fmax(double a, double b){
103 /* largest factor smaller than sqrt */
104 static int lfactor(int z) {
107 if (z%i==0) return i;
112 static int l2factor(int z) {
116 if (z%i==0) return i;
120 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
121 static int lpfactor(int z) {
124 return fft5d_fmax(lpfactor(f),lpfactor(z/f));
128 #ifdef HAVE_GETTIMEOFDAY
129 #include <sys/time.h>
133 return tv.tv_sec+tv.tv_usec*1e-6;
142 static int vmax(int* a, int s) {
146 if (a[i]>max) max=a[i];
152 /* NxMxK the size of the data
153 * comm communicator to use for fft5d
154 * P0 number of processor in 1st axes (can be null for automatic)
155 * lin is allocated by fft5d because size of array is only known after planning phase
156 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
158 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
161 int P[2],bMaster,prank[2],i,t;
163 int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
164 int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
165 int C[3],rC[3],nP[2];
167 t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
171 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
173 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
175 MPI_Comm_size(comm[0],&P[0]);
176 MPI_Comm_rank(comm[0],&prank[0]);
185 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
187 MPI_Comm_size(comm[1],&P[1]);
188 MPI_Comm_rank(comm[1],&prank[1]);
197 bMaster=(prank[0]==0&&prank[1]==0);
202 fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
203 P[0],P[1],prank[0],prank[1]);
208 fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
209 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
210 /* The check below is not correct, one prime factor 11 or 13 is ok.
211 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
212 printf("WARNING: FFT very slow with prime factors larger 7\n");
213 printf("Change FFT size or in case you cannot change it look at\n");
214 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
219 if (NG==0 || MG==0 || KG==0) {
220 if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
224 rNG=NG;rMG=MG;rKG=KG;
226 if (flags&FFT5D_REALCOMPLEX) {
227 if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
229 if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
235 /*for transpose we need to know the size for each processor not only our own size*/
237 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
238 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
239 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
240 oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
241 oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
242 oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
248 oN0[i]=i*ceil((double)NG/P[0]);
249 oM0[i]=i*ceil((double)MG/P[0]);
250 oK0[i]=i*ceil((double)KG/P[0]);
260 oN1[i]=i*ceil((double)NG/P[1]);
261 oM1[i]=i*ceil((double)MG/P[1]);
262 oK1[i]=i*ceil((double)KG/P[1]);
269 for (i=0;i<P[0]-1;i++)
271 N0[i]=oN0[i+1]-oN0[i];
272 M0[i]=oM0[i+1]-oM0[i];
273 K0[i]=oK0[i+1]-oK0[i];
275 N0[P[0]-1]=NG-oN0[P[0]-1];
276 M0[P[0]-1]=MG-oM0[P[0]-1];
277 K0[P[0]-1]=KG-oK0[P[0]-1];
278 for (i=0;i<P[1]-1;i++)
280 N1[i]=oN1[i+1]-oN1[i];
281 M1[i]=oM1[i+1]-oM1[i];
282 K1[i]=oK1[i+1]-oK1[i];
284 N1[P[1]-1]=NG-oN1[P[1]-1];
285 M1[P[1]-1]=MG-oM1[P[1]-1];
286 K1[P[1]-1]=KG-oK1[P[1]-1];
288 /* for step 1-3 the local N,M,K sizes of the transposed system
289 C: contiguous dimension, and nP: number of processor in subcommunicator
293 pM[0] = M0[prank[0]];
294 oM[0] = oM0[prank[0]];
295 pK[0] = K1[prank[1]];
296 oK[0] = oK1[prank[1]];
299 if (!(flags&FFT5D_ORDER_YZ)) {
300 N[0] = vmax(N1,P[1]);
302 K[0] = vmax(K1,P[1]);
303 pN[0] = N1[prank[1]];
309 N[1] = vmax(K0,P[0]);
310 pN[1] = K0[prank[0]];
315 M[1] = vmax(M0,P[0]);
316 pM[1] = M0[prank[0]];
317 oM[1] = oM0[prank[0]];
319 pK[1] = N1[prank[1]];
320 oK[1] = oN1[prank[1]];
326 M[2] = vmax(K0,P[0]);
327 pM[2] = K0[prank[0]];
328 oM[2] = oK0[prank[0]];
329 K[2] = vmax(N1,P[1]);
330 pK[2] = N1[prank[1]];
331 oK[2] = oN1[prank[1]];
332 free(N0); free(oN0); /*these are not used for this order*/
333 free(M1); free(oM1); /*the rest is freed in destroy*/
335 N[0] = vmax(N0,P[0]);
336 M[0] = vmax(M0,P[0]);
338 pN[0] = N0[prank[0]];
344 N[1] = vmax(M1,P[1]);
345 pN[1] = M1[prank[1]];
351 pM[1] = N0[prank[0]];
352 oM[1] = oN0[prank[0]];
353 K[1] = vmax(K1,P[1]);
354 pK[1] = K1[prank[1]];
355 oK[1] = oK1[prank[1]];
361 M[2] = vmax(N0,P[0]);
362 pM[2] = N0[prank[0]];
363 oM[2] = oN0[prank[0]];
364 K[2] = vmax(M1,P[1]);
365 pK[2] = M1[prank[1]];
366 oK[2] = oM1[prank[1]];
367 free(N1); free(oN1); /*these are not used for this order*/
368 free(K0); free(oK0); /*the rest is freed in destroy*/
370 N[2]=pN[2]=-1; /*not used*/
373 Difference between x-y-z regarding 2d decomposition is whether they are
374 distributed along axis 1, 2 or both
377 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
378 lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
379 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
380 if (!(flags&FFT5D_NOMALLOC)) {
381 snew_aligned(lin, lsize, 32);
382 snew_aligned(lout, lsize, 32);
385 /* We need extra transpose buffers to avoid OpenMP barriers */
386 snew_aligned(lout2, lsize, 32);
387 snew_aligned(lout3, lsize, 32);
391 /* We can reuse the buffers to avoid cache misses */
410 plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
415 fprintf(debug, "Running on %d threads\n",nthreads);
418 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
419 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
420 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
421 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
422 * and that the 3d plan is faster than the 1d plan.
424 if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) { /*don't do 3d plan in parallel or if in_place requested */
425 int fftwflags=FFTW_DESTROY_INPUT;
427 int inNG=NG,outMG=MG,outKG=KG;
430 if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
431 if (flags&FFT5D_REALCOMPLEX) {
432 if (!(flags&FFT5D_BACKWARD)) { /*input pointer is not complex*/
434 } else { /*output pointer is not complex*/
435 if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
440 if (!(flags&FFT5D_BACKWARD)) {
445 dims[0].is = inNG*MG; /*N M K*/
448 if (!(flags&FFT5D_ORDER_YZ)) {
449 dims[0].os = MG; /*M K N*/
453 dims[0].os = 1; /*K N M*/
458 if (!(flags&FFT5D_ORDER_YZ)) {
467 dims[0].os = outMG*KG;
479 dims[0].os = outKG*NG;
485 #ifdef FFT5D_FFTW_THREADS
486 FFTW(plan_with_nthreads)(nthreads);
489 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
490 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
491 /*howmany*/ 0, /*howmany_dims*/0 ,
492 (real*)lin, (FFTW(complex) *)lout,
493 /*flags*/ fftwflags);
494 } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
495 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
496 /*howmany*/ 0, /*howmany_dims*/0 ,
497 (FFTW(complex) *)lin, (real*)lout,
498 /*flags*/ fftwflags);
500 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
501 /*howmany*/ 0, /*howmany_dims*/0 ,
502 (FFTW(complex) *)lin, (FFTW(complex) *)lout,
503 /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
506 #ifdef FFT5D_FFTW_THREADS
507 FFTW(plan_with_nthreads)(1);
512 if (!plan->p3d) { /* for decomposition and if 3d plan did not work */
513 #endif /* GMX_FFT_FFTW3 */
517 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
518 s,rC[s],M[s],pK[s],C[s],lsize);
520 plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
522 /* Make sure that the init routines are only called by one thread at a time and in order
523 (later is only important to not confuse valgrind)
525 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
526 for(t=0; t<nthreads; t++)
529 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
531 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
532 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
534 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
542 if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
543 plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
545 plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
547 #ifdef FFT5D_MPI_TRANSPOSE
550 if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
551 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
553 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
564 plan->NG=NG;plan->MG=MG;plan->KG=KG;
567 plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
568 plan->oM[s]=oM[s];plan->oK[s]=oK[s];
569 plan->C[s]=C[s];plan->rC[s]=rC[s];
570 plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
573 plan->P[s]=nP[s];plan->coor[s]=prank[s];
576 /* plan->fftorder=fftorder;
577 plan->direction=direction;
578 plan->realcomplex=realcomplex;
581 plan->nthreads=nthreads;
601 /*here x,y,z and N,M,K is in rotated coordinate system!!
602 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
603 maxN,maxM,maxK is max size of local data
604 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
605 NG, MG, KG is size of global data*/
606 static void splitaxes(t_complex* lout,const t_complex* lin,
607 int maxN,int maxM,int maxK, int pN, int pM, int pK,
608 int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
611 int in_i,out_i,in_z,out_z,in_y,out_y;
614 for (z=startz; z<endz+1; z++) /*3. z l*/
629 for (i=0; i<P; i++) /*index cube along long axis*/
631 out_i = out_z + i*maxN*maxM*maxK;
633 for (y=s_y;y<e_y;y++) { /*2. y k*/
634 out_y = out_i + y*maxN;
636 for (x=0;x<N[i];x++) { /*1. x j*/
637 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
638 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
639 /*before split data contiguos - thus if different processor get different amount oN is different*/
646 /*make axis contiguous again (after AllToAll) and also do local transpose*/
647 /*transpose mayor and major dimension
649 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
650 N,M,K local dimensions
652 static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
653 int maxN,int maxM,int maxK,int pN, int pM, int pK,
654 int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
657 int out_i,in_i,out_x,in_x,out_z,in_z;
660 for (x=startx;x<endx+1;x++) /*1.j*/
682 for (i=0;i<P;i++) /*index cube along long axis*/
684 out_i = out_x + oK[i];
685 in_i = in_x + i*maxM*maxN*maxK;
686 for (z=0;z<K[i];z++) /*3.l*/
689 in_z = in_i + z*maxM*maxN;
690 for (y=s_y;y<e_y;y++) /*2.k*/
692 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
699 /*make axis contiguous again (after AllToAll) and also do local transpose
700 tranpose mayor and middle dimension
702 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
705 static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
706 int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
708 int out_i,in_i,out_z,in_z,out_x,in_x;
711 for (z=startz; z<endz+1; z++)
732 for (i=0; i<P; i++) /*index cube along long axis*/
734 out_i = out_z + oM[i];
735 in_i = in_z + i*maxM*maxN*maxK;
736 for (x=s_x;x<e_x;x++)
738 out_x = out_i + x*MG;
742 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
750 static void rotate_offsets(int x[]) {
760 /*compute the offset to compare or print transposed local data in original input coordinates
761 xs matrix dimension size, xl dimension length, xc decomposition offset
762 s: step in computation = number of transposes*/
763 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
764 /* int direction = plan->direction;
765 int fftorder = plan->fftorder;*/
769 int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
770 *C=plan->C, *rC=plan->rC;
772 NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
774 if (!(plan->flags&FFT5D_ORDER_YZ)) {
776 case 0: o=XYZ; break;
777 case 1: o=ZYX; break;
778 case 2: o=YZX; break;
783 case 0: o=XYZ; break;
784 case 1: o=YXZ; break;
785 case 2: o=ZXY; break;
791 case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
792 case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
793 case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
794 case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
795 case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
796 case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
798 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
800 /*xs, xl give dimension size and data length in local transposed coordinate system
801 for 0(/1/2): x(/y/z) in original coordinate system*/
804 case 1: xs[i]=1; xc[i]=0; xl[i]=C[s];break;
805 case 2: xs[i]=C[s]; xc[i]=oM[s]; xl[i]=pM[s];break;
806 case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
809 /*input order is different for test program to match FFTW order
810 (important for complex to real)*/
811 if (plan->flags&FFT5D_BACKWARD) {
816 if (plan->flags&FFT5D_ORDER_YZ) {
823 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
828 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
830 int *coor = plan->coor;
831 int xs[3],xl[3],xc[3],NG[3];
832 int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
833 compute_offsets(plan,xs,xl,xc,NG,s);
834 fprintf(debug,txt,coor[0],coor[1],s);
835 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
836 for(z=0;z<xl[2];z++) {
837 for(y=0;y<xl[1];y++) {
838 fprintf(debug,"%d %d: ",coor[0],coor[1]);
839 for (x=0;x<xl[0];x++) {
841 fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
850 void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
851 t_complex *lin = plan->lin;
852 t_complex *lout = plan->lout;
853 t_complex *lout2 = plan->lout2;
854 t_complex *lout3 = plan->lout3;
855 t_complex *fftout,*joinin;
857 gmx_fft_t **p1d=plan->p1d;
858 #ifdef FFT5D_MPI_TRANSPOSE
859 FFTW(plan) *mpip=plan->mpip;
862 MPI_Comm *cart=plan->cart;
865 double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
866 int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
867 *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
868 int s=0,tstart,tend,bParallelDim;
882 FFTW(execute)(plan->p3d);
886 times->fft+=MPI_Wtime()-time;
897 if (plan->flags&FFT5D_DEBUG && thread == 0)
899 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
902 for (s=0;s<2;s++) { /*loop over first two FFT steps (corner rotations)*/
905 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
915 /* ---------- START FFT ------------ */
917 if (times!=0 && thread == 0)
923 if (bParallelDim || plan->nthreads == 1) {
937 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
938 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
940 gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
943 gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,fftout+tstart);
948 if (times != NULL && thread == 0)
950 time_fft+=MPI_Wtime()-time;
953 if (plan->flags&FFT5D_DEBUG && thread == 0)
955 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
957 /* ---------- END FFT ------------ */
959 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
962 if (times != NULL && thread == 0)
969 1. (most outer) axes (x) is split into P[s] parts of size N[s]
973 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
975 splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
977 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
979 if (times != NULL && thread == 0)
981 time_local+=MPI_Wtime()-time;
985 /* ---------- END SPLIT , START TRANSPOSE------------ */
995 wallcycle_start(times,ewcPME_FFTCOMM);
997 #ifdef FFT5D_MPI_TRANSPOSE
998 FFTW(execute)(mpip[s]);
1001 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
1002 MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
1004 MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
1006 gmx_incons("fft5d MPI call without MPI configuration");
1008 #endif /*FFT5D_MPI_TRANSPOSE*/
1012 time_mpi[s]=MPI_Wtime()-time;
1015 wallcycle_stop(times,ewcPME_FFTCOMM);
1019 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1021 /* ---------- END SPLIT + TRANSPOSE------------ */
1023 /* ---------- START JOIN ------------ */
1025 if (times != NULL && thread == 0)
1036 /*bring back in matrix form
1037 thus make new 1. axes contiguos
1038 also local transpose 1 and 2/3
1039 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1041 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
1044 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1045 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1046 joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
1052 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1053 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1054 joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
1059 if (times != NULL && thread == 0)
1061 time_local+=MPI_Wtime()-time;
1064 if (plan->flags&FFT5D_DEBUG && thread == 0)
1066 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1068 /* ---------- END JOIN ------------ */
1070 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1071 } /* for(s=0;s<2;s++) */
1073 if (times != NULL && thread == 0)
1079 if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
1081 /* ----------- FFT ----------- */
1082 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1083 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
1084 gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
1086 gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,lout+tstart);
1088 /* ------------ END FFT ---------*/
1091 if (times != NULL && thread == 0)
1093 time_fft+=MPI_Wtime()-time;
1095 times->fft+=time_fft;
1096 times->local+=time_local;
1097 times->mpi2+=time_mpi[1];
1098 times->mpi1+=time_mpi[0];
1102 if (plan->flags&FFT5D_DEBUG && thread == 0)
1104 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1108 void fft5d_destroy(fft5d_plan plan) {
1111 /* Note that we expect plan->lin and plan->lout to be freed elsewhere */
1112 if (plan->nthreads > 1)
1122 for (t=0;t<plan->nthreads;t++)
1124 gmx_many_fft_destroy(plan->p1d[s][t]);
1130 free(plan->iNin[s]);
1135 free(plan->oNin[s]);
1140 free(plan->iNout[s]);
1145 free(plan->oNout[s]);
1149 #ifdef GMX_FFT_FFTW3
1151 #ifdef FFT5D_MPI_TRANSPOS
1154 FFTW(destroy_plan)(plan->mpip[s]);
1158 FFTW(destroy_plan)(plan->p3d);
1160 #endif /* FFT5D_MPI_TRANSPOS */
1161 #endif /* GMX_FFT_FFTW3 */
1163 if (!(plan->flags&FFT5D_NOMALLOC))
1165 sfree_aligned(plan->lin);
1166 sfree_aligned(plan->lout);
1167 sfree_aligned(plan->lout2);
1168 sfree_aligned(plan->lout3);
1171 #ifdef FFT5D_THREADS
1172 #ifdef FFT5D_FFTW_THREADS
1173 /*FFTW(cleanup_threads)();*/
1180 /*Is this better than direct access of plan? enough data?
1181 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1182 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
1192 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1193 of processor dimensions*/
1194 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
1195 MPI_Comm cart[2]={0};
1202 int rdim1[] = {0,1}, rdim2[] = {1,0};
1204 MPI_Comm_size(comm,&size);
1205 MPI_Comm_rank(comm,&prank);
1207 if (P0==0) P0 = lfactor(size);
1210 if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1214 P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1216 /*Difference between x-y-z regarding 2d decomposition is whether they are
1217 distributed along axis 1, 2 or both*/
1219 MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1220 MPI_Cart_get(gcart,2,P,wrap,coor);
1221 MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1222 MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1224 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
1229 /*prints in original coordinate system of data (as the input to FFT)*/
1230 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1231 int xs[3],xl[3],xc[3],NG[3];
1233 int *coor = plan->coor;
1234 int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1235 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1240 compute_offsets(plan,xs,xl,xc,NG,2);
1241 if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1242 for (z=0;z<xl[2];z++) {
1243 for(y=0;y<xl[1];y++) {
1244 if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1245 for (x=0;x<xl[0];x++) {
1246 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1248 a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1249 if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1251 b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1253 b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1254 if (plan->flags&FFT5D_DEBUG) {
1255 printf("%f %f, ",a,b);
1257 if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1258 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1260 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1263 if (plan->flags&FFT5D_DEBUG) printf(",");
1265 if (plan->flags&FFT5D_DEBUG) printf("\n");