1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
9 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11 * Copyright (c) 2001-2012, The GROMACS development team,
12 * check out http://www.gromacs.org for more information.
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * If you want to redistribute modifications, please consider that
20 * scientific software is very special. Version control is crucial -
21 * bugs must be traceable. We will be happy to consider code for
22 * inclusion in the official distribution, but derived work must not
23 * be called official GROMACS. Details are found in the README & COPYING
24 * files - if they are missing, get the official version at www.gromacs.org.
26 * To help us fund GROMACS development, we humbly ask that you cite
27 * the papers on the package - you can find them in the top README file.
29 * For more info, check our website at http://www.gromacs.org
32 * Groningen Machine for Chemical Simulation
44 #define GMX_PARALLEL_ENV_INITIALIZED 1
47 #define GMX_PARALLEL_ENV_INITIALIZED 1
49 #define GMX_PARALLEL_ENV_INITIALIZED 0
61 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
66 /* requires fftw compiled with openmp */
67 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
76 #ifndef __FLT_EPSILON__
77 #define __FLT_EPSILON__ FLT_EPSILON
78 #define __DBL_EPSILON__ DBL_EPSILON
85 #include "gmx_fatal.h"
90 /* none of the fftw3 calls, except execute(), are thread-safe, so
91 we need to serialize them with this mutex. */
92 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
94 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex)
95 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex)
96 #else /* GMX_THREAD_MPI */
99 #endif /* GMX_THREAD_MPI */
100 #endif /* GMX_FFT_FFTW3 */
102 static double fft5d_fmax(double a, double b){
106 /* largest factor smaller than sqrt */
107 static int lfactor(int z) {
110 if (z%i==0) return i;
115 static int l2factor(int z) {
119 if (z%i==0) return i;
123 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
124 static int lpfactor(int z) {
127 return fft5d_fmax(lpfactor(f),lpfactor(z/f));
131 #ifdef HAVE_GETTIMEOFDAY
132 #include <sys/time.h>
136 return tv.tv_sec+tv.tv_usec*1e-6;
145 static int vmax(int* a, int s) {
149 if (a[i]>max) max=a[i];
155 /* NxMxK the size of the data
156 * comm communicator to use for fft5d
157 * P0 number of processor in 1st axes (can be null for automatic)
158 * lin is allocated by fft5d because size of array is only known after planning phase
159 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
161 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
164 int P[2],bMaster,prank[2],i,t;
166 int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
167 int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
168 int C[3],rC[3],nP[2];
170 t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
174 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
176 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
178 MPI_Comm_size(comm[0],&P[0]);
179 MPI_Comm_rank(comm[0],&prank[0]);
188 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
190 MPI_Comm_size(comm[1],&P[1]);
191 MPI_Comm_rank(comm[1],&prank[1]);
200 bMaster=(prank[0]==0&&prank[1]==0);
205 fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
206 P[0],P[1],prank[0],prank[1]);
211 fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
212 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
213 /* The check below is not correct, one prime factor 11 or 13 is ok.
214 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
215 printf("WARNING: FFT very slow with prime factors larger 7\n");
216 printf("Change FFT size or in case you cannot change it look at\n");
217 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
222 if (NG==0 || MG==0 || KG==0) {
223 if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
227 rNG=NG;rMG=MG;rKG=KG;
229 if (flags&FFT5D_REALCOMPLEX) {
230 if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
232 if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
238 /*for transpose we need to know the size for each processor not only our own size*/
240 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
241 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
242 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
243 oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
244 oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
245 oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
251 oN0[i]=i*ceil((double)NG/P[0]);
252 oM0[i]=i*ceil((double)MG/P[0]);
253 oK0[i]=i*ceil((double)KG/P[0]);
263 oN1[i]=i*ceil((double)NG/P[1]);
264 oM1[i]=i*ceil((double)MG/P[1]);
265 oK1[i]=i*ceil((double)KG/P[1]);
272 for (i=0;i<P[0]-1;i++)
274 N0[i]=oN0[i+1]-oN0[i];
275 M0[i]=oM0[i+1]-oM0[i];
276 K0[i]=oK0[i+1]-oK0[i];
278 N0[P[0]-1]=NG-oN0[P[0]-1];
279 M0[P[0]-1]=MG-oM0[P[0]-1];
280 K0[P[0]-1]=KG-oK0[P[0]-1];
281 for (i=0;i<P[1]-1;i++)
283 N1[i]=oN1[i+1]-oN1[i];
284 M1[i]=oM1[i+1]-oM1[i];
285 K1[i]=oK1[i+1]-oK1[i];
287 N1[P[1]-1]=NG-oN1[P[1]-1];
288 M1[P[1]-1]=MG-oM1[P[1]-1];
289 K1[P[1]-1]=KG-oK1[P[1]-1];
291 /* for step 1-3 the local N,M,K sizes of the transposed system
292 C: contiguous dimension, and nP: number of processor in subcommunicator
296 pM[0] = M0[prank[0]];
297 oM[0] = oM0[prank[0]];
298 pK[0] = K1[prank[1]];
299 oK[0] = oK1[prank[1]];
302 if (!(flags&FFT5D_ORDER_YZ)) {
303 N[0] = vmax(N1,P[1]);
305 K[0] = vmax(K1,P[1]);
306 pN[0] = N1[prank[1]];
312 N[1] = vmax(K0,P[0]);
313 pN[1] = K0[prank[0]];
318 M[1] = vmax(M0,P[0]);
319 pM[1] = M0[prank[0]];
320 oM[1] = oM0[prank[0]];
322 pK[1] = N1[prank[1]];
323 oK[1] = oN1[prank[1]];
329 M[2] = vmax(K0,P[0]);
330 pM[2] = K0[prank[0]];
331 oM[2] = oK0[prank[0]];
332 K[2] = vmax(N1,P[1]);
333 pK[2] = N1[prank[1]];
334 oK[2] = oN1[prank[1]];
335 free(N0); free(oN0); /*these are not used for this order*/
336 free(M1); free(oM1); /*the rest is freed in destroy*/
338 N[0] = vmax(N0,P[0]);
339 M[0] = vmax(M0,P[0]);
341 pN[0] = N0[prank[0]];
347 N[1] = vmax(M1,P[1]);
348 pN[1] = M1[prank[1]];
354 pM[1] = N0[prank[0]];
355 oM[1] = oN0[prank[0]];
356 K[1] = vmax(K1,P[1]);
357 pK[1] = K1[prank[1]];
358 oK[1] = oK1[prank[1]];
364 M[2] = vmax(N0,P[0]);
365 pM[2] = N0[prank[0]];
366 oM[2] = oN0[prank[0]];
367 K[2] = vmax(M1,P[1]);
368 pK[2] = M1[prank[1]];
369 oK[2] = oM1[prank[1]];
370 free(N1); free(oN1); /*these are not used for this order*/
371 free(K0); free(oK0); /*the rest is freed in destroy*/
373 N[2]=pN[2]=-1; /*not used*/
376 Difference between x-y-z regarding 2d decomposition is whether they are
377 distributed along axis 1, 2 or both
380 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
381 lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
382 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
383 if (!(flags&FFT5D_NOMALLOC)) {
384 snew_aligned(lin, lsize, 32);
385 snew_aligned(lout, lsize, 32);
386 snew_aligned(lout2, lsize, 32);
387 snew_aligned(lout3, lsize, 32);
395 plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
400 fprintf(debug, "Running on %d threads\n",nthreads);
403 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
404 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
405 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
406 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
407 * and that the 3d plan is faster than the 1d plan.
409 if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) { /*don't do 3d plan in parallel or if in_place requested */
410 int fftwflags=FFTW_DESTROY_INPUT;
412 int inNG=NG,outMG=MG,outKG=KG;
415 if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
416 if (flags&FFT5D_REALCOMPLEX) {
417 if (!(flags&FFT5D_BACKWARD)) { /*input pointer is not complex*/
419 } else { /*output pointer is not complex*/
420 if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
425 if (!(flags&FFT5D_BACKWARD)) {
430 dims[0].is = inNG*MG; /*N M K*/
433 if (!(flags&FFT5D_ORDER_YZ)) {
434 dims[0].os = MG; /*M K N*/
438 dims[0].os = 1; /*K N M*/
443 if (!(flags&FFT5D_ORDER_YZ)) {
452 dims[0].os = outMG*KG;
464 dims[0].os = outKG*NG;
470 #ifdef FFT5D_FFTW_THREADS
471 FFTW(plan_with_nthreads)(nthreads);
474 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
475 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
476 /*howmany*/ 0, /*howmany_dims*/0 ,
477 (real*)lin, (FFTW(complex) *)lout,
478 /*flags*/ fftwflags);
479 } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
480 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
481 /*howmany*/ 0, /*howmany_dims*/0 ,
482 (FFTW(complex) *)lin, (real*)lout,
483 /*flags*/ fftwflags);
485 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
486 /*howmany*/ 0, /*howmany_dims*/0 ,
487 (FFTW(complex) *)lin, (FFTW(complex) *)lout,
488 /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
491 #ifdef FFT5D_FFTW_THREADS
492 FFTW(plan_with_nthreads)(1);
497 if (!plan->p3d) { /* for decomposition and if 3d plan did not work */
498 #endif /* GMX_FFT_FFTW3 */
502 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
503 s,rC[s],M[s],pK[s],C[s],lsize);
505 plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
507 /* Make sure that the init routines are only called by one thread at a time and in order
508 (later is only important to not confuse valgrind)
510 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
511 for(t=0; t<nthreads; t++)
513 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
515 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
516 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
518 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
526 if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
527 plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
529 plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
531 #ifdef FFT5D_MPI_TRANSPOSE
534 if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
535 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
537 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
548 plan->NG=NG;plan->MG=MG;plan->KG=KG;
551 plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
552 plan->oM[s]=oM[s];plan->oK[s]=oK[s];
553 plan->C[s]=C[s];plan->rC[s]=rC[s];
554 plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
557 plan->P[s]=nP[s];plan->coor[s]=prank[s];
560 /* plan->fftorder=fftorder;
561 plan->direction=direction;
562 plan->realcomplex=realcomplex;
565 plan->nthreads=nthreads;
585 /*here x,y,z and N,M,K is in rotated coordinate system!!
586 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
587 maxN,maxM,maxK is max size of local data
588 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
589 NG, MG, KG is size of global data*/
590 static void splitaxes(t_complex* lout,const t_complex* lin,
591 int maxN,int maxM,int maxK, int pN, int pM, int pK,
592 int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
595 int in_i,out_i,in_z,out_z,in_y,out_y;
598 for (z=startz; z<endz+1; z++) /*3. z l*/
613 for (i=0; i<P; i++) /*index cube along long axis*/
615 out_i = out_z + i*maxN*maxM*maxK;
617 for (y=s_y;y<e_y;y++) { /*2. y k*/
618 out_y = out_i + y*maxN;
620 for (x=0;x<N[i];x++) { /*1. x j*/
621 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
622 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
623 /*before split data contiguos - thus if different processor get different amount oN is different*/
630 /*make axis contiguous again (after AllToAll) and also do local transpose*/
631 /*transpose mayor and major dimension
633 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
634 N,M,K local dimensions
636 static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
637 int maxN,int maxM,int maxK,int pN, int pM, int pK,
638 int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
641 int out_i,in_i,out_x,in_x,out_z,in_z;
644 for (x=startx;x<endx+1;x++) /*1.j*/
666 for (i=0;i<P;i++) /*index cube along long axis*/
668 out_i = out_x + oK[i];
669 in_i = in_x + i*maxM*maxN*maxK;
670 for (z=0;z<K[i];z++) /*3.l*/
673 in_z = in_i + z*maxM*maxN;
674 for (y=s_y;y<e_y;y++) /*2.k*/
676 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
683 /*make axis contiguous again (after AllToAll) and also do local transpose
684 tranpose mayor and middle dimension
686 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
689 static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
690 int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
692 int out_i,in_i,out_z,in_z,out_x,in_x;
695 for (z=startz; z<endz+1; z++)
716 for (i=0; i<P; i++) /*index cube along long axis*/
718 out_i = out_z + oM[i];
719 in_i = in_z + i*maxM*maxN*maxK;
720 for (x=s_x;x<e_x;x++)
722 out_x = out_i + x*MG;
726 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
734 static void rotate_offsets(int x[]) {
744 /*compute the offset to compare or print transposed local data in original input coordinates
745 xs matrix dimension size, xl dimension length, xc decomposition offset
746 s: step in computation = number of transposes*/
747 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
748 /* int direction = plan->direction;
749 int fftorder = plan->fftorder;*/
753 int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
754 *C=plan->C, *rC=plan->rC;
756 NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
758 if (!(plan->flags&FFT5D_ORDER_YZ)) {
760 case 0: o=XYZ; break;
761 case 1: o=ZYX; break;
762 case 2: o=YZX; break;
767 case 0: o=XYZ; break;
768 case 1: o=YXZ; break;
769 case 2: o=ZXY; break;
775 case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
776 case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
777 case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
778 case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
779 case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
780 case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
782 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
784 /*xs, xl give dimension size and data length in local transposed coordinate system
785 for 0(/1/2): x(/y/z) in original coordinate system*/
788 case 1: xs[i]=1; xc[i]=0; xl[i]=C[s];break;
789 case 2: xs[i]=C[s]; xc[i]=oM[s]; xl[i]=pM[s];break;
790 case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
793 /*input order is different for test program to match FFTW order
794 (important for complex to real)*/
795 if (plan->flags&FFT5D_BACKWARD) {
800 if (plan->flags&FFT5D_ORDER_YZ) {
807 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
812 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
814 int *coor = plan->coor;
815 int xs[3],xl[3],xc[3],NG[3];
816 int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
817 compute_offsets(plan,xs,xl,xc,NG,s);
818 fprintf(debug,txt,coor[0],coor[1],s);
819 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
820 for(z=0;z<xl[2];z++) {
821 for(y=0;y<xl[1];y++) {
822 fprintf(debug,"%d %d: ",coor[0],coor[1]);
823 for (x=0;x<xl[0];x++) {
825 fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
834 void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
835 t_complex *lin = plan->lin;
836 t_complex *lout = plan->lout;
837 t_complex *lout2 = plan->lout2;
838 t_complex *lout3 = plan->lout3;
839 t_complex *fftout,*joinin;
841 gmx_fft_t **p1d=plan->p1d;
842 #ifdef FFT5D_MPI_TRANSPOSE
843 FFTW(plan) *mpip=plan->mpip;
846 MPI_Comm *cart=plan->cart;
849 double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
850 int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
851 *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
852 int s=0,tstart,tend,bParallelDim;
866 FFTW(execute)(plan->p3d);
870 times->fft+=MPI_Wtime()-time;
881 if (plan->flags&FFT5D_DEBUG && thread == 0)
883 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
886 for (s=0;s<2;s++) { /*loop over first two FFT steps (corner rotations)*/
889 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
899 /* ---------- START FFT ------------ */
901 if (times!=0 && thread == 0)
921 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
922 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
924 gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
927 gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,fftout+tstart);
932 if (times != NULL && thread == 0)
934 time_fft+=MPI_Wtime()-time;
937 if (plan->flags&FFT5D_DEBUG && thread == 0)
939 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
941 /* ---------- END FFT ------------ */
943 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
946 if (times != NULL && thread == 0)
953 1. (most outer) axes (x) is split into P[s] parts of size N[s]
957 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
959 splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
961 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
963 if (times != NULL && thread == 0)
965 time_local+=MPI_Wtime()-time;
969 /* ---------- END SPLIT , START TRANSPOSE------------ */
979 wallcycle_start(times,ewcPME_FFTCOMM);
981 #ifdef FFT5D_MPI_TRANSPOSE
982 FFTW(execute)(mpip[s]);
985 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
986 MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
988 MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
990 gmx_incons("fft5d MPI call without MPI configuration");
992 #endif /*FFT5D_MPI_TRANSPOSE*/
996 time_mpi[s]=MPI_Wtime()-time;
999 wallcycle_stop(times,ewcPME_FFTCOMM);
1003 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1005 /* ---------- END SPLIT + TRANSPOSE------------ */
1007 /* ---------- START JOIN ------------ */
1009 if (times != NULL && thread == 0)
1020 /*bring back in matrix form
1021 thus make new 1. axes contiguos
1022 also local transpose 1 and 2/3
1023 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1025 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
1028 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1029 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1030 joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
1036 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1037 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1038 joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
1043 if (times != NULL && thread == 0)
1045 time_local+=MPI_Wtime()-time;
1048 if (plan->flags&FFT5D_DEBUG && thread == 0)
1050 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1052 /* ---------- END JOIN ------------ */
1054 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1055 } /* for(s=0;s<2;s++) */
1057 if (times != NULL && thread == 0)
1063 if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
1065 /* ----------- FFT ----------- */
1066 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1067 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
1068 gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
1070 gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,lout+tstart);
1072 /* ------------ END FFT ---------*/
1075 if (times != NULL && thread == 0)
1077 time_fft+=MPI_Wtime()-time;
1079 times->fft+=time_fft;
1080 times->local+=time_local;
1081 times->mpi2+=time_mpi[1];
1082 times->mpi1+=time_mpi[0];
1086 if (plan->flags&FFT5D_DEBUG && thread == 0)
1088 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1092 void fft5d_destroy(fft5d_plan plan) {
1098 for (t=0;t<plan->nthreads;t++)
1100 gmx_many_fft_destroy(plan->p1d[s][t]);
1106 free(plan->iNin[s]);
1111 free(plan->oNin[s]);
1116 free(plan->iNout[s]);
1121 free(plan->oNout[s]);
1125 #ifdef GMX_FFT_FFTW3
1127 #ifdef FFT5D_MPI_TRANSPOS
1130 FFTW(destroy_plan)(plan->mpip[s]);
1134 FFTW(destroy_plan)(plan->p3d);
1136 #endif /* FFT5D_MPI_TRANSPOS */
1137 #endif /* GMX_FFT_FFTW3 */
1139 if (!(plan->flags&FFT5D_NOMALLOC))
1141 sfree_aligned(plan->lin);
1142 sfree_aligned(plan->lout);
1143 sfree_aligned(plan->lout2);
1144 sfree_aligned(plan->lout3);
1147 #ifdef FFT5D_THREADS
1148 #ifdef FFT5D_FFTW_THREADS
1149 /*FFTW(cleanup_threads)();*/
1156 /*Is this better than direct access of plan? enough data?
1157 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1158 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
1168 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1169 of processor dimensions*/
1170 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
1171 MPI_Comm cart[2]={0};
1178 int rdim1[] = {0,1}, rdim2[] = {1,0};
1180 MPI_Comm_size(comm,&size);
1181 MPI_Comm_rank(comm,&prank);
1183 if (P0==0) P0 = lfactor(size);
1186 if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1190 P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1192 /*Difference between x-y-z regarding 2d decomposition is whether they are
1193 distributed along axis 1, 2 or both*/
1195 MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1196 MPI_Cart_get(gcart,2,P,wrap,coor);
1197 MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1198 MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1200 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
1205 /*prints in original coordinate system of data (as the input to FFT)*/
1206 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1207 int xs[3],xl[3],xc[3],NG[3];
1209 int *coor = plan->coor;
1210 int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1211 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1216 compute_offsets(plan,xs,xl,xc,NG,2);
1217 if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1218 for (z=0;z<xl[2];z++) {
1219 for(y=0;y<xl[1];y++) {
1220 if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1221 for (x=0;x<xl[0];x++) {
1222 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1224 a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1225 if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1227 b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1229 b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1230 if (plan->flags&FFT5D_DEBUG) {
1231 printf("%f %f, ",a,b);
1233 if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1234 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1236 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1239 if (plan->flags&FFT5D_DEBUG) printf(",");
1241 if (plan->flags&FFT5D_DEBUG) printf("\n");