1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
10 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12 * Copyright (c) 2001-2008, The GROMACS development team,
13 * check out http://www.gromacs.org for more information.
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * If you want to redistribute modifications, please consider that
21 * scientific software is very special. Version control is crucial -
22 * bugs must be traceable. We will be happy to consider code for
23 * inclusion in the official distribution, but derived work must not
24 * be called official GROMACS. Details are found in the README & COPYING
25 * files - if they are missing, get the official version at www.gromacs.org.
27 * To help us fund GROMACS development, we humbly ask that you cite
28 * the papers on the package - you can find them in the top README file.
30 * For more info, check our website at http://www.gromacs.org
33 * Groningen Machine for Chemical Simulation
45 #define GMX_PARALLEL_ENV_INITIALIZED 1
48 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
60 /* requires fftw compiled with openmp */
61 #define FFT5D_FFTW_THREADS
69 #ifndef __FLT_EPSILON__
70 #define __FLT_EPSILON__ FLT_EPSILON
71 #define __DBL_EPSILON__ DBL_EPSILON
78 #include "gmx_fatal.h"
83 /* none of the fftw3 calls, except execute(), are thread-safe, so
84 we need to serialize them with this mutex. */
85 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
87 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex);
88 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex);
89 #else /* GMX_THREADS */
92 #endif /* GMX_THREADS */
93 #endif /* GMX_FFT_FFTW3 */
95 static double fft5d_fmax(double a, double b){
99 /* largest factor smaller than sqrt */
100 static int lfactor(int z) {
103 if (z%i==0) return i;
108 static int l2factor(int z) {
112 if (z%i==0) return i;
116 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
117 static int lpfactor(int z) {
120 return fft5d_fmax(lpfactor(f),lpfactor(z/f));
124 #ifdef HAVE_GETTIMEOFDAY
125 #include <sys/time.h>
129 return tv.tv_sec+tv.tv_usec*1e-6;
138 static int vmax(int* a, int s) {
142 if (a[i]>max) max=a[i];
148 copied here from fftgrid, because:
149 1. function there not publically available
150 2. not sure whether we keep fftgrid
151 3. less dependencies for fft5d
153 Only used for non-fftw case
156 gmx_calloc_aligned(size_t size)
160 /*We initialize by zero for Valgrind
161 For non-divisible case we communicate more than the data.
162 If we don't initialize the data we communicate uninitialized data*/
163 p0 = calloc(size+32,1);
167 gmx_fatal(FARGS,"Failed to allocated %u bytes of aligned memory.",size+32);
170 p = (void *) (((size_t) p0 + 32) & (~((size_t) 31)));
172 /* Yeah, yeah, we cannot free this pointer, but who cares... */
177 /* NxMxK the size of the data
178 * comm communicator to use for fft5d
179 * P0 number of processor in 1st axes (can be null for automatic)
180 * lin is allocated by fft5d because size of array is only known after planning phase */
181 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout)
184 int P[2],bMaster,prank[2],i;
186 int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
187 int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
188 int C[3],rC[3],nP[2];
190 t_complex *lin=0,*lout=0;
194 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
196 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != 0)
198 MPI_Comm_size(comm[0],&P[0]);
199 MPI_Comm_rank(comm[0],&prank[0]);
208 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != 0)
210 MPI_Comm_size(comm[1],&P[1]);
211 MPI_Comm_rank(comm[1],&prank[1]);
220 bMaster=(prank[0]==0&&prank[1]==0);
225 fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
226 P[0],P[1],prank[0],prank[1]);
231 fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
232 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
233 /* The check below is not correct, one prime factor 11 or 13 is ok.
234 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
235 printf("WARNING: FFT very slow with prime factors larger 7\n");
236 printf("Change FFT size or in case you cannot change it look at\n");
237 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
242 if (NG==0 || MG==0 || KG==0) {
243 if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
247 rNG=NG;rMG=MG;rKG=KG;
249 if (flags&FFT5D_REALCOMPLEX) {
250 if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
252 if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
258 /*for transpose we need to know the size for each processor not only our own size*/
260 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
261 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
262 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
263 oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
264 oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
265 oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
271 oN0[i]=i*ceil((double)NG/P[0]);
272 oM0[i]=i*ceil((double)MG/P[0]);
273 oK0[i]=i*ceil((double)KG/P[0]);
283 oN1[i]=i*ceil((double)NG/P[1]);
284 oM1[i]=i*ceil((double)MG/P[1]);
285 oK1[i]=i*ceil((double)KG/P[1]);
292 for (i=0;i<P[0]-1;i++)
294 N0[i]=oN0[i+1]-oN0[i];
295 M0[i]=oM0[i+1]-oM0[i];
296 K0[i]=oK0[i+1]-oK0[i];
298 N0[P[0]-1]=NG-oN0[P[0]-1];
299 M0[P[0]-1]=MG-oM0[P[0]-1];
300 K0[P[0]-1]=KG-oK0[P[0]-1];
301 for (i=0;i<P[1]-1;i++)
303 N1[i]=oN1[i+1]-oN1[i];
304 M1[i]=oM1[i+1]-oM1[i];
305 K1[i]=oK1[i+1]-oK1[i];
307 N1[P[1]-1]=NG-oN1[P[1]-1];
308 M1[P[1]-1]=MG-oM1[P[1]-1];
309 K1[P[1]-1]=KG-oK1[P[1]-1];
311 /* for step 1-3 the local N,M,K sizes of the transposed system
312 C: contiguous dimension, and nP: number of processor in subcommunicator
316 pM[0] = M0[prank[0]];
317 oM[0] = oM0[prank[0]];
318 pK[0] = K1[prank[1]];
319 oK[0] = oK1[prank[1]];
322 if (!(flags&FFT5D_ORDER_YZ)) {
323 N[0] = vmax(N1,P[1]);
325 K[0] = vmax(K1,P[1]);
326 pN[0] = N1[prank[1]];
332 N[1] = vmax(K0,P[0]);
333 pN[1] = K0[prank[0]];
338 M[1] = vmax(M0,P[0]);
339 pM[1] = M0[prank[0]];
340 oM[1] = oM0[prank[0]];
342 pK[1] = N1[prank[1]];
343 oK[1] = oN1[prank[1]];
349 M[2] = vmax(K0,P[0]);
350 pM[2] = K0[prank[0]];
351 oM[2] = oK0[prank[0]];
352 K[2] = vmax(N1,P[1]);
353 pK[2] = N1[prank[1]];
354 oK[2] = oN1[prank[1]];
355 free(N0); free(oN0); /*these are not used for this order*/
356 free(M1); free(oM1); /*the rest is freed in destroy*/
358 N[0] = vmax(N0,P[0]);
359 M[0] = vmax(M0,P[0]);
361 pN[0] = N0[prank[0]];
367 N[1] = vmax(M1,P[1]);
368 pN[1] = M1[prank[1]];
374 pM[1] = N0[prank[0]];
375 oM[1] = oN0[prank[0]];
376 K[1] = vmax(K1,P[1]);
377 pK[1] = K1[prank[1]];
378 oK[1] = oK1[prank[1]];
384 M[2] = vmax(N0,P[0]);
385 pM[2] = N0[prank[0]];
386 oM[2] = oN0[prank[0]];
387 K[2] = vmax(M1,P[1]);
388 pK[2] = M1[prank[1]];
389 oK[2] = oM1[prank[1]];
390 free(N1); free(oN1); /*these are not used for this order*/
391 free(K0); free(oK0); /*the rest is freed in destroy*/
395 Difference between x-y-z regarding 2d decomposition is whether they are
396 distributed along axis 1, 2 or both
399 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
400 lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
401 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
402 if (!(flags&FFT5D_NOMALLOC)) {
403 lin = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
404 lout = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
410 plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
414 #ifdef FFT5D_FFTW_THREADS
415 FFTW(init_threads)();
421 nthreads = omp_get_num_threads();
424 if (prank[0] == 0 && prank[1] == 0)
426 printf("Running fftw on %d threads\n",nthreads);
428 FFTW(plan_with_nthreads)(nthreads);
432 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
433 if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1))) { /*don't do 3d plan in parallel or if in_place requested */
434 int fftwflags=FFTW_DESTROY_INPUT;
436 int inNG=NG,outMG=MG,outKG=KG;
439 if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
440 if (flags&FFT5D_REALCOMPLEX) {
441 if (!(flags&FFT5D_BACKWARD)) { /*input pointer is not complex*/
443 } else { /*output pointer is not complex*/
444 if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
449 if (!(flags&FFT5D_BACKWARD)) {
454 dims[0].is = inNG*MG; /*N M K*/
457 if (!(flags&FFT5D_ORDER_YZ)) {
458 dims[0].os = MG; /*M K N*/
462 dims[0].os = 1; /*K N M*/
467 if (!(flags&FFT5D_ORDER_YZ)) {
476 dims[0].os = outMG*KG;
488 dims[0].os = outKG*NG;
493 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
494 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
495 /*howmany*/ 0, /*howmany_dims*/0 ,
496 (real*)lin, (FFTW(complex) *)lout,
497 /*flags*/ fftwflags);
498 } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
499 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
500 /*howmany*/ 0, /*howmany_dims*/0 ,
501 (FFTW(complex) *)lin, (real*)lout,
502 /*flags*/ fftwflags);
504 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
505 /*howmany*/ 0, /*howmany_dims*/0 ,
506 (FFTW(complex) *)lin, (FFTW(complex) *)lout,
507 /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
511 if (!plan->p3d) { /* for decomposition and if 3d plan did not work */
512 #endif /* GMX_FFT_FFTW3 */
516 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
517 s,rC[s],M[s],pK[s],C[s],lsize);
519 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
520 gmx_fft_init_many_1d_real( &plan->p1d[s], rC[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
522 gmx_fft_init_many_1d ( &plan->p1d[s], C[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
528 if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
529 plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
531 plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
533 #ifdef FFT5D_MPI_TRANSPOSE
536 if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
537 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
539 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
548 plan->NG=NG;plan->MG=MG;plan->KG=KG;
551 plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
552 plan->oM[s]=oM[s];plan->oK[s]=oK[s];
553 plan->C[s]=C[s];plan->rC[s]=rC[s];
554 plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
557 plan->P[s]=nP[s];plan->coor[s]=prank[s];
560 /* plan->fftorder=fftorder;
561 plan->direction=direction;
562 plan->realcomplex=realcomplex;
582 /*here x,y,z and N,M,K is in rotated coordinate system!!
583 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
584 maxN,maxM,maxK is max size of local data
585 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
586 NG, MG, KG is size of global data*/
587 static void splitaxes(t_complex* lout,const t_complex* lin,
588 int maxN,int maxM,int maxK, int pN, int pM, int pK,
589 int P,int NG,int *N, int* oN)
592 int in_i,out_i,in_z,out_z,in_y,out_y;
597 /* In the thread parallel case we want to loop over z and i
598 * in a single for loop to allow for better load balancing.
600 #pragma omp parallel for private(z,in_z,out_z,i,in_i,out_i,y,in_y,out_y,x) schedule(static)
601 for (zi=0; zi<pK*P; zi++)
606 for (z=0; z<pK; z++) /*3. z l*/
612 #ifndef FFT5D_THREADS
613 for (i=0; i<P; i++) /*index cube along long axis*/
616 in_i = in_z + i*maxN*maxM*maxK;
617 out_i = out_z + oN[i];
618 for (y=0;y<pM;y++) { /*2. y k*/
619 in_y = in_i + y*maxN;
620 out_y = out_i + y*NG;
621 for (x=0;x<N[i];x++) { /*1. x j*/
622 lout[in_y+x] = lin[out_y+x];
623 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
624 /*before split data contiguos - thus if different processor get different amount oN is different*/
631 /*make axis contiguous again (after AllToAll) and also do local transpose*/
632 /*transpose mayor and major dimension
634 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
635 N,M,K local dimensions
637 static void joinAxesTrans13(t_complex* lin,const t_complex* lout,
638 int maxN,int maxM,int maxK,int pN, int pM, int pK,
639 int P,int KG, int* K, int* oK)
642 int in_i,out_i,in_x,out_x,in_z,out_z;
647 /* In the thread parallel case we want to loop over x and i
648 * in a single for loop to allow for better load balancing.
650 #pragma omp parallel for private(x,in_x,out_x,i,in_i,out_i,z,in_z,out_z,y) schedule(static)
651 for (xi=0; xi<pN*P; xi++)
656 for (x=0;x<pN;x++) /*1.j*/
662 #ifndef FFT5D_THREADS
663 for (i=0;i<P;i++) /*index cube along long axis*/
667 out_i = out_x + i*maxM*maxN*maxK;
668 for (z=0;z<K[i];z++) /*3.l*/
671 out_z = out_i + z*maxM*maxN;
672 for (y=0;y<pM;y++) { /*2.k*/
673 lin[in_z+y*KG] = lout[out_z+y*maxN];
680 /*make axis contiguous again (after AllToAll) and also do local transpose
681 tranpose mayor and middle dimension
683 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
686 static void joinAxesTrans12(t_complex* lin,const t_complex* lout,int maxN,int maxM,int maxK,int pN, int pM, int pK,
687 int P,int MG, int* M, int* oM) {
689 int in_i,out_i,in_z,out_z,in_x,out_x;
694 /* In the thread parallel case we want to loop over z and i
695 * in a single for loop to allow for better load balancing.
697 #pragma omp parallel for private(i,in_i,out_i,z,in_z,out_z,in_x,out_x,x,y) schedule(static)
698 for (zi=0; zi<pK*P; zi++)
709 #ifndef FFT5D_THREADS
710 for (i=0; i<P; i++) /*index cube along long axis*/
714 out_i = out_z + i*maxM*maxN*maxK;
718 for (y=0;y<M[i];y++) {
719 lin[in_x+y] = lout[out_x+y*maxN];
727 static void rotate(int x[]) {
737 /*compute the offset to compare or print transposed local data in original input coordinates
738 xs matrix dimension size, xl dimension length, xc decomposition offset
739 s: step in computation = number of transposes*/
740 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
741 /* int direction = plan->direction;
742 int fftorder = plan->fftorder;*/
746 int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
747 *C=plan->C, *rC=plan->rC;
749 NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
751 if (!(plan->flags&FFT5D_ORDER_YZ)) {
753 case 0: o=XYZ; break;
754 case 1: o=ZYX; break;
755 case 2: o=YZX; break;
760 case 0: o=XYZ; break;
761 case 1: o=YXZ; break;
762 case 2: o=ZXY; break;
768 case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
769 case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
770 case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
771 case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
772 case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
773 case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
775 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
777 /*xs, xl give dimension size and data length in local transposed coordinate system
778 for 0(/1/2): x(/y/z) in original coordinate system*/
781 case 1: xs[i]=1; xc[i]=0; xl[i]=C[s];break;
782 case 2: xs[i]=C[s]; xc[i]=oM[s]; xl[i]=pM[s];break;
783 case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
786 /*input order is different for test program to match FFTW order
787 (important for complex to real)*/
788 if (plan->flags&FFT5D_BACKWARD) {
793 if (plan->flags&FFT5D_ORDER_YZ) {
800 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
805 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
807 int *coor = plan->coor;
808 int xs[3],xl[3],xc[3],NG[3];
809 int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
810 compute_offsets(plan,xs,xl,xc,NG,s);
811 fprintf(debug,txt,coor[0],coor[1],s);
812 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
813 for(z=0;z<xl[2];z++) {
814 for(y=0;y<xl[1];y++) {
815 fprintf(debug,"%d %d: ",coor[0],coor[1]);
816 for (x=0;x<xl[0];x++) {
818 fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
827 void fft5d_execute(fft5d_plan plan,fft5d_time times) {
828 t_complex *lin = plan->lin;
829 t_complex *lout = plan->lout;
831 gmx_fft_t *p1d=plan->p1d;
832 #ifdef FFT5D_MPI_TRANSPOSE
833 FFTW(plan) *mpip=plan->mpip;
836 MPI_Comm *cart=plan->cart;
839 double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
840 int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
841 *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
849 FFTW(execute)(plan->p3d);
851 times->fft+=MPI_Wtime()-time;
857 if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: copy in lin\n", s, plan);
862 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0) {
863 gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
865 gmx_fft_many_1d( p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin,lout);
868 time_fft+=MPI_Wtime()-time;
870 if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
873 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] !=0 && P[s]>1 )
877 /*prepare for AllToAll
878 1. (most outer) axes (x) is split into P[s] parts of size N[s]
880 splitaxes(lin,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s]);
884 time_local+=MPI_Wtime()-time;
890 #ifdef FFT5D_MPI_TRANSPOSE
891 FFTW(execute)(mpip[s]);
893 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
894 MPI_Alltoall(lin,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
896 MPI_Alltoall(lin,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
897 #endif /*FFT5D_MPI_TRANSPOSE*/
899 time_mpi[s]=MPI_Wtime()-time;
906 /*bring back in matrix form
907 thus make new 1. axes contiguos
908 also local transpose 1 and 2/3 */
909 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
910 joinAxesTrans13(lin,lout,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
912 joinAxesTrans12(lin,lout,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
914 time_local+=MPI_Wtime()-time;
916 if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
918 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
923 if (plan->flags&FFT5D_INPLACE) lout=lin;
924 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
925 gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
927 gmx_fft_many_1d( p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin,lout);
931 time_fft+=MPI_Wtime()-time;
932 if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
933 /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
936 times->fft+=time_fft;
937 times->local+=time_local;
938 times->mpi2+=time_mpi[1];
939 times->mpi1+=time_mpi[0];
943 void fft5d_destroy(fft5d_plan plan) {
946 gmx_many_fft_destroy(plan->p1d[s]);
955 if (plan->iNout[s]) {
956 free(plan->iNout[s]);
959 if (plan->oNout[s]) {
960 free(plan->oNout[s]);
966 #ifdef FFT5D_MPI_TRANSPOS
968 FFTW(destroy_plan)(plan->mpip[s]);
969 #endif /* FFT5D_MPI_TRANSPOS */
970 #endif /* GMX_FFT_FFTW3 */
972 /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
976 #ifdef FFT5D_FFTW_THREADS
977 FFTW(cleanup_threads)();
984 /*Is this better than direct access of plan? enough data?
985 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
986 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
996 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
997 of processor dimensions*/
998 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout) {
999 MPI_Comm cart[2]={0};
1006 int rdim1[] = {0,1}, rdim2[] = {1,0};
1008 MPI_Comm_size(comm,&size);
1009 MPI_Comm_rank(comm,&prank);
1011 if (P0==0) P0 = lfactor(size);
1013 if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1017 P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1019 /*Difference between x-y-z regarding 2d decomposition is whether they are
1020 distributed along axis 1, 2 or both*/
1022 MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1023 MPI_Cart_get(gcart,2,P,wrap,coor);
1024 MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1025 MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1027 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout);
1032 /*prints in original coordinate system of data (as the input to FFT)*/
1033 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1034 int xs[3],xl[3],xc[3],NG[3];
1036 int *coor = plan->coor;
1037 int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1038 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1043 compute_offsets(plan,xs,xl,xc,NG,2);
1044 if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1045 for (z=0;z<xl[2];z++) {
1046 for(y=0;y<xl[1];y++) {
1047 if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1048 for (x=0;x<xl[0];x++) {
1049 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1051 a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1052 if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1054 b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1056 b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1057 if (plan->flags&FFT5D_DEBUG) {
1058 printf("%f %f, ",a,b);
1060 if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1061 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1063 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1066 if (plan->flags&FFT5D_DEBUG) printf(",");
1068 if (plan->flags&FFT5D_DEBUG) printf("\n");