2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009,2010,2012,2013,2014,2015,2016, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/utility/exceptions.h"
51 #include "gromacs/utility/fatalerror.h"
52 #include "gromacs/utility/gmxmpi.h"
53 #include "gromacs/utility/smalloc.h"
56 #define GMX_PARALLEL_ENV_INITIALIZED 1
59 #define GMX_PARALLEL_ENV_INITIALIZED 1
61 #define GMX_PARALLEL_ENV_INITIALIZED 0
66 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
68 /* requires fftw compiled with openmp */
69 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
72 #ifndef __FLT_EPSILON__
73 #define __FLT_EPSILON__ FLT_EPSILON
74 #define __DBL_EPSILON__ DBL_EPSILON
83 #include "gromacs/utility/exceptions.h"
84 #include "gromacs/utility/mutex.h"
85 /* none of the fftw3 calls, except execute(), are thread-safe, so
86 we need to serialize them with this mutex. */
87 static gmx::Mutex big_fftw_mutex;
88 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
89 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
90 #endif /* GMX_FFT_FFTW3 */
93 /* largest factor smaller than sqrt */
94 static int lfactor(int z)
96 int i = static_cast<int>(sqrt(static_cast<double>(z)));
106 static int l2factor(int z)
124 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
125 static int lpfactor(int z)
132 return std::max(lpfactor(f), lpfactor(z/f));
136 #if HAVE_GETTIMEOFDAY
137 #include <sys/time.h>
141 gettimeofday(&tv, 0);
142 return tv.tv_sec+tv.tv_usec*1e-6;
152 static int vmax(int* a, int s)
155 for (i = 0; i < s; i++)
166 /* NxMxK the size of the data
167 * comm communicator to use for fft5d
168 * P0 number of processor in 1st axes (can be null for automatic)
169 * lin is allocated by fft5d because size of array is only known after planning phase
170 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
172 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
175 int P[2], bMaster, prank[2], i, t;
177 int *N0 = 0, *N1 = 0, *M0 = 0, *M1 = 0, *K0 = 0, *K1 = 0, *oN0 = 0, *oN1 = 0, *oM0 = 0, *oM1 = 0, *oK0 = 0, *oK1 = 0;
178 int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {0}, *oNin[3] = {0}, *iNout[3] = {0}, *oNout[3] = {0};
179 int C[3], rC[3], nP[2];
181 t_complex *lin = 0, *lout = 0, *lout2 = 0, *lout3 = 0;
185 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
187 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
189 MPI_Comm_size(comm[0], &P[0]);
190 MPI_Comm_rank(comm[0], &prank[0]);
199 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
201 MPI_Comm_size(comm[1], &P[1]);
202 MPI_Comm_rank(comm[1], &prank[1]);
211 bMaster = (prank[0] == 0 && prank[1] == 0);
216 fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n",
217 P[0], P[1], prank[0], prank[1]);
224 fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
225 NG, MG, KG, P[0], P[1], (flags&FFT5D_REALCOMPLEX) > 0, (flags&FFT5D_BACKWARD) > 0, (flags&FFT5D_ORDER_YZ) > 0, (flags&FFT5D_DEBUG) > 0);
227 /* The check below is not correct, one prime factor 11 or 13 is ok.
228 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
229 printf("WARNING: FFT very slow with prime factors larger 7\n");
230 printf("Change FFT size or in case you cannot change it look at\n");
231 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
236 if (NG == 0 || MG == 0 || KG == 0)
240 printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
245 rNG = NG; rMG = MG; rKG = KG;
247 if (flags&FFT5D_REALCOMPLEX)
249 if (!(flags&FFT5D_BACKWARD))
255 if (!(flags&FFT5D_ORDER_YZ))
267 /*for transpose we need to know the size for each processor not only our own size*/
269 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
270 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
271 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
272 oN0 = (int*)malloc(P[0]*sizeof(int)); oN1 = (int*)malloc(P[1]*sizeof(int));
273 oM0 = (int*)malloc(P[0]*sizeof(int)); oM1 = (int*)malloc(P[1]*sizeof(int));
274 oK0 = (int*)malloc(P[0]*sizeof(int)); oK1 = (int*)malloc(P[1]*sizeof(int));
276 for (i = 0; i < P[0]; i++)
280 oN0[i] = i*ceil((double)NG/P[0]);
281 oM0[i] = i*ceil((double)MG/P[0]);
282 oK0[i] = i*ceil((double)KG/P[0]);
284 oN0[i] = (NG*i)/P[0];
285 oM0[i] = (MG*i)/P[0];
286 oK0[i] = (KG*i)/P[0];
289 for (i = 0; i < P[1]; i++)
292 oN1[i] = i*ceil((double)NG/P[1]);
293 oM1[i] = i*ceil((double)MG/P[1]);
294 oK1[i] = i*ceil((double)KG/P[1]);
296 oN1[i] = (NG*i)/P[1];
297 oM1[i] = (MG*i)/P[1];
298 oK1[i] = (KG*i)/P[1];
301 for (i = 0; i < P[0]-1; i++)
303 N0[i] = oN0[i+1]-oN0[i];
304 M0[i] = oM0[i+1]-oM0[i];
305 K0[i] = oK0[i+1]-oK0[i];
307 N0[P[0]-1] = NG-oN0[P[0]-1];
308 M0[P[0]-1] = MG-oM0[P[0]-1];
309 K0[P[0]-1] = KG-oK0[P[0]-1];
310 for (i = 0; i < P[1]-1; i++)
312 N1[i] = oN1[i+1]-oN1[i];
313 M1[i] = oM1[i+1]-oM1[i];
314 K1[i] = oK1[i+1]-oK1[i];
316 N1[P[1]-1] = NG-oN1[P[1]-1];
317 M1[P[1]-1] = MG-oM1[P[1]-1];
318 K1[P[1]-1] = KG-oK1[P[1]-1];
320 /* for step 1-3 the local N,M,K sizes of the transposed system
321 C: contiguous dimension, and nP: number of processor in subcommunicator
325 pM[0] = M0[prank[0]];
326 oM[0] = oM0[prank[0]];
327 pK[0] = K1[prank[1]];
328 oK[0] = oK1[prank[1]];
331 if (!(flags&FFT5D_ORDER_YZ))
333 N[0] = vmax(N1, P[1]);
335 K[0] = vmax(K1, P[1]);
336 pN[0] = N1[prank[1]];
342 N[1] = vmax(K0, P[0]);
343 pN[1] = K0[prank[0]];
348 M[1] = vmax(M0, P[0]);
349 pM[1] = M0[prank[0]];
350 oM[1] = oM0[prank[0]];
352 pK[1] = N1[prank[1]];
353 oK[1] = oN1[prank[1]];
359 M[2] = vmax(K0, P[0]);
360 pM[2] = K0[prank[0]];
361 oM[2] = oK0[prank[0]];
362 K[2] = vmax(N1, P[1]);
363 pK[2] = N1[prank[1]];
364 oK[2] = oN1[prank[1]];
365 free(N0); free(oN0); /*these are not used for this order*/
366 free(M1); free(oM1); /*the rest is freed in destroy*/
370 N[0] = vmax(N0, P[0]);
371 M[0] = vmax(M0, P[0]);
373 pN[0] = N0[prank[0]];
379 N[1] = vmax(M1, P[1]);
380 pN[1] = M1[prank[1]];
386 pM[1] = N0[prank[0]];
387 oM[1] = oN0[prank[0]];
388 K[1] = vmax(K1, P[1]);
389 pK[1] = K1[prank[1]];
390 oK[1] = oK1[prank[1]];
396 M[2] = vmax(N0, P[0]);
397 pM[2] = N0[prank[0]];
398 oM[2] = oN0[prank[0]];
399 K[2] = vmax(M1, P[1]);
400 pK[2] = M1[prank[1]];
401 oK[2] = oM1[prank[1]];
402 free(N1); free(oN1); /*these are not used for this order*/
403 free(K0); free(oK0); /*the rest is freed in destroy*/
405 N[2] = pN[2] = -1; /*not used*/
408 Difference between x-y-z regarding 2d decomposition is whether they are
409 distributed along axis 1, 2 or both
412 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
413 lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
414 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
415 if (!(flags&FFT5D_NOMALLOC))
417 snew_aligned(lin, lsize, 32);
418 snew_aligned(lout, lsize, 32);
421 /* We need extra transpose buffers to avoid OpenMP barriers */
422 snew_aligned(lout2, lsize, 32);
423 snew_aligned(lout3, lsize, 32);
427 /* We can reuse the buffers to avoid cache misses */
448 plan = (fft5d_plan)calloc(1, sizeof(struct fft5d_plan_t));
453 fprintf(debug, "Running on %d threads\n", nthreads);
457 /* Don't add more stuff here! We have already had at least one bug because we are reimplementing
458 * the low-level FFT interface instead of using the Gromacs FFT module. If we need more
459 * generic functionality it is far better to extend the interface so we can use it for
460 * all FFT libraries instead of writing FFTW-specific code here.
463 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
464 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
465 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
466 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
467 * and that the 3d plan is faster than the 1d plan.
469 if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
471 int fftwflags = FFTW_DESTROY_INPUT;
473 int inNG = NG, outMG = MG, outKG = KG;
477 fftwflags |= (flags & FFT5D_NOMEASURE) ? FFTW_ESTIMATE : FFTW_MEASURE;
479 if (flags&FFT5D_REALCOMPLEX)
481 if (!(flags&FFT5D_BACKWARD)) /*input pointer is not complex*/
485 else /*output pointer is not complex*/
487 if (!(flags&FFT5D_ORDER_YZ))
498 if (!(flags&FFT5D_BACKWARD))
504 dims[0].is = inNG*MG; /*N M K*/
507 if (!(flags&FFT5D_ORDER_YZ))
509 dims[0].os = MG; /*M K N*/
515 dims[0].os = 1; /*K N M*/
522 if (!(flags&FFT5D_ORDER_YZ))
532 dims[0].os = outMG*KG;
546 dims[0].os = outKG*NG;
552 #ifdef FFT5D_FFTW_THREADS
553 FFTW(plan_with_nthreads)(nthreads);
556 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
558 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
559 /*howmany*/ 0, /*howmany_dims*/ 0,
560 (real*)lin, (FFTW(complex) *) lout,
561 /*flags*/ fftwflags);
563 else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD))
565 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
566 /*howmany*/ 0, /*howmany_dims*/ 0,
567 (FFTW(complex) *) lin, (real*)lout,
568 /*flags*/ fftwflags);
572 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
573 /*howmany*/ 0, /*howmany_dims*/ 0,
574 (FFTW(complex) *) lin, (FFTW(complex) *) lout,
575 /*sign*/ (flags&FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
578 #ifdef FFT5D_FFTW_THREADS
579 FFTW(plan_with_nthreads)(1);
584 if (!plan->p3d) /* for decomposition and if 3d plan did not work */
586 #endif /* GMX_FFT_FFTW3 */
587 for (s = 0; s < 3; s++)
591 fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
592 s, rC[s], M[s], pK[s], C[s], lsize);
594 plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
596 /* Make sure that the init routines are only called by one thread at a time and in order
597 (later is only important to not confuse valgrind)
599 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
600 for (t = 0; t < nthreads; t++)
606 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
608 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s == 0) || ((flags&FFT5D_BACKWARD) && s == 2)))
610 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
614 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
617 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
625 if ((flags&FFT5D_ORDER_YZ)) /*plan->cart is in the order of transposes */
627 plan->cart[0] = comm[0]; plan->cart[1] = comm[1];
631 plan->cart[1] = comm[0]; plan->cart[0] = comm[1];
633 #ifdef FFT5D_MPI_TRANSPOSE
635 for (s = 0; s < 2; s++)
637 if ((s == 0 && !(flags&FFT5D_ORDER_YZ)) || (s == 1 && (flags&FFT5D_ORDER_YZ)))
639 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
643 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
655 plan->NG = NG; plan->MG = MG; plan->KG = KG;
657 for (s = 0; s < 3; s++)
659 plan->N[s] = N[s]; plan->M[s] = M[s]; plan->K[s] = K[s]; plan->pN[s] = pN[s]; plan->pM[s] = pM[s]; plan->pK[s] = pK[s];
660 plan->oM[s] = oM[s]; plan->oK[s] = oK[s];
661 plan->C[s] = C[s]; plan->rC[s] = rC[s];
662 plan->iNin[s] = iNin[s]; plan->oNin[s] = oNin[s]; plan->iNout[s] = iNout[s]; plan->oNout[s] = oNout[s];
664 for (s = 0; s < 2; s++)
666 plan->P[s] = nP[s]; plan->coor[s] = prank[s];
669 /* plan->fftorder=fftorder;
670 plan->direction=direction;
671 plan->realcomplex=realcomplex;
674 plan->nthreads = nthreads;
694 /*here x,y,z and N,M,K is in rotated coordinate system!!
695 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
696 maxN,maxM,maxK is max size of local data
697 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
698 NG, MG, KG is size of global data*/
699 static void splitaxes(t_complex* lout, const t_complex* lin,
700 int maxN, int maxM, int maxK, int pM,
701 int P, int NG, int *N, int* oN, int starty, int startz, int endy, int endz)
704 int in_i, out_i, in_z, out_z, in_y, out_y;
707 for (z = startz; z < endz+1; z++) /*3. z l*/
728 for (i = 0; i < P; i++) /*index cube along long axis*/
730 out_i = out_z + i*maxN*maxM*maxK;
732 for (y = s_y; y < e_y; y++) /*2. y k*/
734 out_y = out_i + y*maxN;
736 for (x = 0; x < N[i]; x++) /*1. x j*/
738 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
739 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
740 /*before split data contiguos - thus if different processor get different amount oN is different*/
747 /*make axis contiguous again (after AllToAll) and also do local transpose*/
748 /*transpose mayor and major dimension
750 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
751 N,M,K local dimensions
753 static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
754 int maxN, int maxM, int maxK, int pM,
755 int P, int KG, int* K, int* oK, int starty, int startx, int endy, int endx)
758 int out_i, in_i, out_x, in_x, out_z, in_z;
761 for (x = startx; x < endx+1; x++) /*1.j*/
783 for (i = 0; i < P; i++) /*index cube along long axis*/
785 out_i = out_x + oK[i];
786 in_i = in_x + i*maxM*maxN*maxK;
787 for (z = 0; z < K[i]; z++) /*3.l*/
790 in_z = in_i + z*maxM*maxN;
791 for (y = s_y; y < e_y; y++) /*2.k*/
793 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
800 /*make axis contiguous again (after AllToAll) and also do local transpose
801 tranpose mayor and middle dimension
803 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
806 static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
807 int P, int MG, int* M, int* oM, int startx, int startz, int endx, int endz)
810 int out_i, in_i, out_z, in_z, out_x, in_x;
813 for (z = startz; z < endz+1; z++)
834 for (i = 0; i < P; i++) /*index cube along long axis*/
836 out_i = out_z + oM[i];
837 in_i = in_z + i*maxM*maxN*maxK;
838 for (x = s_x; x < e_x; x++)
840 out_x = out_i + x*MG;
842 for (y = 0; y < M[i]; y++)
844 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
852 static void rotate_offsets(int x[])
863 /*compute the offset to compare or print transposed local data in original input coordinates
864 xs matrix dimension size, xl dimension length, xc decomposition offset
865 s: step in computation = number of transposes*/
866 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
868 /* int direction = plan->direction;
869 int fftorder = plan->fftorder;*/
873 int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK,
874 *C = plan->C, *rC = plan->rC;
876 NG[0] = plan->NG; NG[1] = plan->MG; NG[2] = plan->KG;
878 if (!(plan->flags&FFT5D_ORDER_YZ))
882 case 0: o = XYZ; break;
883 case 1: o = ZYX; break;
884 case 2: o = YZX; break;
892 case 0: o = XYZ; break;
893 case 1: o = YXZ; break;
894 case 2: o = ZXY; break;
901 case XYZ: pos[0] = 1; pos[1] = 2; pos[2] = 3; break;
902 case XZY: pos[0] = 1; pos[1] = 3; pos[2] = 2; break;
903 case YXZ: pos[0] = 2; pos[1] = 1; pos[2] = 3; break;
904 case YZX: pos[0] = 3; pos[1] = 1; pos[2] = 2; break;
905 case ZXY: pos[0] = 2; pos[1] = 3; pos[2] = 1; break;
906 case ZYX: pos[0] = 3; pos[1] = 2; pos[2] = 1; break;
908 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
910 /*xs, xl give dimension size and data length in local transposed coordinate system
911 for 0(/1/2): x(/y/z) in original coordinate system*/
912 for (i = 0; i < 3; i++)
916 case 1: xs[i] = 1; xc[i] = 0; xl[i] = C[s]; break;
917 case 2: xs[i] = C[s]; xc[i] = oM[s]; xl[i] = pM[s]; break;
918 case 3: xs[i] = C[s]*pM[s]; xc[i] = oK[s]; xl[i] = pK[s]; break;
921 /*input order is different for test program to match FFTW order
922 (important for complex to real)*/
923 if (plan->flags&FFT5D_BACKWARD)
929 if (plan->flags&FFT5D_ORDER_YZ)
937 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s == 0) || ((plan->flags&FFT5D_BACKWARD) && s == 2)))
943 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
946 int *coor = plan->coor;
947 int xs[3], xl[3], xc[3], NG[3];
948 int ll = (plan->flags&FFT5D_REALCOMPLEX) ? 1 : 2;
949 compute_offsets(plan, xs, xl, xc, NG, s);
950 fprintf(debug, txt, coor[0], coor[1]);
951 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
952 for (z = 0; z < xl[2]; z++)
954 for (y = 0; y < xl[1]; y++)
956 fprintf(debug, "%d %d: ", coor[0], coor[1]);
957 for (x = 0; x < xl[0]; x++)
959 for (l = 0; l < ll; l++)
961 fprintf(debug, "%f ", ((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
965 fprintf(debug, "\n");
970 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
972 t_complex *lin = plan->lin;
973 t_complex *lout = plan->lout;
974 t_complex *lout2 = plan->lout2;
975 t_complex *lout3 = plan->lout3;
976 t_complex *fftout, *joinin;
978 gmx_fft_t **p1d = plan->p1d;
979 #ifdef FFT5D_MPI_TRANSPOSE
980 FFTW(plan) *mpip = plan->mpip;
983 MPI_Comm *cart = plan->cart;
986 double time_fft = 0, time_local = 0, time_mpi[2] = {0}, time = 0;
988 int *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
989 *C = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout, **oNout = plan->oNout;
990 int s = 0, tstart, tend, bParallelDim;
1004 FFTW(execute)(plan->p3d);
1008 times->fft += MPI_Wtime()-time;
1019 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1021 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1024 for (s = 0; s < 2; s++) /*loop over first two FFT steps (corner rotations)*/
1028 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1038 /* ---------- START FFT ------------ */
1040 if (times != 0 && thread == 0)
1046 if (bParallelDim || plan->nthreads == 1)
1062 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1063 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s == 0)
1065 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, fftout+tstart);
1069 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, fftout+tstart);
1074 if (times != NULL && thread == 0)
1076 time_fft += MPI_Wtime()-time;
1079 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1081 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1083 /* ---------- END FFT ------------ */
1085 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1089 if (times != NULL && thread == 0)
1096 1. (most outer) axes (x) is split into P[s] parts of size N[s]
1100 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
1102 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1104 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1106 if (times != NULL && thread == 0)
1108 time_local += MPI_Wtime()-time;
1112 /* ---------- END SPLIT , START TRANSPOSE------------ */
1122 wallcycle_start(times, ewcPME_FFTCOMM);
1124 #ifdef FFT5D_MPI_TRANSPOSE
1125 FFTW(execute)(mpip[s]);
1128 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1130 MPI_Alltoall((real *)lout2, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1134 MPI_Alltoall((real *)lout2, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1137 gmx_incons("fft5d MPI call without MPI configuration");
1139 #endif /*FFT5D_MPI_TRANSPOSE*/
1143 time_mpi[s] = MPI_Wtime()-time;
1146 wallcycle_stop(times, ewcPME_FFTCOMM);
1150 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1152 /* ---------- END SPLIT + TRANSPOSE------------ */
1154 /* ---------- START JOIN ------------ */
1156 if (times != NULL && thread == 0)
1170 /*bring back in matrix form
1171 thus make new 1. axes contiguos
1172 also local transpose 1 and 2/3
1173 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1175 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1179 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1180 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1181 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1188 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1189 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1190 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
1195 if (times != NULL && thread == 0)
1197 time_local += MPI_Wtime()-time;
1200 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1202 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1204 /* ---------- END JOIN ------------ */
1206 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1207 } /* for(s=0;s<2;s++) */
1209 if (times != NULL && thread == 0)
1215 if (plan->flags&FFT5D_INPLACE)
1217 lout = lin; /*in place currently not supported*/
1220 /* ----------- FFT ----------- */
1221 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1222 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1224 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, lout+tstart);
1228 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, lout+tstart);
1230 /* ------------ END FFT ---------*/
1233 if (times != NULL && thread == 0)
1235 time_fft += MPI_Wtime()-time;
1237 times->fft += time_fft;
1238 times->local += time_local;
1239 times->mpi2 += time_mpi[1];
1240 times->mpi1 += time_mpi[0];
1244 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1246 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1250 void fft5d_destroy(fft5d_plan plan)
1254 for (s = 0; s < 3; s++)
1258 for (t = 0; t < plan->nthreads; t++)
1260 gmx_many_fft_destroy(plan->p1d[s][t]);
1266 free(plan->iNin[s]);
1271 free(plan->oNin[s]);
1276 free(plan->iNout[s]);
1281 free(plan->oNout[s]);
1287 #ifdef FFT5D_MPI_TRANSPOS
1288 for (s = 0; s < 2; s++)
1290 FFTW(destroy_plan)(plan->mpip[s]);
1292 #endif /* FFT5D_MPI_TRANSPOS */
1295 FFTW(destroy_plan)(plan->p3d);
1298 #endif /* GMX_FFT_FFTW3 */
1300 if (!(plan->flags&FFT5D_NOMALLOC))
1302 sfree_aligned(plan->lin);
1303 sfree_aligned(plan->lout);
1304 if (plan->nthreads > 1)
1306 sfree_aligned(plan->lout2);
1307 sfree_aligned(plan->lout3);
1311 #ifdef FFT5D_THREADS
1312 #ifdef FFT5D_FFTW_THREADS
1313 /*FFTW(cleanup_threads)();*/
1320 /*Is this better than direct access of plan? enough data?
1321 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1322 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1333 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1334 of processor dimensions*/
1335 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
1337 MPI_Comm cart[2] = {0};
1339 int size = 1, prank = 0;
1342 int wrap[] = {0, 0};
1344 int rdim1[] = {0, 1}, rdim2[] = {1, 0};
1346 MPI_Comm_size(comm, &size);
1347 MPI_Comm_rank(comm, &prank);
1357 printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1362 P[0] = P0; P[1] = size/P0; /*number of processors in the two dimensions*/
1364 /*Difference between x-y-z regarding 2d decomposition is whether they are
1365 distributed along axis 1, 2 or both*/
1367 MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1368 MPI_Cart_get(gcart, 2, P, wrap, coor);
1369 MPI_Cart_sub(gcart, rdim1, &cart[0]);
1370 MPI_Cart_sub(gcart, rdim2, &cart[1]);
1375 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1380 /*prints in original coordinate system of data (as the input to FFT)*/
1381 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1383 int xs[3], xl[3], xc[3], NG[3];
1385 int *coor = plan->coor;
1386 int ll = 2; /*compare ll values per element (has to be 2 for complex)*/
1387 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1392 compute_offsets(plan, xs, xl, xc, NG, 2);
1393 if (plan->flags&FFT5D_DEBUG)
1395 printf("Compare2\n");
1397 for (z = 0; z < xl[2]; z++)
1399 for (y = 0; y < xl[1]; y++)
1401 if (plan->flags&FFT5D_DEBUG)
1403 printf("%d %d: ", coor[0], coor[1]);
1405 for (x = 0; x < xl[0]; x++)
1407 for (l = 0; l < ll; l++) /*loop over real/complex parts*/
1410 a = ((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1413 a /= plan->rC[0]*plan->rC[1]*plan->rC[2];
1417 b = ((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1421 b = ((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1423 if (plan->flags&FFT5D_DEBUG)
1425 printf("%f %f, ", a, b);
1429 if (fabs(a-b) > 2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS)
1431 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n", coor[0], coor[1], x, y, z, a, b);
1433 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1436 if (plan->flags&FFT5D_DEBUG)
1441 if (plan->flags&FFT5D_DEBUG)