2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009,2010,2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/utility/fatalerror.h"
51 #include "gromacs/utility/gmxmpi.h"
52 #include "gromacs/utility/smalloc.h"
55 #define GMX_PARALLEL_ENV_INITIALIZED 1
58 #define GMX_PARALLEL_ENV_INITIALIZED 1
60 #define GMX_PARALLEL_ENV_INITIALIZED 0
65 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
67 /* requires fftw compiled with openmp */
68 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
71 #ifndef __FLT_EPSILON__
72 #define __FLT_EPSILON__ FLT_EPSILON
73 #define __DBL_EPSILON__ DBL_EPSILON
81 #include "thread_mpi/mutex.h"
83 #include "gromacs/utility/exceptions.h"
84 /* none of the fftw3 calls, except execute(), are thread-safe, so
85 we need to serialize them with this mutex. */
86 static tMPI::mutex big_fftw_mutex;
87 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
88 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
89 #endif /* GMX_FFT_FFTW3 */
92 /* largest factor smaller than sqrt */
93 static int lfactor(int z)
95 int i = static_cast<int>(sqrt(static_cast<double>(z)));
105 static int l2factor(int z)
123 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
124 static int lpfactor(int z)
131 return std::max(lpfactor(f), lpfactor(z/f));
135 #ifdef HAVE_GETTIMEOFDAY
136 #include <sys/time.h>
140 gettimeofday(&tv, 0);
141 return tv.tv_sec+tv.tv_usec*1e-6;
151 static int vmax(int* a, int s)
154 for (i = 0; i < s; i++)
165 /* NxMxK the size of the data
166 * comm communicator to use for fft5d
167 * P0 number of processor in 1st axes (can be null for automatic)
168 * lin is allocated by fft5d because size of array is only known after planning phase
169 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
171 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
174 int P[2], bMaster, prank[2], i, t;
176 int *N0 = 0, *N1 = 0, *M0 = 0, *M1 = 0, *K0 = 0, *K1 = 0, *oN0 = 0, *oN1 = 0, *oM0 = 0, *oM1 = 0, *oK0 = 0, *oK1 = 0;
177 int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {0}, *oNin[3] = {0}, *iNout[3] = {0}, *oNout[3] = {0};
178 int C[3], rC[3], nP[2];
180 t_complex *lin = 0, *lout = 0, *lout2 = 0, *lout3 = 0;
184 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
186 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
188 MPI_Comm_size(comm[0], &P[0]);
189 MPI_Comm_rank(comm[0], &prank[0]);
198 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
200 MPI_Comm_size(comm[1], &P[1]);
201 MPI_Comm_rank(comm[1], &prank[1]);
210 bMaster = (prank[0] == 0 && prank[1] == 0);
215 fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n",
216 P[0], P[1], prank[0], prank[1]);
223 fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
224 NG, MG, KG, P[0], P[1], (flags&FFT5D_REALCOMPLEX) > 0, (flags&FFT5D_BACKWARD) > 0, (flags&FFT5D_ORDER_YZ) > 0, (flags&FFT5D_DEBUG) > 0);
226 /* The check below is not correct, one prime factor 11 or 13 is ok.
227 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
228 printf("WARNING: FFT very slow with prime factors larger 7\n");
229 printf("Change FFT size or in case you cannot change it look at\n");
230 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
235 if (NG == 0 || MG == 0 || KG == 0)
239 printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
244 rNG = NG; rMG = MG; rKG = KG;
246 if (flags&FFT5D_REALCOMPLEX)
248 if (!(flags&FFT5D_BACKWARD))
254 if (!(flags&FFT5D_ORDER_YZ))
266 /*for transpose we need to know the size for each processor not only our own size*/
268 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
269 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
270 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
271 oN0 = (int*)malloc(P[0]*sizeof(int)); oN1 = (int*)malloc(P[1]*sizeof(int));
272 oM0 = (int*)malloc(P[0]*sizeof(int)); oM1 = (int*)malloc(P[1]*sizeof(int));
273 oK0 = (int*)malloc(P[0]*sizeof(int)); oK1 = (int*)malloc(P[1]*sizeof(int));
275 for (i = 0; i < P[0]; i++)
279 oN0[i] = i*ceil((double)NG/P[0]);
280 oM0[i] = i*ceil((double)MG/P[0]);
281 oK0[i] = i*ceil((double)KG/P[0]);
283 oN0[i] = (NG*i)/P[0];
284 oM0[i] = (MG*i)/P[0];
285 oK0[i] = (KG*i)/P[0];
288 for (i = 0; i < P[1]; i++)
291 oN1[i] = i*ceil((double)NG/P[1]);
292 oM1[i] = i*ceil((double)MG/P[1]);
293 oK1[i] = i*ceil((double)KG/P[1]);
295 oN1[i] = (NG*i)/P[1];
296 oM1[i] = (MG*i)/P[1];
297 oK1[i] = (KG*i)/P[1];
300 for (i = 0; i < P[0]-1; i++)
302 N0[i] = oN0[i+1]-oN0[i];
303 M0[i] = oM0[i+1]-oM0[i];
304 K0[i] = oK0[i+1]-oK0[i];
306 N0[P[0]-1] = NG-oN0[P[0]-1];
307 M0[P[0]-1] = MG-oM0[P[0]-1];
308 K0[P[0]-1] = KG-oK0[P[0]-1];
309 for (i = 0; i < P[1]-1; i++)
311 N1[i] = oN1[i+1]-oN1[i];
312 M1[i] = oM1[i+1]-oM1[i];
313 K1[i] = oK1[i+1]-oK1[i];
315 N1[P[1]-1] = NG-oN1[P[1]-1];
316 M1[P[1]-1] = MG-oM1[P[1]-1];
317 K1[P[1]-1] = KG-oK1[P[1]-1];
319 /* for step 1-3 the local N,M,K sizes of the transposed system
320 C: contiguous dimension, and nP: number of processor in subcommunicator
324 pM[0] = M0[prank[0]];
325 oM[0] = oM0[prank[0]];
326 pK[0] = K1[prank[1]];
327 oK[0] = oK1[prank[1]];
330 if (!(flags&FFT5D_ORDER_YZ))
332 N[0] = vmax(N1, P[1]);
334 K[0] = vmax(K1, P[1]);
335 pN[0] = N1[prank[1]];
341 N[1] = vmax(K0, P[0]);
342 pN[1] = K0[prank[0]];
347 M[1] = vmax(M0, P[0]);
348 pM[1] = M0[prank[0]];
349 oM[1] = oM0[prank[0]];
351 pK[1] = N1[prank[1]];
352 oK[1] = oN1[prank[1]];
358 M[2] = vmax(K0, P[0]);
359 pM[2] = K0[prank[0]];
360 oM[2] = oK0[prank[0]];
361 K[2] = vmax(N1, P[1]);
362 pK[2] = N1[prank[1]];
363 oK[2] = oN1[prank[1]];
364 free(N0); free(oN0); /*these are not used for this order*/
365 free(M1); free(oM1); /*the rest is freed in destroy*/
369 N[0] = vmax(N0, P[0]);
370 M[0] = vmax(M0, P[0]);
372 pN[0] = N0[prank[0]];
378 N[1] = vmax(M1, P[1]);
379 pN[1] = M1[prank[1]];
385 pM[1] = N0[prank[0]];
386 oM[1] = oN0[prank[0]];
387 K[1] = vmax(K1, P[1]);
388 pK[1] = K1[prank[1]];
389 oK[1] = oK1[prank[1]];
395 M[2] = vmax(N0, P[0]);
396 pM[2] = N0[prank[0]];
397 oM[2] = oN0[prank[0]];
398 K[2] = vmax(M1, P[1]);
399 pK[2] = M1[prank[1]];
400 oK[2] = oM1[prank[1]];
401 free(N1); free(oN1); /*these are not used for this order*/
402 free(K0); free(oK0); /*the rest is freed in destroy*/
404 N[2] = pN[2] = -1; /*not used*/
407 Difference between x-y-z regarding 2d decomposition is whether they are
408 distributed along axis 1, 2 or both
411 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
412 lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
413 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
414 if (!(flags&FFT5D_NOMALLOC))
416 snew_aligned(lin, lsize, 32);
417 snew_aligned(lout, lsize, 32);
420 /* We need extra transpose buffers to avoid OpenMP barriers */
421 snew_aligned(lout2, lsize, 32);
422 snew_aligned(lout3, lsize, 32);
426 /* We can reuse the buffers to avoid cache misses */
447 plan = (fft5d_plan)calloc(1, sizeof(struct fft5d_plan_t));
452 fprintf(debug, "Running on %d threads\n", nthreads);
455 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
456 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
457 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
458 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
459 * and that the 3d plan is faster than the 1d plan.
461 if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
463 int fftwflags = FFTW_DESTROY_INPUT;
465 int inNG = NG, outMG = MG, outKG = KG;
468 if (!(flags&FFT5D_NOMEASURE))
470 fftwflags |= FFTW_MEASURE;
472 if (flags&FFT5D_REALCOMPLEX)
474 if (!(flags&FFT5D_BACKWARD)) /*input pointer is not complex*/
478 else /*output pointer is not complex*/
480 if (!(flags&FFT5D_ORDER_YZ))
491 if (!(flags&FFT5D_BACKWARD))
497 dims[0].is = inNG*MG; /*N M K*/
500 if (!(flags&FFT5D_ORDER_YZ))
502 dims[0].os = MG; /*M K N*/
508 dims[0].os = 1; /*K N M*/
515 if (!(flags&FFT5D_ORDER_YZ))
525 dims[0].os = outMG*KG;
539 dims[0].os = outKG*NG;
545 #ifdef FFT5D_FFTW_THREADS
546 FFTW(plan_with_nthreads)(nthreads);
549 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
551 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
552 /*howmany*/ 0, /*howmany_dims*/ 0,
553 (real*)lin, (FFTW(complex) *) lout,
554 /*flags*/ fftwflags);
556 else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD))
558 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
559 /*howmany*/ 0, /*howmany_dims*/ 0,
560 (FFTW(complex) *) lin, (real*)lout,
561 /*flags*/ fftwflags);
565 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
566 /*howmany*/ 0, /*howmany_dims*/ 0,
567 (FFTW(complex) *) lin, (FFTW(complex) *) lout,
568 /*sign*/ (flags&FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
571 #ifdef FFT5D_FFTW_THREADS
572 FFTW(plan_with_nthreads)(1);
577 if (!plan->p3d) /* for decomposition and if 3d plan did not work */
579 #endif /* GMX_FFT_FFTW3 */
580 for (s = 0; s < 3; s++)
584 fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
585 s, rC[s], M[s], pK[s], C[s], lsize);
587 plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
589 /* Make sure that the init routines are only called by one thread at a time and in order
590 (later is only important to not confuse valgrind)
592 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
593 for (t = 0; t < nthreads; t++)
597 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
599 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s == 0) || ((flags&FFT5D_BACKWARD) && s == 2)))
601 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
605 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
614 if ((flags&FFT5D_ORDER_YZ)) /*plan->cart is in the order of transposes */
616 plan->cart[0] = comm[0]; plan->cart[1] = comm[1];
620 plan->cart[1] = comm[0]; plan->cart[0] = comm[1];
622 #ifdef FFT5D_MPI_TRANSPOSE
624 for (s = 0; s < 2; s++)
626 if ((s == 0 && !(flags&FFT5D_ORDER_YZ)) || (s == 1 && (flags&FFT5D_ORDER_YZ)))
628 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
632 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
644 plan->NG = NG; plan->MG = MG; plan->KG = KG;
646 for (s = 0; s < 3; s++)
648 plan->N[s] = N[s]; plan->M[s] = M[s]; plan->K[s] = K[s]; plan->pN[s] = pN[s]; plan->pM[s] = pM[s]; plan->pK[s] = pK[s];
649 plan->oM[s] = oM[s]; plan->oK[s] = oK[s];
650 plan->C[s] = C[s]; plan->rC[s] = rC[s];
651 plan->iNin[s] = iNin[s]; plan->oNin[s] = oNin[s]; plan->iNout[s] = iNout[s]; plan->oNout[s] = oNout[s];
653 for (s = 0; s < 2; s++)
655 plan->P[s] = nP[s]; plan->coor[s] = prank[s];
658 /* plan->fftorder=fftorder;
659 plan->direction=direction;
660 plan->realcomplex=realcomplex;
663 plan->nthreads = nthreads;
683 /*here x,y,z and N,M,K is in rotated coordinate system!!
684 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
685 maxN,maxM,maxK is max size of local data
686 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
687 NG, MG, KG is size of global data*/
688 static void splitaxes(t_complex* lout, const t_complex* lin,
689 int maxN, int maxM, int maxK, int pM,
690 int P, int NG, int *N, int* oN, int starty, int startz, int endy, int endz)
693 int in_i, out_i, in_z, out_z, in_y, out_y;
696 for (z = startz; z < endz+1; z++) /*3. z l*/
717 for (i = 0; i < P; i++) /*index cube along long axis*/
719 out_i = out_z + i*maxN*maxM*maxK;
721 for (y = s_y; y < e_y; y++) /*2. y k*/
723 out_y = out_i + y*maxN;
725 for (x = 0; x < N[i]; x++) /*1. x j*/
727 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
728 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
729 /*before split data contiguos - thus if different processor get different amount oN is different*/
736 /*make axis contiguous again (after AllToAll) and also do local transpose*/
737 /*transpose mayor and major dimension
739 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
740 N,M,K local dimensions
742 static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
743 int maxN, int maxM, int maxK, int pM,
744 int P, int KG, int* K, int* oK, int starty, int startx, int endy, int endx)
747 int out_i, in_i, out_x, in_x, out_z, in_z;
750 for (x = startx; x < endx+1; x++) /*1.j*/
772 for (i = 0; i < P; i++) /*index cube along long axis*/
774 out_i = out_x + oK[i];
775 in_i = in_x + i*maxM*maxN*maxK;
776 for (z = 0; z < K[i]; z++) /*3.l*/
779 in_z = in_i + z*maxM*maxN;
780 for (y = s_y; y < e_y; y++) /*2.k*/
782 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
789 /*make axis contiguous again (after AllToAll) and also do local transpose
790 tranpose mayor and middle dimension
792 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
795 static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
796 int P, int MG, int* M, int* oM, int startx, int startz, int endx, int endz)
799 int out_i, in_i, out_z, in_z, out_x, in_x;
802 for (z = startz; z < endz+1; z++)
823 for (i = 0; i < P; i++) /*index cube along long axis*/
825 out_i = out_z + oM[i];
826 in_i = in_z + i*maxM*maxN*maxK;
827 for (x = s_x; x < e_x; x++)
829 out_x = out_i + x*MG;
831 for (y = 0; y < M[i]; y++)
833 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
841 static void rotate_offsets(int x[])
852 /*compute the offset to compare or print transposed local data in original input coordinates
853 xs matrix dimension size, xl dimension length, xc decomposition offset
854 s: step in computation = number of transposes*/
855 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
857 /* int direction = plan->direction;
858 int fftorder = plan->fftorder;*/
862 int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK,
863 *C = plan->C, *rC = plan->rC;
865 NG[0] = plan->NG; NG[1] = plan->MG; NG[2] = plan->KG;
867 if (!(plan->flags&FFT5D_ORDER_YZ))
871 case 0: o = XYZ; break;
872 case 1: o = ZYX; break;
873 case 2: o = YZX; break;
881 case 0: o = XYZ; break;
882 case 1: o = YXZ; break;
883 case 2: o = ZXY; break;
890 case XYZ: pos[0] = 1; pos[1] = 2; pos[2] = 3; break;
891 case XZY: pos[0] = 1; pos[1] = 3; pos[2] = 2; break;
892 case YXZ: pos[0] = 2; pos[1] = 1; pos[2] = 3; break;
893 case YZX: pos[0] = 3; pos[1] = 1; pos[2] = 2; break;
894 case ZXY: pos[0] = 2; pos[1] = 3; pos[2] = 1; break;
895 case ZYX: pos[0] = 3; pos[1] = 2; pos[2] = 1; break;
897 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
899 /*xs, xl give dimension size and data length in local transposed coordinate system
900 for 0(/1/2): x(/y/z) in original coordinate system*/
901 for (i = 0; i < 3; i++)
905 case 1: xs[i] = 1; xc[i] = 0; xl[i] = C[s]; break;
906 case 2: xs[i] = C[s]; xc[i] = oM[s]; xl[i] = pM[s]; break;
907 case 3: xs[i] = C[s]*pM[s]; xc[i] = oK[s]; xl[i] = pK[s]; break;
910 /*input order is different for test program to match FFTW order
911 (important for complex to real)*/
912 if (plan->flags&FFT5D_BACKWARD)
918 if (plan->flags&FFT5D_ORDER_YZ)
926 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s == 0) || ((plan->flags&FFT5D_BACKWARD) && s == 2)))
932 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
935 int *coor = plan->coor;
936 int xs[3], xl[3], xc[3], NG[3];
937 int ll = (plan->flags&FFT5D_REALCOMPLEX) ? 1 : 2;
938 compute_offsets(plan, xs, xl, xc, NG, s);
939 fprintf(debug, txt, coor[0], coor[1], s);
940 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
941 for (z = 0; z < xl[2]; z++)
943 for (y = 0; y < xl[1]; y++)
945 fprintf(debug, "%d %d: ", coor[0], coor[1]);
946 for (x = 0; x < xl[0]; x++)
948 for (l = 0; l < ll; l++)
950 fprintf(debug, "%f ", ((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
954 fprintf(debug, "\n");
959 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
961 t_complex *lin = plan->lin;
962 t_complex *lout = plan->lout;
963 t_complex *lout2 = plan->lout2;
964 t_complex *lout3 = plan->lout3;
965 t_complex *fftout, *joinin;
967 gmx_fft_t **p1d = plan->p1d;
968 #ifdef FFT5D_MPI_TRANSPOSE
969 FFTW(plan) *mpip = plan->mpip;
972 MPI_Comm *cart = plan->cart;
975 double time_fft = 0, time_local = 0, time_mpi[2] = {0}, time = 0;
977 int *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
978 *C = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout, **oNout = plan->oNout;
979 int s = 0, tstart, tend, bParallelDim;
993 FFTW(execute)(plan->p3d);
997 times->fft += MPI_Wtime()-time;
1008 if (plan->flags&FFT5D_DEBUG && thread == 0)
1010 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1013 for (s = 0; s < 2; s++) /*loop over first two FFT steps (corner rotations)*/
1017 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1027 /* ---------- START FFT ------------ */
1029 if (times != 0 && thread == 0)
1035 if (bParallelDim || plan->nthreads == 1)
1051 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1052 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s == 0)
1054 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, fftout+tstart);
1058 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, fftout+tstart);
1063 if (times != NULL && thread == 0)
1065 time_fft += MPI_Wtime()-time;
1068 if (plan->flags&FFT5D_DEBUG && thread == 0)
1070 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1072 /* ---------- END FFT ------------ */
1074 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1078 if (times != NULL && thread == 0)
1085 1. (most outer) axes (x) is split into P[s] parts of size N[s]
1089 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
1091 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1093 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1095 if (times != NULL && thread == 0)
1097 time_local += MPI_Wtime()-time;
1101 /* ---------- END SPLIT , START TRANSPOSE------------ */
1111 wallcycle_start(times, ewcPME_FFTCOMM);
1113 #ifdef FFT5D_MPI_TRANSPOSE
1114 FFTW(execute)(mpip[s]);
1117 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1119 MPI_Alltoall((real *)lout2, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1123 MPI_Alltoall((real *)lout2, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1126 gmx_incons("fft5d MPI call without MPI configuration");
1128 #endif /*FFT5D_MPI_TRANSPOSE*/
1132 time_mpi[s] = MPI_Wtime()-time;
1135 wallcycle_stop(times, ewcPME_FFTCOMM);
1139 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1141 /* ---------- END SPLIT + TRANSPOSE------------ */
1143 /* ---------- START JOIN ------------ */
1145 if (times != NULL && thread == 0)
1159 /*bring back in matrix form
1160 thus make new 1. axes contiguos
1161 also local transpose 1 and 2/3
1162 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1164 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1168 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1169 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1170 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1177 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1178 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1179 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
1184 if (times != NULL && thread == 0)
1186 time_local += MPI_Wtime()-time;
1189 if (plan->flags&FFT5D_DEBUG && thread == 0)
1191 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1193 /* ---------- END JOIN ------------ */
1195 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1196 } /* for(s=0;s<2;s++) */
1198 if (times != NULL && thread == 0)
1204 if (plan->flags&FFT5D_INPLACE)
1206 lout = lin; /*in place currently not supported*/
1209 /* ----------- FFT ----------- */
1210 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1211 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1213 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, lout+tstart);
1217 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, lout+tstart);
1219 /* ------------ END FFT ---------*/
1222 if (times != NULL && thread == 0)
1224 time_fft += MPI_Wtime()-time;
1226 times->fft += time_fft;
1227 times->local += time_local;
1228 times->mpi2 += time_mpi[1];
1229 times->mpi1 += time_mpi[0];
1233 if (plan->flags&FFT5D_DEBUG && thread == 0)
1235 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1239 void fft5d_destroy(fft5d_plan plan)
1243 for (s = 0; s < 3; s++)
1247 for (t = 0; t < plan->nthreads; t++)
1249 gmx_many_fft_destroy(plan->p1d[s][t]);
1255 free(plan->iNin[s]);
1260 free(plan->oNin[s]);
1265 free(plan->iNout[s]);
1270 free(plan->oNout[s]);
1274 #ifdef GMX_FFT_FFTW3
1276 #ifdef FFT5D_MPI_TRANSPOS
1277 for (s = 0; s < 2; s++)
1279 FFTW(destroy_plan)(plan->mpip[s]);
1281 #endif /* FFT5D_MPI_TRANSPOS */
1284 FFTW(destroy_plan)(plan->p3d);
1287 #endif /* GMX_FFT_FFTW3 */
1289 if (!(plan->flags&FFT5D_NOMALLOC))
1291 sfree_aligned(plan->lin);
1292 sfree_aligned(plan->lout);
1293 if (plan->nthreads > 1)
1295 sfree_aligned(plan->lout2);
1296 sfree_aligned(plan->lout3);
1300 #ifdef FFT5D_THREADS
1301 #ifdef FFT5D_FFTW_THREADS
1302 /*FFTW(cleanup_threads)();*/
1309 /*Is this better than direct access of plan? enough data?
1310 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1311 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1322 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1323 of processor dimensions*/
1324 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
1326 MPI_Comm cart[2] = {0};
1328 int size = 1, prank = 0;
1331 int wrap[] = {0, 0};
1333 int rdim1[] = {0, 1}, rdim2[] = {1, 0};
1335 MPI_Comm_size(comm, &size);
1336 MPI_Comm_rank(comm, &prank);
1346 printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1351 P[0] = P0; P[1] = size/P0; /*number of processors in the two dimensions*/
1353 /*Difference between x-y-z regarding 2d decomposition is whether they are
1354 distributed along axis 1, 2 or both*/
1356 MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1357 MPI_Cart_get(gcart, 2, P, wrap, coor);
1358 MPI_Cart_sub(gcart, rdim1, &cart[0]);
1359 MPI_Cart_sub(gcart, rdim2, &cart[1]);
1364 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1369 /*prints in original coordinate system of data (as the input to FFT)*/
1370 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1372 int xs[3], xl[3], xc[3], NG[3];
1374 int *coor = plan->coor;
1375 int ll = 2; /*compare ll values per element (has to be 2 for complex)*/
1376 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1381 compute_offsets(plan, xs, xl, xc, NG, 2);
1382 if (plan->flags&FFT5D_DEBUG)
1384 printf("Compare2\n");
1386 for (z = 0; z < xl[2]; z++)
1388 for (y = 0; y < xl[1]; y++)
1390 if (plan->flags&FFT5D_DEBUG)
1392 printf("%d %d: ", coor[0], coor[1]);
1394 for (x = 0; x < xl[0]; x++)
1396 for (l = 0; l < ll; l++) /*loop over real/complex parts*/
1399 a = ((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1402 a /= plan->rC[0]*plan->rC[1]*plan->rC[2];
1406 b = ((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1410 b = ((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1412 if (plan->flags&FFT5D_DEBUG)
1414 printf("%f %f, ", a, b);
1418 if (fabs(a-b) > 2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS)
1420 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n", coor[0], coor[1], x, y, z, a, b);
1422 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1425 if (plan->flags&FFT5D_DEBUG)
1430 if (plan->flags&FFT5D_DEBUG)