2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009,2010,2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
46 #define GMX_PARALLEL_ENV_INITIALIZED 1
49 #define GMX_PARALLEL_ENV_INITIALIZED 1
51 #define GMX_PARALLEL_ENV_INITIALIZED 0
55 #include "gromacs/utility/gmxmpi.h"
58 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
60 /* requires fftw compiled with openmp */
61 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
68 #include "gromacs/utility/smalloc.h"
70 #ifndef __FLT_EPSILON__
71 #define __FLT_EPSILON__ FLT_EPSILON
72 #define __DBL_EPSILON__ DBL_EPSILON
79 #include "gromacs/utility/fatalerror.h"
83 #include "thread_mpi/mutex.h"
84 #include "gromacs/utility/exceptions.h"
85 /* none of the fftw3 calls, except execute(), are thread-safe, so
86 we need to serialize them with this mutex. */
87 static tMPI::mutex big_fftw_mutex;
88 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
89 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
90 #endif /* GMX_FFT_FFTW3 */
92 /* largest factor smaller than sqrt */
93 static int lfactor(int z)
96 for (i = static_cast<int>(sqrt(static_cast<double>(z)));; i--)
107 static int l2factor(int z)
124 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
125 static int lpfactor(int z)
132 return std::max(lpfactor(f), lpfactor(z/f));
136 #ifdef HAVE_GETTIMEOFDAY
137 #include <sys/time.h>
141 gettimeofday(&tv, 0);
142 return tv.tv_sec+tv.tv_usec*1e-6;
152 static int vmax(int* a, int s)
155 for (i = 0; i < s; i++)
166 /* NxMxK the size of the data
167 * comm communicator to use for fft5d
168 * P0 number of processor in 1st axes (can be null for automatic)
169 * lin is allocated by fft5d because size of array is only known after planning phase
170 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
172 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
175 int P[2], bMaster, prank[2], i, t;
177 int *N0 = 0, *N1 = 0, *M0 = 0, *M1 = 0, *K0 = 0, *K1 = 0, *oN0 = 0, *oN1 = 0, *oM0 = 0, *oM1 = 0, *oK0 = 0, *oK1 = 0;
178 int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {0}, *oNin[3] = {0}, *iNout[3] = {0}, *oNout[3] = {0};
179 int C[3], rC[3], nP[2];
181 t_complex *lin = 0, *lout = 0, *lout2 = 0, *lout3 = 0;
185 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
187 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
189 MPI_Comm_size(comm[0], &P[0]);
190 MPI_Comm_rank(comm[0], &prank[0]);
199 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
201 MPI_Comm_size(comm[1], &P[1]);
202 MPI_Comm_rank(comm[1], &prank[1]);
211 bMaster = (prank[0] == 0 && prank[1] == 0);
216 fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n",
217 P[0], P[1], prank[0], prank[1]);
224 fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
225 NG, MG, KG, P[0], P[1], (flags&FFT5D_REALCOMPLEX) > 0, (flags&FFT5D_BACKWARD) > 0, (flags&FFT5D_ORDER_YZ) > 0, (flags&FFT5D_DEBUG) > 0);
227 /* The check below is not correct, one prime factor 11 or 13 is ok.
228 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
229 printf("WARNING: FFT very slow with prime factors larger 7\n");
230 printf("Change FFT size or in case you cannot change it look at\n");
231 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
236 if (NG == 0 || MG == 0 || KG == 0)
240 printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
245 rNG = NG; rMG = MG; rKG = KG;
247 if (flags&FFT5D_REALCOMPLEX)
249 if (!(flags&FFT5D_BACKWARD))
255 if (!(flags&FFT5D_ORDER_YZ))
267 /*for transpose we need to know the size for each processor not only our own size*/
269 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
270 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
271 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
272 oN0 = (int*)malloc(P[0]*sizeof(int)); oN1 = (int*)malloc(P[1]*sizeof(int));
273 oM0 = (int*)malloc(P[0]*sizeof(int)); oM1 = (int*)malloc(P[1]*sizeof(int));
274 oK0 = (int*)malloc(P[0]*sizeof(int)); oK1 = (int*)malloc(P[1]*sizeof(int));
276 for (i = 0; i < P[0]; i++)
280 oN0[i] = i*ceil((double)NG/P[0]);
281 oM0[i] = i*ceil((double)MG/P[0]);
282 oK0[i] = i*ceil((double)KG/P[0]);
284 oN0[i] = (NG*i)/P[0];
285 oM0[i] = (MG*i)/P[0];
286 oK0[i] = (KG*i)/P[0];
289 for (i = 0; i < P[1]; i++)
292 oN1[i] = i*ceil((double)NG/P[1]);
293 oM1[i] = i*ceil((double)MG/P[1]);
294 oK1[i] = i*ceil((double)KG/P[1]);
296 oN1[i] = (NG*i)/P[1];
297 oM1[i] = (MG*i)/P[1];
298 oK1[i] = (KG*i)/P[1];
301 for (i = 0; i < P[0]-1; i++)
303 N0[i] = oN0[i+1]-oN0[i];
304 M0[i] = oM0[i+1]-oM0[i];
305 K0[i] = oK0[i+1]-oK0[i];
307 N0[P[0]-1] = NG-oN0[P[0]-1];
308 M0[P[0]-1] = MG-oM0[P[0]-1];
309 K0[P[0]-1] = KG-oK0[P[0]-1];
310 for (i = 0; i < P[1]-1; i++)
312 N1[i] = oN1[i+1]-oN1[i];
313 M1[i] = oM1[i+1]-oM1[i];
314 K1[i] = oK1[i+1]-oK1[i];
316 N1[P[1]-1] = NG-oN1[P[1]-1];
317 M1[P[1]-1] = MG-oM1[P[1]-1];
318 K1[P[1]-1] = KG-oK1[P[1]-1];
320 /* for step 1-3 the local N,M,K sizes of the transposed system
321 C: contiguous dimension, and nP: number of processor in subcommunicator
325 pM[0] = M0[prank[0]];
326 oM[0] = oM0[prank[0]];
327 pK[0] = K1[prank[1]];
328 oK[0] = oK1[prank[1]];
331 if (!(flags&FFT5D_ORDER_YZ))
333 N[0] = vmax(N1, P[1]);
335 K[0] = vmax(K1, P[1]);
336 pN[0] = N1[prank[1]];
342 N[1] = vmax(K0, P[0]);
343 pN[1] = K0[prank[0]];
348 M[1] = vmax(M0, P[0]);
349 pM[1] = M0[prank[0]];
350 oM[1] = oM0[prank[0]];
352 pK[1] = N1[prank[1]];
353 oK[1] = oN1[prank[1]];
359 M[2] = vmax(K0, P[0]);
360 pM[2] = K0[prank[0]];
361 oM[2] = oK0[prank[0]];
362 K[2] = vmax(N1, P[1]);
363 pK[2] = N1[prank[1]];
364 oK[2] = oN1[prank[1]];
365 free(N0); free(oN0); /*these are not used for this order*/
366 free(M1); free(oM1); /*the rest is freed in destroy*/
370 N[0] = vmax(N0, P[0]);
371 M[0] = vmax(M0, P[0]);
373 pN[0] = N0[prank[0]];
379 N[1] = vmax(M1, P[1]);
380 pN[1] = M1[prank[1]];
386 pM[1] = N0[prank[0]];
387 oM[1] = oN0[prank[0]];
388 K[1] = vmax(K1, P[1]);
389 pK[1] = K1[prank[1]];
390 oK[1] = oK1[prank[1]];
396 M[2] = vmax(N0, P[0]);
397 pM[2] = N0[prank[0]];
398 oM[2] = oN0[prank[0]];
399 K[2] = vmax(M1, P[1]);
400 pK[2] = M1[prank[1]];
401 oK[2] = oM1[prank[1]];
402 free(N1); free(oN1); /*these are not used for this order*/
403 free(K0); free(oK0); /*the rest is freed in destroy*/
405 N[2] = pN[2] = -1; /*not used*/
408 Difference between x-y-z regarding 2d decomposition is whether they are
409 distributed along axis 1, 2 or both
412 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
413 lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
414 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
415 if (!(flags&FFT5D_NOMALLOC))
417 snew_aligned(lin, lsize, 32);
418 snew_aligned(lout, lsize, 32);
421 /* We need extra transpose buffers to avoid OpenMP barriers */
422 snew_aligned(lout2, lsize, 32);
423 snew_aligned(lout3, lsize, 32);
427 /* We can reuse the buffers to avoid cache misses */
448 plan = (fft5d_plan)calloc(1, sizeof(struct fft5d_plan_t));
453 fprintf(debug, "Running on %d threads\n", nthreads);
456 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
457 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
458 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
459 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
460 * and that the 3d plan is faster than the 1d plan.
462 if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
464 int fftwflags = FFTW_DESTROY_INPUT;
466 int inNG = NG, outMG = MG, outKG = KG;
469 if (!(flags&FFT5D_NOMEASURE))
471 fftwflags |= FFTW_MEASURE;
473 if (flags&FFT5D_REALCOMPLEX)
475 if (!(flags&FFT5D_BACKWARD)) /*input pointer is not complex*/
479 else /*output pointer is not complex*/
481 if (!(flags&FFT5D_ORDER_YZ))
492 if (!(flags&FFT5D_BACKWARD))
498 dims[0].is = inNG*MG; /*N M K*/
501 if (!(flags&FFT5D_ORDER_YZ))
503 dims[0].os = MG; /*M K N*/
509 dims[0].os = 1; /*K N M*/
516 if (!(flags&FFT5D_ORDER_YZ))
526 dims[0].os = outMG*KG;
540 dims[0].os = outKG*NG;
546 #ifdef FFT5D_FFTW_THREADS
547 FFTW(plan_with_nthreads)(nthreads);
550 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
552 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
553 /*howmany*/ 0, /*howmany_dims*/ 0,
554 (real*)lin, (FFTW(complex) *) lout,
555 /*flags*/ fftwflags);
557 else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD))
559 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
560 /*howmany*/ 0, /*howmany_dims*/ 0,
561 (FFTW(complex) *) lin, (real*)lout,
562 /*flags*/ fftwflags);
566 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
567 /*howmany*/ 0, /*howmany_dims*/ 0,
568 (FFTW(complex) *) lin, (FFTW(complex) *) lout,
569 /*sign*/ (flags&FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
572 #ifdef FFT5D_FFTW_THREADS
573 FFTW(plan_with_nthreads)(1);
578 if (!plan->p3d) /* for decomposition and if 3d plan did not work */
580 #endif /* GMX_FFT_FFTW3 */
581 for (s = 0; s < 3; s++)
585 fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
586 s, rC[s], M[s], pK[s], C[s], lsize);
588 plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
590 /* Make sure that the init routines are only called by one thread at a time and in order
591 (later is only important to not confuse valgrind)
593 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
594 for (t = 0; t < nthreads; t++)
598 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
600 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s == 0) || ((flags&FFT5D_BACKWARD) && s == 2)))
602 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
606 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
615 if ((flags&FFT5D_ORDER_YZ)) /*plan->cart is in the order of transposes */
617 plan->cart[0] = comm[0]; plan->cart[1] = comm[1];
621 plan->cart[1] = comm[0]; plan->cart[0] = comm[1];
623 #ifdef FFT5D_MPI_TRANSPOSE
625 for (s = 0; s < 2; s++)
627 if ((s == 0 && !(flags&FFT5D_ORDER_YZ)) || (s == 1 && (flags&FFT5D_ORDER_YZ)))
629 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
633 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
645 plan->NG = NG; plan->MG = MG; plan->KG = KG;
647 for (s = 0; s < 3; s++)
649 plan->N[s] = N[s]; plan->M[s] = M[s]; plan->K[s] = K[s]; plan->pN[s] = pN[s]; plan->pM[s] = pM[s]; plan->pK[s] = pK[s];
650 plan->oM[s] = oM[s]; plan->oK[s] = oK[s];
651 plan->C[s] = C[s]; plan->rC[s] = rC[s];
652 plan->iNin[s] = iNin[s]; plan->oNin[s] = oNin[s]; plan->iNout[s] = iNout[s]; plan->oNout[s] = oNout[s];
654 for (s = 0; s < 2; s++)
656 plan->P[s] = nP[s]; plan->coor[s] = prank[s];
659 /* plan->fftorder=fftorder;
660 plan->direction=direction;
661 plan->realcomplex=realcomplex;
664 plan->nthreads = nthreads;
684 /*here x,y,z and N,M,K is in rotated coordinate system!!
685 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
686 maxN,maxM,maxK is max size of local data
687 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
688 NG, MG, KG is size of global data*/
689 static void splitaxes(t_complex* lout, const t_complex* lin,
690 int maxN, int maxM, int maxK, int pM,
691 int P, int NG, int *N, int* oN, int starty, int startz, int endy, int endz)
694 int in_i, out_i, in_z, out_z, in_y, out_y;
697 for (z = startz; z < endz+1; z++) /*3. z l*/
718 for (i = 0; i < P; i++) /*index cube along long axis*/
720 out_i = out_z + i*maxN*maxM*maxK;
722 for (y = s_y; y < e_y; y++) /*2. y k*/
724 out_y = out_i + y*maxN;
726 for (x = 0; x < N[i]; x++) /*1. x j*/
728 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
729 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
730 /*before split data contiguos - thus if different processor get different amount oN is different*/
737 /*make axis contiguous again (after AllToAll) and also do local transpose*/
738 /*transpose mayor and major dimension
740 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
741 N,M,K local dimensions
743 static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
744 int maxN, int maxM, int maxK, int pM,
745 int P, int KG, int* K, int* oK, int starty, int startx, int endy, int endx)
748 int out_i, in_i, out_x, in_x, out_z, in_z;
751 for (x = startx; x < endx+1; x++) /*1.j*/
773 for (i = 0; i < P; i++) /*index cube along long axis*/
775 out_i = out_x + oK[i];
776 in_i = in_x + i*maxM*maxN*maxK;
777 for (z = 0; z < K[i]; z++) /*3.l*/
780 in_z = in_i + z*maxM*maxN;
781 for (y = s_y; y < e_y; y++) /*2.k*/
783 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
790 /*make axis contiguous again (after AllToAll) and also do local transpose
791 tranpose mayor and middle dimension
793 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
796 static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
797 int P, int MG, int* M, int* oM, int startx, int startz, int endx, int endz)
800 int out_i, in_i, out_z, in_z, out_x, in_x;
803 for (z = startz; z < endz+1; z++)
824 for (i = 0; i < P; i++) /*index cube along long axis*/
826 out_i = out_z + oM[i];
827 in_i = in_z + i*maxM*maxN*maxK;
828 for (x = s_x; x < e_x; x++)
830 out_x = out_i + x*MG;
832 for (y = 0; y < M[i]; y++)
834 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
842 static void rotate_offsets(int x[])
853 /*compute the offset to compare or print transposed local data in original input coordinates
854 xs matrix dimension size, xl dimension length, xc decomposition offset
855 s: step in computation = number of transposes*/
856 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
858 /* int direction = plan->direction;
859 int fftorder = plan->fftorder;*/
863 int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK,
864 *C = plan->C, *rC = plan->rC;
866 NG[0] = plan->NG; NG[1] = plan->MG; NG[2] = plan->KG;
868 if (!(plan->flags&FFT5D_ORDER_YZ))
872 case 0: o = XYZ; break;
873 case 1: o = ZYX; break;
874 case 2: o = YZX; break;
882 case 0: o = XYZ; break;
883 case 1: o = YXZ; break;
884 case 2: o = ZXY; break;
891 case XYZ: pos[0] = 1; pos[1] = 2; pos[2] = 3; break;
892 case XZY: pos[0] = 1; pos[1] = 3; pos[2] = 2; break;
893 case YXZ: pos[0] = 2; pos[1] = 1; pos[2] = 3; break;
894 case YZX: pos[0] = 3; pos[1] = 1; pos[2] = 2; break;
895 case ZXY: pos[0] = 2; pos[1] = 3; pos[2] = 1; break;
896 case ZYX: pos[0] = 3; pos[1] = 2; pos[2] = 1; break;
898 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
900 /*xs, xl give dimension size and data length in local transposed coordinate system
901 for 0(/1/2): x(/y/z) in original coordinate system*/
902 for (i = 0; i < 3; i++)
906 case 1: xs[i] = 1; xc[i] = 0; xl[i] = C[s]; break;
907 case 2: xs[i] = C[s]; xc[i] = oM[s]; xl[i] = pM[s]; break;
908 case 3: xs[i] = C[s]*pM[s]; xc[i] = oK[s]; xl[i] = pK[s]; break;
911 /*input order is different for test program to match FFTW order
912 (important for complex to real)*/
913 if (plan->flags&FFT5D_BACKWARD)
919 if (plan->flags&FFT5D_ORDER_YZ)
927 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s == 0) || ((plan->flags&FFT5D_BACKWARD) && s == 2)))
933 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
936 int *coor = plan->coor;
937 int xs[3], xl[3], xc[3], NG[3];
938 int ll = (plan->flags&FFT5D_REALCOMPLEX) ? 1 : 2;
939 compute_offsets(plan, xs, xl, xc, NG, s);
940 fprintf(debug, txt, coor[0], coor[1], s);
941 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
942 for (z = 0; z < xl[2]; z++)
944 for (y = 0; y < xl[1]; y++)
946 fprintf(debug, "%d %d: ", coor[0], coor[1]);
947 for (x = 0; x < xl[0]; x++)
949 for (l = 0; l < ll; l++)
951 fprintf(debug, "%f ", ((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
955 fprintf(debug, "\n");
960 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
962 t_complex *lin = plan->lin;
963 t_complex *lout = plan->lout;
964 t_complex *lout2 = plan->lout2;
965 t_complex *lout3 = plan->lout3;
966 t_complex *fftout, *joinin;
968 gmx_fft_t **p1d = plan->p1d;
969 #ifdef FFT5D_MPI_TRANSPOSE
970 FFTW(plan) *mpip = plan->mpip;
973 MPI_Comm *cart = plan->cart;
976 double time_fft = 0, time_local = 0, time_mpi[2] = {0}, time = 0;
978 int *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
979 *C = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout, **oNout = plan->oNout;
980 int s = 0, tstart, tend, bParallelDim;
994 FFTW(execute)(plan->p3d);
998 times->fft += MPI_Wtime()-time;
1009 if (plan->flags&FFT5D_DEBUG && thread == 0)
1011 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1014 for (s = 0; s < 2; s++) /*loop over first two FFT steps (corner rotations)*/
1018 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1028 /* ---------- START FFT ------------ */
1030 if (times != 0 && thread == 0)
1036 if (bParallelDim || plan->nthreads == 1)
1052 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1053 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s == 0)
1055 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, fftout+tstart);
1059 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, fftout+tstart);
1064 if (times != NULL && thread == 0)
1066 time_fft += MPI_Wtime()-time;
1069 if (plan->flags&FFT5D_DEBUG && thread == 0)
1071 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1073 /* ---------- END FFT ------------ */
1075 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1079 if (times != NULL && thread == 0)
1086 1. (most outer) axes (x) is split into P[s] parts of size N[s]
1090 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
1092 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1094 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1096 if (times != NULL && thread == 0)
1098 time_local += MPI_Wtime()-time;
1102 /* ---------- END SPLIT , START TRANSPOSE------------ */
1112 wallcycle_start(times, ewcPME_FFTCOMM);
1114 #ifdef FFT5D_MPI_TRANSPOSE
1115 FFTW(execute)(mpip[s]);
1118 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1120 MPI_Alltoall((real *)lout2, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1124 MPI_Alltoall((real *)lout2, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1127 gmx_incons("fft5d MPI call without MPI configuration");
1129 #endif /*FFT5D_MPI_TRANSPOSE*/
1133 time_mpi[s] = MPI_Wtime()-time;
1136 wallcycle_stop(times, ewcPME_FFTCOMM);
1140 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1142 /* ---------- END SPLIT + TRANSPOSE------------ */
1144 /* ---------- START JOIN ------------ */
1146 if (times != NULL && thread == 0)
1160 /*bring back in matrix form
1161 thus make new 1. axes contiguos
1162 also local transpose 1 and 2/3
1163 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1165 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1169 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1170 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1171 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1178 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1179 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1180 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
1185 if (times != NULL && thread == 0)
1187 time_local += MPI_Wtime()-time;
1190 if (plan->flags&FFT5D_DEBUG && thread == 0)
1192 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1194 /* ---------- END JOIN ------------ */
1196 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1197 } /* for(s=0;s<2;s++) */
1199 if (times != NULL && thread == 0)
1205 if (plan->flags&FFT5D_INPLACE)
1207 lout = lin; /*in place currently not supported*/
1210 /* ----------- FFT ----------- */
1211 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1212 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1214 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, lout+tstart);
1218 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, lout+tstart);
1220 /* ------------ END FFT ---------*/
1223 if (times != NULL && thread == 0)
1225 time_fft += MPI_Wtime()-time;
1227 times->fft += time_fft;
1228 times->local += time_local;
1229 times->mpi2 += time_mpi[1];
1230 times->mpi1 += time_mpi[0];
1234 if (plan->flags&FFT5D_DEBUG && thread == 0)
1236 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1240 void fft5d_destroy(fft5d_plan plan)
1244 for (s = 0; s < 3; s++)
1248 for (t = 0; t < plan->nthreads; t++)
1250 gmx_many_fft_destroy(plan->p1d[s][t]);
1256 free(plan->iNin[s]);
1261 free(plan->oNin[s]);
1266 free(plan->iNout[s]);
1271 free(plan->oNout[s]);
1275 #ifdef GMX_FFT_FFTW3
1277 #ifdef FFT5D_MPI_TRANSPOS
1278 for (s = 0; s < 2; s++)
1280 FFTW(destroy_plan)(plan->mpip[s]);
1282 #endif /* FFT5D_MPI_TRANSPOS */
1285 FFTW(destroy_plan)(plan->p3d);
1288 #endif /* GMX_FFT_FFTW3 */
1290 if (!(plan->flags&FFT5D_NOMALLOC))
1292 sfree_aligned(plan->lin);
1293 sfree_aligned(plan->lout);
1294 if (plan->nthreads > 1)
1296 sfree_aligned(plan->lout2);
1297 sfree_aligned(plan->lout3);
1301 #ifdef FFT5D_THREADS
1302 #ifdef FFT5D_FFTW_THREADS
1303 /*FFTW(cleanup_threads)();*/
1310 /*Is this better than direct access of plan? enough data?
1311 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1312 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1323 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1324 of processor dimensions*/
1325 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
1327 MPI_Comm cart[2] = {0};
1329 int size = 1, prank = 0;
1332 int wrap[] = {0, 0};
1334 int rdim1[] = {0, 1}, rdim2[] = {1, 0};
1336 MPI_Comm_size(comm, &size);
1337 MPI_Comm_rank(comm, &prank);
1347 printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1352 P[0] = P0; P[1] = size/P0; /*number of processors in the two dimensions*/
1354 /*Difference between x-y-z regarding 2d decomposition is whether they are
1355 distributed along axis 1, 2 or both*/
1357 MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1358 MPI_Cart_get(gcart, 2, P, wrap, coor);
1359 MPI_Cart_sub(gcart, rdim1, &cart[0]);
1360 MPI_Cart_sub(gcart, rdim2, &cart[1]);
1365 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1370 /*prints in original coordinate system of data (as the input to FFT)*/
1371 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1373 int xs[3], xl[3], xc[3], NG[3];
1375 int *coor = plan->coor;
1376 int ll = 2; /*compare ll values per element (has to be 2 for complex)*/
1377 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1382 compute_offsets(plan, xs, xl, xc, NG, 2);
1383 if (plan->flags&FFT5D_DEBUG)
1385 printf("Compare2\n");
1387 for (z = 0; z < xl[2]; z++)
1389 for (y = 0; y < xl[1]; y++)
1391 if (plan->flags&FFT5D_DEBUG)
1393 printf("%d %d: ", coor[0], coor[1]);
1395 for (x = 0; x < xl[0]; x++)
1397 for (l = 0; l < ll; l++) /*loop over real/complex parts*/
1400 a = ((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1403 a /= plan->rC[0]*plan->rC[1]*plan->rC[2];
1407 b = ((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1411 b = ((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1413 if (plan->flags&FFT5D_DEBUG)
1415 printf("%f %f, ", a, b);
1419 if (fabs(a-b) > 2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS)
1421 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n", coor[0], coor[1], x, y, z, a, b);
1423 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1426 if (plan->flags&FFT5D_DEBUG)
1431 if (plan->flags&FFT5D_DEBUG)