2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009,2010,2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/utility/fatalerror.h"
51 #include "gromacs/utility/gmxmpi.h"
52 #include "gromacs/utility/smalloc.h"
55 #define GMX_PARALLEL_ENV_INITIALIZED 1
58 #define GMX_PARALLEL_ENV_INITIALIZED 1
60 #define GMX_PARALLEL_ENV_INITIALIZED 0
65 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
67 /* requires fftw compiled with openmp */
68 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
71 #ifndef __FLT_EPSILON__
72 #define __FLT_EPSILON__ FLT_EPSILON
73 #define __DBL_EPSILON__ DBL_EPSILON
81 #include "gromacs/utility/exceptions.h"
82 #include "gromacs/utility/mutex.h"
83 /* none of the fftw3 calls, except execute(), are thread-safe, so
84 we need to serialize them with this mutex. */
85 static gmx::Mutex big_fftw_mutex;
86 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
87 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
88 #endif /* GMX_FFT_FFTW3 */
91 /* largest factor smaller than sqrt */
92 static int lfactor(int z)
94 int i = static_cast<int>(sqrt(static_cast<double>(z)));
104 static int l2factor(int z)
122 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
123 static int lpfactor(int z)
130 return std::max(lpfactor(f), lpfactor(z/f));
134 #ifdef HAVE_GETTIMEOFDAY
135 #include <sys/time.h>
139 gettimeofday(&tv, 0);
140 return tv.tv_sec+tv.tv_usec*1e-6;
150 static int vmax(int* a, int s)
153 for (i = 0; i < s; i++)
164 /* NxMxK the size of the data
165 * comm communicator to use for fft5d
166 * P0 number of processor in 1st axes (can be null for automatic)
167 * lin is allocated by fft5d because size of array is only known after planning phase
168 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
170 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
173 int P[2], bMaster, prank[2], i, t;
175 int *N0 = 0, *N1 = 0, *M0 = 0, *M1 = 0, *K0 = 0, *K1 = 0, *oN0 = 0, *oN1 = 0, *oM0 = 0, *oM1 = 0, *oK0 = 0, *oK1 = 0;
176 int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {0}, *oNin[3] = {0}, *iNout[3] = {0}, *oNout[3] = {0};
177 int C[3], rC[3], nP[2];
179 t_complex *lin = 0, *lout = 0, *lout2 = 0, *lout3 = 0;
183 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
185 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
187 MPI_Comm_size(comm[0], &P[0]);
188 MPI_Comm_rank(comm[0], &prank[0]);
197 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
199 MPI_Comm_size(comm[1], &P[1]);
200 MPI_Comm_rank(comm[1], &prank[1]);
209 bMaster = (prank[0] == 0 && prank[1] == 0);
214 fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n",
215 P[0], P[1], prank[0], prank[1]);
222 fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
223 NG, MG, KG, P[0], P[1], (flags&FFT5D_REALCOMPLEX) > 0, (flags&FFT5D_BACKWARD) > 0, (flags&FFT5D_ORDER_YZ) > 0, (flags&FFT5D_DEBUG) > 0);
225 /* The check below is not correct, one prime factor 11 or 13 is ok.
226 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
227 printf("WARNING: FFT very slow with prime factors larger 7\n");
228 printf("Change FFT size or in case you cannot change it look at\n");
229 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
234 if (NG == 0 || MG == 0 || KG == 0)
238 printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
243 rNG = NG; rMG = MG; rKG = KG;
245 if (flags&FFT5D_REALCOMPLEX)
247 if (!(flags&FFT5D_BACKWARD))
253 if (!(flags&FFT5D_ORDER_YZ))
265 /*for transpose we need to know the size for each processor not only our own size*/
267 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
268 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
269 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
270 oN0 = (int*)malloc(P[0]*sizeof(int)); oN1 = (int*)malloc(P[1]*sizeof(int));
271 oM0 = (int*)malloc(P[0]*sizeof(int)); oM1 = (int*)malloc(P[1]*sizeof(int));
272 oK0 = (int*)malloc(P[0]*sizeof(int)); oK1 = (int*)malloc(P[1]*sizeof(int));
274 for (i = 0; i < P[0]; i++)
278 oN0[i] = i*ceil((double)NG/P[0]);
279 oM0[i] = i*ceil((double)MG/P[0]);
280 oK0[i] = i*ceil((double)KG/P[0]);
282 oN0[i] = (NG*i)/P[0];
283 oM0[i] = (MG*i)/P[0];
284 oK0[i] = (KG*i)/P[0];
287 for (i = 0; i < P[1]; i++)
290 oN1[i] = i*ceil((double)NG/P[1]);
291 oM1[i] = i*ceil((double)MG/P[1]);
292 oK1[i] = i*ceil((double)KG/P[1]);
294 oN1[i] = (NG*i)/P[1];
295 oM1[i] = (MG*i)/P[1];
296 oK1[i] = (KG*i)/P[1];
299 for (i = 0; i < P[0]-1; i++)
301 N0[i] = oN0[i+1]-oN0[i];
302 M0[i] = oM0[i+1]-oM0[i];
303 K0[i] = oK0[i+1]-oK0[i];
305 N0[P[0]-1] = NG-oN0[P[0]-1];
306 M0[P[0]-1] = MG-oM0[P[0]-1];
307 K0[P[0]-1] = KG-oK0[P[0]-1];
308 for (i = 0; i < P[1]-1; i++)
310 N1[i] = oN1[i+1]-oN1[i];
311 M1[i] = oM1[i+1]-oM1[i];
312 K1[i] = oK1[i+1]-oK1[i];
314 N1[P[1]-1] = NG-oN1[P[1]-1];
315 M1[P[1]-1] = MG-oM1[P[1]-1];
316 K1[P[1]-1] = KG-oK1[P[1]-1];
318 /* for step 1-3 the local N,M,K sizes of the transposed system
319 C: contiguous dimension, and nP: number of processor in subcommunicator
323 pM[0] = M0[prank[0]];
324 oM[0] = oM0[prank[0]];
325 pK[0] = K1[prank[1]];
326 oK[0] = oK1[prank[1]];
329 if (!(flags&FFT5D_ORDER_YZ))
331 N[0] = vmax(N1, P[1]);
333 K[0] = vmax(K1, P[1]);
334 pN[0] = N1[prank[1]];
340 N[1] = vmax(K0, P[0]);
341 pN[1] = K0[prank[0]];
346 M[1] = vmax(M0, P[0]);
347 pM[1] = M0[prank[0]];
348 oM[1] = oM0[prank[0]];
350 pK[1] = N1[prank[1]];
351 oK[1] = oN1[prank[1]];
357 M[2] = vmax(K0, P[0]);
358 pM[2] = K0[prank[0]];
359 oM[2] = oK0[prank[0]];
360 K[2] = vmax(N1, P[1]);
361 pK[2] = N1[prank[1]];
362 oK[2] = oN1[prank[1]];
363 free(N0); free(oN0); /*these are not used for this order*/
364 free(M1); free(oM1); /*the rest is freed in destroy*/
368 N[0] = vmax(N0, P[0]);
369 M[0] = vmax(M0, P[0]);
371 pN[0] = N0[prank[0]];
377 N[1] = vmax(M1, P[1]);
378 pN[1] = M1[prank[1]];
384 pM[1] = N0[prank[0]];
385 oM[1] = oN0[prank[0]];
386 K[1] = vmax(K1, P[1]);
387 pK[1] = K1[prank[1]];
388 oK[1] = oK1[prank[1]];
394 M[2] = vmax(N0, P[0]);
395 pM[2] = N0[prank[0]];
396 oM[2] = oN0[prank[0]];
397 K[2] = vmax(M1, P[1]);
398 pK[2] = M1[prank[1]];
399 oK[2] = oM1[prank[1]];
400 free(N1); free(oN1); /*these are not used for this order*/
401 free(K0); free(oK0); /*the rest is freed in destroy*/
403 N[2] = pN[2] = -1; /*not used*/
406 Difference between x-y-z regarding 2d decomposition is whether they are
407 distributed along axis 1, 2 or both
410 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
411 lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
412 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
413 if (!(flags&FFT5D_NOMALLOC))
415 snew_aligned(lin, lsize, 32);
416 snew_aligned(lout, lsize, 32);
419 /* We need extra transpose buffers to avoid OpenMP barriers */
420 snew_aligned(lout2, lsize, 32);
421 snew_aligned(lout3, lsize, 32);
425 /* We can reuse the buffers to avoid cache misses */
446 plan = (fft5d_plan)calloc(1, sizeof(struct fft5d_plan_t));
451 fprintf(debug, "Running on %d threads\n", nthreads);
455 /* Don't add more stuff here! We have already had at least one bug because we are reimplementing
456 * the low-level FFT interface instead of using the Gromacs FFT module. If we need more
457 * generic functionality it is far better to extend the interface so we can use it for
458 * all FFT libraries instead of writing FFTW-specific code here.
461 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
462 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
463 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
464 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
465 * and that the 3d plan is faster than the 1d plan.
467 if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
469 int fftwflags = FFTW_DESTROY_INPUT;
471 int inNG = NG, outMG = MG, outKG = KG;
475 fftwflags |= (flags & FFT5D_NOMEASURE) ? FFTW_ESTIMATE : FFTW_MEASURE;
477 if (flags&FFT5D_REALCOMPLEX)
479 if (!(flags&FFT5D_BACKWARD)) /*input pointer is not complex*/
483 else /*output pointer is not complex*/
485 if (!(flags&FFT5D_ORDER_YZ))
496 if (!(flags&FFT5D_BACKWARD))
502 dims[0].is = inNG*MG; /*N M K*/
505 if (!(flags&FFT5D_ORDER_YZ))
507 dims[0].os = MG; /*M K N*/
513 dims[0].os = 1; /*K N M*/
520 if (!(flags&FFT5D_ORDER_YZ))
530 dims[0].os = outMG*KG;
544 dims[0].os = outKG*NG;
550 #ifdef FFT5D_FFTW_THREADS
551 FFTW(plan_with_nthreads)(nthreads);
554 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
556 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
557 /*howmany*/ 0, /*howmany_dims*/ 0,
558 (real*)lin, (FFTW(complex) *) lout,
559 /*flags*/ fftwflags);
561 else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD))
563 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
564 /*howmany*/ 0, /*howmany_dims*/ 0,
565 (FFTW(complex) *) lin, (real*)lout,
566 /*flags*/ fftwflags);
570 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
571 /*howmany*/ 0, /*howmany_dims*/ 0,
572 (FFTW(complex) *) lin, (FFTW(complex) *) lout,
573 /*sign*/ (flags&FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
576 #ifdef FFT5D_FFTW_THREADS
577 FFTW(plan_with_nthreads)(1);
582 if (!plan->p3d) /* for decomposition and if 3d plan did not work */
584 #endif /* GMX_FFT_FFTW3 */
585 for (s = 0; s < 3; s++)
589 fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
590 s, rC[s], M[s], pK[s], C[s], lsize);
592 plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
594 /* Make sure that the init routines are only called by one thread at a time and in order
595 (later is only important to not confuse valgrind)
597 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
598 for (t = 0; t < nthreads; t++)
602 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
604 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s == 0) || ((flags&FFT5D_BACKWARD) && s == 2)))
606 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
610 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
619 if ((flags&FFT5D_ORDER_YZ)) /*plan->cart is in the order of transposes */
621 plan->cart[0] = comm[0]; plan->cart[1] = comm[1];
625 plan->cart[1] = comm[0]; plan->cart[0] = comm[1];
627 #ifdef FFT5D_MPI_TRANSPOSE
629 for (s = 0; s < 2; s++)
631 if ((s == 0 && !(flags&FFT5D_ORDER_YZ)) || (s == 1 && (flags&FFT5D_ORDER_YZ)))
633 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
637 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
649 plan->NG = NG; plan->MG = MG; plan->KG = KG;
651 for (s = 0; s < 3; s++)
653 plan->N[s] = N[s]; plan->M[s] = M[s]; plan->K[s] = K[s]; plan->pN[s] = pN[s]; plan->pM[s] = pM[s]; plan->pK[s] = pK[s];
654 plan->oM[s] = oM[s]; plan->oK[s] = oK[s];
655 plan->C[s] = C[s]; plan->rC[s] = rC[s];
656 plan->iNin[s] = iNin[s]; plan->oNin[s] = oNin[s]; plan->iNout[s] = iNout[s]; plan->oNout[s] = oNout[s];
658 for (s = 0; s < 2; s++)
660 plan->P[s] = nP[s]; plan->coor[s] = prank[s];
663 /* plan->fftorder=fftorder;
664 plan->direction=direction;
665 plan->realcomplex=realcomplex;
668 plan->nthreads = nthreads;
688 /*here x,y,z and N,M,K is in rotated coordinate system!!
689 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
690 maxN,maxM,maxK is max size of local data
691 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
692 NG, MG, KG is size of global data*/
693 static void splitaxes(t_complex* lout, const t_complex* lin,
694 int maxN, int maxM, int maxK, int pM,
695 int P, int NG, int *N, int* oN, int starty, int startz, int endy, int endz)
698 int in_i, out_i, in_z, out_z, in_y, out_y;
701 for (z = startz; z < endz+1; z++) /*3. z l*/
722 for (i = 0; i < P; i++) /*index cube along long axis*/
724 out_i = out_z + i*maxN*maxM*maxK;
726 for (y = s_y; y < e_y; y++) /*2. y k*/
728 out_y = out_i + y*maxN;
730 for (x = 0; x < N[i]; x++) /*1. x j*/
732 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
733 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
734 /*before split data contiguos - thus if different processor get different amount oN is different*/
741 /*make axis contiguous again (after AllToAll) and also do local transpose*/
742 /*transpose mayor and major dimension
744 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
745 N,M,K local dimensions
747 static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
748 int maxN, int maxM, int maxK, int pM,
749 int P, int KG, int* K, int* oK, int starty, int startx, int endy, int endx)
752 int out_i, in_i, out_x, in_x, out_z, in_z;
755 for (x = startx; x < endx+1; x++) /*1.j*/
777 for (i = 0; i < P; i++) /*index cube along long axis*/
779 out_i = out_x + oK[i];
780 in_i = in_x + i*maxM*maxN*maxK;
781 for (z = 0; z < K[i]; z++) /*3.l*/
784 in_z = in_i + z*maxM*maxN;
785 for (y = s_y; y < e_y; y++) /*2.k*/
787 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
794 /*make axis contiguous again (after AllToAll) and also do local transpose
795 tranpose mayor and middle dimension
797 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
800 static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
801 int P, int MG, int* M, int* oM, int startx, int startz, int endx, int endz)
804 int out_i, in_i, out_z, in_z, out_x, in_x;
807 for (z = startz; z < endz+1; z++)
828 for (i = 0; i < P; i++) /*index cube along long axis*/
830 out_i = out_z + oM[i];
831 in_i = in_z + i*maxM*maxN*maxK;
832 for (x = s_x; x < e_x; x++)
834 out_x = out_i + x*MG;
836 for (y = 0; y < M[i]; y++)
838 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
846 static void rotate_offsets(int x[])
857 /*compute the offset to compare or print transposed local data in original input coordinates
858 xs matrix dimension size, xl dimension length, xc decomposition offset
859 s: step in computation = number of transposes*/
860 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
862 /* int direction = plan->direction;
863 int fftorder = plan->fftorder;*/
867 int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK,
868 *C = plan->C, *rC = plan->rC;
870 NG[0] = plan->NG; NG[1] = plan->MG; NG[2] = plan->KG;
872 if (!(plan->flags&FFT5D_ORDER_YZ))
876 case 0: o = XYZ; break;
877 case 1: o = ZYX; break;
878 case 2: o = YZX; break;
886 case 0: o = XYZ; break;
887 case 1: o = YXZ; break;
888 case 2: o = ZXY; break;
895 case XYZ: pos[0] = 1; pos[1] = 2; pos[2] = 3; break;
896 case XZY: pos[0] = 1; pos[1] = 3; pos[2] = 2; break;
897 case YXZ: pos[0] = 2; pos[1] = 1; pos[2] = 3; break;
898 case YZX: pos[0] = 3; pos[1] = 1; pos[2] = 2; break;
899 case ZXY: pos[0] = 2; pos[1] = 3; pos[2] = 1; break;
900 case ZYX: pos[0] = 3; pos[1] = 2; pos[2] = 1; break;
902 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
904 /*xs, xl give dimension size and data length in local transposed coordinate system
905 for 0(/1/2): x(/y/z) in original coordinate system*/
906 for (i = 0; i < 3; i++)
910 case 1: xs[i] = 1; xc[i] = 0; xl[i] = C[s]; break;
911 case 2: xs[i] = C[s]; xc[i] = oM[s]; xl[i] = pM[s]; break;
912 case 3: xs[i] = C[s]*pM[s]; xc[i] = oK[s]; xl[i] = pK[s]; break;
915 /*input order is different for test program to match FFTW order
916 (important for complex to real)*/
917 if (plan->flags&FFT5D_BACKWARD)
923 if (plan->flags&FFT5D_ORDER_YZ)
931 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s == 0) || ((plan->flags&FFT5D_BACKWARD) && s == 2)))
937 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
940 int *coor = plan->coor;
941 int xs[3], xl[3], xc[3], NG[3];
942 int ll = (plan->flags&FFT5D_REALCOMPLEX) ? 1 : 2;
943 compute_offsets(plan, xs, xl, xc, NG, s);
944 fprintf(debug, txt, coor[0], coor[1], s);
945 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
946 for (z = 0; z < xl[2]; z++)
948 for (y = 0; y < xl[1]; y++)
950 fprintf(debug, "%d %d: ", coor[0], coor[1]);
951 for (x = 0; x < xl[0]; x++)
953 for (l = 0; l < ll; l++)
955 fprintf(debug, "%f ", ((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
959 fprintf(debug, "\n");
964 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
966 t_complex *lin = plan->lin;
967 t_complex *lout = plan->lout;
968 t_complex *lout2 = plan->lout2;
969 t_complex *lout3 = plan->lout3;
970 t_complex *fftout, *joinin;
972 gmx_fft_t **p1d = plan->p1d;
973 #ifdef FFT5D_MPI_TRANSPOSE
974 FFTW(plan) *mpip = plan->mpip;
977 MPI_Comm *cart = plan->cart;
980 double time_fft = 0, time_local = 0, time_mpi[2] = {0}, time = 0;
982 int *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
983 *C = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout, **oNout = plan->oNout;
984 int s = 0, tstart, tend, bParallelDim;
998 FFTW(execute)(plan->p3d);
1002 times->fft += MPI_Wtime()-time;
1013 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1015 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1018 for (s = 0; s < 2; s++) /*loop over first two FFT steps (corner rotations)*/
1022 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1032 /* ---------- START FFT ------------ */
1034 if (times != 0 && thread == 0)
1040 if (bParallelDim || plan->nthreads == 1)
1056 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1057 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s == 0)
1059 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, fftout+tstart);
1063 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, fftout+tstart);
1068 if (times != NULL && thread == 0)
1070 time_fft += MPI_Wtime()-time;
1073 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1075 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1077 /* ---------- END FFT ------------ */
1079 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1083 if (times != NULL && thread == 0)
1090 1. (most outer) axes (x) is split into P[s] parts of size N[s]
1094 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
1096 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1098 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1100 if (times != NULL && thread == 0)
1102 time_local += MPI_Wtime()-time;
1106 /* ---------- END SPLIT , START TRANSPOSE------------ */
1116 wallcycle_start(times, ewcPME_FFTCOMM);
1118 #ifdef FFT5D_MPI_TRANSPOSE
1119 FFTW(execute)(mpip[s]);
1122 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1124 MPI_Alltoall((real *)lout2, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1128 MPI_Alltoall((real *)lout2, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1131 gmx_incons("fft5d MPI call without MPI configuration");
1133 #endif /*FFT5D_MPI_TRANSPOSE*/
1137 time_mpi[s] = MPI_Wtime()-time;
1140 wallcycle_stop(times, ewcPME_FFTCOMM);
1144 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1146 /* ---------- END SPLIT + TRANSPOSE------------ */
1148 /* ---------- START JOIN ------------ */
1150 if (times != NULL && thread == 0)
1164 /*bring back in matrix form
1165 thus make new 1. axes contiguos
1166 also local transpose 1 and 2/3
1167 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1169 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1173 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1174 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1175 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1182 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1183 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1184 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
1189 if (times != NULL && thread == 0)
1191 time_local += MPI_Wtime()-time;
1194 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1196 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1198 /* ---------- END JOIN ------------ */
1200 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1201 } /* for(s=0;s<2;s++) */
1203 if (times != NULL && thread == 0)
1209 if (plan->flags&FFT5D_INPLACE)
1211 lout = lin; /*in place currently not supported*/
1214 /* ----------- FFT ----------- */
1215 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1216 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1218 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, lout+tstart);
1222 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, lout+tstart);
1224 /* ------------ END FFT ---------*/
1227 if (times != NULL && thread == 0)
1229 time_fft += MPI_Wtime()-time;
1231 times->fft += time_fft;
1232 times->local += time_local;
1233 times->mpi2 += time_mpi[1];
1234 times->mpi1 += time_mpi[0];
1238 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1240 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1244 void fft5d_destroy(fft5d_plan plan)
1248 for (s = 0; s < 3; s++)
1252 for (t = 0; t < plan->nthreads; t++)
1254 gmx_many_fft_destroy(plan->p1d[s][t]);
1260 free(plan->iNin[s]);
1265 free(plan->oNin[s]);
1270 free(plan->iNout[s]);
1275 free(plan->oNout[s]);
1279 #ifdef GMX_FFT_FFTW3
1281 #ifdef FFT5D_MPI_TRANSPOS
1282 for (s = 0; s < 2; s++)
1284 FFTW(destroy_plan)(plan->mpip[s]);
1286 #endif /* FFT5D_MPI_TRANSPOS */
1289 FFTW(destroy_plan)(plan->p3d);
1292 #endif /* GMX_FFT_FFTW3 */
1294 if (!(plan->flags&FFT5D_NOMALLOC))
1296 sfree_aligned(plan->lin);
1297 sfree_aligned(plan->lout);
1298 if (plan->nthreads > 1)
1300 sfree_aligned(plan->lout2);
1301 sfree_aligned(plan->lout3);
1305 #ifdef FFT5D_THREADS
1306 #ifdef FFT5D_FFTW_THREADS
1307 /*FFTW(cleanup_threads)();*/
1314 /*Is this better than direct access of plan? enough data?
1315 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1316 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1327 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1328 of processor dimensions*/
1329 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
1331 MPI_Comm cart[2] = {0};
1333 int size = 1, prank = 0;
1336 int wrap[] = {0, 0};
1338 int rdim1[] = {0, 1}, rdim2[] = {1, 0};
1340 MPI_Comm_size(comm, &size);
1341 MPI_Comm_rank(comm, &prank);
1351 printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1356 P[0] = P0; P[1] = size/P0; /*number of processors in the two dimensions*/
1358 /*Difference between x-y-z regarding 2d decomposition is whether they are
1359 distributed along axis 1, 2 or both*/
1361 MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1362 MPI_Cart_get(gcart, 2, P, wrap, coor);
1363 MPI_Cart_sub(gcart, rdim1, &cart[0]);
1364 MPI_Cart_sub(gcart, rdim2, &cart[1]);
1369 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1374 /*prints in original coordinate system of data (as the input to FFT)*/
1375 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1377 int xs[3], xl[3], xc[3], NG[3];
1379 int *coor = plan->coor;
1380 int ll = 2; /*compare ll values per element (has to be 2 for complex)*/
1381 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1386 compute_offsets(plan, xs, xl, xc, NG, 2);
1387 if (plan->flags&FFT5D_DEBUG)
1389 printf("Compare2\n");
1391 for (z = 0; z < xl[2]; z++)
1393 for (y = 0; y < xl[1]; y++)
1395 if (plan->flags&FFT5D_DEBUG)
1397 printf("%d %d: ", coor[0], coor[1]);
1399 for (x = 0; x < xl[0]; x++)
1401 for (l = 0; l < ll; l++) /*loop over real/complex parts*/
1404 a = ((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1407 a /= plan->rC[0]*plan->rC[1]*plan->rC[2];
1411 b = ((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1415 b = ((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1417 if (plan->flags&FFT5D_DEBUG)
1419 printf("%f %f, ", a, b);
1423 if (fabs(a-b) > 2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS)
1425 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n", coor[0], coor[1], x, y, z, a, b);
1427 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1430 if (plan->flags&FFT5D_DEBUG)
1435 if (plan->flags&FFT5D_DEBUG)