2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009,2010,2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/utility/fatalerror.h"
51 #include "gromacs/utility/gmxmpi.h"
52 #include "gromacs/utility/smalloc.h"
55 #define GMX_PARALLEL_ENV_INITIALIZED 1
58 #define GMX_PARALLEL_ENV_INITIALIZED 1
60 #define GMX_PARALLEL_ENV_INITIALIZED 0
65 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
67 /* requires fftw compiled with openmp */
68 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
71 #ifndef __FLT_EPSILON__
72 #define __FLT_EPSILON__ FLT_EPSILON
73 #define __DBL_EPSILON__ DBL_EPSILON
82 #include "gromacs/utility/exceptions.h"
83 #include "gromacs/utility/mutex.h"
84 /* none of the fftw3 calls, except execute(), are thread-safe, so
85 we need to serialize them with this mutex. */
86 static gmx::Mutex big_fftw_mutex;
87 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
88 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
89 #endif /* GMX_FFT_FFTW3 */
92 /* largest factor smaller than sqrt */
93 static int lfactor(int z)
95 int i = static_cast<int>(sqrt(static_cast<double>(z)));
105 static int l2factor(int z)
123 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
124 static int lpfactor(int z)
131 return std::max(lpfactor(f), lpfactor(z/f));
135 #ifdef HAVE_GETTIMEOFDAY
136 #include <sys/time.h>
140 gettimeofday(&tv, 0);
141 return tv.tv_sec+tv.tv_usec*1e-6;
151 static int vmax(int* a, int s)
154 for (i = 0; i < s; i++)
165 /* NxMxK the size of the data
166 * comm communicator to use for fft5d
167 * P0 number of processor in 1st axes (can be null for automatic)
168 * lin is allocated by fft5d because size of array is only known after planning phase
169 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
171 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
174 int P[2], bMaster, prank[2], i, t;
176 int *N0 = 0, *N1 = 0, *M0 = 0, *M1 = 0, *K0 = 0, *K1 = 0, *oN0 = 0, *oN1 = 0, *oM0 = 0, *oM1 = 0, *oK0 = 0, *oK1 = 0;
177 int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {0}, *oNin[3] = {0}, *iNout[3] = {0}, *oNout[3] = {0};
178 int C[3], rC[3], nP[2];
180 t_complex *lin = 0, *lout = 0, *lout2 = 0, *lout3 = 0;
184 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
186 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
188 MPI_Comm_size(comm[0], &P[0]);
189 MPI_Comm_rank(comm[0], &prank[0]);
198 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
200 MPI_Comm_size(comm[1], &P[1]);
201 MPI_Comm_rank(comm[1], &prank[1]);
210 bMaster = (prank[0] == 0 && prank[1] == 0);
215 fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n",
216 P[0], P[1], prank[0], prank[1]);
223 fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
224 NG, MG, KG, P[0], P[1], (flags&FFT5D_REALCOMPLEX) > 0, (flags&FFT5D_BACKWARD) > 0, (flags&FFT5D_ORDER_YZ) > 0, (flags&FFT5D_DEBUG) > 0);
226 /* The check below is not correct, one prime factor 11 or 13 is ok.
227 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
228 printf("WARNING: FFT very slow with prime factors larger 7\n");
229 printf("Change FFT size or in case you cannot change it look at\n");
230 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
235 if (NG == 0 || MG == 0 || KG == 0)
239 printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
244 rNG = NG; rMG = MG; rKG = KG;
246 if (flags&FFT5D_REALCOMPLEX)
248 if (!(flags&FFT5D_BACKWARD))
254 if (!(flags&FFT5D_ORDER_YZ))
266 /*for transpose we need to know the size for each processor not only our own size*/
268 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
269 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
270 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
271 oN0 = (int*)malloc(P[0]*sizeof(int)); oN1 = (int*)malloc(P[1]*sizeof(int));
272 oM0 = (int*)malloc(P[0]*sizeof(int)); oM1 = (int*)malloc(P[1]*sizeof(int));
273 oK0 = (int*)malloc(P[0]*sizeof(int)); oK1 = (int*)malloc(P[1]*sizeof(int));
275 for (i = 0; i < P[0]; i++)
279 oN0[i] = i*ceil((double)NG/P[0]);
280 oM0[i] = i*ceil((double)MG/P[0]);
281 oK0[i] = i*ceil((double)KG/P[0]);
283 oN0[i] = (NG*i)/P[0];
284 oM0[i] = (MG*i)/P[0];
285 oK0[i] = (KG*i)/P[0];
288 for (i = 0; i < P[1]; i++)
291 oN1[i] = i*ceil((double)NG/P[1]);
292 oM1[i] = i*ceil((double)MG/P[1]);
293 oK1[i] = i*ceil((double)KG/P[1]);
295 oN1[i] = (NG*i)/P[1];
296 oM1[i] = (MG*i)/P[1];
297 oK1[i] = (KG*i)/P[1];
300 for (i = 0; i < P[0]-1; i++)
302 N0[i] = oN0[i+1]-oN0[i];
303 M0[i] = oM0[i+1]-oM0[i];
304 K0[i] = oK0[i+1]-oK0[i];
306 N0[P[0]-1] = NG-oN0[P[0]-1];
307 M0[P[0]-1] = MG-oM0[P[0]-1];
308 K0[P[0]-1] = KG-oK0[P[0]-1];
309 for (i = 0; i < P[1]-1; i++)
311 N1[i] = oN1[i+1]-oN1[i];
312 M1[i] = oM1[i+1]-oM1[i];
313 K1[i] = oK1[i+1]-oK1[i];
315 N1[P[1]-1] = NG-oN1[P[1]-1];
316 M1[P[1]-1] = MG-oM1[P[1]-1];
317 K1[P[1]-1] = KG-oK1[P[1]-1];
319 /* for step 1-3 the local N,M,K sizes of the transposed system
320 C: contiguous dimension, and nP: number of processor in subcommunicator
324 pM[0] = M0[prank[0]];
325 oM[0] = oM0[prank[0]];
326 pK[0] = K1[prank[1]];
327 oK[0] = oK1[prank[1]];
330 if (!(flags&FFT5D_ORDER_YZ))
332 N[0] = vmax(N1, P[1]);
334 K[0] = vmax(K1, P[1]);
335 pN[0] = N1[prank[1]];
341 N[1] = vmax(K0, P[0]);
342 pN[1] = K0[prank[0]];
347 M[1] = vmax(M0, P[0]);
348 pM[1] = M0[prank[0]];
349 oM[1] = oM0[prank[0]];
351 pK[1] = N1[prank[1]];
352 oK[1] = oN1[prank[1]];
358 M[2] = vmax(K0, P[0]);
359 pM[2] = K0[prank[0]];
360 oM[2] = oK0[prank[0]];
361 K[2] = vmax(N1, P[1]);
362 pK[2] = N1[prank[1]];
363 oK[2] = oN1[prank[1]];
364 free(N0); free(oN0); /*these are not used for this order*/
365 free(M1); free(oM1); /*the rest is freed in destroy*/
369 N[0] = vmax(N0, P[0]);
370 M[0] = vmax(M0, P[0]);
372 pN[0] = N0[prank[0]];
378 N[1] = vmax(M1, P[1]);
379 pN[1] = M1[prank[1]];
385 pM[1] = N0[prank[0]];
386 oM[1] = oN0[prank[0]];
387 K[1] = vmax(K1, P[1]);
388 pK[1] = K1[prank[1]];
389 oK[1] = oK1[prank[1]];
395 M[2] = vmax(N0, P[0]);
396 pM[2] = N0[prank[0]];
397 oM[2] = oN0[prank[0]];
398 K[2] = vmax(M1, P[1]);
399 pK[2] = M1[prank[1]];
400 oK[2] = oM1[prank[1]];
401 free(N1); free(oN1); /*these are not used for this order*/
402 free(K0); free(oK0); /*the rest is freed in destroy*/
404 N[2] = pN[2] = -1; /*not used*/
407 Difference between x-y-z regarding 2d decomposition is whether they are
408 distributed along axis 1, 2 or both
411 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
412 lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
413 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
414 if (!(flags&FFT5D_NOMALLOC))
416 snew_aligned(lin, lsize, 32);
417 snew_aligned(lout, lsize, 32);
420 /* We need extra transpose buffers to avoid OpenMP barriers */
421 snew_aligned(lout2, lsize, 32);
422 snew_aligned(lout3, lsize, 32);
426 /* We can reuse the buffers to avoid cache misses */
447 plan = (fft5d_plan)calloc(1, sizeof(struct fft5d_plan_t));
452 fprintf(debug, "Running on %d threads\n", nthreads);
456 /* Don't add more stuff here! We have already had at least one bug because we are reimplementing
457 * the low-level FFT interface instead of using the Gromacs FFT module. If we need more
458 * generic functionality it is far better to extend the interface so we can use it for
459 * all FFT libraries instead of writing FFTW-specific code here.
462 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
463 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
464 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
465 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
466 * and that the 3d plan is faster than the 1d plan.
468 if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
470 int fftwflags = FFTW_DESTROY_INPUT;
472 int inNG = NG, outMG = MG, outKG = KG;
476 fftwflags |= (flags & FFT5D_NOMEASURE) ? FFTW_ESTIMATE : FFTW_MEASURE;
478 if (flags&FFT5D_REALCOMPLEX)
480 if (!(flags&FFT5D_BACKWARD)) /*input pointer is not complex*/
484 else /*output pointer is not complex*/
486 if (!(flags&FFT5D_ORDER_YZ))
497 if (!(flags&FFT5D_BACKWARD))
503 dims[0].is = inNG*MG; /*N M K*/
506 if (!(flags&FFT5D_ORDER_YZ))
508 dims[0].os = MG; /*M K N*/
514 dims[0].os = 1; /*K N M*/
521 if (!(flags&FFT5D_ORDER_YZ))
531 dims[0].os = outMG*KG;
545 dims[0].os = outKG*NG;
551 #ifdef FFT5D_FFTW_THREADS
552 FFTW(plan_with_nthreads)(nthreads);
555 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
557 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
558 /*howmany*/ 0, /*howmany_dims*/ 0,
559 (real*)lin, (FFTW(complex) *) lout,
560 /*flags*/ fftwflags);
562 else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD))
564 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
565 /*howmany*/ 0, /*howmany_dims*/ 0,
566 (FFTW(complex) *) lin, (real*)lout,
567 /*flags*/ fftwflags);
571 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
572 /*howmany*/ 0, /*howmany_dims*/ 0,
573 (FFTW(complex) *) lin, (FFTW(complex) *) lout,
574 /*sign*/ (flags&FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
577 #ifdef FFT5D_FFTW_THREADS
578 FFTW(plan_with_nthreads)(1);
583 if (!plan->p3d) /* for decomposition and if 3d plan did not work */
585 #endif /* GMX_FFT_FFTW3 */
586 for (s = 0; s < 3; s++)
590 fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
591 s, rC[s], M[s], pK[s], C[s], lsize);
593 plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
595 /* Make sure that the init routines are only called by one thread at a time and in order
596 (later is only important to not confuse valgrind)
598 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
599 for (t = 0; t < nthreads; t++)
603 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
605 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s == 0) || ((flags&FFT5D_BACKWARD) && s == 2)))
607 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
611 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
620 if ((flags&FFT5D_ORDER_YZ)) /*plan->cart is in the order of transposes */
622 plan->cart[0] = comm[0]; plan->cart[1] = comm[1];
626 plan->cart[1] = comm[0]; plan->cart[0] = comm[1];
628 #ifdef FFT5D_MPI_TRANSPOSE
630 for (s = 0; s < 2; s++)
632 if ((s == 0 && !(flags&FFT5D_ORDER_YZ)) || (s == 1 && (flags&FFT5D_ORDER_YZ)))
634 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
638 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
650 plan->NG = NG; plan->MG = MG; plan->KG = KG;
652 for (s = 0; s < 3; s++)
654 plan->N[s] = N[s]; plan->M[s] = M[s]; plan->K[s] = K[s]; plan->pN[s] = pN[s]; plan->pM[s] = pM[s]; plan->pK[s] = pK[s];
655 plan->oM[s] = oM[s]; plan->oK[s] = oK[s];
656 plan->C[s] = C[s]; plan->rC[s] = rC[s];
657 plan->iNin[s] = iNin[s]; plan->oNin[s] = oNin[s]; plan->iNout[s] = iNout[s]; plan->oNout[s] = oNout[s];
659 for (s = 0; s < 2; s++)
661 plan->P[s] = nP[s]; plan->coor[s] = prank[s];
664 /* plan->fftorder=fftorder;
665 plan->direction=direction;
666 plan->realcomplex=realcomplex;
669 plan->nthreads = nthreads;
689 /*here x,y,z and N,M,K is in rotated coordinate system!!
690 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
691 maxN,maxM,maxK is max size of local data
692 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
693 NG, MG, KG is size of global data*/
694 static void splitaxes(t_complex* lout, const t_complex* lin,
695 int maxN, int maxM, int maxK, int pM,
696 int P, int NG, int *N, int* oN, int starty, int startz, int endy, int endz)
699 int in_i, out_i, in_z, out_z, in_y, out_y;
702 for (z = startz; z < endz+1; z++) /*3. z l*/
723 for (i = 0; i < P; i++) /*index cube along long axis*/
725 out_i = out_z + i*maxN*maxM*maxK;
727 for (y = s_y; y < e_y; y++) /*2. y k*/
729 out_y = out_i + y*maxN;
731 for (x = 0; x < N[i]; x++) /*1. x j*/
733 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
734 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
735 /*before split data contiguos - thus if different processor get different amount oN is different*/
742 /*make axis contiguous again (after AllToAll) and also do local transpose*/
743 /*transpose mayor and major dimension
745 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
746 N,M,K local dimensions
748 static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
749 int maxN, int maxM, int maxK, int pM,
750 int P, int KG, int* K, int* oK, int starty, int startx, int endy, int endx)
753 int out_i, in_i, out_x, in_x, out_z, in_z;
756 for (x = startx; x < endx+1; x++) /*1.j*/
778 for (i = 0; i < P; i++) /*index cube along long axis*/
780 out_i = out_x + oK[i];
781 in_i = in_x + i*maxM*maxN*maxK;
782 for (z = 0; z < K[i]; z++) /*3.l*/
785 in_z = in_i + z*maxM*maxN;
786 for (y = s_y; y < e_y; y++) /*2.k*/
788 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
795 /*make axis contiguous again (after AllToAll) and also do local transpose
796 tranpose mayor and middle dimension
798 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
801 static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
802 int P, int MG, int* M, int* oM, int startx, int startz, int endx, int endz)
805 int out_i, in_i, out_z, in_z, out_x, in_x;
808 for (z = startz; z < endz+1; z++)
829 for (i = 0; i < P; i++) /*index cube along long axis*/
831 out_i = out_z + oM[i];
832 in_i = in_z + i*maxM*maxN*maxK;
833 for (x = s_x; x < e_x; x++)
835 out_x = out_i + x*MG;
837 for (y = 0; y < M[i]; y++)
839 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
847 static void rotate_offsets(int x[])
858 /*compute the offset to compare or print transposed local data in original input coordinates
859 xs matrix dimension size, xl dimension length, xc decomposition offset
860 s: step in computation = number of transposes*/
861 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
863 /* int direction = plan->direction;
864 int fftorder = plan->fftorder;*/
868 int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK,
869 *C = plan->C, *rC = plan->rC;
871 NG[0] = plan->NG; NG[1] = plan->MG; NG[2] = plan->KG;
873 if (!(plan->flags&FFT5D_ORDER_YZ))
877 case 0: o = XYZ; break;
878 case 1: o = ZYX; break;
879 case 2: o = YZX; break;
887 case 0: o = XYZ; break;
888 case 1: o = YXZ; break;
889 case 2: o = ZXY; break;
896 case XYZ: pos[0] = 1; pos[1] = 2; pos[2] = 3; break;
897 case XZY: pos[0] = 1; pos[1] = 3; pos[2] = 2; break;
898 case YXZ: pos[0] = 2; pos[1] = 1; pos[2] = 3; break;
899 case YZX: pos[0] = 3; pos[1] = 1; pos[2] = 2; break;
900 case ZXY: pos[0] = 2; pos[1] = 3; pos[2] = 1; break;
901 case ZYX: pos[0] = 3; pos[1] = 2; pos[2] = 1; break;
903 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
905 /*xs, xl give dimension size and data length in local transposed coordinate system
906 for 0(/1/2): x(/y/z) in original coordinate system*/
907 for (i = 0; i < 3; i++)
911 case 1: xs[i] = 1; xc[i] = 0; xl[i] = C[s]; break;
912 case 2: xs[i] = C[s]; xc[i] = oM[s]; xl[i] = pM[s]; break;
913 case 3: xs[i] = C[s]*pM[s]; xc[i] = oK[s]; xl[i] = pK[s]; break;
916 /*input order is different for test program to match FFTW order
917 (important for complex to real)*/
918 if (plan->flags&FFT5D_BACKWARD)
924 if (plan->flags&FFT5D_ORDER_YZ)
932 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s == 0) || ((plan->flags&FFT5D_BACKWARD) && s == 2)))
938 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
941 int *coor = plan->coor;
942 int xs[3], xl[3], xc[3], NG[3];
943 int ll = (plan->flags&FFT5D_REALCOMPLEX) ? 1 : 2;
944 compute_offsets(plan, xs, xl, xc, NG, s);
945 fprintf(debug, txt, coor[0], coor[1], s);
946 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
947 for (z = 0; z < xl[2]; z++)
949 for (y = 0; y < xl[1]; y++)
951 fprintf(debug, "%d %d: ", coor[0], coor[1]);
952 for (x = 0; x < xl[0]; x++)
954 for (l = 0; l < ll; l++)
956 fprintf(debug, "%f ", ((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
960 fprintf(debug, "\n");
965 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
967 t_complex *lin = plan->lin;
968 t_complex *lout = plan->lout;
969 t_complex *lout2 = plan->lout2;
970 t_complex *lout3 = plan->lout3;
971 t_complex *fftout, *joinin;
973 gmx_fft_t **p1d = plan->p1d;
974 #ifdef FFT5D_MPI_TRANSPOSE
975 FFTW(plan) *mpip = plan->mpip;
978 MPI_Comm *cart = plan->cart;
981 double time_fft = 0, time_local = 0, time_mpi[2] = {0}, time = 0;
983 int *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
984 *C = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout, **oNout = plan->oNout;
985 int s = 0, tstart, tend, bParallelDim;
999 FFTW(execute)(plan->p3d);
1003 times->fft += MPI_Wtime()-time;
1014 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1016 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1019 for (s = 0; s < 2; s++) /*loop over first two FFT steps (corner rotations)*/
1023 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1033 /* ---------- START FFT ------------ */
1035 if (times != 0 && thread == 0)
1041 if (bParallelDim || plan->nthreads == 1)
1057 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1058 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s == 0)
1060 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, fftout+tstart);
1064 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, fftout+tstart);
1069 if (times != NULL && thread == 0)
1071 time_fft += MPI_Wtime()-time;
1074 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1076 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1078 /* ---------- END FFT ------------ */
1080 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1084 if (times != NULL && thread == 0)
1091 1. (most outer) axes (x) is split into P[s] parts of size N[s]
1095 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
1097 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1099 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1101 if (times != NULL && thread == 0)
1103 time_local += MPI_Wtime()-time;
1107 /* ---------- END SPLIT , START TRANSPOSE------------ */
1117 wallcycle_start(times, ewcPME_FFTCOMM);
1119 #ifdef FFT5D_MPI_TRANSPOSE
1120 FFTW(execute)(mpip[s]);
1123 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1125 MPI_Alltoall((real *)lout2, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1129 MPI_Alltoall((real *)lout2, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1132 gmx_incons("fft5d MPI call without MPI configuration");
1134 #endif /*FFT5D_MPI_TRANSPOSE*/
1138 time_mpi[s] = MPI_Wtime()-time;
1141 wallcycle_stop(times, ewcPME_FFTCOMM);
1145 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1147 /* ---------- END SPLIT + TRANSPOSE------------ */
1149 /* ---------- START JOIN ------------ */
1151 if (times != NULL && thread == 0)
1165 /*bring back in matrix form
1166 thus make new 1. axes contiguos
1167 also local transpose 1 and 2/3
1168 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1170 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1174 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1175 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1176 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1183 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1184 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1185 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
1190 if (times != NULL && thread == 0)
1192 time_local += MPI_Wtime()-time;
1195 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1197 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1199 /* ---------- END JOIN ------------ */
1201 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1202 } /* for(s=0;s<2;s++) */
1204 if (times != NULL && thread == 0)
1210 if (plan->flags&FFT5D_INPLACE)
1212 lout = lin; /*in place currently not supported*/
1215 /* ----------- FFT ----------- */
1216 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1217 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1219 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, lout+tstart);
1223 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, lout+tstart);
1225 /* ------------ END FFT ---------*/
1228 if (times != NULL && thread == 0)
1230 time_fft += MPI_Wtime()-time;
1232 times->fft += time_fft;
1233 times->local += time_local;
1234 times->mpi2 += time_mpi[1];
1235 times->mpi1 += time_mpi[0];
1239 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1241 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1245 void fft5d_destroy(fft5d_plan plan)
1249 for (s = 0; s < 3; s++)
1253 for (t = 0; t < plan->nthreads; t++)
1255 gmx_many_fft_destroy(plan->p1d[s][t]);
1261 free(plan->iNin[s]);
1266 free(plan->oNin[s]);
1271 free(plan->iNout[s]);
1276 free(plan->oNout[s]);
1282 #ifdef FFT5D_MPI_TRANSPOS
1283 for (s = 0; s < 2; s++)
1285 FFTW(destroy_plan)(plan->mpip[s]);
1287 #endif /* FFT5D_MPI_TRANSPOS */
1290 FFTW(destroy_plan)(plan->p3d);
1293 #endif /* GMX_FFT_FFTW3 */
1295 if (!(plan->flags&FFT5D_NOMALLOC))
1297 sfree_aligned(plan->lin);
1298 sfree_aligned(plan->lout);
1299 if (plan->nthreads > 1)
1301 sfree_aligned(plan->lout2);
1302 sfree_aligned(plan->lout3);
1306 #ifdef FFT5D_THREADS
1307 #ifdef FFT5D_FFTW_THREADS
1308 /*FFTW(cleanup_threads)();*/
1315 /*Is this better than direct access of plan? enough data?
1316 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1317 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1328 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1329 of processor dimensions*/
1330 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
1332 MPI_Comm cart[2] = {0};
1334 int size = 1, prank = 0;
1337 int wrap[] = {0, 0};
1339 int rdim1[] = {0, 1}, rdim2[] = {1, 0};
1341 MPI_Comm_size(comm, &size);
1342 MPI_Comm_rank(comm, &prank);
1352 printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1357 P[0] = P0; P[1] = size/P0; /*number of processors in the two dimensions*/
1359 /*Difference between x-y-z regarding 2d decomposition is whether they are
1360 distributed along axis 1, 2 or both*/
1362 MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1363 MPI_Cart_get(gcart, 2, P, wrap, coor);
1364 MPI_Cart_sub(gcart, rdim1, &cart[0]);
1365 MPI_Cart_sub(gcart, rdim2, &cart[1]);
1370 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1375 /*prints in original coordinate system of data (as the input to FFT)*/
1376 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1378 int xs[3], xl[3], xc[3], NG[3];
1380 int *coor = plan->coor;
1381 int ll = 2; /*compare ll values per element (has to be 2 for complex)*/
1382 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1387 compute_offsets(plan, xs, xl, xc, NG, 2);
1388 if (plan->flags&FFT5D_DEBUG)
1390 printf("Compare2\n");
1392 for (z = 0; z < xl[2]; z++)
1394 for (y = 0; y < xl[1]; y++)
1396 if (plan->flags&FFT5D_DEBUG)
1398 printf("%d %d: ", coor[0], coor[1]);
1400 for (x = 0; x < xl[0]; x++)
1402 for (l = 0; l < ll; l++) /*loop over real/complex parts*/
1405 a = ((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1408 a /= plan->rC[0]*plan->rC[1]*plan->rC[2];
1412 b = ((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1416 b = ((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1418 if (plan->flags&FFT5D_DEBUG)
1420 printf("%f %f, ", a, b);
1424 if (fabs(a-b) > 2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS)
1426 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n", coor[0], coor[1], x, y, z, a, b);
1428 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1431 if (plan->flags&FFT5D_DEBUG)
1436 if (plan->flags&FFT5D_DEBUG)