2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009-2018, The GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
51 #include "gromacs/gpu_utils/gpu_utils.h"
52 #include "gromacs/gpu_utils/hostallocator.h"
53 #include "gromacs/gpu_utils/pinning.h"
54 #include "gromacs/utility/alignedallocator.h"
55 #include "gromacs/utility/exceptions.h"
56 #include "gromacs/utility/fatalerror.h"
57 #include "gromacs/utility/gmxassert.h"
58 #include "gromacs/utility/gmxmpi.h"
59 #include "gromacs/utility/smalloc.h"
62 # define GMX_PARALLEL_ENV_INITIALIZED 1
65 # define GMX_PARALLEL_ENV_INITIALIZED 1
67 # define GMX_PARALLEL_ENV_INITIALIZED 0
72 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
73 # define FFT5D_THREADS
74 /* requires fftw compiled with openmp */
75 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
78 #ifndef __FLT_EPSILON__
79 # define __FLT_EPSILON__ FLT_EPSILON
80 # define __DBL_EPSILON__ DBL_EPSILON
89 # include "gromacs/utility/exceptions.h"
90 # include "gromacs/utility/mutex.h"
91 /* none of the fftw3 calls, except execute(), are thread-safe, so
92 we need to serialize them with this mutex. */
93 static gmx::Mutex big_fftw_mutex;
97 big_fftw_mutex.lock(); \
99 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
100 # define FFTW_UNLOCK \
103 big_fftw_mutex.unlock(); \
105 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
106 #endif /* GMX_FFT_FFTW3 */
109 /* largest factor smaller than sqrt */
110 static int lfactor(int z)
112 int i = static_cast<int>(sqrt(static_cast<double>(z)));
122 # if HAVE_GETTIMEOFDAY
123 # include <sys/time.h>
127 gettimeofday(&tv, nullptr);
128 return tv.tv_sec + tv.tv_usec * 1e-6;
138 static int vmax(const int* a, int s)
141 for (i = 0; i < s; i++)
152 /* NxMxK the size of the data
153 * comm communicator to use for fft5d
154 * P0 number of processor in 1st axes (can be null for automatic)
155 * lin is allocated by fft5d because size of array is only known after planning phase
156 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
158 fft5d_plan fft5d_plan_3d(int NG,
168 gmx::PinningPolicy realGridAllocationPinningPolicy)
171 int P[2], prank[2], i;
174 int *N0 = nullptr, *N1 = nullptr, *M0 = nullptr, *M1 = nullptr, *K0 = nullptr, *K1 = nullptr,
175 *oN0 = nullptr, *oN1 = nullptr, *oM0 = nullptr, *oM1 = nullptr, *oK0 = nullptr, *oK1 = nullptr;
176 int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3],
177 *iNin[3] = { nullptr }, *oNin[3] = { nullptr }, *iNout[3] = { nullptr },
178 *oNout[3] = { nullptr };
179 int C[3], rC[3], nP[2];
181 t_complex *lin = nullptr, *lout = nullptr, *lout2 = nullptr, *lout3 = nullptr;
185 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
187 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
189 MPI_Comm_size(comm[0], &P[0]);
190 MPI_Comm_rank(comm[0], &prank[0]);
199 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
201 MPI_Comm_size(comm[1], &P[1]);
202 MPI_Comm_rank(comm[1], &prank[1]);
211 bMaster = prank[0] == 0 && prank[1] == 0;
216 fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n", P[0], P[1], prank[0], prank[1]);
224 "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order "
225 "yz: %d, debug %d\n",
226 NG, MG, KG, P[0], P[1], int((flags & FFT5D_REALCOMPLEX) > 0),
227 int((flags & FFT5D_BACKWARD) > 0), int((flags & FFT5D_ORDER_YZ) > 0),
228 int((flags & FFT5D_DEBUG) > 0));
230 /* The check below is not correct, one prime factor 11 or 13 is ok.
231 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
232 printf("WARNING: FFT very slow with prime factors larger 7\n");
233 printf("Change FFT size or in case you cannot change it look at\n");
234 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
239 if (NG == 0 || MG == 0 || KG == 0)
243 printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
252 if (flags & FFT5D_REALCOMPLEX)
254 if (!(flags & FFT5D_BACKWARD))
260 if (!(flags & FFT5D_ORDER_YZ))
272 /*for transpose we need to know the size for each processor not only our own size*/
274 N0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
275 N1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
276 M0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
277 M1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
278 K0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
279 K1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
280 oN0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
281 oN1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
282 oM0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
283 oM1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
284 oK0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
285 oK1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
287 for (i = 0; i < P[0]; i++)
291 oN0[i] = i * ceil((double)NG / P[0]);
292 oM0[i] = i * ceil((double)MG / P[0]);
293 oK0[i] = i * ceil((double)KG / P[0]);
295 oN0[i] = (NG * i) / P[0];
296 oM0[i] = (MG * i) / P[0];
297 oK0[i] = (KG * i) / P[0];
300 for (i = 0; i < P[1]; i++)
303 oN1[i] = i * ceil((double)NG / P[1]);
304 oM1[i] = i * ceil((double)MG / P[1]);
305 oK1[i] = i * ceil((double)KG / P[1]);
307 oN1[i] = (NG * i) / P[1];
308 oM1[i] = (MG * i) / P[1];
309 oK1[i] = (KG * i) / P[1];
312 for (i = 0; P[0] > 0 && i < P[0] - 1; i++)
314 N0[i] = oN0[i + 1] - oN0[i];
315 M0[i] = oM0[i + 1] - oM0[i];
316 K0[i] = oK0[i + 1] - oK0[i];
318 N0[P[0] - 1] = NG - oN0[P[0] - 1];
319 M0[P[0] - 1] = MG - oM0[P[0] - 1];
320 K0[P[0] - 1] = KG - oK0[P[0] - 1];
321 for (i = 0; P[1] > 0 && i < P[1] - 1; i++)
323 N1[i] = oN1[i + 1] - oN1[i];
324 M1[i] = oM1[i + 1] - oM1[i];
325 K1[i] = oK1[i + 1] - oK1[i];
327 N1[P[1] - 1] = NG - oN1[P[1] - 1];
328 M1[P[1] - 1] = MG - oM1[P[1] - 1];
329 K1[P[1] - 1] = KG - oK1[P[1] - 1];
331 /* for step 1-3 the local N,M,K sizes of the transposed system
332 C: contiguous dimension, and nP: number of processor in subcommunicator
335 GMX_ASSERT(prank[0] < P[0], "Must have valid rank within communicator size");
336 GMX_ASSERT(prank[1] < P[1], "Must have valid rank within communicator size");
337 pM[0] = M0[prank[0]];
338 oM[0] = oM0[prank[0]];
339 pK[0] = K1[prank[1]];
340 oK[0] = oK1[prank[1]];
343 if (!(flags & FFT5D_ORDER_YZ))
345 N[0] = vmax(N1, P[1]);
347 K[0] = vmax(K1, P[1]);
348 pN[0] = N1[prank[1]];
354 N[1] = vmax(K0, P[0]);
355 pN[1] = K0[prank[0]];
360 M[1] = vmax(M0, P[0]);
361 pM[1] = M0[prank[0]];
362 oM[1] = oM0[prank[0]];
364 pK[1] = N1[prank[1]];
365 oK[1] = oN1[prank[1]];
371 M[2] = vmax(K0, P[0]);
372 pM[2] = K0[prank[0]];
373 oM[2] = oK0[prank[0]];
374 K[2] = vmax(N1, P[1]);
375 pK[2] = N1[prank[1]];
376 oK[2] = oN1[prank[1]];
378 free(oN0); /*these are not used for this order*/
380 free(oM1); /*the rest is freed in destroy*/
384 N[0] = vmax(N0, P[0]);
385 M[0] = vmax(M0, P[0]);
387 pN[0] = N0[prank[0]];
393 N[1] = vmax(M1, P[1]);
394 pN[1] = M1[prank[1]];
400 pM[1] = N0[prank[0]];
401 oM[1] = oN0[prank[0]];
402 K[1] = vmax(K1, P[1]);
403 pK[1] = K1[prank[1]];
404 oK[1] = oK1[prank[1]];
410 M[2] = vmax(N0, P[0]);
411 pM[2] = N0[prank[0]];
412 oM[2] = oN0[prank[0]];
413 K[2] = vmax(M1, P[1]);
414 pK[2] = M1[prank[1]];
415 oK[2] = oM1[prank[1]];
417 free(oN1); /*these are not used for this order*/
419 free(oK0); /*the rest is freed in destroy*/
421 N[2] = pN[2] = -1; /*not used*/
424 Difference between x-y-z regarding 2d decomposition is whether they are
425 distributed along axis 1, 2 or both
428 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
429 lsize = std::max(N[0] * M[0] * K[0] * nP[0], std::max(N[1] * M[1] * K[1] * nP[1], C[2] * M[2] * K[2]));
430 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
431 if (!(flags & FFT5D_NOMALLOC))
433 // only needed for PME GPU mixed mode
434 if (realGridAllocationPinningPolicy == gmx::PinningPolicy::PinnedIfSupported && GMX_GPU == GMX_GPU_CUDA)
436 const std::size_t numBytes = lsize * sizeof(t_complex);
437 lin = static_cast<t_complex*>(gmx::PageAlignedAllocationPolicy::malloc(numBytes));
438 gmx::pinBuffer(lin, numBytes);
442 snew_aligned(lin, lsize, 32);
444 snew_aligned(lout, lsize, 32);
447 /* We need extra transpose buffers to avoid OpenMP barriers */
448 snew_aligned(lout2, lsize, 32);
449 snew_aligned(lout3, lsize, 32);
453 /* We can reuse the buffers to avoid cache misses */
474 plan = static_cast<fft5d_plan>(calloc(1, sizeof(struct fft5d_plan_t)));
479 fprintf(debug, "Running on %d threads\n", nthreads);
483 /* Don't add more stuff here! We have already had at least one bug because we are reimplementing
484 * the low-level FFT interface instead of using the Gromacs FFT module. If we need more
485 * generic functionality it is far better to extend the interface so we can use it for
486 * all FFT libraries instead of writing FFTW-specific code here.
489 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
490 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be
491 * called from within a parallel region. For now deactivated. If it should be supported it has
492 * to made sure that that the execute of the 3d plan is in a master/serial block (since it
493 * contains it own parallel region) and that the 3d plan is faster than the 1d plan.
495 if ((!(flags & FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1))
496 && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
498 int fftwflags = FFTW_DESTROY_INPUT;
500 int inNG = NG, outMG = MG, outKG = KG;
504 fftwflags |= (flags & FFT5D_NOMEASURE) ? FFTW_ESTIMATE : FFTW_MEASURE;
506 if (flags & FFT5D_REALCOMPLEX)
508 if (!(flags & FFT5D_BACKWARD)) /*input pointer is not complex*/
512 else /*output pointer is not complex*/
514 if (!(flags & FFT5D_ORDER_YZ))
525 if (!(flags & FFT5D_BACKWARD))
531 dims[0].is = inNG * MG; /*N M K*/
534 if (!(flags & FFT5D_ORDER_YZ))
536 dims[0].os = MG; /*M K N*/
538 dims[2].os = MG * KG;
542 dims[0].os = 1; /*K N M*/
543 dims[1].os = KG * NG;
549 if (!(flags & FFT5D_ORDER_YZ))
556 dims[1].is = NG * MG;
559 dims[0].os = outMG * KG;
571 dims[2].is = NG * MG;
573 dims[0].os = outKG * NG;
578 # ifdef FFT5D_THREADS
579 # ifdef FFT5D_FFTW_THREADS
580 FFTW(plan_with_nthreads)(nthreads);
583 if ((flags & FFT5D_REALCOMPLEX) && !(flags & FFT5D_BACKWARD))
585 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
586 /*howmany*/ 0, /*howmany_dims*/ nullptr,
587 reinterpret_cast<real*>(lin),
588 reinterpret_cast<FFTW(complex)*>(lout),
589 /*flags*/ fftwflags);
591 else if ((flags & FFT5D_REALCOMPLEX) && (flags & FFT5D_BACKWARD))
593 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
594 /*howmany*/ 0, /*howmany_dims*/ nullptr,
595 reinterpret_cast<FFTW(complex)*>(lin),
596 reinterpret_cast<real*>(lout),
597 /*flags*/ fftwflags);
601 plan->p3d = FFTW(plan_guru_dft)(
603 /*howmany*/ 0, /*howmany_dims*/ nullptr, reinterpret_cast<FFTW(complex)*>(lin),
604 reinterpret_cast<FFTW(complex)*>(lout),
605 /*sign*/ (flags & FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
607 # ifdef FFT5D_THREADS
608 # ifdef FFT5D_FFTW_THREADS
609 FFTW(plan_with_nthreads)(1);
614 if (!plan->p3d) /* for decomposition and if 3d plan did not work */
616 #endif /* GMX_FFT_FFTW3 */
617 for (s = 0; s < 3; s++)
621 fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n", s, rC[s], M[s],
624 plan->p1d[s] = static_cast<gmx_fft_t*>(malloc(sizeof(gmx_fft_t) * nthreads));
626 /* Make sure that the init routines are only called by one thread at a time and in order
627 (later is only important to not confuse valgrind)
629 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
630 for (int t = 0; t < nthreads; t++)
636 int tsize = ((t + 1) * pM[s] * pK[s] / nthreads) - (t * pM[s] * pK[s] / nthreads);
638 if ((flags & FFT5D_REALCOMPLEX)
639 && ((!(flags & FFT5D_BACKWARD) && s == 0)
640 || ((flags & FFT5D_BACKWARD) && s == 2)))
642 gmx_fft_init_many_1d_real(
643 &plan->p1d[s][t], rC[s], tsize,
644 (flags & FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0);
648 gmx_fft_init_many_1d(&plan->p1d[s][t], C[s], tsize,
649 (flags & FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0);
652 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
660 if ((flags & FFT5D_ORDER_YZ)) /*plan->cart is in the order of transposes */
662 plan->cart[0] = comm[0];
663 plan->cart[1] = comm[1];
667 plan->cart[1] = comm[0];
668 plan->cart[0] = comm[1];
670 #ifdef FFT5D_MPI_TRANSPOSE
672 for (s = 0; s < 2; s++)
674 if ((s == 0 && !(flags & FFT5D_ORDER_YZ)) || (s == 1 && (flags & FFT5D_ORDER_YZ)))
676 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s] * K[s] * pM[s] * 2, 1,
677 1, (real*)lout2, (real*)lout3,
678 plan->cart[s], FFTW_PATIENT);
682 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s] * pK[s] * M[s] * 2, 1,
683 1, (real*)lout2, (real*)lout3,
684 plan->cart[s], FFTW_PATIENT);
700 for (s = 0; s < 3; s++)
712 plan->iNin[s] = iNin[s];
713 plan->oNin[s] = oNin[s];
714 plan->iNout[s] = iNout[s];
715 plan->oNout[s] = oNout[s];
717 for (s = 0; s < 2; s++)
720 plan->coor[s] = prank[s];
723 /* plan->fftorder=fftorder;
724 plan->direction=direction;
725 plan->realcomplex=realcomplex;
728 plan->nthreads = nthreads;
729 plan->pinningPolicy = realGridAllocationPinningPolicy;
749 /*here x,y,z and N,M,K is in rotated coordinate system!!
750 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
751 maxN,maxM,maxK is max size of local data
752 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
753 NG, MG, KG is size of global data*/
754 static void splitaxes(t_complex* lout,
755 const t_complex* lin,
770 int in_i, out_i, in_z, out_z, in_y, out_y;
773 for (z = startz; z < endz + 1; z++) /*3. z l*/
791 out_z = z * maxN * maxM;
794 for (i = 0; i < P; i++) /*index cube along long axis*/
796 out_i = out_z + i * maxN * maxM * maxK;
798 for (y = s_y; y < e_y; y++) /*2. y k*/
800 out_y = out_i + y * maxN;
801 in_y = in_i + y * NG;
802 for (x = 0; x < N[i]; x++) /*1. x j*/
804 lout[out_y + x] = lin[in_y + x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
805 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
806 /*before split data contiguos - thus if different processor get different amount oN is different*/
813 /*make axis contiguous again (after AllToAll) and also do local transpose*/
814 /*transpose mayor and major dimension
816 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
817 N,M,K local dimensions
819 static void joinAxesTrans13(t_complex* lout,
820 const t_complex* lin,
835 int out_i, in_i, out_x, in_x, out_z, in_z;
838 for (x = startx; x < endx + 1; x++) /*1.j*/
860 for (i = 0; i < P; i++) /*index cube along long axis*/
862 out_i = out_x + oK[i];
863 in_i = in_x + i * maxM * maxN * maxK;
864 for (z = 0; z < K[i]; z++) /*3.l*/
867 in_z = in_i + z * maxM * maxN;
868 for (y = s_y; y < e_y; y++) /*2.k*/
870 lout[out_z + y * KG] = lin[in_z + y * maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
877 /*make axis contiguous again (after AllToAll) and also do local transpose
878 tranpose mayor and middle dimension
880 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
883 static void joinAxesTrans12(t_complex* lout,
884 const t_complex* lin,
899 int out_i, in_i, out_z, in_z, out_x, in_x;
902 for (z = startz; z < endz + 1; z++)
921 in_z = z * maxM * maxN;
923 for (i = 0; i < P; i++) /*index cube along long axis*/
925 out_i = out_z + oM[i];
926 in_i = in_z + i * maxM * maxN * maxK;
927 for (x = s_x; x < e_x; x++)
929 out_x = out_i + x * MG;
931 for (y = 0; y < M[i]; y++)
933 lout[out_x + y] = lin[in_x + y * maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
941 static void rotate_offsets(int x[])
952 /*compute the offset to compare or print transposed local data in original input coordinates
953 xs matrix dimension size, xl dimension length, xc decomposition offset
954 s: step in computation = number of transposes*/
955 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
957 /* int direction = plan->direction;
958 int fftorder = plan->fftorder;*/
962 int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK, *C = plan->C, *rC = plan->rC;
968 if (!(plan->flags & FFT5D_ORDER_YZ))
972 case 0: o = XYZ; break;
973 case 1: o = ZYX; break;
974 case 2: o = YZX; break;
982 case 0: o = XYZ; break;
983 case 1: o = YXZ; break;
984 case 2: o = ZXY; break;
1022 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
1024 /*xs, xl give dimension size and data length in local transposed coordinate system
1025 for 0(/1/2): x(/y/z) in original coordinate system*/
1026 for (i = 0; i < 3; i++)
1041 xs[i] = C[s] * pM[s];
1047 /*input order is different for test program to match FFTW order
1048 (important for complex to real)*/
1049 if (plan->flags & FFT5D_BACKWARD)
1055 if (plan->flags & FFT5D_ORDER_YZ)
1063 if ((plan->flags & FFT5D_REALCOMPLEX)
1064 && ((!(plan->flags & FFT5D_BACKWARD) && s == 0) || ((plan->flags & FFT5D_BACKWARD) && s == 2)))
1070 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
1073 int* coor = plan->coor;
1074 int xs[3], xl[3], xc[3], NG[3];
1075 int ll = (plan->flags & FFT5D_REALCOMPLEX) ? 1 : 2;
1076 compute_offsets(plan, xs, xl, xc, NG, s);
1077 fprintf(debug, txt, coor[0], coor[1]);
1078 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
1079 for (z = 0; z < xl[2]; z++)
1081 for (y = 0; y < xl[1]; y++)
1083 fprintf(debug, "%d %d: ", coor[0], coor[1]);
1084 for (x = 0; x < xl[0]; x++)
1086 for (l = 0; l < ll; l++)
1088 fprintf(debug, "%f ",
1089 reinterpret_cast<const real*>(
1090 lin)[(z * xs[2] + y * xs[1]) * 2 + (x * xs[0]) * ll + l]);
1092 fprintf(debug, ",");
1094 fprintf(debug, "\n");
1099 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
1101 t_complex* lin = plan->lin;
1102 t_complex* lout = plan->lout;
1103 t_complex* lout2 = plan->lout2;
1104 t_complex* lout3 = plan->lout3;
1105 t_complex *fftout, *joinin;
1107 gmx_fft_t** p1d = plan->p1d;
1108 #ifdef FFT5D_MPI_TRANSPOSE
1109 FFTW(plan)* mpip = plan->mpip;
1112 MPI_Comm* cart = plan->cart;
1115 double time_fft = 0, time_local = 0, time_mpi[2] = { 0 }, time = 0;
1117 int *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
1118 *C = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout,
1119 **oNout = plan->oNout;
1120 int s = 0, tstart, tend, bParallelDim;
1134 FFTW(execute)(plan->p3d);
1138 times->fft += MPI_Wtime() - time;
1149 if ((plan->flags & FFT5D_DEBUG) && thread == 0)
1151 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1154 for (s = 0; s < 2; s++) /*loop over first two FFT steps (corner rotations)*/
1158 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1168 /* ---------- START FFT ------------ */
1170 if (times != 0 && thread == 0)
1176 if (bParallelDim || plan->nthreads == 1)
1192 tstart = (thread * pM[s] * pK[s] / plan->nthreads) * C[s];
1193 if ((plan->flags & FFT5D_REALCOMPLEX) && !(plan->flags & FFT5D_BACKWARD) && s == 0)
1195 gmx_fft_many_1d_real(p1d[s][thread],
1196 (plan->flags & FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL
1197 : GMX_FFT_REAL_TO_COMPLEX,
1198 lin + tstart, fftout + tstart);
1202 gmx_fft_many_1d(p1d[s][thread],
1203 (plan->flags & FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,
1204 lin + tstart, fftout + tstart);
1208 if (times != NULL && thread == 0)
1210 time_fft += MPI_Wtime() - time;
1213 if ((plan->flags & FFT5D_DEBUG) && thread == 0)
1215 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1217 /* ---------- END FFT ------------ */
1219 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1223 if (times != NULL && thread == 0)
1230 1. (most outer) axes (x) is split into P[s] parts of size N[s]
1234 tend = ((thread + 1) * pM[s] * pK[s] / plan->nthreads);
1236 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s],
1237 tstart % pM[s], tstart / pM[s], tend % pM[s], tend / pM[s]);
1239 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1241 if (times != NULL && thread == 0)
1243 time_local += MPI_Wtime() - time;
1247 /* ---------- END SPLIT , START TRANSPOSE------------ */
1257 wallcycle_start(times, ewcPME_FFTCOMM);
1259 #ifdef FFT5D_MPI_TRANSPOSE
1260 FFTW(execute)(mpip[s]);
1263 if ((s == 0 && !(plan->flags & FFT5D_ORDER_YZ))
1264 || (s == 1 && (plan->flags & FFT5D_ORDER_YZ)))
1266 MPI_Alltoall(reinterpret_cast<real*>(lout2),
1267 N[s] * pM[s] * K[s] * sizeof(t_complex) / sizeof(real),
1268 GMX_MPI_REAL, reinterpret_cast<real*>(lout3),
1269 N[s] * pM[s] * K[s] * sizeof(t_complex) / sizeof(real),
1270 GMX_MPI_REAL, cart[s]);
1274 MPI_Alltoall(reinterpret_cast<real*>(lout2),
1275 N[s] * M[s] * pK[s] * sizeof(t_complex) / sizeof(real),
1276 GMX_MPI_REAL, reinterpret_cast<real*>(lout3),
1277 N[s] * M[s] * pK[s] * sizeof(t_complex) / sizeof(real),
1278 GMX_MPI_REAL, cart[s]);
1281 GMX_RELEASE_ASSERT(false, "Invalid call to fft5d_execute");
1283 #endif /*FFT5D_MPI_TRANSPOSE*/
1287 time_mpi[s] = MPI_Wtime() - time;
1290 wallcycle_stop(times, ewcPME_FFTCOMM);
1294 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1296 /* ---------- END SPLIT + TRANSPOSE------------ */
1298 /* ---------- START JOIN ------------ */
1300 if (times != NULL && thread == 0)
1314 /*bring back in matrix form
1315 thus make new 1. axes contiguos
1316 also local transpose 1 and 2/3
1317 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1319 if ((s == 0 && !(plan->flags & FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags & FFT5D_ORDER_YZ)))
1323 tstart = (thread * pM[s] * pN[s] / plan->nthreads);
1324 tend = ((thread + 1) * pM[s] * pN[s] / plan->nthreads);
1325 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s + 1], iNin[s + 1],
1326 oNin[s + 1], tstart % pM[s], tstart / pM[s], tend % pM[s], tend / pM[s]);
1333 tstart = (thread * pK[s] * pN[s] / plan->nthreads);
1334 tend = ((thread + 1) * pK[s] * pN[s] / plan->nthreads);
1335 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s + 1], iNin[s + 1],
1336 oNin[s + 1], tstart % pN[s], tstart / pN[s], tend % pN[s], tend / pN[s]);
1341 if (times != NULL && thread == 0)
1343 time_local += MPI_Wtime() - time;
1346 if ((plan->flags & FFT5D_DEBUG) && thread == 0)
1348 print_localdata(lin, "%d %d: tranposed %d\n", s + 1, plan);
1350 /* ---------- END JOIN ------------ */
1352 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1353 } /* for(s=0;s<2;s++) */
1355 if (times != NULL && thread == 0)
1361 if (plan->flags & FFT5D_INPLACE)
1363 lout = lin; /*in place currently not supported*/
1365 /* ----------- FFT ----------- */
1366 tstart = (thread * pM[s] * pK[s] / plan->nthreads) * C[s];
1367 if ((plan->flags & FFT5D_REALCOMPLEX) && (plan->flags & FFT5D_BACKWARD))
1369 gmx_fft_many_1d_real(p1d[s][thread],
1370 (plan->flags & FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX,
1371 lin + tstart, lout + tstart);
1375 gmx_fft_many_1d(p1d[s][thread], (plan->flags & FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,
1376 lin + tstart, lout + tstart);
1378 /* ------------ END FFT ---------*/
1381 if (times != NULL && thread == 0)
1383 time_fft += MPI_Wtime() - time;
1385 times->fft += time_fft;
1386 times->local += time_local;
1387 times->mpi2 += time_mpi[1];
1388 times->mpi1 += time_mpi[0];
1392 if ((plan->flags & FFT5D_DEBUG) && thread == 0)
1394 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1398 void fft5d_destroy(fft5d_plan plan)
1402 for (s = 0; s < 3; s++)
1406 for (t = 0; t < plan->nthreads; t++)
1408 gmx_many_fft_destroy(plan->p1d[s][t]);
1414 free(plan->iNin[s]);
1415 plan->iNin[s] = nullptr;
1419 free(plan->oNin[s]);
1420 plan->oNin[s] = nullptr;
1424 free(plan->iNout[s]);
1425 plan->iNout[s] = nullptr;
1429 free(plan->oNout[s]);
1430 plan->oNout[s] = nullptr;
1435 # ifdef FFT5D_MPI_TRANSPOS
1436 for (s = 0; s < 2; s++)
1438 FFTW(destroy_plan)(plan->mpip[s]);
1440 # endif /* FFT5D_MPI_TRANSPOS */
1443 FFTW(destroy_plan)(plan->p3d);
1446 #endif /* GMX_FFT_FFTW3 */
1448 if (!(plan->flags & FFT5D_NOMALLOC))
1450 // only needed for PME GPU mixed mode
1451 if (plan->pinningPolicy == gmx::PinningPolicy::PinnedIfSupported && isHostMemoryPinned(plan->lin))
1453 gmx::unpinBuffer(plan->lin);
1455 sfree_aligned(plan->lin);
1456 sfree_aligned(plan->lout);
1457 if (plan->nthreads > 1)
1459 sfree_aligned(plan->lout2);
1460 sfree_aligned(plan->lout3);
1464 #ifdef FFT5D_THREADS
1465 # ifdef FFT5D_FFTW_THREADS
1466 /*FFTW(cleanup_threads)();*/
1473 /*Is this better than direct access of plan? enough data?
1474 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1475 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1486 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1487 of processor dimensions*/
1488 fft5d_plan fft5d_plan_3d_cart(int NG,
1500 MPI_Comm cart[2] = { MPI_COMM_NULL, MPI_COMM_NULL };
1502 int size = 1, prank = 0;
1505 int wrap[] = { 0, 0 };
1507 int rdim1[] = { 0, 1 }, rdim2[] = { 1, 0 };
1509 MPI_Comm_size(comm, &size);
1510 MPI_Comm_rank(comm, &prank);
1520 printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1526 P[1] = size / P0; /*number of processors in the two dimensions*/
1528 /*Difference between x-y-z regarding 2d decomposition is whether they are
1529 distributed along axis 1, 2 or both*/
1531 MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1532 MPI_Cart_get(gcart, 2, P, wrap, coor);
1533 MPI_Cart_sub(gcart, rdim1, &cart[0]);
1534 MPI_Cart_sub(gcart, rdim2, &cart[1]);
1539 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1543 /*prints in original coordinate system of data (as the input to FFT)*/
1544 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1546 int xs[3], xl[3], xc[3], NG[3];
1548 int* coor = plan->coor;
1549 int ll = 2; /*compare ll values per element (has to be 2 for complex)*/
1550 if ((plan->flags & FFT5D_REALCOMPLEX) && (plan->flags & FFT5D_BACKWARD))
1555 compute_offsets(plan, xs, xl, xc, NG, 2);
1556 if (plan->flags & FFT5D_DEBUG)
1558 printf("Compare2\n");
1560 for (z = 0; z < xl[2]; z++)
1562 for (y = 0; y < xl[1]; y++)
1564 if (plan->flags & FFT5D_DEBUG)
1566 printf("%d %d: ", coor[0], coor[1]);
1568 for (x = 0; x < xl[0]; x++)
1570 for (l = 0; l < ll; l++) /*loop over real/complex parts*/
1573 a = reinterpret_cast<const real*>(lin)[(z * xs[2] + y * xs[1]) * 2 + x * xs[0] * ll + l];
1576 a /= plan->rC[0] * plan->rC[1] * plan->rC[2];
1580 b = reinterpret_cast<const real*>(
1581 in)[((z + xc[2]) * NG[0] * NG[1] + (y + xc[1]) * NG[0]) * 2 + (x + xc[0]) * ll + l];
1585 b = reinterpret_cast<const real*>(
1586 in)[(z * xs[2] + y * xs[1]) * 2 + x * xs[0] * ll + l];
1588 if (plan->flags & FFT5D_DEBUG)
1590 printf("%f %f, ", a, b);
1594 if (std::fabs(a - b) > 2 * NG[0] * NG[1] * NG[2] * GMX_REAL_EPS)
1596 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",
1597 coor[0], coor[1], x, y, z, a, b);
1599 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1602 if (plan->flags & FFT5D_DEBUG)
1607 if (plan->flags & FFT5D_DEBUG)