2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009-2018, The GROMACS development team.
5 * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
51 #include "gromacs/gpu_utils/gpu_utils.h"
52 #include "gromacs/gpu_utils/hostallocator.h"
53 #include "gromacs/gpu_utils/pinning.h"
54 #include "gromacs/utility/alignedallocator.h"
55 #include "gromacs/utility/exceptions.h"
56 #include "gromacs/utility/fatalerror.h"
57 #include "gromacs/utility/gmxassert.h"
58 #include "gromacs/utility/gmxmpi.h"
59 #include "gromacs/utility/smalloc.h"
62 # define GMX_PARALLEL_ENV_INITIALIZED 1
65 # define GMX_PARALLEL_ENV_INITIALIZED 1
67 # define GMX_PARALLEL_ENV_INITIALIZED 0
72 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
73 # define FFT5D_THREADS
74 /* requires fftw compiled with openmp */
75 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
78 #ifndef __FLT_EPSILON__
79 # define __FLT_EPSILON__ FLT_EPSILON
80 # define __DBL_EPSILON__ DBL_EPSILON
91 # include "gromacs/utility/exceptions.h"
93 /* none of the fftw3 calls, except execute(), are thread-safe, so
94 we need to serialize them with this mutex. */
95 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
96 static std::mutex big_fftw_mutex;
100 big_fftw_mutex.lock(); \
102 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
103 # define FFTW_UNLOCK \
106 big_fftw_mutex.unlock(); \
108 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
109 #endif /* GMX_FFT_FFTW3 */
112 /* largest factor smaller than sqrt */
113 static int lfactor(int z)
115 int i = static_cast<int>(sqrt(static_cast<double>(z)));
125 # if HAVE_GETTIMEOFDAY
126 # include <sys/time.h>
130 gettimeofday(&tv, nullptr);
131 return tv.tv_sec + tv.tv_usec * 1e-6;
141 static int vmax(const int* a, int s)
144 for (i = 0; i < s; i++)
155 /* NxMxK the size of the data
156 * comm communicator to use for fft5d
157 * P0 number of processor in 1st axes (can be null for automatic)
158 * lin is allocated by fft5d because size of array is only known after planning phase
159 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
161 fft5d_plan fft5d_plan_3d(int NG,
171 gmx::PinningPolicy realGridAllocationPinningPolicy)
174 int P[2], prank[2], i;
177 int *N0 = nullptr, *N1 = nullptr, *M0 = nullptr, *M1 = nullptr, *K0 = nullptr, *K1 = nullptr,
178 *oN0 = nullptr, *oN1 = nullptr, *oM0 = nullptr, *oM1 = nullptr, *oK0 = nullptr, *oK1 = nullptr;
179 int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3],
180 *iNin[3] = { nullptr }, *oNin[3] = { nullptr }, *iNout[3] = { nullptr },
181 *oNout[3] = { nullptr };
182 int C[3], rC[3], nP[2];
184 t_complex *lin = nullptr, *lout = nullptr, *lout2 = nullptr, *lout3 = nullptr;
188 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
190 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
192 MPI_Comm_size(comm[0], &P[0]);
193 MPI_Comm_rank(comm[0], &prank[0]);
202 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
204 MPI_Comm_size(comm[1], &P[1]);
205 MPI_Comm_rank(comm[1], &prank[1]);
214 bMaster = prank[0] == 0 && prank[1] == 0;
219 fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n", P[0], P[1], prank[0], prank[1]);
227 "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order "
228 "yz: %d, debug %d\n",
234 int((flags & FFT5D_REALCOMPLEX) > 0),
235 int((flags & FFT5D_BACKWARD) > 0),
236 int((flags & FFT5D_ORDER_YZ) > 0),
237 int((flags & FFT5D_DEBUG) > 0));
239 /* The check below is not correct, one prime factor 11 or 13 is ok.
240 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
241 printf("WARNING: FFT very slow with prime factors larger 7\n");
242 printf("Change FFT size or in case you cannot change it look at\n");
243 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
248 if (NG == 0 || MG == 0 || KG == 0)
252 printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
261 if (flags & FFT5D_REALCOMPLEX)
263 if (!(flags & FFT5D_BACKWARD))
269 if (!(flags & FFT5D_ORDER_YZ))
281 /*for transpose we need to know the size for each processor not only our own size*/
283 N0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
284 N1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
285 M0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
286 M1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
287 K0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
288 K1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
289 oN0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
290 oN1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
291 oM0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
292 oM1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
293 oK0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
294 oK1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
296 for (i = 0; i < P[0]; i++)
300 oN0[i] = i * ceil((double)NG / P[0]);
301 oM0[i] = i * ceil((double)MG / P[0]);
302 oK0[i] = i * ceil((double)KG / P[0]);
304 oN0[i] = (NG * i) / P[0];
305 oM0[i] = (MG * i) / P[0];
306 oK0[i] = (KG * i) / P[0];
309 for (i = 0; i < P[1]; i++)
312 oN1[i] = i * ceil((double)NG / P[1]);
313 oM1[i] = i * ceil((double)MG / P[1]);
314 oK1[i] = i * ceil((double)KG / P[1]);
316 oN1[i] = (NG * i) / P[1];
317 oM1[i] = (MG * i) / P[1];
318 oK1[i] = (KG * i) / P[1];
321 for (i = 0; P[0] > 0 && i < P[0] - 1; i++)
323 N0[i] = oN0[i + 1] - oN0[i];
324 M0[i] = oM0[i + 1] - oM0[i];
325 K0[i] = oK0[i + 1] - oK0[i];
327 N0[P[0] - 1] = NG - oN0[P[0] - 1];
328 M0[P[0] - 1] = MG - oM0[P[0] - 1];
329 K0[P[0] - 1] = KG - oK0[P[0] - 1];
330 for (i = 0; P[1] > 0 && i < P[1] - 1; i++)
332 N1[i] = oN1[i + 1] - oN1[i];
333 M1[i] = oM1[i + 1] - oM1[i];
334 K1[i] = oK1[i + 1] - oK1[i];
336 N1[P[1] - 1] = NG - oN1[P[1] - 1];
337 M1[P[1] - 1] = MG - oM1[P[1] - 1];
338 K1[P[1] - 1] = KG - oK1[P[1] - 1];
340 /* for step 1-3 the local N,M,K sizes of the transposed system
341 C: contiguous dimension, and nP: number of processor in subcommunicator
344 GMX_ASSERT(prank[0] < P[0], "Must have valid rank within communicator size");
345 GMX_ASSERT(prank[1] < P[1], "Must have valid rank within communicator size");
346 pM[0] = M0[prank[0]];
347 oM[0] = oM0[prank[0]];
348 pK[0] = K1[prank[1]];
349 oK[0] = oK1[prank[1]];
352 if (!(flags & FFT5D_ORDER_YZ))
354 N[0] = vmax(N1, P[1]);
356 K[0] = vmax(K1, P[1]);
357 pN[0] = N1[prank[1]];
363 N[1] = vmax(K0, P[0]);
364 pN[1] = K0[prank[0]];
369 M[1] = vmax(M0, P[0]);
370 pM[1] = M0[prank[0]];
371 oM[1] = oM0[prank[0]];
373 pK[1] = N1[prank[1]];
374 oK[1] = oN1[prank[1]];
380 M[2] = vmax(K0, P[0]);
381 pM[2] = K0[prank[0]];
382 oM[2] = oK0[prank[0]];
383 K[2] = vmax(N1, P[1]);
384 pK[2] = N1[prank[1]];
385 oK[2] = oN1[prank[1]];
387 free(oN0); /*these are not used for this order*/
389 free(oM1); /*the rest is freed in destroy*/
393 N[0] = vmax(N0, P[0]);
394 M[0] = vmax(M0, P[0]);
396 pN[0] = N0[prank[0]];
402 N[1] = vmax(M1, P[1]);
403 pN[1] = M1[prank[1]];
409 pM[1] = N0[prank[0]];
410 oM[1] = oN0[prank[0]];
411 K[1] = vmax(K1, P[1]);
412 pK[1] = K1[prank[1]];
413 oK[1] = oK1[prank[1]];
419 M[2] = vmax(N0, P[0]);
420 pM[2] = N0[prank[0]];
421 oM[2] = oN0[prank[0]];
422 K[2] = vmax(M1, P[1]);
423 pK[2] = M1[prank[1]];
424 oK[2] = oM1[prank[1]];
426 free(oN1); /*these are not used for this order*/
428 free(oK0); /*the rest is freed in destroy*/
430 N[2] = pN[2] = -1; /*not used*/
433 Difference between x-y-z regarding 2d decomposition is whether they are
434 distributed along axis 1, 2 or both
437 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
438 lsize = std::max(N[0] * M[0] * K[0] * nP[0], std::max(N[1] * M[1] * K[1] * nP[1], C[2] * M[2] * K[2]));
439 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
440 if (!(flags & FFT5D_NOMALLOC))
442 // only needed for PME GPU mixed mode
443 if (GMX_GPU_CUDA && realGridAllocationPinningPolicy == gmx::PinningPolicy::PinnedIfSupported)
445 const std::size_t numBytes = lsize * sizeof(t_complex);
446 lin = static_cast<t_complex*>(gmx::PageAlignedAllocationPolicy::malloc(numBytes));
447 gmx::pinBuffer(lin, numBytes);
451 snew_aligned(lin, lsize, 32);
453 snew_aligned(lout, lsize, 32);
456 /* We need extra transpose buffers to avoid OpenMP barriers */
457 snew_aligned(lout2, lsize, 32);
458 snew_aligned(lout3, lsize, 32);
462 /* We can reuse the buffers to avoid cache misses */
483 plan = static_cast<fft5d_plan>(calloc(1, sizeof(struct fft5d_plan_t)));
488 fprintf(debug, "Running on %d threads\n", nthreads);
492 /* Don't add more stuff here! We have already had at least one bug because we are reimplementing
493 * the low-level FFT interface instead of using the Gromacs FFT module. If we need more
494 * generic functionality it is far better to extend the interface so we can use it for
495 * all FFT libraries instead of writing FFTW-specific code here.
498 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
499 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be
500 * called from within a parallel region. For now deactivated. If it should be supported it has
501 * to made sure that that the execute of the 3d plan is in a master/serial block (since it
502 * contains it own parallel region) and that the 3d plan is faster than the 1d plan.
504 if ((!(flags & FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1))
505 && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
507 int fftwflags = FFTW_DESTROY_INPUT;
509 int inNG = NG, outMG = MG, outKG = KG;
513 fftwflags |= (flags & FFT5D_NOMEASURE) ? FFTW_ESTIMATE : FFTW_MEASURE;
515 if (flags & FFT5D_REALCOMPLEX)
517 if (!(flags & FFT5D_BACKWARD)) /*input pointer is not complex*/
521 else /*output pointer is not complex*/
523 if (!(flags & FFT5D_ORDER_YZ))
534 if (!(flags & FFT5D_BACKWARD))
540 dims[0].is = inNG * MG; /*N M K*/
543 if (!(flags & FFT5D_ORDER_YZ))
545 dims[0].os = MG; /*M K N*/
547 dims[2].os = MG * KG;
551 dims[0].os = 1; /*K N M*/
552 dims[1].os = KG * NG;
558 if (!(flags & FFT5D_ORDER_YZ))
565 dims[1].is = NG * MG;
568 dims[0].os = outMG * KG;
580 dims[2].is = NG * MG;
582 dims[0].os = outKG * NG;
587 # ifdef FFT5D_THREADS
588 # ifdef FFT5D_FFTW_THREADS
589 FFTW(plan_with_nthreads)(nthreads);
592 if ((flags & FFT5D_REALCOMPLEX) && !(flags & FFT5D_BACKWARD))
594 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3,
597 /*howmany_dims*/ nullptr,
598 reinterpret_cast<real*>(lin),
599 reinterpret_cast<FFTW(complex)*>(lout),
600 /*flags*/ fftwflags);
602 else if ((flags & FFT5D_REALCOMPLEX) && (flags & FFT5D_BACKWARD))
604 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3,
607 /*howmany_dims*/ nullptr,
608 reinterpret_cast<FFTW(complex)*>(lin),
609 reinterpret_cast<real*>(lout),
610 /*flags*/ fftwflags);
614 plan->p3d = FFTW(plan_guru_dft)(
618 /*howmany_dims*/ nullptr,
619 reinterpret_cast<FFTW(complex)*>(lin),
620 reinterpret_cast<FFTW(complex)*>(lout),
621 /*sign*/ (flags & FFT5D_BACKWARD) ? 1 : -1,
622 /*flags*/ fftwflags);
624 # ifdef FFT5D_THREADS
625 # ifdef FFT5D_FFTW_THREADS
626 FFTW(plan_with_nthreads)(1);
631 if (!plan->p3d) /* for decomposition and if 3d plan did not work */
633 #endif /* GMX_FFT_FFTW3 */
634 for (s = 0; s < 3; s++)
638 fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n", s, rC[s], M[s], pK[s], C[s], lsize);
640 plan->p1d[s] = static_cast<gmx_fft_t*>(malloc(sizeof(gmx_fft_t) * nthreads));
642 /* Make sure that the init routines are only called by one thread at a time and in order
643 (later is only important to not confuse valgrind)
645 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
646 for (int t = 0; t < nthreads; t++)
652 int tsize = ((t + 1) * pM[s] * pK[s] / nthreads) - (t * pM[s] * pK[s] / nthreads);
654 if ((flags & FFT5D_REALCOMPLEX)
655 && ((!(flags & FFT5D_BACKWARD) && s == 0)
656 || ((flags & FFT5D_BACKWARD) && s == 2)))
658 gmx_fft_init_many_1d_real(
662 (flags & FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0);
666 gmx_fft_init_many_1d(&plan->p1d[s][t],
669 (flags & FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0);
672 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
680 if ((flags & FFT5D_ORDER_YZ)) /*plan->cart is in the order of transposes */
682 plan->cart[0] = comm[0];
683 plan->cart[1] = comm[1];
687 plan->cart[1] = comm[0];
688 plan->cart[0] = comm[1];
690 #ifdef FFT5D_MPI_TRANSPOSE
692 for (s = 0; s < 2; s++)
694 if ((s == 0 && !(flags & FFT5D_ORDER_YZ)) || (s == 1 && (flags & FFT5D_ORDER_YZ)))
696 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(
697 nP[s], nP[s], N[s] * K[s] * pM[s] * 2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
701 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(
702 nP[s], nP[s], N[s] * pK[s] * M[s] * 2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
718 for (s = 0; s < 3; s++)
730 plan->iNin[s] = iNin[s];
731 plan->oNin[s] = oNin[s];
732 plan->iNout[s] = iNout[s];
733 plan->oNout[s] = oNout[s];
735 for (s = 0; s < 2; s++)
738 plan->coor[s] = prank[s];
741 /* plan->fftorder=fftorder;
742 plan->direction=direction;
743 plan->realcomplex=realcomplex;
746 plan->nthreads = nthreads;
747 plan->pinningPolicy = realGridAllocationPinningPolicy;
767 /*here x,y,z and N,M,K is in rotated coordinate system!!
768 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
769 maxN,maxM,maxK is max size of local data
770 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
771 NG, MG, KG is size of global data*/
772 static void splitaxes(t_complex* lout,
773 const t_complex* lin,
788 int in_i, out_i, in_z, out_z, in_y, out_y;
791 for (z = startz; z < endz + 1; z++) /*3. z l*/
809 out_z = z * maxN * maxM;
812 for (i = 0; i < P; i++) /*index cube along long axis*/
814 out_i = out_z + i * maxN * maxM * maxK;
816 for (y = s_y; y < e_y; y++) /*2. y k*/
818 out_y = out_i + y * maxN;
819 in_y = in_i + y * NG;
820 for (x = 0; x < N[i]; x++) /*1. x j*/
822 lout[out_y + x] = lin[in_y + x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
823 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
824 /*before split data contiguos - thus if different processor get different amount oN is different*/
831 /*make axis contiguous again (after AllToAll) and also do local transpose*/
832 /*transpose mayor and major dimension
834 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
835 N,M,K local dimensions
837 static void joinAxesTrans13(t_complex* lout,
838 const t_complex* lin,
853 int out_i, in_i, out_x, in_x, out_z, in_z;
856 for (x = startx; x < endx + 1; x++) /*1.j*/
878 for (i = 0; i < P; i++) /*index cube along long axis*/
880 out_i = out_x + oK[i];
881 in_i = in_x + i * maxM * maxN * maxK;
882 for (z = 0; z < K[i]; z++) /*3.l*/
885 in_z = in_i + z * maxM * maxN;
886 for (y = s_y; y < e_y; y++) /*2.k*/
888 lout[out_z + y * KG] = lin[in_z + y * maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
895 /*make axis contiguous again (after AllToAll) and also do local transpose
896 tranpose mayor and middle dimension
898 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
901 static void joinAxesTrans12(t_complex* lout,
902 const t_complex* lin,
917 int out_i, in_i, out_z, in_z, out_x, in_x;
920 for (z = startz; z < endz + 1; z++)
939 in_z = z * maxM * maxN;
941 for (i = 0; i < P; i++) /*index cube along long axis*/
943 out_i = out_z + oM[i];
944 in_i = in_z + i * maxM * maxN * maxK;
945 for (x = s_x; x < e_x; x++)
947 out_x = out_i + x * MG;
949 for (y = 0; y < M[i]; y++)
951 lout[out_x + y] = lin[in_x + y * maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
959 static void rotate_offsets(int x[])
970 /*compute the offset to compare or print transposed local data in original input coordinates
971 xs matrix dimension size, xl dimension length, xc decomposition offset
972 s: step in computation = number of transposes*/
973 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
975 /* int direction = plan->direction;
976 int fftorder = plan->fftorder;*/
980 int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK, *C = plan->C, *rC = plan->rC;
986 if (!(plan->flags & FFT5D_ORDER_YZ))
990 case 0: o = XYZ; break;
991 case 1: o = ZYX; break;
992 case 2: o = YZX; break;
1000 case 0: o = XYZ; break;
1001 case 1: o = YXZ; break;
1002 case 2: o = ZXY; break;
1040 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
1042 /*xs, xl give dimension size and data length in local transposed coordinate system
1043 for 0(/1/2): x(/y/z) in original coordinate system*/
1044 for (i = 0; i < 3; i++)
1059 xs[i] = C[s] * pM[s];
1065 /*input order is different for test program to match FFTW order
1066 (important for complex to real)*/
1067 if (plan->flags & FFT5D_BACKWARD)
1073 if (plan->flags & FFT5D_ORDER_YZ)
1081 if ((plan->flags & FFT5D_REALCOMPLEX)
1082 && ((!(plan->flags & FFT5D_BACKWARD) && s == 0) || ((plan->flags & FFT5D_BACKWARD) && s == 2)))
1088 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
1091 int* coor = plan->coor;
1092 int xs[3], xl[3], xc[3], NG[3];
1093 int ll = (plan->flags & FFT5D_REALCOMPLEX) ? 1 : 2;
1094 compute_offsets(plan, xs, xl, xc, NG, s);
1095 fprintf(debug, txt, coor[0], coor[1]);
1096 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
1097 for (z = 0; z < xl[2]; z++)
1099 for (y = 0; y < xl[1]; y++)
1101 fprintf(debug, "%d %d: ", coor[0], coor[1]);
1102 for (x = 0; x < xl[0]; x++)
1104 for (l = 0; l < ll; l++)
1108 reinterpret_cast<const real*>(
1109 lin)[(z * xs[2] + y * xs[1]) * 2 + (x * xs[0]) * ll + l]);
1111 fprintf(debug, ",");
1113 fprintf(debug, "\n");
1118 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
1120 t_complex* lin = plan->lin;
1121 t_complex* lout = plan->lout;
1122 t_complex* lout2 = plan->lout2;
1123 t_complex* lout3 = plan->lout3;
1124 t_complex *fftout, *joinin;
1126 gmx_fft_t** p1d = plan->p1d;
1127 #ifdef FFT5D_MPI_TRANSPOSE
1128 FFTW(plan)* mpip = plan->mpip;
1131 MPI_Comm* cart = plan->cart;
1134 double time_fft = 0, time_local = 0, time_mpi[2] = { 0 }, time = 0;
1136 int *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
1137 *C = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout,
1138 **oNout = plan->oNout;
1139 int s = 0, tstart, tend, bParallelDim;
1153 FFTW(execute)(plan->p3d);
1157 times->fft += MPI_Wtime() - time;
1168 if ((plan->flags & FFT5D_DEBUG) && thread == 0)
1170 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1173 for (s = 0; s < 2; s++) /*loop over first two FFT steps (corner rotations)*/
1177 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1187 /* ---------- START FFT ------------ */
1189 if (times != 0 && thread == 0)
1195 if (bParallelDim || plan->nthreads == 1)
1211 tstart = (thread * pM[s] * pK[s] / plan->nthreads) * C[s];
1212 if ((plan->flags & FFT5D_REALCOMPLEX) && !(plan->flags & FFT5D_BACKWARD) && s == 0)
1214 gmx_fft_many_1d_real(p1d[s][thread],
1215 (plan->flags & FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL
1216 : GMX_FFT_REAL_TO_COMPLEX,
1222 gmx_fft_many_1d(p1d[s][thread],
1223 (plan->flags & FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,
1229 if (times != NULL && thread == 0)
1231 time_fft += MPI_Wtime() - time;
1234 if ((plan->flags & FFT5D_DEBUG) && thread == 0)
1236 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1238 /* ---------- END FFT ------------ */
1240 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1244 if (times != NULL && thread == 0)
1251 1. (most outer) axes (x) is split into P[s] parts of size N[s]
1255 tend = ((thread + 1) * pM[s] * pK[s] / plan->nthreads);
1272 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1274 if (times != NULL && thread == 0)
1276 time_local += MPI_Wtime() - time;
1280 /* ---------- END SPLIT , START TRANSPOSE------------ */
1290 wallcycle_start(times, WallCycleCounter::PmeFftComm);
1292 #ifdef FFT5D_MPI_TRANSPOSE
1293 FFTW(execute)(mpip[s]);
1296 if ((s == 0 && !(plan->flags & FFT5D_ORDER_YZ))
1297 || (s == 1 && (plan->flags & FFT5D_ORDER_YZ)))
1299 MPI_Alltoall(reinterpret_cast<real*>(lout2),
1300 N[s] * pM[s] * K[s] * sizeof(t_complex) / sizeof(real),
1302 reinterpret_cast<real*>(lout3),
1303 N[s] * pM[s] * K[s] * sizeof(t_complex) / sizeof(real),
1309 MPI_Alltoall(reinterpret_cast<real*>(lout2),
1310 N[s] * M[s] * pK[s] * sizeof(t_complex) / sizeof(real),
1312 reinterpret_cast<real*>(lout3),
1313 N[s] * M[s] * pK[s] * sizeof(t_complex) / sizeof(real),
1318 GMX_RELEASE_ASSERT(false, "Invalid call to fft5d_execute");
1320 #endif /*FFT5D_MPI_TRANSPOSE*/
1324 time_mpi[s] = MPI_Wtime() - time;
1327 wallcycle_stop(times, WallCycleCounter::PmeFftComm);
1331 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1333 /* ---------- END SPLIT + TRANSPOSE------------ */
1335 /* ---------- START JOIN ------------ */
1337 if (times != NULL && thread == 0)
1351 /*bring back in matrix form
1352 thus make new 1. axes contiguos
1353 also local transpose 1 and 2/3
1354 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1356 if ((s == 0 && !(plan->flags & FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags & FFT5D_ORDER_YZ)))
1360 tstart = (thread * pM[s] * pN[s] / plan->nthreads);
1361 tend = ((thread + 1) * pM[s] * pN[s] / plan->nthreads);
1362 joinAxesTrans13(lin,
1382 tstart = (thread * pK[s] * pN[s] / plan->nthreads);
1383 tend = ((thread + 1) * pK[s] * pN[s] / plan->nthreads);
1384 joinAxesTrans12(lin,
1402 if (times != NULL && thread == 0)
1404 time_local += MPI_Wtime() - time;
1407 if ((plan->flags & FFT5D_DEBUG) && thread == 0)
1409 print_localdata(lin, "%d %d: tranposed %d\n", s + 1, plan);
1411 /* ---------- END JOIN ------------ */
1413 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1414 } /* for(s=0;s<2;s++) */
1416 if (times != NULL && thread == 0)
1422 if (plan->flags & FFT5D_INPLACE)
1424 lout = lin; /*in place currently not supported*/
1426 /* ----------- FFT ----------- */
1427 tstart = (thread * pM[s] * pK[s] / plan->nthreads) * C[s];
1428 if ((plan->flags & FFT5D_REALCOMPLEX) && (plan->flags & FFT5D_BACKWARD))
1430 gmx_fft_many_1d_real(p1d[s][thread],
1431 (plan->flags & FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX,
1437 gmx_fft_many_1d(p1d[s][thread],
1438 (plan->flags & FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,
1442 /* ------------ END FFT ---------*/
1445 if (times != NULL && thread == 0)
1447 time_fft += MPI_Wtime() - time;
1449 times->fft += time_fft;
1450 times->local += time_local;
1451 times->mpi2 += time_mpi[1];
1452 times->mpi1 += time_mpi[0];
1456 if ((plan->flags & FFT5D_DEBUG) && thread == 0)
1458 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1462 void fft5d_destroy(fft5d_plan plan)
1466 for (s = 0; s < 3; s++)
1470 for (t = 0; t < plan->nthreads; t++)
1472 gmx_many_fft_destroy(plan->p1d[s][t]);
1478 free(plan->iNin[s]);
1479 plan->iNin[s] = nullptr;
1483 free(plan->oNin[s]);
1484 plan->oNin[s] = nullptr;
1488 free(plan->iNout[s]);
1489 plan->iNout[s] = nullptr;
1493 free(plan->oNout[s]);
1494 plan->oNout[s] = nullptr;
1499 # ifdef FFT5D_MPI_TRANSPOS
1500 for (s = 0; s < 2; s++)
1502 FFTW(destroy_plan)(plan->mpip[s]);
1504 # endif /* FFT5D_MPI_TRANSPOS */
1507 FFTW(destroy_plan)(plan->p3d);
1510 #endif /* GMX_FFT_FFTW3 */
1512 if (!(plan->flags & FFT5D_NOMALLOC))
1514 // only needed for PME GPU mixed mode
1515 if (plan->pinningPolicy == gmx::PinningPolicy::PinnedIfSupported && isHostMemoryPinned(plan->lin))
1517 gmx::unpinBuffer(plan->lin);
1519 sfree_aligned(plan->lin);
1520 sfree_aligned(plan->lout);
1521 if (plan->nthreads > 1)
1523 sfree_aligned(plan->lout2);
1524 sfree_aligned(plan->lout3);
1528 #ifdef FFT5D_THREADS
1529 # ifdef FFT5D_FFTW_THREADS
1530 /*FFTW(cleanup_threads)();*/
1537 /*Is this better than direct access of plan? enough data?
1538 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1539 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1550 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1551 of processor dimensions*/
1552 fft5d_plan fft5d_plan_3d_cart(int NG,
1564 MPI_Comm cart[2] = { MPI_COMM_NULL, MPI_COMM_NULL };
1566 int size = 1, prank = 0;
1569 int wrap[] = { 0, 0 };
1571 int rdim1[] = { 0, 1 }, rdim2[] = { 1, 0 };
1573 MPI_Comm_size(comm, &size);
1574 MPI_Comm_rank(comm, &prank);
1584 printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1590 P[1] = size / P0; /*number of processors in the two dimensions*/
1592 /*Difference between x-y-z regarding 2d decomposition is whether they are
1593 distributed along axis 1, 2 or both*/
1595 MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1596 MPI_Cart_get(gcart, 2, P, wrap, coor);
1597 MPI_Cart_sub(gcart, rdim1, &cart[0]);
1598 MPI_Cart_sub(gcart, rdim2, &cart[1]);
1603 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1607 /*prints in original coordinate system of data (as the input to FFT)*/
1608 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1610 int xs[3], xl[3], xc[3], NG[3];
1612 int* coor = plan->coor;
1613 int ll = 2; /*compare ll values per element (has to be 2 for complex)*/
1614 if ((plan->flags & FFT5D_REALCOMPLEX) && (plan->flags & FFT5D_BACKWARD))
1619 compute_offsets(plan, xs, xl, xc, NG, 2);
1620 if (plan->flags & FFT5D_DEBUG)
1622 printf("Compare2\n");
1624 for (z = 0; z < xl[2]; z++)
1626 for (y = 0; y < xl[1]; y++)
1628 if (plan->flags & FFT5D_DEBUG)
1630 printf("%d %d: ", coor[0], coor[1]);
1632 for (x = 0; x < xl[0]; x++)
1634 for (l = 0; l < ll; l++) /*loop over real/complex parts*/
1637 a = reinterpret_cast<const real*>(lin)[(z * xs[2] + y * xs[1]) * 2 + x * xs[0] * ll + l];
1640 a /= plan->rC[0] * plan->rC[1] * plan->rC[2];
1644 b = reinterpret_cast<const real*>(
1645 in)[((z + xc[2]) * NG[0] * NG[1] + (y + xc[1]) * NG[0]) * 2 + (x + xc[0]) * ll + l];
1649 b = reinterpret_cast<const real*>(
1650 in)[(z * xs[2] + y * xs[1]) * 2 + x * xs[0] * ll + l];
1652 if (plan->flags & FFT5D_DEBUG)
1654 printf("%f %f, ", a, b);
1658 if (std::fabs(a - b) > 2 * NG[0] * NG[1] * NG[2] * GMX_REAL_EPS)
1660 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",
1669 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1672 if (plan->flags & FFT5D_DEBUG)
1677 if (plan->flags & FFT5D_DEBUG)