2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009-2018, The GROMACS development team.
5 * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
51 #include "gromacs/gpu_utils/gpu_utils.h"
52 #include "gromacs/gpu_utils/hostallocator.h"
53 #include "gromacs/gpu_utils/pinning.h"
54 #include "gromacs/utility/alignedallocator.h"
55 #include "gromacs/utility/exceptions.h"
56 #include "gromacs/utility/fatalerror.h"
57 #include "gromacs/utility/gmxassert.h"
58 #include "gromacs/utility/gmxmpi.h"
59 #include "gromacs/utility/smalloc.h"
62 # define GMX_PARALLEL_ENV_INITIALIZED 1
65 # define GMX_PARALLEL_ENV_INITIALIZED 1
67 # define GMX_PARALLEL_ENV_INITIALIZED 0
72 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
73 # define FFT5D_THREADS
74 /* requires fftw compiled with openmp */
75 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
78 #ifndef __FLT_EPSILON__
79 # define __FLT_EPSILON__ FLT_EPSILON
80 # define __DBL_EPSILON__ DBL_EPSILON
91 # include "gromacs/utility/exceptions.h"
93 /* none of the fftw3 calls, except execute(), are thread-safe, so
94 we need to serialize them with this mutex. */
95 static std::mutex big_fftw_mutex;
99 big_fftw_mutex.lock(); \
101 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
102 # define FFTW_UNLOCK \
105 big_fftw_mutex.unlock(); \
107 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
108 #endif /* GMX_FFT_FFTW3 */
111 /* largest factor smaller than sqrt */
112 static int lfactor(int z)
114 int i = static_cast<int>(sqrt(static_cast<double>(z)));
124 # if HAVE_GETTIMEOFDAY
125 # include <sys/time.h>
129 gettimeofday(&tv, nullptr);
130 return tv.tv_sec + tv.tv_usec * 1e-6;
140 static int vmax(const int* a, int s)
143 for (i = 0; i < s; i++)
154 /* NxMxK the size of the data
155 * comm communicator to use for fft5d
156 * P0 number of processor in 1st axes (can be null for automatic)
157 * lin is allocated by fft5d because size of array is only known after planning phase
158 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
160 fft5d_plan fft5d_plan_3d(int NG,
170 gmx::PinningPolicy realGridAllocationPinningPolicy)
173 int P[2], prank[2], i;
176 int *N0 = nullptr, *N1 = nullptr, *M0 = nullptr, *M1 = nullptr, *K0 = nullptr, *K1 = nullptr,
177 *oN0 = nullptr, *oN1 = nullptr, *oM0 = nullptr, *oM1 = nullptr, *oK0 = nullptr, *oK1 = nullptr;
178 int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3],
179 *iNin[3] = { nullptr }, *oNin[3] = { nullptr }, *iNout[3] = { nullptr },
180 *oNout[3] = { nullptr };
181 int C[3], rC[3], nP[2];
183 t_complex *lin = nullptr, *lout = nullptr, *lout2 = nullptr, *lout3 = nullptr;
187 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
189 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
191 MPI_Comm_size(comm[0], &P[0]);
192 MPI_Comm_rank(comm[0], &prank[0]);
201 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
203 MPI_Comm_size(comm[1], &P[1]);
204 MPI_Comm_rank(comm[1], &prank[1]);
213 bMaster = prank[0] == 0 && prank[1] == 0;
218 fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n", P[0], P[1], prank[0], prank[1]);
226 "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order "
227 "yz: %d, debug %d\n",
233 int((flags & FFT5D_REALCOMPLEX) > 0),
234 int((flags & FFT5D_BACKWARD) > 0),
235 int((flags & FFT5D_ORDER_YZ) > 0),
236 int((flags & FFT5D_DEBUG) > 0));
238 /* The check below is not correct, one prime factor 11 or 13 is ok.
239 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
240 printf("WARNING: FFT very slow with prime factors larger 7\n");
241 printf("Change FFT size or in case you cannot change it look at\n");
242 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
247 if (NG == 0 || MG == 0 || KG == 0)
251 printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
260 if (flags & FFT5D_REALCOMPLEX)
262 if (!(flags & FFT5D_BACKWARD))
268 if (!(flags & FFT5D_ORDER_YZ))
280 /*for transpose we need to know the size for each processor not only our own size*/
282 N0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
283 N1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
284 M0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
285 M1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
286 K0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
287 K1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
288 oN0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
289 oN1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
290 oM0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
291 oM1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
292 oK0 = static_cast<int*>(malloc(P[0] * sizeof(int)));
293 oK1 = static_cast<int*>(malloc(P[1] * sizeof(int)));
295 for (i = 0; i < P[0]; i++)
299 oN0[i] = i * ceil((double)NG / P[0]);
300 oM0[i] = i * ceil((double)MG / P[0]);
301 oK0[i] = i * ceil((double)KG / P[0]);
303 oN0[i] = (NG * i) / P[0];
304 oM0[i] = (MG * i) / P[0];
305 oK0[i] = (KG * i) / P[0];
308 for (i = 0; i < P[1]; i++)
311 oN1[i] = i * ceil((double)NG / P[1]);
312 oM1[i] = i * ceil((double)MG / P[1]);
313 oK1[i] = i * ceil((double)KG / P[1]);
315 oN1[i] = (NG * i) / P[1];
316 oM1[i] = (MG * i) / P[1];
317 oK1[i] = (KG * i) / P[1];
320 for (i = 0; P[0] > 0 && i < P[0] - 1; i++)
322 N0[i] = oN0[i + 1] - oN0[i];
323 M0[i] = oM0[i + 1] - oM0[i];
324 K0[i] = oK0[i + 1] - oK0[i];
326 N0[P[0] - 1] = NG - oN0[P[0] - 1];
327 M0[P[0] - 1] = MG - oM0[P[0] - 1];
328 K0[P[0] - 1] = KG - oK0[P[0] - 1];
329 for (i = 0; P[1] > 0 && i < P[1] - 1; i++)
331 N1[i] = oN1[i + 1] - oN1[i];
332 M1[i] = oM1[i + 1] - oM1[i];
333 K1[i] = oK1[i + 1] - oK1[i];
335 N1[P[1] - 1] = NG - oN1[P[1] - 1];
336 M1[P[1] - 1] = MG - oM1[P[1] - 1];
337 K1[P[1] - 1] = KG - oK1[P[1] - 1];
339 /* for step 1-3 the local N,M,K sizes of the transposed system
340 C: contiguous dimension, and nP: number of processor in subcommunicator
343 GMX_ASSERT(prank[0] < P[0], "Must have valid rank within communicator size");
344 GMX_ASSERT(prank[1] < P[1], "Must have valid rank within communicator size");
345 pM[0] = M0[prank[0]];
346 oM[0] = oM0[prank[0]];
347 pK[0] = K1[prank[1]];
348 oK[0] = oK1[prank[1]];
351 if (!(flags & FFT5D_ORDER_YZ))
353 N[0] = vmax(N1, P[1]);
355 K[0] = vmax(K1, P[1]);
356 pN[0] = N1[prank[1]];
362 N[1] = vmax(K0, P[0]);
363 pN[1] = K0[prank[0]];
368 M[1] = vmax(M0, P[0]);
369 pM[1] = M0[prank[0]];
370 oM[1] = oM0[prank[0]];
372 pK[1] = N1[prank[1]];
373 oK[1] = oN1[prank[1]];
379 M[2] = vmax(K0, P[0]);
380 pM[2] = K0[prank[0]];
381 oM[2] = oK0[prank[0]];
382 K[2] = vmax(N1, P[1]);
383 pK[2] = N1[prank[1]];
384 oK[2] = oN1[prank[1]];
386 free(oN0); /*these are not used for this order*/
388 free(oM1); /*the rest is freed in destroy*/
392 N[0] = vmax(N0, P[0]);
393 M[0] = vmax(M0, P[0]);
395 pN[0] = N0[prank[0]];
401 N[1] = vmax(M1, P[1]);
402 pN[1] = M1[prank[1]];
408 pM[1] = N0[prank[0]];
409 oM[1] = oN0[prank[0]];
410 K[1] = vmax(K1, P[1]);
411 pK[1] = K1[prank[1]];
412 oK[1] = oK1[prank[1]];
418 M[2] = vmax(N0, P[0]);
419 pM[2] = N0[prank[0]];
420 oM[2] = oN0[prank[0]];
421 K[2] = vmax(M1, P[1]);
422 pK[2] = M1[prank[1]];
423 oK[2] = oM1[prank[1]];
425 free(oN1); /*these are not used for this order*/
427 free(oK0); /*the rest is freed in destroy*/
429 N[2] = pN[2] = -1; /*not used*/
432 Difference between x-y-z regarding 2d decomposition is whether they are
433 distributed along axis 1, 2 or both
436 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
437 lsize = std::max(N[0] * M[0] * K[0] * nP[0], std::max(N[1] * M[1] * K[1] * nP[1], C[2] * M[2] * K[2]));
438 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
439 if (!(flags & FFT5D_NOMALLOC))
441 // only needed for PME GPU mixed mode
442 if (GMX_GPU_CUDA && realGridAllocationPinningPolicy == gmx::PinningPolicy::PinnedIfSupported)
444 const std::size_t numBytes = lsize * sizeof(t_complex);
445 lin = static_cast<t_complex*>(gmx::PageAlignedAllocationPolicy::malloc(numBytes));
446 gmx::pinBuffer(lin, numBytes);
450 snew_aligned(lin, lsize, 32);
452 snew_aligned(lout, lsize, 32);
455 /* We need extra transpose buffers to avoid OpenMP barriers */
456 snew_aligned(lout2, lsize, 32);
457 snew_aligned(lout3, lsize, 32);
461 /* We can reuse the buffers to avoid cache misses */
482 plan = static_cast<fft5d_plan>(calloc(1, sizeof(struct fft5d_plan_t)));
487 fprintf(debug, "Running on %d threads\n", nthreads);
491 /* Don't add more stuff here! We have already had at least one bug because we are reimplementing
492 * the low-level FFT interface instead of using the Gromacs FFT module. If we need more
493 * generic functionality it is far better to extend the interface so we can use it for
494 * all FFT libraries instead of writing FFTW-specific code here.
497 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
498 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be
499 * called from within a parallel region. For now deactivated. If it should be supported it has
500 * to made sure that that the execute of the 3d plan is in a master/serial block (since it
501 * contains it own parallel region) and that the 3d plan is faster than the 1d plan.
503 if ((!(flags & FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1))
504 && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
506 int fftwflags = FFTW_DESTROY_INPUT;
508 int inNG = NG, outMG = MG, outKG = KG;
512 fftwflags |= (flags & FFT5D_NOMEASURE) ? FFTW_ESTIMATE : FFTW_MEASURE;
514 if (flags & FFT5D_REALCOMPLEX)
516 if (!(flags & FFT5D_BACKWARD)) /*input pointer is not complex*/
520 else /*output pointer is not complex*/
522 if (!(flags & FFT5D_ORDER_YZ))
533 if (!(flags & FFT5D_BACKWARD))
539 dims[0].is = inNG * MG; /*N M K*/
542 if (!(flags & FFT5D_ORDER_YZ))
544 dims[0].os = MG; /*M K N*/
546 dims[2].os = MG * KG;
550 dims[0].os = 1; /*K N M*/
551 dims[1].os = KG * NG;
557 if (!(flags & FFT5D_ORDER_YZ))
564 dims[1].is = NG * MG;
567 dims[0].os = outMG * KG;
579 dims[2].is = NG * MG;
581 dims[0].os = outKG * NG;
586 # ifdef FFT5D_THREADS
587 # ifdef FFT5D_FFTW_THREADS
588 FFTW(plan_with_nthreads)(nthreads);
591 if ((flags & FFT5D_REALCOMPLEX) && !(flags & FFT5D_BACKWARD))
593 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3,
596 /*howmany_dims*/ nullptr,
597 reinterpret_cast<real*>(lin),
598 reinterpret_cast<FFTW(complex)*>(lout),
599 /*flags*/ fftwflags);
601 else if ((flags & FFT5D_REALCOMPLEX) && (flags & FFT5D_BACKWARD))
603 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3,
606 /*howmany_dims*/ nullptr,
607 reinterpret_cast<FFTW(complex)*>(lin),
608 reinterpret_cast<real*>(lout),
609 /*flags*/ fftwflags);
613 plan->p3d = FFTW(plan_guru_dft)(
617 /*howmany_dims*/ nullptr,
618 reinterpret_cast<FFTW(complex)*>(lin),
619 reinterpret_cast<FFTW(complex)*>(lout),
620 /*sign*/ (flags & FFT5D_BACKWARD) ? 1 : -1,
621 /*flags*/ fftwflags);
623 # ifdef FFT5D_THREADS
624 # ifdef FFT5D_FFTW_THREADS
625 FFTW(plan_with_nthreads)(1);
630 if (!plan->p3d) /* for decomposition and if 3d plan did not work */
632 #endif /* GMX_FFT_FFTW3 */
633 for (s = 0; s < 3; s++)
637 fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n", s, rC[s], M[s], pK[s], C[s], lsize);
639 plan->p1d[s] = static_cast<gmx_fft_t*>(malloc(sizeof(gmx_fft_t) * nthreads));
641 /* Make sure that the init routines are only called by one thread at a time and in order
642 (later is only important to not confuse valgrind)
644 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
645 for (int t = 0; t < nthreads; t++)
651 int tsize = ((t + 1) * pM[s] * pK[s] / nthreads) - (t * pM[s] * pK[s] / nthreads);
653 if ((flags & FFT5D_REALCOMPLEX)
654 && ((!(flags & FFT5D_BACKWARD) && s == 0)
655 || ((flags & FFT5D_BACKWARD) && s == 2)))
657 gmx_fft_init_many_1d_real(
661 (flags & FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0);
665 gmx_fft_init_many_1d(&plan->p1d[s][t],
668 (flags & FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0);
671 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
679 if ((flags & FFT5D_ORDER_YZ)) /*plan->cart is in the order of transposes */
681 plan->cart[0] = comm[0];
682 plan->cart[1] = comm[1];
686 plan->cart[1] = comm[0];
687 plan->cart[0] = comm[1];
689 #ifdef FFT5D_MPI_TRANSPOSE
691 for (s = 0; s < 2; s++)
693 if ((s == 0 && !(flags & FFT5D_ORDER_YZ)) || (s == 1 && (flags & FFT5D_ORDER_YZ)))
695 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(
696 nP[s], nP[s], N[s] * K[s] * pM[s] * 2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
700 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(
701 nP[s], nP[s], N[s] * pK[s] * M[s] * 2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
717 for (s = 0; s < 3; s++)
729 plan->iNin[s] = iNin[s];
730 plan->oNin[s] = oNin[s];
731 plan->iNout[s] = iNout[s];
732 plan->oNout[s] = oNout[s];
734 for (s = 0; s < 2; s++)
737 plan->coor[s] = prank[s];
740 /* plan->fftorder=fftorder;
741 plan->direction=direction;
742 plan->realcomplex=realcomplex;
745 plan->nthreads = nthreads;
746 plan->pinningPolicy = realGridAllocationPinningPolicy;
766 /*here x,y,z and N,M,K is in rotated coordinate system!!
767 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
768 maxN,maxM,maxK is max size of local data
769 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
770 NG, MG, KG is size of global data*/
771 static void splitaxes(t_complex* lout,
772 const t_complex* lin,
787 int in_i, out_i, in_z, out_z, in_y, out_y;
790 for (z = startz; z < endz + 1; z++) /*3. z l*/
808 out_z = z * maxN * maxM;
811 for (i = 0; i < P; i++) /*index cube along long axis*/
813 out_i = out_z + i * maxN * maxM * maxK;
815 for (y = s_y; y < e_y; y++) /*2. y k*/
817 out_y = out_i + y * maxN;
818 in_y = in_i + y * NG;
819 for (x = 0; x < N[i]; x++) /*1. x j*/
821 lout[out_y + x] = lin[in_y + x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
822 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
823 /*before split data contiguos - thus if different processor get different amount oN is different*/
830 /*make axis contiguous again (after AllToAll) and also do local transpose*/
831 /*transpose mayor and major dimension
833 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
834 N,M,K local dimensions
836 static void joinAxesTrans13(t_complex* lout,
837 const t_complex* lin,
852 int out_i, in_i, out_x, in_x, out_z, in_z;
855 for (x = startx; x < endx + 1; x++) /*1.j*/
877 for (i = 0; i < P; i++) /*index cube along long axis*/
879 out_i = out_x + oK[i];
880 in_i = in_x + i * maxM * maxN * maxK;
881 for (z = 0; z < K[i]; z++) /*3.l*/
884 in_z = in_i + z * maxM * maxN;
885 for (y = s_y; y < e_y; y++) /*2.k*/
887 lout[out_z + y * KG] = lin[in_z + y * maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
894 /*make axis contiguous again (after AllToAll) and also do local transpose
895 tranpose mayor and middle dimension
897 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
900 static void joinAxesTrans12(t_complex* lout,
901 const t_complex* lin,
916 int out_i, in_i, out_z, in_z, out_x, in_x;
919 for (z = startz; z < endz + 1; z++)
938 in_z = z * maxM * maxN;
940 for (i = 0; i < P; i++) /*index cube along long axis*/
942 out_i = out_z + oM[i];
943 in_i = in_z + i * maxM * maxN * maxK;
944 for (x = s_x; x < e_x; x++)
946 out_x = out_i + x * MG;
948 for (y = 0; y < M[i]; y++)
950 lout[out_x + y] = lin[in_x + y * maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
958 static void rotate_offsets(int x[])
969 /*compute the offset to compare or print transposed local data in original input coordinates
970 xs matrix dimension size, xl dimension length, xc decomposition offset
971 s: step in computation = number of transposes*/
972 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
974 /* int direction = plan->direction;
975 int fftorder = plan->fftorder;*/
979 int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK, *C = plan->C, *rC = plan->rC;
985 if (!(plan->flags & FFT5D_ORDER_YZ))
989 case 0: o = XYZ; break;
990 case 1: o = ZYX; break;
991 case 2: o = YZX; break;
999 case 0: o = XYZ; break;
1000 case 1: o = YXZ; break;
1001 case 2: o = ZXY; break;
1039 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
1041 /*xs, xl give dimension size and data length in local transposed coordinate system
1042 for 0(/1/2): x(/y/z) in original coordinate system*/
1043 for (i = 0; i < 3; i++)
1058 xs[i] = C[s] * pM[s];
1064 /*input order is different for test program to match FFTW order
1065 (important for complex to real)*/
1066 if (plan->flags & FFT5D_BACKWARD)
1072 if (plan->flags & FFT5D_ORDER_YZ)
1080 if ((plan->flags & FFT5D_REALCOMPLEX)
1081 && ((!(plan->flags & FFT5D_BACKWARD) && s == 0) || ((plan->flags & FFT5D_BACKWARD) && s == 2)))
1087 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
1090 int* coor = plan->coor;
1091 int xs[3], xl[3], xc[3], NG[3];
1092 int ll = (plan->flags & FFT5D_REALCOMPLEX) ? 1 : 2;
1093 compute_offsets(plan, xs, xl, xc, NG, s);
1094 fprintf(debug, txt, coor[0], coor[1]);
1095 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
1096 for (z = 0; z < xl[2]; z++)
1098 for (y = 0; y < xl[1]; y++)
1100 fprintf(debug, "%d %d: ", coor[0], coor[1]);
1101 for (x = 0; x < xl[0]; x++)
1103 for (l = 0; l < ll; l++)
1107 reinterpret_cast<const real*>(
1108 lin)[(z * xs[2] + y * xs[1]) * 2 + (x * xs[0]) * ll + l]);
1110 fprintf(debug, ",");
1112 fprintf(debug, "\n");
1117 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
1119 t_complex* lin = plan->lin;
1120 t_complex* lout = plan->lout;
1121 t_complex* lout2 = plan->lout2;
1122 t_complex* lout3 = plan->lout3;
1123 t_complex *fftout, *joinin;
1125 gmx_fft_t** p1d = plan->p1d;
1126 #ifdef FFT5D_MPI_TRANSPOSE
1127 FFTW(plan)* mpip = plan->mpip;
1130 MPI_Comm* cart = plan->cart;
1133 double time_fft = 0, time_local = 0, time_mpi[2] = { 0 }, time = 0;
1135 int *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
1136 *C = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout,
1137 **oNout = plan->oNout;
1138 int s = 0, tstart, tend, bParallelDim;
1152 FFTW(execute)(plan->p3d);
1156 times->fft += MPI_Wtime() - time;
1167 if ((plan->flags & FFT5D_DEBUG) && thread == 0)
1169 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1172 for (s = 0; s < 2; s++) /*loop over first two FFT steps (corner rotations)*/
1176 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1186 /* ---------- START FFT ------------ */
1188 if (times != 0 && thread == 0)
1194 if (bParallelDim || plan->nthreads == 1)
1210 tstart = (thread * pM[s] * pK[s] / plan->nthreads) * C[s];
1211 if ((plan->flags & FFT5D_REALCOMPLEX) && !(plan->flags & FFT5D_BACKWARD) && s == 0)
1213 gmx_fft_many_1d_real(p1d[s][thread],
1214 (plan->flags & FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL
1215 : GMX_FFT_REAL_TO_COMPLEX,
1221 gmx_fft_many_1d(p1d[s][thread],
1222 (plan->flags & FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,
1228 if (times != NULL && thread == 0)
1230 time_fft += MPI_Wtime() - time;
1233 if ((plan->flags & FFT5D_DEBUG) && thread == 0)
1235 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1237 /* ---------- END FFT ------------ */
1239 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1243 if (times != NULL && thread == 0)
1250 1. (most outer) axes (x) is split into P[s] parts of size N[s]
1254 tend = ((thread + 1) * pM[s] * pK[s] / plan->nthreads);
1271 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1273 if (times != NULL && thread == 0)
1275 time_local += MPI_Wtime() - time;
1279 /* ---------- END SPLIT , START TRANSPOSE------------ */
1289 wallcycle_start(times, ewcPME_FFTCOMM);
1291 #ifdef FFT5D_MPI_TRANSPOSE
1292 FFTW(execute)(mpip[s]);
1295 if ((s == 0 && !(plan->flags & FFT5D_ORDER_YZ))
1296 || (s == 1 && (plan->flags & FFT5D_ORDER_YZ)))
1298 MPI_Alltoall(reinterpret_cast<real*>(lout2),
1299 N[s] * pM[s] * K[s] * sizeof(t_complex) / sizeof(real),
1301 reinterpret_cast<real*>(lout3),
1302 N[s] * pM[s] * K[s] * sizeof(t_complex) / sizeof(real),
1308 MPI_Alltoall(reinterpret_cast<real*>(lout2),
1309 N[s] * M[s] * pK[s] * sizeof(t_complex) / sizeof(real),
1311 reinterpret_cast<real*>(lout3),
1312 N[s] * M[s] * pK[s] * sizeof(t_complex) / sizeof(real),
1317 GMX_RELEASE_ASSERT(false, "Invalid call to fft5d_execute");
1319 #endif /*FFT5D_MPI_TRANSPOSE*/
1323 time_mpi[s] = MPI_Wtime() - time;
1326 wallcycle_stop(times, ewcPME_FFTCOMM);
1330 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1332 /* ---------- END SPLIT + TRANSPOSE------------ */
1334 /* ---------- START JOIN ------------ */
1336 if (times != NULL && thread == 0)
1350 /*bring back in matrix form
1351 thus make new 1. axes contiguos
1352 also local transpose 1 and 2/3
1353 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1355 if ((s == 0 && !(plan->flags & FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags & FFT5D_ORDER_YZ)))
1359 tstart = (thread * pM[s] * pN[s] / plan->nthreads);
1360 tend = ((thread + 1) * pM[s] * pN[s] / plan->nthreads);
1361 joinAxesTrans13(lin,
1381 tstart = (thread * pK[s] * pN[s] / plan->nthreads);
1382 tend = ((thread + 1) * pK[s] * pN[s] / plan->nthreads);
1383 joinAxesTrans12(lin,
1401 if (times != NULL && thread == 0)
1403 time_local += MPI_Wtime() - time;
1406 if ((plan->flags & FFT5D_DEBUG) && thread == 0)
1408 print_localdata(lin, "%d %d: tranposed %d\n", s + 1, plan);
1410 /* ---------- END JOIN ------------ */
1412 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1413 } /* for(s=0;s<2;s++) */
1415 if (times != NULL && thread == 0)
1421 if (plan->flags & FFT5D_INPLACE)
1423 lout = lin; /*in place currently not supported*/
1425 /* ----------- FFT ----------- */
1426 tstart = (thread * pM[s] * pK[s] / plan->nthreads) * C[s];
1427 if ((plan->flags & FFT5D_REALCOMPLEX) && (plan->flags & FFT5D_BACKWARD))
1429 gmx_fft_many_1d_real(p1d[s][thread],
1430 (plan->flags & FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX,
1436 gmx_fft_many_1d(p1d[s][thread],
1437 (plan->flags & FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,
1441 /* ------------ END FFT ---------*/
1444 if (times != NULL && thread == 0)
1446 time_fft += MPI_Wtime() - time;
1448 times->fft += time_fft;
1449 times->local += time_local;
1450 times->mpi2 += time_mpi[1];
1451 times->mpi1 += time_mpi[0];
1455 if ((plan->flags & FFT5D_DEBUG) && thread == 0)
1457 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1461 void fft5d_destroy(fft5d_plan plan)
1465 for (s = 0; s < 3; s++)
1469 for (t = 0; t < plan->nthreads; t++)
1471 gmx_many_fft_destroy(plan->p1d[s][t]);
1477 free(plan->iNin[s]);
1478 plan->iNin[s] = nullptr;
1482 free(plan->oNin[s]);
1483 plan->oNin[s] = nullptr;
1487 free(plan->iNout[s]);
1488 plan->iNout[s] = nullptr;
1492 free(plan->oNout[s]);
1493 plan->oNout[s] = nullptr;
1498 # ifdef FFT5D_MPI_TRANSPOS
1499 for (s = 0; s < 2; s++)
1501 FFTW(destroy_plan)(plan->mpip[s]);
1503 # endif /* FFT5D_MPI_TRANSPOS */
1506 FFTW(destroy_plan)(plan->p3d);
1509 #endif /* GMX_FFT_FFTW3 */
1511 if (!(plan->flags & FFT5D_NOMALLOC))
1513 // only needed for PME GPU mixed mode
1514 if (plan->pinningPolicy == gmx::PinningPolicy::PinnedIfSupported && isHostMemoryPinned(plan->lin))
1516 gmx::unpinBuffer(plan->lin);
1518 sfree_aligned(plan->lin);
1519 sfree_aligned(plan->lout);
1520 if (plan->nthreads > 1)
1522 sfree_aligned(plan->lout2);
1523 sfree_aligned(plan->lout3);
1527 #ifdef FFT5D_THREADS
1528 # ifdef FFT5D_FFTW_THREADS
1529 /*FFTW(cleanup_threads)();*/
1536 /*Is this better than direct access of plan? enough data?
1537 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1538 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1549 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1550 of processor dimensions*/
1551 fft5d_plan fft5d_plan_3d_cart(int NG,
1563 MPI_Comm cart[2] = { MPI_COMM_NULL, MPI_COMM_NULL };
1565 int size = 1, prank = 0;
1568 int wrap[] = { 0, 0 };
1570 int rdim1[] = { 0, 1 }, rdim2[] = { 1, 0 };
1572 MPI_Comm_size(comm, &size);
1573 MPI_Comm_rank(comm, &prank);
1583 printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1589 P[1] = size / P0; /*number of processors in the two dimensions*/
1591 /*Difference between x-y-z regarding 2d decomposition is whether they are
1592 distributed along axis 1, 2 or both*/
1594 MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1595 MPI_Cart_get(gcart, 2, P, wrap, coor);
1596 MPI_Cart_sub(gcart, rdim1, &cart[0]);
1597 MPI_Cart_sub(gcart, rdim2, &cart[1]);
1602 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1606 /*prints in original coordinate system of data (as the input to FFT)*/
1607 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1609 int xs[3], xl[3], xc[3], NG[3];
1611 int* coor = plan->coor;
1612 int ll = 2; /*compare ll values per element (has to be 2 for complex)*/
1613 if ((plan->flags & FFT5D_REALCOMPLEX) && (plan->flags & FFT5D_BACKWARD))
1618 compute_offsets(plan, xs, xl, xc, NG, 2);
1619 if (plan->flags & FFT5D_DEBUG)
1621 printf("Compare2\n");
1623 for (z = 0; z < xl[2]; z++)
1625 for (y = 0; y < xl[1]; y++)
1627 if (plan->flags & FFT5D_DEBUG)
1629 printf("%d %d: ", coor[0], coor[1]);
1631 for (x = 0; x < xl[0]; x++)
1633 for (l = 0; l < ll; l++) /*loop over real/complex parts*/
1636 a = reinterpret_cast<const real*>(lin)[(z * xs[2] + y * xs[1]) * 2 + x * xs[0] * ll + l];
1639 a /= plan->rC[0] * plan->rC[1] * plan->rC[2];
1643 b = reinterpret_cast<const real*>(
1644 in)[((z + xc[2]) * NG[0] * NG[1] + (y + xc[1]) * NG[0]) * 2 + (x + xc[0]) * ll + l];
1648 b = reinterpret_cast<const real*>(
1649 in)[(z * xs[2] + y * xs[1]) * 2 + x * xs[0] * ll + l];
1651 if (plan->flags & FFT5D_DEBUG)
1653 printf("%f %f, ", a, b);
1657 if (std::fabs(a - b) > 2 * NG[0] * NG[1] * NG[2] * GMX_REAL_EPS)
1659 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",
1668 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1671 if (plan->flags & FFT5D_DEBUG)
1676 if (plan->flags & FFT5D_DEBUG)