2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009,2010,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/gpu_utils/gpu_utils.h"
51 #include "gromacs/gpu_utils/hostallocator.h"
52 #include "gromacs/gpu_utils/pinning.h"
53 #include "gromacs/utility/alignedallocator.h"
54 #include "gromacs/utility/exceptions.h"
55 #include "gromacs/utility/fatalerror.h"
56 #include "gromacs/utility/gmxmpi.h"
57 #include "gromacs/utility/smalloc.h"
60 #define GMX_PARALLEL_ENV_INITIALIZED 1
63 #define GMX_PARALLEL_ENV_INITIALIZED 1
65 #define GMX_PARALLEL_ENV_INITIALIZED 0
70 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
72 /* requires fftw compiled with openmp */
73 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
76 #ifndef __FLT_EPSILON__
77 #define __FLT_EPSILON__ FLT_EPSILON
78 #define __DBL_EPSILON__ DBL_EPSILON
87 #include "gromacs/utility/exceptions.h"
88 #include "gromacs/utility/mutex.h"
89 /* none of the fftw3 calls, except execute(), are thread-safe, so
90 we need to serialize them with this mutex. */
91 static gmx::Mutex big_fftw_mutex;
92 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
93 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
94 #endif /* GMX_FFT_FFTW3 */
97 /* largest factor smaller than sqrt */
98 static int lfactor(int z)
100 int i = static_cast<int>(sqrt(static_cast<double>(z)));
110 #if HAVE_GETTIMEOFDAY
111 #include <sys/time.h>
115 gettimeofday(&tv, 0);
116 return tv.tv_sec+tv.tv_usec*1e-6;
126 static int vmax(const int* a, int s)
129 for (i = 0; i < s; i++)
140 /* NxMxK the size of the data
141 * comm communicator to use for fft5d
142 * P0 number of processor in 1st axes (can be null for automatic)
143 * lin is allocated by fft5d because size of array is only known after planning phase
144 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
146 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads, gmx::PinningPolicy realGridAllocationPinningPolicy)
149 int P[2], prank[2], i;
152 int *N0 = nullptr, *N1 = nullptr, *M0 = nullptr, *M1 = nullptr, *K0 = nullptr, *K1 = nullptr, *oN0 = nullptr, *oN1 = nullptr, *oM0 = nullptr, *oM1 = nullptr, *oK0 = nullptr, *oK1 = nullptr;
153 int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {nullptr}, *oNin[3] = {nullptr}, *iNout[3] = {nullptr}, *oNout[3] = {nullptr};
154 int C[3], rC[3], nP[2];
156 t_complex *lin = nullptr, *lout = nullptr, *lout2 = nullptr, *lout3 = nullptr;
160 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
162 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
164 MPI_Comm_size(comm[0], &P[0]);
165 MPI_Comm_rank(comm[0], &prank[0]);
174 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
176 MPI_Comm_size(comm[1], &P[1]);
177 MPI_Comm_rank(comm[1], &prank[1]);
186 bMaster = prank[0] == 0 && prank[1] == 0;
191 fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n",
192 P[0], P[1], prank[0], prank[1]);
199 fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
200 NG, MG, KG, P[0], P[1], int((flags&FFT5D_REALCOMPLEX) > 0), int((flags&FFT5D_BACKWARD) > 0), int((flags&FFT5D_ORDER_YZ) > 0), int((flags&FFT5D_DEBUG) > 0));
202 /* The check below is not correct, one prime factor 11 or 13 is ok.
203 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
204 printf("WARNING: FFT very slow with prime factors larger 7\n");
205 printf("Change FFT size or in case you cannot change it look at\n");
206 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
211 if (NG == 0 || MG == 0 || KG == 0)
215 printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
220 rNG = NG; rMG = MG; rKG = KG;
222 if (flags&FFT5D_REALCOMPLEX)
224 if (!(flags&FFT5D_BACKWARD))
230 if (!(flags&FFT5D_ORDER_YZ))
242 /*for transpose we need to know the size for each processor not only our own size*/
244 N0 = static_cast<int*>(malloc(P[0]*sizeof(int))); N1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
245 M0 = static_cast<int*>(malloc(P[0]*sizeof(int))); M1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
246 K0 = static_cast<int*>(malloc(P[0]*sizeof(int))); K1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
247 oN0 = static_cast<int*>(malloc(P[0]*sizeof(int))); oN1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
248 oM0 = static_cast<int*>(malloc(P[0]*sizeof(int))); oM1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
249 oK0 = static_cast<int*>(malloc(P[0]*sizeof(int))); oK1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
251 for (i = 0; i < P[0]; i++)
255 oN0[i] = i*ceil((double)NG/P[0]);
256 oM0[i] = i*ceil((double)MG/P[0]);
257 oK0[i] = i*ceil((double)KG/P[0]);
259 oN0[i] = (NG*i)/P[0];
260 oM0[i] = (MG*i)/P[0];
261 oK0[i] = (KG*i)/P[0];
264 for (i = 0; i < P[1]; i++)
267 oN1[i] = i*ceil((double)NG/P[1]);
268 oM1[i] = i*ceil((double)MG/P[1]);
269 oK1[i] = i*ceil((double)KG/P[1]);
271 oN1[i] = (NG*i)/P[1];
272 oM1[i] = (MG*i)/P[1];
273 oK1[i] = (KG*i)/P[1];
276 for (i = 0; P[0] > 0 && i < P[0]-1; i++)
278 N0[i] = oN0[i+1]-oN0[i];
279 M0[i] = oM0[i+1]-oM0[i];
280 K0[i] = oK0[i+1]-oK0[i];
282 N0[P[0]-1] = NG-oN0[P[0]-1];
283 M0[P[0]-1] = MG-oM0[P[0]-1];
284 K0[P[0]-1] = KG-oK0[P[0]-1];
285 for (i = 0; P[1] > 0 && i < P[1]-1; i++)
287 N1[i] = oN1[i+1]-oN1[i];
288 M1[i] = oM1[i+1]-oM1[i];
289 K1[i] = oK1[i+1]-oK1[i];
291 N1[P[1]-1] = NG-oN1[P[1]-1];
292 M1[P[1]-1] = MG-oM1[P[1]-1];
293 K1[P[1]-1] = KG-oK1[P[1]-1];
295 /* for step 1-3 the local N,M,K sizes of the transposed system
296 C: contiguous dimension, and nP: number of processor in subcommunicator
299 GMX_ASSERT(prank[0] < P[0], "Must have valid rank within communicator size");
300 GMX_ASSERT(prank[1] < P[1], "Must have valid rank within communicator size");
301 pM[0] = M0[prank[0]];
302 oM[0] = oM0[prank[0]];
303 pK[0] = K1[prank[1]];
304 oK[0] = oK1[prank[1]];
307 if (!(flags&FFT5D_ORDER_YZ))
309 N[0] = vmax(N1, P[1]);
311 K[0] = vmax(K1, P[1]);
312 pN[0] = N1[prank[1]];
318 N[1] = vmax(K0, P[0]);
319 pN[1] = K0[prank[0]];
324 M[1] = vmax(M0, P[0]);
325 pM[1] = M0[prank[0]];
326 oM[1] = oM0[prank[0]];
328 pK[1] = N1[prank[1]];
329 oK[1] = oN1[prank[1]];
335 M[2] = vmax(K0, P[0]);
336 pM[2] = K0[prank[0]];
337 oM[2] = oK0[prank[0]];
338 K[2] = vmax(N1, P[1]);
339 pK[2] = N1[prank[1]];
340 oK[2] = oN1[prank[1]];
341 free(N0); free(oN0); /*these are not used for this order*/
342 free(M1); free(oM1); /*the rest is freed in destroy*/
346 N[0] = vmax(N0, P[0]);
347 M[0] = vmax(M0, P[0]);
349 pN[0] = N0[prank[0]];
355 N[1] = vmax(M1, P[1]);
356 pN[1] = M1[prank[1]];
362 pM[1] = N0[prank[0]];
363 oM[1] = oN0[prank[0]];
364 K[1] = vmax(K1, P[1]);
365 pK[1] = K1[prank[1]];
366 oK[1] = oK1[prank[1]];
372 M[2] = vmax(N0, P[0]);
373 pM[2] = N0[prank[0]];
374 oM[2] = oN0[prank[0]];
375 K[2] = vmax(M1, P[1]);
376 pK[2] = M1[prank[1]];
377 oK[2] = oM1[prank[1]];
378 free(N1); free(oN1); /*these are not used for this order*/
379 free(K0); free(oK0); /*the rest is freed in destroy*/
381 N[2] = pN[2] = -1; /*not used*/
384 Difference between x-y-z regarding 2d decomposition is whether they are
385 distributed along axis 1, 2 or both
388 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
389 lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
390 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
391 if (!(flags&FFT5D_NOMALLOC))
393 // only needed for PME GPU mixed mode
394 if (realGridAllocationPinningPolicy == gmx::PinningPolicy::PinnedIfSupported &&
395 GMX_GPU == GMX_GPU_CUDA)
397 const std::size_t numBytes = lsize * sizeof(t_complex);
398 lin = static_cast<t_complex *>(gmx::PageAlignedAllocationPolicy::malloc(numBytes));
399 gmx::pinBuffer(lin, numBytes);
403 snew_aligned(lin, lsize, 32);
405 snew_aligned(lout, lsize, 32);
408 /* We need extra transpose buffers to avoid OpenMP barriers */
409 snew_aligned(lout2, lsize, 32);
410 snew_aligned(lout3, lsize, 32);
414 /* We can reuse the buffers to avoid cache misses */
435 plan = static_cast<fft5d_plan>(calloc(1, sizeof(struct fft5d_plan_t)));
440 fprintf(debug, "Running on %d threads\n", nthreads);
444 /* Don't add more stuff here! We have already had at least one bug because we are reimplementing
445 * the low-level FFT interface instead of using the Gromacs FFT module. If we need more
446 * generic functionality it is far better to extend the interface so we can use it for
447 * all FFT libraries instead of writing FFTW-specific code here.
450 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
451 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
452 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
453 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
454 * and that the 3d plan is faster than the 1d plan.
456 if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
458 int fftwflags = FFTW_DESTROY_INPUT;
460 int inNG = NG, outMG = MG, outKG = KG;
464 fftwflags |= (flags & FFT5D_NOMEASURE) ? FFTW_ESTIMATE : FFTW_MEASURE;
466 if (flags&FFT5D_REALCOMPLEX)
468 if (!(flags&FFT5D_BACKWARD)) /*input pointer is not complex*/
472 else /*output pointer is not complex*/
474 if (!(flags&FFT5D_ORDER_YZ))
485 if (!(flags&FFT5D_BACKWARD))
491 dims[0].is = inNG*MG; /*N M K*/
494 if (!(flags&FFT5D_ORDER_YZ))
496 dims[0].os = MG; /*M K N*/
502 dims[0].os = 1; /*K N M*/
509 if (!(flags&FFT5D_ORDER_YZ))
519 dims[0].os = outMG*KG;
533 dims[0].os = outKG*NG;
539 #ifdef FFT5D_FFTW_THREADS
540 FFTW(plan_with_nthreads)(nthreads);
543 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
545 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
546 /*howmany*/ 0, /*howmany_dims*/ nullptr,
547 reinterpret_cast<real*>(lin), reinterpret_cast<FFTW(complex) *>(lout),
548 /*flags*/ fftwflags);
550 else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD))
552 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
553 /*howmany*/ 0, /*howmany_dims*/ nullptr,
554 reinterpret_cast<FFTW(complex) *>(lin), reinterpret_cast<real*>(lout),
555 /*flags*/ fftwflags);
559 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
560 /*howmany*/ 0, /*howmany_dims*/ nullptr,
561 reinterpret_cast<FFTW(complex) *>(lin), reinterpret_cast<FFTW(complex) *>(lout),
562 /*sign*/ (flags&FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
565 #ifdef FFT5D_FFTW_THREADS
566 FFTW(plan_with_nthreads)(1);
571 if (!plan->p3d) /* for decomposition and if 3d plan did not work */
573 #endif /* GMX_FFT_FFTW3 */
574 for (s = 0; s < 3; s++)
578 fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
579 s, rC[s], M[s], pK[s], C[s], lsize);
581 plan->p1d[s] = static_cast<gmx_fft_t*>(malloc(sizeof(gmx_fft_t)*nthreads));
583 /* Make sure that the init routines are only called by one thread at a time and in order
584 (later is only important to not confuse valgrind)
586 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
587 for (int t = 0; t < nthreads; t++)
593 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
595 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s == 0) || ((flags&FFT5D_BACKWARD) && s == 2)))
597 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
601 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
604 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
612 if ((flags&FFT5D_ORDER_YZ)) /*plan->cart is in the order of transposes */
614 plan->cart[0] = comm[0]; plan->cart[1] = comm[1];
618 plan->cart[1] = comm[0]; plan->cart[0] = comm[1];
620 #ifdef FFT5D_MPI_TRANSPOSE
622 for (s = 0; s < 2; s++)
624 if ((s == 0 && !(flags&FFT5D_ORDER_YZ)) || (s == 1 && (flags&FFT5D_ORDER_YZ)))
626 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
630 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
642 plan->NG = NG; plan->MG = MG; plan->KG = KG;
644 for (s = 0; s < 3; s++)
646 plan->N[s] = N[s]; plan->M[s] = M[s]; plan->K[s] = K[s]; plan->pN[s] = pN[s]; plan->pM[s] = pM[s]; plan->pK[s] = pK[s];
647 plan->oM[s] = oM[s]; plan->oK[s] = oK[s];
648 plan->C[s] = C[s]; plan->rC[s] = rC[s];
649 plan->iNin[s] = iNin[s]; plan->oNin[s] = oNin[s]; plan->iNout[s] = iNout[s]; plan->oNout[s] = oNout[s];
651 for (s = 0; s < 2; s++)
653 plan->P[s] = nP[s]; plan->coor[s] = prank[s];
656 /* plan->fftorder=fftorder;
657 plan->direction=direction;
658 plan->realcomplex=realcomplex;
661 plan->nthreads = nthreads;
662 plan->pinningPolicy = realGridAllocationPinningPolicy;
682 /*here x,y,z and N,M,K is in rotated coordinate system!!
683 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
684 maxN,maxM,maxK is max size of local data
685 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
686 NG, MG, KG is size of global data*/
687 static void splitaxes(t_complex* lout, const t_complex* lin,
688 int maxN, int maxM, int maxK, int pM,
689 int P, int NG, const int *N, const int* oN, int starty, int startz, int endy, int endz)
692 int in_i, out_i, in_z, out_z, in_y, out_y;
695 for (z = startz; z < endz+1; z++) /*3. z l*/
716 for (i = 0; i < P; i++) /*index cube along long axis*/
718 out_i = out_z + i*maxN*maxM*maxK;
720 for (y = s_y; y < e_y; y++) /*2. y k*/
722 out_y = out_i + y*maxN;
724 for (x = 0; x < N[i]; x++) /*1. x j*/
726 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
727 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
728 /*before split data contiguos - thus if different processor get different amount oN is different*/
735 /*make axis contiguous again (after AllToAll) and also do local transpose*/
736 /*transpose mayor and major dimension
738 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
739 N,M,K local dimensions
741 static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
742 int maxN, int maxM, int maxK, int pM,
743 int P, int KG, const int* K, const int* oK, int starty, int startx, int endy, int endx)
746 int out_i, in_i, out_x, in_x, out_z, in_z;
749 for (x = startx; x < endx+1; x++) /*1.j*/
771 for (i = 0; i < P; i++) /*index cube along long axis*/
773 out_i = out_x + oK[i];
774 in_i = in_x + i*maxM*maxN*maxK;
775 for (z = 0; z < K[i]; z++) /*3.l*/
778 in_z = in_i + z*maxM*maxN;
779 for (y = s_y; y < e_y; y++) /*2.k*/
781 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
788 /*make axis contiguous again (after AllToAll) and also do local transpose
789 tranpose mayor and middle dimension
791 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
794 static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
795 int P, int MG, const int* M, const int* oM, int startx, int startz, int endx, int endz)
798 int out_i, in_i, out_z, in_z, out_x, in_x;
801 for (z = startz; z < endz+1; z++)
822 for (i = 0; i < P; i++) /*index cube along long axis*/
824 out_i = out_z + oM[i];
825 in_i = in_z + i*maxM*maxN*maxK;
826 for (x = s_x; x < e_x; x++)
828 out_x = out_i + x*MG;
830 for (y = 0; y < M[i]; y++)
832 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
840 static void rotate_offsets(int x[])
851 /*compute the offset to compare or print transposed local data in original input coordinates
852 xs matrix dimension size, xl dimension length, xc decomposition offset
853 s: step in computation = number of transposes*/
854 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
856 /* int direction = plan->direction;
857 int fftorder = plan->fftorder;*/
861 int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK,
862 *C = plan->C, *rC = plan->rC;
864 NG[0] = plan->NG; NG[1] = plan->MG; NG[2] = plan->KG;
866 if (!(plan->flags&FFT5D_ORDER_YZ))
870 case 0: o = XYZ; break;
871 case 1: o = ZYX; break;
872 case 2: o = YZX; break;
880 case 0: o = XYZ; break;
881 case 1: o = YXZ; break;
882 case 2: o = ZXY; break;
889 case XYZ: pos[0] = 1; pos[1] = 2; pos[2] = 3; break;
890 case XZY: pos[0] = 1; pos[1] = 3; pos[2] = 2; break;
891 case YXZ: pos[0] = 2; pos[1] = 1; pos[2] = 3; break;
892 case YZX: pos[0] = 3; pos[1] = 1; pos[2] = 2; break;
893 case ZXY: pos[0] = 2; pos[1] = 3; pos[2] = 1; break;
894 case ZYX: pos[0] = 3; pos[1] = 2; pos[2] = 1; break;
896 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
898 /*xs, xl give dimension size and data length in local transposed coordinate system
899 for 0(/1/2): x(/y/z) in original coordinate system*/
900 for (i = 0; i < 3; i++)
904 case 1: xs[i] = 1; xc[i] = 0; xl[i] = C[s]; break;
905 case 2: xs[i] = C[s]; xc[i] = oM[s]; xl[i] = pM[s]; break;
906 case 3: xs[i] = C[s]*pM[s]; xc[i] = oK[s]; xl[i] = pK[s]; break;
909 /*input order is different for test program to match FFTW order
910 (important for complex to real)*/
911 if (plan->flags&FFT5D_BACKWARD)
917 if (plan->flags&FFT5D_ORDER_YZ)
925 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s == 0) || ((plan->flags&FFT5D_BACKWARD) && s == 2)))
931 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
934 int *coor = plan->coor;
935 int xs[3], xl[3], xc[3], NG[3];
936 int ll = (plan->flags&FFT5D_REALCOMPLEX) ? 1 : 2;
937 compute_offsets(plan, xs, xl, xc, NG, s);
938 fprintf(debug, txt, coor[0], coor[1]);
939 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
940 for (z = 0; z < xl[2]; z++)
942 for (y = 0; y < xl[1]; y++)
944 fprintf(debug, "%d %d: ", coor[0], coor[1]);
945 for (x = 0; x < xl[0]; x++)
947 for (l = 0; l < ll; l++)
949 fprintf(debug, "%f ", reinterpret_cast<const real*>(lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
953 fprintf(debug, "\n");
958 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
960 t_complex *lin = plan->lin;
961 t_complex *lout = plan->lout;
962 t_complex *lout2 = plan->lout2;
963 t_complex *lout3 = plan->lout3;
964 t_complex *fftout, *joinin;
966 gmx_fft_t **p1d = plan->p1d;
967 #ifdef FFT5D_MPI_TRANSPOSE
968 FFTW(plan) *mpip = plan->mpip;
971 MPI_Comm *cart = plan->cart;
974 double time_fft = 0, time_local = 0, time_mpi[2] = {0}, time = 0;
976 int *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
977 *C = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout, **oNout = plan->oNout;
978 int s = 0, tstart, tend, bParallelDim;
992 FFTW(execute)(plan->p3d);
996 times->fft += MPI_Wtime()-time;
1007 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1009 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1012 for (s = 0; s < 2; s++) /*loop over first two FFT steps (corner rotations)*/
1016 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1026 /* ---------- START FFT ------------ */
1028 if (times != 0 && thread == 0)
1034 if (bParallelDim || plan->nthreads == 1)
1050 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1051 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s == 0)
1053 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, fftout+tstart);
1057 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, fftout+tstart);
1062 if (times != NULL && thread == 0)
1064 time_fft += MPI_Wtime()-time;
1067 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1069 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1071 /* ---------- END FFT ------------ */
1073 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1077 if (times != NULL && thread == 0)
1084 1. (most outer) axes (x) is split into P[s] parts of size N[s]
1088 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
1090 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1092 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1094 if (times != NULL && thread == 0)
1096 time_local += MPI_Wtime()-time;
1100 /* ---------- END SPLIT , START TRANSPOSE------------ */
1110 wallcycle_start(times, ewcPME_FFTCOMM);
1112 #ifdef FFT5D_MPI_TRANSPOSE
1113 FFTW(execute)(mpip[s]);
1116 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1118 MPI_Alltoall(reinterpret_cast<real *>(lout2), N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, reinterpret_cast<real *>(lout3), N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1122 MPI_Alltoall(reinterpret_cast<real *>(lout2), N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, reinterpret_cast<real *>(lout3), N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1125 gmx_incons("fft5d MPI call without MPI configuration");
1127 #endif /*FFT5D_MPI_TRANSPOSE*/
1131 time_mpi[s] = MPI_Wtime()-time;
1134 wallcycle_stop(times, ewcPME_FFTCOMM);
1138 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1140 /* ---------- END SPLIT + TRANSPOSE------------ */
1142 /* ---------- START JOIN ------------ */
1144 if (times != NULL && thread == 0)
1158 /*bring back in matrix form
1159 thus make new 1. axes contiguos
1160 also local transpose 1 and 2/3
1161 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1163 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1167 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1168 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1169 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1176 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1177 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1178 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
1183 if (times != NULL && thread == 0)
1185 time_local += MPI_Wtime()-time;
1188 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1190 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1192 /* ---------- END JOIN ------------ */
1194 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1195 } /* for(s=0;s<2;s++) */
1197 if (times != NULL && thread == 0)
1203 if (plan->flags&FFT5D_INPLACE)
1205 lout = lin; /*in place currently not supported*/
1208 /* ----------- FFT ----------- */
1209 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1210 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1212 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, lout+tstart);
1216 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, lout+tstart);
1218 /* ------------ END FFT ---------*/
1221 if (times != NULL && thread == 0)
1223 time_fft += MPI_Wtime()-time;
1225 times->fft += time_fft;
1226 times->local += time_local;
1227 times->mpi2 += time_mpi[1];
1228 times->mpi1 += time_mpi[0];
1232 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1234 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1238 void fft5d_destroy(fft5d_plan plan)
1242 for (s = 0; s < 3; s++)
1246 for (t = 0; t < plan->nthreads; t++)
1248 gmx_many_fft_destroy(plan->p1d[s][t]);
1254 free(plan->iNin[s]);
1255 plan->iNin[s] = nullptr;
1259 free(plan->oNin[s]);
1260 plan->oNin[s] = nullptr;
1264 free(plan->iNout[s]);
1265 plan->iNout[s] = nullptr;
1269 free(plan->oNout[s]);
1270 plan->oNout[s] = nullptr;
1275 #ifdef FFT5D_MPI_TRANSPOS
1276 for (s = 0; s < 2; s++)
1278 FFTW(destroy_plan)(plan->mpip[s]);
1280 #endif /* FFT5D_MPI_TRANSPOS */
1283 FFTW(destroy_plan)(plan->p3d);
1286 #endif /* GMX_FFT_FFTW3 */
1288 if (!(plan->flags&FFT5D_NOMALLOC))
1290 // only needed for PME GPU mixed mode
1291 if (plan->pinningPolicy == gmx::PinningPolicy::PinnedIfSupported &&
1292 isHostMemoryPinned(plan->lin))
1294 gmx::unpinBuffer(plan->lin);
1296 sfree_aligned(plan->lin);
1297 sfree_aligned(plan->lout);
1298 if (plan->nthreads > 1)
1300 sfree_aligned(plan->lout2);
1301 sfree_aligned(plan->lout3);
1305 #ifdef FFT5D_THREADS
1306 #ifdef FFT5D_FFTW_THREADS
1307 /*FFTW(cleanup_threads)();*/
1314 /*Is this better than direct access of plan? enough data?
1315 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1316 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1327 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1328 of processor dimensions*/
1329 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
1331 MPI_Comm cart[2] = {MPI_COMM_NULL, MPI_COMM_NULL};
1333 int size = 1, prank = 0;
1336 int wrap[] = {0, 0};
1338 int rdim1[] = {0, 1}, rdim2[] = {1, 0};
1340 MPI_Comm_size(comm, &size);
1341 MPI_Comm_rank(comm, &prank);
1351 printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1356 P[0] = P0; P[1] = size/P0; /*number of processors in the two dimensions*/
1358 /*Difference between x-y-z regarding 2d decomposition is whether they are
1359 distributed along axis 1, 2 or both*/
1361 MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1362 MPI_Cart_get(gcart, 2, P, wrap, coor);
1363 MPI_Cart_sub(gcart, rdim1, &cart[0]);
1364 MPI_Cart_sub(gcart, rdim2, &cart[1]);
1369 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1374 /*prints in original coordinate system of data (as the input to FFT)*/
1375 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1377 int xs[3], xl[3], xc[3], NG[3];
1379 int *coor = plan->coor;
1380 int ll = 2; /*compare ll values per element (has to be 2 for complex)*/
1381 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1386 compute_offsets(plan, xs, xl, xc, NG, 2);
1387 if (plan->flags&FFT5D_DEBUG)
1389 printf("Compare2\n");
1391 for (z = 0; z < xl[2]; z++)
1393 for (y = 0; y < xl[1]; y++)
1395 if (plan->flags&FFT5D_DEBUG)
1397 printf("%d %d: ", coor[0], coor[1]);
1399 for (x = 0; x < xl[0]; x++)
1401 for (l = 0; l < ll; l++) /*loop over real/complex parts*/
1404 a = reinterpret_cast<const real*>(lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1407 a /= plan->rC[0]*plan->rC[1]*plan->rC[2];
1411 b = reinterpret_cast<const real*>(in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1415 b = reinterpret_cast<const real*>(in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1417 if (plan->flags&FFT5D_DEBUG)
1419 printf("%f %f, ", a, b);
1423 if (std::fabs(a-b) > 2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS)
1425 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n", coor[0], coor[1], x, y, z, a, b);
1427 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1430 if (plan->flags&FFT5D_DEBUG)
1435 if (plan->flags&FFT5D_DEBUG)