2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009,2010,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
51 #include "gromacs/gpu_utils/gpu_utils.h"
52 #include "gromacs/gpu_utils/hostallocator.h"
53 #include "gromacs/gpu_utils/pinning.h"
54 #include "gromacs/utility/alignedallocator.h"
55 #include "gromacs/utility/exceptions.h"
56 #include "gromacs/utility/fatalerror.h"
57 #include "gromacs/utility/gmxmpi.h"
58 #include "gromacs/utility/smalloc.h"
61 #define GMX_PARALLEL_ENV_INITIALIZED 1
64 #define GMX_PARALLEL_ENV_INITIALIZED 1
66 #define GMX_PARALLEL_ENV_INITIALIZED 0
71 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
73 /* requires fftw compiled with openmp */
74 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
77 #ifndef __FLT_EPSILON__
78 #define __FLT_EPSILON__ FLT_EPSILON
79 #define __DBL_EPSILON__ DBL_EPSILON
88 #include "gromacs/utility/exceptions.h"
89 #include "gromacs/utility/mutex.h"
90 /* none of the fftw3 calls, except execute(), are thread-safe, so
91 we need to serialize them with this mutex. */
92 static gmx::Mutex big_fftw_mutex;
93 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
94 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
95 #endif /* GMX_FFT_FFTW3 */
98 /* largest factor smaller than sqrt */
99 static int lfactor(int z)
101 int i = static_cast<int>(sqrt(static_cast<double>(z)));
111 #if HAVE_GETTIMEOFDAY
112 #include <sys/time.h>
116 gettimeofday(&tv, 0);
117 return tv.tv_sec+tv.tv_usec*1e-6;
127 static int vmax(const int* a, int s)
130 for (i = 0; i < s; i++)
141 /* NxMxK the size of the data
142 * comm communicator to use for fft5d
143 * P0 number of processor in 1st axes (can be null for automatic)
144 * lin is allocated by fft5d because size of array is only known after planning phase
145 * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
147 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads, gmx::PinningPolicy realGridAllocationPinningPolicy)
150 int P[2], bMaster, prank[2], i, t;
152 int *N0 = nullptr, *N1 = nullptr, *M0 = nullptr, *M1 = nullptr, *K0 = nullptr, *K1 = nullptr, *oN0 = nullptr, *oN1 = nullptr, *oM0 = nullptr, *oM1 = nullptr, *oK0 = nullptr, *oK1 = nullptr;
153 int N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {nullptr}, *oNin[3] = {nullptr}, *iNout[3] = {nullptr}, *oNout[3] = {nullptr};
154 int C[3], rC[3], nP[2];
156 t_complex *lin = nullptr, *lout = nullptr, *lout2 = nullptr, *lout3 = nullptr;
160 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
162 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
164 MPI_Comm_size(comm[0], &P[0]);
165 MPI_Comm_rank(comm[0], &prank[0]);
174 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
176 MPI_Comm_size(comm[1], &P[1]);
177 MPI_Comm_rank(comm[1], &prank[1]);
186 bMaster = (prank[0] == 0 && prank[1] == 0);
191 fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n",
192 P[0], P[1], prank[0], prank[1]);
199 fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
200 NG, MG, KG, P[0], P[1], int((flags&FFT5D_REALCOMPLEX) > 0), int((flags&FFT5D_BACKWARD) > 0), int((flags&FFT5D_ORDER_YZ) > 0), int((flags&FFT5D_DEBUG) > 0));
202 /* The check below is not correct, one prime factor 11 or 13 is ok.
203 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
204 printf("WARNING: FFT very slow with prime factors larger 7\n");
205 printf("Change FFT size or in case you cannot change it look at\n");
206 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
211 if (NG == 0 || MG == 0 || KG == 0)
215 printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
220 rNG = NG; rMG = MG; rKG = KG;
222 if (flags&FFT5D_REALCOMPLEX)
224 if (!(flags&FFT5D_BACKWARD))
230 if (!(flags&FFT5D_ORDER_YZ))
242 /*for transpose we need to know the size for each processor not only our own size*/
244 N0 = static_cast<int*>(malloc(P[0]*sizeof(int))); N1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
245 M0 = static_cast<int*>(malloc(P[0]*sizeof(int))); M1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
246 K0 = static_cast<int*>(malloc(P[0]*sizeof(int))); K1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
247 oN0 = static_cast<int*>(malloc(P[0]*sizeof(int))); oN1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
248 oM0 = static_cast<int*>(malloc(P[0]*sizeof(int))); oM1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
249 oK0 = static_cast<int*>(malloc(P[0]*sizeof(int))); oK1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
251 for (i = 0; i < P[0]; i++)
255 oN0[i] = i*ceil((double)NG/P[0]);
256 oM0[i] = i*ceil((double)MG/P[0]);
257 oK0[i] = i*ceil((double)KG/P[0]);
259 oN0[i] = (NG*i)/P[0];
260 oM0[i] = (MG*i)/P[0];
261 oK0[i] = (KG*i)/P[0];
264 for (i = 0; i < P[1]; i++)
267 oN1[i] = i*ceil((double)NG/P[1]);
268 oM1[i] = i*ceil((double)MG/P[1]);
269 oK1[i] = i*ceil((double)KG/P[1]);
271 oN1[i] = (NG*i)/P[1];
272 oM1[i] = (MG*i)/P[1];
273 oK1[i] = (KG*i)/P[1];
276 for (i = 0; i < P[0]-1; i++)
278 N0[i] = oN0[i+1]-oN0[i];
279 M0[i] = oM0[i+1]-oM0[i];
280 K0[i] = oK0[i+1]-oK0[i];
282 N0[P[0]-1] = NG-oN0[P[0]-1];
283 M0[P[0]-1] = MG-oM0[P[0]-1];
284 K0[P[0]-1] = KG-oK0[P[0]-1];
285 for (i = 0; i < P[1]-1; i++)
287 N1[i] = oN1[i+1]-oN1[i];
288 M1[i] = oM1[i+1]-oM1[i];
289 K1[i] = oK1[i+1]-oK1[i];
291 N1[P[1]-1] = NG-oN1[P[1]-1];
292 M1[P[1]-1] = MG-oM1[P[1]-1];
293 K1[P[1]-1] = KG-oK1[P[1]-1];
295 /* for step 1-3 the local N,M,K sizes of the transposed system
296 C: contiguous dimension, and nP: number of processor in subcommunicator
300 pM[0] = M0[prank[0]];
301 oM[0] = oM0[prank[0]];
302 pK[0] = K1[prank[1]];
303 oK[0] = oK1[prank[1]];
306 if (!(flags&FFT5D_ORDER_YZ))
308 N[0] = vmax(N1, P[1]);
310 K[0] = vmax(K1, P[1]);
311 pN[0] = N1[prank[1]];
317 N[1] = vmax(K0, P[0]);
318 pN[1] = K0[prank[0]];
323 M[1] = vmax(M0, P[0]);
324 pM[1] = M0[prank[0]];
325 oM[1] = oM0[prank[0]];
327 pK[1] = N1[prank[1]];
328 oK[1] = oN1[prank[1]];
334 M[2] = vmax(K0, P[0]);
335 pM[2] = K0[prank[0]];
336 oM[2] = oK0[prank[0]];
337 K[2] = vmax(N1, P[1]);
338 pK[2] = N1[prank[1]];
339 oK[2] = oN1[prank[1]];
340 free(N0); free(oN0); /*these are not used for this order*/
341 free(M1); free(oM1); /*the rest is freed in destroy*/
345 N[0] = vmax(N0, P[0]);
346 M[0] = vmax(M0, P[0]);
348 pN[0] = N0[prank[0]];
354 N[1] = vmax(M1, P[1]);
355 pN[1] = M1[prank[1]];
361 pM[1] = N0[prank[0]];
362 oM[1] = oN0[prank[0]];
363 K[1] = vmax(K1, P[1]);
364 pK[1] = K1[prank[1]];
365 oK[1] = oK1[prank[1]];
371 M[2] = vmax(N0, P[0]);
372 pM[2] = N0[prank[0]];
373 oM[2] = oN0[prank[0]];
374 K[2] = vmax(M1, P[1]);
375 pK[2] = M1[prank[1]];
376 oK[2] = oM1[prank[1]];
377 free(N1); free(oN1); /*these are not used for this order*/
378 free(K0); free(oK0); /*the rest is freed in destroy*/
380 N[2] = pN[2] = -1; /*not used*/
383 Difference between x-y-z regarding 2d decomposition is whether they are
384 distributed along axis 1, 2 or both
387 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
388 lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
389 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
390 if (!(flags&FFT5D_NOMALLOC))
392 // only needed for PME GPU mixed mode
393 if (realGridAllocationPinningPolicy == gmx::PinningPolicy::CanBePinned)
395 const std::size_t numBytes = lsize * sizeof(t_complex);
396 lin = static_cast<t_complex *>(gmx::PageAlignedAllocationPolicy::malloc(numBytes));
397 gmx::pinBuffer(lin, numBytes);
401 snew_aligned(lin, lsize, 32);
403 snew_aligned(lout, lsize, 32);
406 /* We need extra transpose buffers to avoid OpenMP barriers */
407 snew_aligned(lout2, lsize, 32);
408 snew_aligned(lout3, lsize, 32);
412 /* We can reuse the buffers to avoid cache misses */
433 plan = static_cast<fft5d_plan>(calloc(1, sizeof(struct fft5d_plan_t)));
438 fprintf(debug, "Running on %d threads\n", nthreads);
442 /* Don't add more stuff here! We have already had at least one bug because we are reimplementing
443 * the low-level FFT interface instead of using the Gromacs FFT module. If we need more
444 * generic functionality it is far better to extend the interface so we can use it for
445 * all FFT libraries instead of writing FFTW-specific code here.
448 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
449 /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
450 * within a parallel region. For now deactivated. If it should be supported it has to made sure that
451 * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
452 * and that the 3d plan is faster than the 1d plan.
454 if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
456 int fftwflags = FFTW_DESTROY_INPUT;
458 int inNG = NG, outMG = MG, outKG = KG;
462 fftwflags |= (flags & FFT5D_NOMEASURE) ? FFTW_ESTIMATE : FFTW_MEASURE;
464 if (flags&FFT5D_REALCOMPLEX)
466 if (!(flags&FFT5D_BACKWARD)) /*input pointer is not complex*/
470 else /*output pointer is not complex*/
472 if (!(flags&FFT5D_ORDER_YZ))
483 if (!(flags&FFT5D_BACKWARD))
489 dims[0].is = inNG*MG; /*N M K*/
492 if (!(flags&FFT5D_ORDER_YZ))
494 dims[0].os = MG; /*M K N*/
500 dims[0].os = 1; /*K N M*/
507 if (!(flags&FFT5D_ORDER_YZ))
517 dims[0].os = outMG*KG;
531 dims[0].os = outKG*NG;
537 #ifdef FFT5D_FFTW_THREADS
538 FFTW(plan_with_nthreads)(nthreads);
541 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
543 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
544 /*howmany*/ 0, /*howmany_dims*/ nullptr,
545 reinterpret_cast<real*>(lin), reinterpret_cast<FFTW(complex) *>(lout),
546 /*flags*/ fftwflags);
548 else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD))
550 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
551 /*howmany*/ 0, /*howmany_dims*/ nullptr,
552 reinterpret_cast<FFTW(complex) *>(lin), reinterpret_cast<real*>(lout),
553 /*flags*/ fftwflags);
557 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
558 /*howmany*/ 0, /*howmany_dims*/ nullptr,
559 reinterpret_cast<FFTW(complex) *>(lin), reinterpret_cast<FFTW(complex) *>(lout),
560 /*sign*/ (flags&FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
563 #ifdef FFT5D_FFTW_THREADS
564 FFTW(plan_with_nthreads)(1);
569 if (!plan->p3d) /* for decomposition and if 3d plan did not work */
571 #endif /* GMX_FFT_FFTW3 */
572 for (s = 0; s < 3; s++)
576 fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
577 s, rC[s], M[s], pK[s], C[s], lsize);
579 plan->p1d[s] = static_cast<gmx_fft_t*>(malloc(sizeof(gmx_fft_t)*nthreads));
581 /* Make sure that the init routines are only called by one thread at a time and in order
582 (later is only important to not confuse valgrind)
584 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
585 for (t = 0; t < nthreads; t++)
591 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
593 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s == 0) || ((flags&FFT5D_BACKWARD) && s == 2)))
595 gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
599 gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
602 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
610 if ((flags&FFT5D_ORDER_YZ)) /*plan->cart is in the order of transposes */
612 plan->cart[0] = comm[0]; plan->cart[1] = comm[1];
616 plan->cart[1] = comm[0]; plan->cart[0] = comm[1];
618 #ifdef FFT5D_MPI_TRANSPOSE
620 for (s = 0; s < 2; s++)
622 if ((s == 0 && !(flags&FFT5D_ORDER_YZ)) || (s == 1 && (flags&FFT5D_ORDER_YZ)))
624 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
628 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
640 plan->NG = NG; plan->MG = MG; plan->KG = KG;
642 for (s = 0; s < 3; s++)
644 plan->N[s] = N[s]; plan->M[s] = M[s]; plan->K[s] = K[s]; plan->pN[s] = pN[s]; plan->pM[s] = pM[s]; plan->pK[s] = pK[s];
645 plan->oM[s] = oM[s]; plan->oK[s] = oK[s];
646 plan->C[s] = C[s]; plan->rC[s] = rC[s];
647 plan->iNin[s] = iNin[s]; plan->oNin[s] = oNin[s]; plan->iNout[s] = iNout[s]; plan->oNout[s] = oNout[s];
649 for (s = 0; s < 2; s++)
651 plan->P[s] = nP[s]; plan->coor[s] = prank[s];
654 /* plan->fftorder=fftorder;
655 plan->direction=direction;
656 plan->realcomplex=realcomplex;
659 plan->nthreads = nthreads;
660 plan->pinningPolicy = realGridAllocationPinningPolicy;
680 /*here x,y,z and N,M,K is in rotated coordinate system!!
681 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
682 maxN,maxM,maxK is max size of local data
683 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
684 NG, MG, KG is size of global data*/
685 static void splitaxes(t_complex* lout, const t_complex* lin,
686 int maxN, int maxM, int maxK, int pM,
687 int P, int NG, const int *N, const int* oN, int starty, int startz, int endy, int endz)
690 int in_i, out_i, in_z, out_z, in_y, out_y;
693 for (z = startz; z < endz+1; z++) /*3. z l*/
714 for (i = 0; i < P; i++) /*index cube along long axis*/
716 out_i = out_z + i*maxN*maxM*maxK;
718 for (y = s_y; y < e_y; y++) /*2. y k*/
720 out_y = out_i + y*maxN;
722 for (x = 0; x < N[i]; x++) /*1. x j*/
724 lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
725 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
726 /*before split data contiguos - thus if different processor get different amount oN is different*/
733 /*make axis contiguous again (after AllToAll) and also do local transpose*/
734 /*transpose mayor and major dimension
736 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
737 N,M,K local dimensions
739 static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
740 int maxN, int maxM, int maxK, int pM,
741 int P, int KG, const int* K, const int* oK, int starty, int startx, int endy, int endx)
744 int out_i, in_i, out_x, in_x, out_z, in_z;
747 for (x = startx; x < endx+1; x++) /*1.j*/
769 for (i = 0; i < P; i++) /*index cube along long axis*/
771 out_i = out_x + oK[i];
772 in_i = in_x + i*maxM*maxN*maxK;
773 for (z = 0; z < K[i]; z++) /*3.l*/
776 in_z = in_i + z*maxM*maxN;
777 for (y = s_y; y < e_y; y++) /*2.k*/
779 lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
786 /*make axis contiguous again (after AllToAll) and also do local transpose
787 tranpose mayor and middle dimension
789 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
792 static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
793 int P, int MG, const int* M, const int* oM, int startx, int startz, int endx, int endz)
796 int out_i, in_i, out_z, in_z, out_x, in_x;
799 for (z = startz; z < endz+1; z++)
820 for (i = 0; i < P; i++) /*index cube along long axis*/
822 out_i = out_z + oM[i];
823 in_i = in_z + i*maxM*maxN*maxK;
824 for (x = s_x; x < e_x; x++)
826 out_x = out_i + x*MG;
828 for (y = 0; y < M[i]; y++)
830 lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
838 static void rotate_offsets(int x[])
849 /*compute the offset to compare or print transposed local data in original input coordinates
850 xs matrix dimension size, xl dimension length, xc decomposition offset
851 s: step in computation = number of transposes*/
852 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
854 /* int direction = plan->direction;
855 int fftorder = plan->fftorder;*/
859 int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK,
860 *C = plan->C, *rC = plan->rC;
862 NG[0] = plan->NG; NG[1] = plan->MG; NG[2] = plan->KG;
864 if (!(plan->flags&FFT5D_ORDER_YZ))
868 case 0: o = XYZ; break;
869 case 1: o = ZYX; break;
870 case 2: o = YZX; break;
878 case 0: o = XYZ; break;
879 case 1: o = YXZ; break;
880 case 2: o = ZXY; break;
887 case XYZ: pos[0] = 1; pos[1] = 2; pos[2] = 3; break;
888 case XZY: pos[0] = 1; pos[1] = 3; pos[2] = 2; break;
889 case YXZ: pos[0] = 2; pos[1] = 1; pos[2] = 3; break;
890 case YZX: pos[0] = 3; pos[1] = 1; pos[2] = 2; break;
891 case ZXY: pos[0] = 2; pos[1] = 3; pos[2] = 1; break;
892 case ZYX: pos[0] = 3; pos[1] = 2; pos[2] = 1; break;
894 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
896 /*xs, xl give dimension size and data length in local transposed coordinate system
897 for 0(/1/2): x(/y/z) in original coordinate system*/
898 for (i = 0; i < 3; i++)
902 case 1: xs[i] = 1; xc[i] = 0; xl[i] = C[s]; break;
903 case 2: xs[i] = C[s]; xc[i] = oM[s]; xl[i] = pM[s]; break;
904 case 3: xs[i] = C[s]*pM[s]; xc[i] = oK[s]; xl[i] = pK[s]; break;
907 /*input order is different for test program to match FFTW order
908 (important for complex to real)*/
909 if (plan->flags&FFT5D_BACKWARD)
915 if (plan->flags&FFT5D_ORDER_YZ)
923 if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s == 0) || ((plan->flags&FFT5D_BACKWARD) && s == 2)))
929 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
932 int *coor = plan->coor;
933 int xs[3], xl[3], xc[3], NG[3];
934 int ll = (plan->flags&FFT5D_REALCOMPLEX) ? 1 : 2;
935 compute_offsets(plan, xs, xl, xc, NG, s);
936 fprintf(debug, txt, coor[0], coor[1]);
937 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
938 for (z = 0; z < xl[2]; z++)
940 for (y = 0; y < xl[1]; y++)
942 fprintf(debug, "%d %d: ", coor[0], coor[1]);
943 for (x = 0; x < xl[0]; x++)
945 for (l = 0; l < ll; l++)
947 fprintf(debug, "%f ", reinterpret_cast<const real*>(lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
951 fprintf(debug, "\n");
956 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
958 t_complex *lin = plan->lin;
959 t_complex *lout = plan->lout;
960 t_complex *lout2 = plan->lout2;
961 t_complex *lout3 = plan->lout3;
962 t_complex *fftout, *joinin;
964 gmx_fft_t **p1d = plan->p1d;
965 #ifdef FFT5D_MPI_TRANSPOSE
966 FFTW(plan) *mpip = plan->mpip;
969 MPI_Comm *cart = plan->cart;
972 double time_fft = 0, time_local = 0, time_mpi[2] = {0}, time = 0;
974 int *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
975 *C = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout, **oNout = plan->oNout;
976 int s = 0, tstart, tend, bParallelDim;
990 FFTW(execute)(plan->p3d);
994 times->fft += MPI_Wtime()-time;
1005 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1007 print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1010 for (s = 0; s < 2; s++) /*loop over first two FFT steps (corner rotations)*/
1014 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1024 /* ---------- START FFT ------------ */
1026 if (times != 0 && thread == 0)
1032 if (bParallelDim || plan->nthreads == 1)
1048 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1049 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s == 0)
1051 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, fftout+tstart);
1055 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, fftout+tstart);
1060 if (times != NULL && thread == 0)
1062 time_fft += MPI_Wtime()-time;
1065 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1067 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1069 /* ---------- END FFT ------------ */
1071 /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1075 if (times != NULL && thread == 0)
1082 1. (most outer) axes (x) is split into P[s] parts of size N[s]
1086 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
1088 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1090 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1092 if (times != NULL && thread == 0)
1094 time_local += MPI_Wtime()-time;
1098 /* ---------- END SPLIT , START TRANSPOSE------------ */
1108 wallcycle_start(times, ewcPME_FFTCOMM);
1110 #ifdef FFT5D_MPI_TRANSPOSE
1111 FFTW(execute)(mpip[s]);
1114 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1116 MPI_Alltoall(reinterpret_cast<real *>(lout2), N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, reinterpret_cast<real *>(lout3), N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1120 MPI_Alltoall(reinterpret_cast<real *>(lout2), N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, reinterpret_cast<real *>(lout3), N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1123 gmx_incons("fft5d MPI call without MPI configuration");
1125 #endif /*FFT5D_MPI_TRANSPOSE*/
1129 time_mpi[s] = MPI_Wtime()-time;
1132 wallcycle_stop(times, ewcPME_FFTCOMM);
1136 #pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1138 /* ---------- END SPLIT + TRANSPOSE------------ */
1140 /* ---------- START JOIN ------------ */
1142 if (times != NULL && thread == 0)
1156 /*bring back in matrix form
1157 thus make new 1. axes contiguos
1158 also local transpose 1 and 2/3
1159 runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1161 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1165 tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
1166 tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1167 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1174 tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
1175 tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1176 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
1181 if (times != NULL && thread == 0)
1183 time_local += MPI_Wtime()-time;
1186 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1188 print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1190 /* ---------- END JOIN ------------ */
1192 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1193 } /* for(s=0;s<2;s++) */
1195 if (times != NULL && thread == 0)
1201 if (plan->flags&FFT5D_INPLACE)
1203 lout = lin; /*in place currently not supported*/
1206 /* ----------- FFT ----------- */
1207 tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1208 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1210 gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, lout+tstart);
1214 gmx_fft_many_1d( p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD, lin+tstart, lout+tstart);
1216 /* ------------ END FFT ---------*/
1219 if (times != NULL && thread == 0)
1221 time_fft += MPI_Wtime()-time;
1223 times->fft += time_fft;
1224 times->local += time_local;
1225 times->mpi2 += time_mpi[1];
1226 times->mpi1 += time_mpi[0];
1230 if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1232 print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1236 void fft5d_destroy(fft5d_plan plan)
1240 for (s = 0; s < 3; s++)
1244 for (t = 0; t < plan->nthreads; t++)
1246 gmx_many_fft_destroy(plan->p1d[s][t]);
1252 free(plan->iNin[s]);
1253 plan->iNin[s] = nullptr;
1257 free(plan->oNin[s]);
1258 plan->oNin[s] = nullptr;
1262 free(plan->iNout[s]);
1263 plan->iNout[s] = nullptr;
1267 free(plan->oNout[s]);
1268 plan->oNout[s] = nullptr;
1273 #ifdef FFT5D_MPI_TRANSPOS
1274 for (s = 0; s < 2; s++)
1276 FFTW(destroy_plan)(plan->mpip[s]);
1278 #endif /* FFT5D_MPI_TRANSPOS */
1281 FFTW(destroy_plan)(plan->p3d);
1284 #endif /* GMX_FFT_FFTW3 */
1286 if (!(plan->flags&FFT5D_NOMALLOC))
1288 // only needed for PME GPU mixed mode
1289 if (plan->pinningPolicy == gmx::PinningPolicy::CanBePinned &&
1290 isHostMemoryPinned(plan->lin))
1292 gmx::unpinBuffer(plan->lin);
1294 sfree_aligned(plan->lin);
1295 sfree_aligned(plan->lout);
1296 if (plan->nthreads > 1)
1298 sfree_aligned(plan->lout2);
1299 sfree_aligned(plan->lout3);
1303 #ifdef FFT5D_THREADS
1304 #ifdef FFT5D_FFTW_THREADS
1305 /*FFTW(cleanup_threads)();*/
1312 /*Is this better than direct access of plan? enough data?
1313 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1314 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1325 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1326 of processor dimensions*/
1327 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
1329 MPI_Comm cart[2] = {MPI_COMM_NULL, MPI_COMM_NULL};
1331 int size = 1, prank = 0;
1334 int wrap[] = {0, 0};
1336 int rdim1[] = {0, 1}, rdim2[] = {1, 0};
1338 MPI_Comm_size(comm, &size);
1339 MPI_Comm_rank(comm, &prank);
1349 printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1354 P[0] = P0; P[1] = size/P0; /*number of processors in the two dimensions*/
1356 /*Difference between x-y-z regarding 2d decomposition is whether they are
1357 distributed along axis 1, 2 or both*/
1359 MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1360 MPI_Cart_get(gcart, 2, P, wrap, coor);
1361 MPI_Cart_sub(gcart, rdim1, &cart[0]);
1362 MPI_Cart_sub(gcart, rdim2, &cart[1]);
1367 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1372 /*prints in original coordinate system of data (as the input to FFT)*/
1373 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1375 int xs[3], xl[3], xc[3], NG[3];
1377 int *coor = plan->coor;
1378 int ll = 2; /*compare ll values per element (has to be 2 for complex)*/
1379 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1384 compute_offsets(plan, xs, xl, xc, NG, 2);
1385 if (plan->flags&FFT5D_DEBUG)
1387 printf("Compare2\n");
1389 for (z = 0; z < xl[2]; z++)
1391 for (y = 0; y < xl[1]; y++)
1393 if (plan->flags&FFT5D_DEBUG)
1395 printf("%d %d: ", coor[0], coor[1]);
1397 for (x = 0; x < xl[0]; x++)
1399 for (l = 0; l < ll; l++) /*loop over real/complex parts*/
1402 a = reinterpret_cast<const real*>(lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1405 a /= plan->rC[0]*plan->rC[1]*plan->rC[2];
1409 b = reinterpret_cast<const real*>(in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1413 b = reinterpret_cast<const real*>(in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1415 if (plan->flags&FFT5D_DEBUG)
1417 printf("%f %f, ", a, b);
1421 if (std::fabs(a-b) > 2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS)
1423 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n", coor[0], coor[1], x, y, z, a, b);
1425 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1428 if (plan->flags&FFT5D_DEBUG)
1433 if (plan->flags&FFT5D_DEBUG)