src/gromacs/fft/fft5d.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2009,2010,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #include "gmxpre.h"
  36
  37 #include "fft5d.h"
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42 #include <cfloat>
  43 #include <cmath>
  44 #include <cstdio>
  45 #include <cstdlib>
  46 #include <cstring>
  47
  48 #include <algorithm>
  49
  50 #include "gromacs/gpu_utils/gpu_utils.h"
  51 #include "gromacs/gpu_utils/hostallocator.h"
  52 #include "gromacs/gpu_utils/pinning.h"
  53 #include "gromacs/utility/alignedallocator.h"
  54 #include "gromacs/utility/exceptions.h"
  55 #include "gromacs/utility/fatalerror.h"
  56 #include "gromacs/utility/gmxmpi.h"
  57 #include "gromacs/utility/smalloc.h"
  58
  59 #ifdef NOGMX
  60 #define GMX_PARALLEL_ENV_INITIALIZED 1
  61 #else
  62 #if GMX_MPI
  63 #define GMX_PARALLEL_ENV_INITIALIZED 1
  64 #else
  65 #define GMX_PARALLEL_ENV_INITIALIZED 0
  66 #endif
  67 #endif
  68
  69 #if GMX_OPENMP
  70 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
  71 #define FFT5D_THREADS
  72 /* requires fftw compiled with openmp */
  73 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
  74 #endif
  75
  76 #ifndef __FLT_EPSILON__
  77 #define __FLT_EPSILON__ FLT_EPSILON
  78 #define __DBL_EPSILON__ DBL_EPSILON
  79 #endif
  80
  81 #ifdef NOGMX
  82 FILE* debug = 0;
  83 #endif
  84
  85 #if GMX_FFT_FFTW3
  86
  87 #include "gromacs/utility/exceptions.h"
  88 #include "gromacs/utility/mutex.h"
  89 /* none of the fftw3 calls, except execute(), are thread-safe, so
  90    we need to serialize them with this mutex. */
  91 static gmx::Mutex big_fftw_mutex;
  92 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  93 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  94 #endif /* GMX_FFT_FFTW3 */
  95
  96 #if GMX_MPI
  97 /* largest factor smaller than sqrt */
  98 static int lfactor(int z)
  99 {
 100     int i = static_cast<int>(sqrt(static_cast<double>(z)));
 101     while (z%i != 0)
 102     {
 103         i--;
 104     }
 105     return i;
 106 }
 107 #endif
 108
 109 #if !GMX_MPI
 110 #if HAVE_GETTIMEOFDAY
 111 #include <sys/time.h>
 112 double MPI_Wtime()
 113 {
 114     struct timeval tv;
 115     gettimeofday(&tv, 0);
 116     return tv.tv_sec+tv.tv_usec*1e-6;
 117 }
 118 #else
 119 double MPI_Wtime()
 120 {
 121     return 0.0;
 122 }
 123 #endif
 124 #endif
 125
 126 static int vmax(const int* a, int s)
 127 {
 128     int i, max = 0;
 129     for (i = 0; i < s; i++)
 130     {
 131         if (a[i] > max)
 132         {
 133             max = a[i];
 134         }
 135     }
 136     return max;
 137 }
 138
 139
 140 /* NxMxK the size of the data
 141  * comm communicator to use for fft5d
 142  * P0 number of processor in 1st axes (can be null for automatic)
 143  * lin is allocated by fft5d because size of array is only known after planning phase
 144  * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 145  */
 146 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads, gmx::PinningPolicy realGridAllocationPinningPolicy)
 147 {
 148
 149     int        P[2], prank[2], i;
 150     bool       bMaster;
 151     int        rNG, rMG, rKG;
 152     int       *N0 = nullptr, *N1 = nullptr, *M0 = nullptr, *M1 = nullptr, *K0 = nullptr, *K1 = nullptr, *oN0 = nullptr, *oN1 = nullptr, *oM0 = nullptr, *oM1 = nullptr, *oK0 = nullptr, *oK1 = nullptr;
 153     int        N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {nullptr}, *oNin[3] = {nullptr}, *iNout[3] = {nullptr}, *oNout[3] = {nullptr};
 154     int        C[3], rC[3], nP[2];
 155     int        lsize;
 156     t_complex *lin = nullptr, *lout = nullptr, *lout2 = nullptr, *lout3 = nullptr;
 157     fft5d_plan plan;
 158     int        s;
 159
 160     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 161 #if GMX_MPI
 162     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
 163     {
 164         MPI_Comm_size(comm[0], &P[0]);
 165         MPI_Comm_rank(comm[0], &prank[0]);
 166     }
 167     else
 168 #endif
 169     {
 170         P[0]     = 1;
 171         prank[0] = 0;
 172     }
 173 #if GMX_MPI
 174     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
 175     {
 176         MPI_Comm_size(comm[1], &P[1]);
 177         MPI_Comm_rank(comm[1], &prank[1]);
 178     }
 179     else
 180 #endif
 181     {
 182         P[1]     = 1;
 183         prank[1] = 0;
 184     }
 185
 186     bMaster = prank[0] == 0 && prank[1] == 0;
 187
 188
 189     if (debug)
 190     {
 191         fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n",
 192                 P[0], P[1], prank[0], prank[1]);
 193     }
 194
 195     if (bMaster)
 196     {
 197         if (debug)
 198         {
 199             fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 200                     NG, MG, KG, P[0], P[1], int((flags&FFT5D_REALCOMPLEX) > 0), int((flags&FFT5D_BACKWARD) > 0), int((flags&FFT5D_ORDER_YZ) > 0), int((flags&FFT5D_DEBUG) > 0));
 201         }
 202         /* The check below is not correct, one prime factor 11 or 13 is ok.
 203            if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 204             printf("WARNING: FFT very slow with prime factors larger 7\n");
 205             printf("Change FFT size or in case you cannot change it look at\n");
 206             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 207            }
 208          */
 209     }
 210
 211     if (NG == 0 || MG == 0 || KG == 0)
 212     {
 213         if (bMaster)
 214         {
 215             printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 216         }
 217         return nullptr;
 218     }
 219
 220     rNG = NG; rMG = MG; rKG = KG;
 221
 222     if (flags&FFT5D_REALCOMPLEX)
 223     {
 224         if (!(flags&FFT5D_BACKWARD))
 225         {
 226             NG = NG/2+1;
 227         }
 228         else
 229         {
 230             if (!(flags&FFT5D_ORDER_YZ))
 231             {
 232                 MG = MG/2+1;
 233             }
 234             else
 235             {
 236                 KG = KG/2+1;
 237             }
 238         }
 239     }
 240
 241
 242     /*for transpose we need to know the size for each processor not only our own size*/
 243
 244     N0  = static_cast<int*>(malloc(P[0]*sizeof(int))); N1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
 245     M0  = static_cast<int*>(malloc(P[0]*sizeof(int))); M1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
 246     K0  = static_cast<int*>(malloc(P[0]*sizeof(int))); K1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
 247     oN0 = static_cast<int*>(malloc(P[0]*sizeof(int))); oN1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
 248     oM0 = static_cast<int*>(malloc(P[0]*sizeof(int))); oM1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
 249     oK0 = static_cast<int*>(malloc(P[0]*sizeof(int))); oK1 = static_cast<int*>(malloc(P[1]*sizeof(int)));
 250
 251     for (i = 0; i < P[0]; i++)
 252     {
 253         #define EVENDIST
 254         #ifndef EVENDIST
 255         oN0[i] = i*ceil((double)NG/P[0]);
 256         oM0[i] = i*ceil((double)MG/P[0]);
 257         oK0[i] = i*ceil((double)KG/P[0]);
 258         #else
 259         oN0[i] = (NG*i)/P[0];
 260         oM0[i] = (MG*i)/P[0];
 261         oK0[i] = (KG*i)/P[0];
 262         #endif
 263     }
 264     for (i = 0; i < P[1]; i++)
 265     {
 266         #ifndef EVENDIST
 267         oN1[i] = i*ceil((double)NG/P[1]);
 268         oM1[i] = i*ceil((double)MG/P[1]);
 269         oK1[i] = i*ceil((double)KG/P[1]);
 270         #else
 271         oN1[i] = (NG*i)/P[1];
 272         oM1[i] = (MG*i)/P[1];
 273         oK1[i] = (KG*i)/P[1];
 274         #endif
 275     }
 276     for (i = 0; P[0] > 0 && i < P[0]-1; i++)
 277     {
 278         N0[i] = oN0[i+1]-oN0[i];
 279         M0[i] = oM0[i+1]-oM0[i];
 280         K0[i] = oK0[i+1]-oK0[i];
 281     }
 282     N0[P[0]-1] = NG-oN0[P[0]-1];
 283     M0[P[0]-1] = MG-oM0[P[0]-1];
 284     K0[P[0]-1] = KG-oK0[P[0]-1];
 285     for (i = 0; P[1] > 0 && i < P[1]-1; i++)
 286     {
 287         N1[i] = oN1[i+1]-oN1[i];
 288         M1[i] = oM1[i+1]-oM1[i];
 289         K1[i] = oK1[i+1]-oK1[i];
 290     }
 291     N1[P[1]-1] = NG-oN1[P[1]-1];
 292     M1[P[1]-1] = MG-oM1[P[1]-1];
 293     K1[P[1]-1] = KG-oK1[P[1]-1];
 294
 295     /* for step 1-3 the local N,M,K sizes of the transposed system
 296        C: contiguous dimension, and nP: number of processor in subcommunicator
 297        for that step */
 298
 299     GMX_ASSERT(prank[0] < P[0], "Must have valid rank within communicator size");
 300     GMX_ASSERT(prank[1] < P[1], "Must have valid rank within communicator size");
 301     pM[0] = M0[prank[0]];
 302     oM[0] = oM0[prank[0]];
 303     pK[0] = K1[prank[1]];
 304     oK[0] = oK1[prank[1]];
 305     C[0]  = NG;
 306     rC[0] = rNG;
 307     if (!(flags&FFT5D_ORDER_YZ))
 308     {
 309         N[0]     = vmax(N1, P[1]);
 310         M[0]     = M0[prank[0]];
 311         K[0]     = vmax(K1, P[1]);
 312         pN[0]    = N1[prank[1]];
 313         iNout[0] = N1;
 314         oNout[0] = oN1;
 315         nP[0]    = P[1];
 316         C[1]     = KG;
 317         rC[1]    = rKG;
 318         N[1]     = vmax(K0, P[0]);
 319         pN[1]    = K0[prank[0]];
 320         iNin[1]  = K1;
 321         oNin[1]  = oK1;
 322         iNout[1] = K0;
 323         oNout[1] = oK0;
 324         M[1]     = vmax(M0, P[0]);
 325         pM[1]    = M0[prank[0]];
 326         oM[1]    = oM0[prank[0]];
 327         K[1]     = N1[prank[1]];
 328         pK[1]    = N1[prank[1]];
 329         oK[1]    = oN1[prank[1]];
 330         nP[1]    = P[0];
 331         C[2]     = MG;
 332         rC[2]    = rMG;
 333         iNin[2]  = M0;
 334         oNin[2]  = oM0;
 335         M[2]     = vmax(K0, P[0]);
 336         pM[2]    = K0[prank[0]];
 337         oM[2]    = oK0[prank[0]];
 338         K[2]     = vmax(N1, P[1]);
 339         pK[2]    = N1[prank[1]];
 340         oK[2]    = oN1[prank[1]];
 341         free(N0); free(oN0); /*these are not used for this order*/
 342         free(M1); free(oM1); /*the rest is freed in destroy*/
 343     }
 344     else
 345     {
 346         N[0]     = vmax(N0, P[0]);
 347         M[0]     = vmax(M0, P[0]);
 348         K[0]     = K1[prank[1]];
 349         pN[0]    = N0[prank[0]];
 350         iNout[0] = N0;
 351         oNout[0] = oN0;
 352         nP[0]    = P[0];
 353         C[1]     = MG;
 354         rC[1]    = rMG;
 355         N[1]     = vmax(M1, P[1]);
 356         pN[1]    = M1[prank[1]];
 357         iNin[1]  = M0;
 358         oNin[1]  = oM0;
 359         iNout[1] = M1;
 360         oNout[1] = oM1;
 361         M[1]     = N0[prank[0]];
 362         pM[1]    = N0[prank[0]];
 363         oM[1]    = oN0[prank[0]];
 364         K[1]     = vmax(K1, P[1]);
 365         pK[1]    = K1[prank[1]];
 366         oK[1]    = oK1[prank[1]];
 367         nP[1]    = P[1];
 368         C[2]     = KG;
 369         rC[2]    = rKG;
 370         iNin[2]  = K1;
 371         oNin[2]  = oK1;
 372         M[2]     = vmax(N0, P[0]);
 373         pM[2]    = N0[prank[0]];
 374         oM[2]    = oN0[prank[0]];
 375         K[2]     = vmax(M1, P[1]);
 376         pK[2]    = M1[prank[1]];
 377         oK[2]    = oM1[prank[1]];
 378         free(N1); free(oN1); /*these are not used for this order*/
 379         free(K0); free(oK0); /*the rest is freed in destroy*/
 380     }
 381     N[2] = pN[2] = -1;       /*not used*/
 382
 383     /*
 384        Difference between x-y-z regarding 2d decomposition is whether they are
 385        distributed along axis 1, 2 or both
 386      */
 387
 388     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 389     lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
 390     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 391     if (!(flags&FFT5D_NOMALLOC))
 392     {
 393         // only needed for PME GPU mixed mode
 394         if (realGridAllocationPinningPolicy == gmx::PinningPolicy::PinnedIfSupported &&
 395             GMX_GPU == GMX_GPU_CUDA)
 396         {
 397             const std::size_t numBytes = lsize * sizeof(t_complex);
 398             lin = static_cast<t_complex *>(gmx::PageAlignedAllocationPolicy::malloc(numBytes));
 399             gmx::pinBuffer(lin, numBytes);
 400         }
 401         else
 402         {
 403             snew_aligned(lin, lsize, 32);
 404         }
 405         snew_aligned(lout, lsize, 32);
 406         if (nthreads > 1)
 407         {
 408             /* We need extra transpose buffers to avoid OpenMP barriers */
 409             snew_aligned(lout2, lsize, 32);
 410             snew_aligned(lout3, lsize, 32);
 411         }
 412         else
 413         {
 414             /* We can reuse the buffers to avoid cache misses */
 415             lout2 = lin;
 416             lout3 = lout;
 417         }
 418     }
 419     else
 420     {
 421         lin  = *rlin;
 422         lout = *rlout;
 423         if (nthreads > 1)
 424         {
 425             lout2 = *rlout2;
 426             lout3 = *rlout3;
 427         }
 428         else
 429         {
 430             lout2 = lin;
 431             lout3 = lout;
 432         }
 433     }
 434
 435     plan = static_cast<fft5d_plan>(calloc(1, sizeof(struct fft5d_plan_t)));
 436
 437
 438     if (debug)
 439     {
 440         fprintf(debug, "Running on %d threads\n", nthreads);
 441     }
 442
 443 #if GMX_FFT_FFTW3
 444     /* Don't add more stuff here! We have already had at least one bug because we are reimplementing
 445      * the low-level FFT interface instead of using the Gromacs FFT module. If we need more
 446      * generic functionality it is far better to extend the interface so we can use it for
 447      * all FFT libraries instead of writing FFTW-specific code here.
 448      */
 449
 450     /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
 451     /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
 452      * within a parallel region. For now deactivated. If it should be supported it has to made sure that
 453      * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
 454      * and that the 3d plan is faster than the 1d plan.
 455      */
 456     if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
 457     {
 458         int fftwflags = FFTW_DESTROY_INPUT;
 459         FFTW(iodim) dims[3];
 460         int inNG = NG, outMG = MG, outKG = KG;
 461
 462         FFTW_LOCK;
 463
 464         fftwflags |= (flags & FFT5D_NOMEASURE) ? FFTW_ESTIMATE : FFTW_MEASURE;
 465
 466         if (flags&FFT5D_REALCOMPLEX)
 467         {
 468             if (!(flags&FFT5D_BACKWARD))        /*input pointer is not complex*/
 469             {
 470                 inNG *= 2;
 471             }
 472             else                                /*output pointer is not complex*/
 473             {
 474                 if (!(flags&FFT5D_ORDER_YZ))
 475                 {
 476                     outMG *= 2;
 477                 }
 478                 else
 479                 {
 480                     outKG *= 2;
 481                 }
 482             }
 483         }
 484
 485         if (!(flags&FFT5D_BACKWARD))
 486         {
 487             dims[0].n  = KG;
 488             dims[1].n  = MG;
 489             dims[2].n  = rNG;
 490
 491             dims[0].is = inNG*MG;         /*N M K*/
 492             dims[1].is = inNG;
 493             dims[2].is = 1;
 494             if (!(flags&FFT5D_ORDER_YZ))
 495             {
 496                 dims[0].os = MG;           /*M K N*/
 497                 dims[1].os = 1;
 498                 dims[2].os = MG*KG;
 499             }
 500             else
 501             {
 502                 dims[0].os = 1;           /*K N M*/
 503                 dims[1].os = KG*NG;
 504                 dims[2].os = KG;
 505             }
 506         }
 507         else
 508         {
 509             if (!(flags&FFT5D_ORDER_YZ))
 510             {
 511                 dims[0].n  = NG;
 512                 dims[1].n  = KG;
 513                 dims[2].n  = rMG;
 514
 515                 dims[0].is = 1;
 516                 dims[1].is = NG*MG;
 517                 dims[2].is = NG;
 518
 519                 dims[0].os = outMG*KG;
 520                 dims[1].os = outMG;
 521                 dims[2].os = 1;
 522             }
 523             else
 524             {
 525                 dims[0].n  = MG;
 526                 dims[1].n  = NG;
 527                 dims[2].n  = rKG;
 528
 529                 dims[0].is = NG;
 530                 dims[1].is = 1;
 531                 dims[2].is = NG*MG;
 532
 533                 dims[0].os = outKG*NG;
 534                 dims[1].os = outKG;
 535                 dims[2].os = 1;
 536             }
 537         }
 538 #ifdef FFT5D_THREADS
 539 #ifdef FFT5D_FFTW_THREADS
 540         FFTW(plan_with_nthreads)(nthreads);
 541 #endif
 542 #endif
 543         if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
 544         {
 545             plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 546                                                          /*howmany*/ 0, /*howmany_dims*/ nullptr,
 547                                                          reinterpret_cast<real*>(lin), reinterpret_cast<FFTW(complex) *>(lout),
 548                                                          /*flags*/ fftwflags);
 549         }
 550         else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD))
 551         {
 552             plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 553                                                          /*howmany*/ 0, /*howmany_dims*/ nullptr,
 554                                                          reinterpret_cast<FFTW(complex) *>(lin), reinterpret_cast<real*>(lout),
 555                                                          /*flags*/ fftwflags);
 556         }
 557         else
 558         {
 559             plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 560                                                      /*howmany*/ 0, /*howmany_dims*/ nullptr,
 561                                                      reinterpret_cast<FFTW(complex) *>(lin), reinterpret_cast<FFTW(complex) *>(lout),
 562                                                      /*sign*/ (flags&FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
 563         }
 564 #ifdef FFT5D_THREADS
 565 #ifdef FFT5D_FFTW_THREADS
 566         FFTW(plan_with_nthreads)(1);
 567 #endif
 568 #endif
 569         FFTW_UNLOCK;
 570     }
 571     if (!plan->p3d) /* for decomposition and if 3d plan did not work */
 572     {
 573 #endif              /* GMX_FFT_FFTW3 */
 574     for (s = 0; s < 3; s++)
 575     {
 576         if (debug)
 577         {
 578             fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 579                     s, rC[s], M[s], pK[s], C[s], lsize);
 580         }
 581         plan->p1d[s] = static_cast<gmx_fft_t*>(malloc(sizeof(gmx_fft_t)*nthreads));
 582
 583         /* Make sure that the init routines are only called by one thread at a time and in order
 584            (later is only important to not confuse valgrind)
 585          */
 586 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
 587         for (int t = 0; t < nthreads; t++)
 588         {
 589 #pragma omp ordered
 590             {
 591                 try
 592                 {
 593                     int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
 594
 595                     if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s == 0) || ((flags&FFT5D_BACKWARD) && s == 2)))
 596                     {
 597                         gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
 598                     }
 599                     else
 600                     {
 601                         gmx_fft_init_many_1d     ( &plan->p1d[s][t],  C[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
 602                     }
 603                 }
 604                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 605             }
 606         }
 607     }
 608
 609 #if GMX_FFT_FFTW3
 610 }
 611 #endif
 612     if ((flags&FFT5D_ORDER_YZ))   /*plan->cart is in the order of transposes */
 613     {
 614         plan->cart[0] = comm[0]; plan->cart[1] = comm[1];
 615     }
 616     else
 617     {
 618         plan->cart[1] = comm[0]; plan->cart[0] = comm[1];
 619     }
 620 #ifdef FFT5D_MPI_TRANSPOSE
 621     FFTW_LOCK;
 622     for (s = 0; s < 2; s++)
 623     {
 624         if ((s == 0 && !(flags&FFT5D_ORDER_YZ)) || (s == 1 && (flags&FFT5D_ORDER_YZ)))
 625         {
 626             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 627         }
 628         else
 629         {
 630             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 631         }
 632     }
 633     FFTW_UNLOCK;
 634 #endif
 635
 636
 637     plan->lin   = lin;
 638     plan->lout  = lout;
 639     plan->lout2 = lout2;
 640     plan->lout3 = lout3;
 641
 642     plan->NG = NG; plan->MG = MG; plan->KG = KG;
 643
 644     for (s = 0; s < 3; s++)
 645     {
 646         plan->N[s]    = N[s]; plan->M[s] = M[s]; plan->K[s] = K[s]; plan->pN[s] = pN[s]; plan->pM[s] = pM[s]; plan->pK[s] = pK[s];
 647         plan->oM[s]   = oM[s]; plan->oK[s] = oK[s];
 648         plan->C[s]    = C[s]; plan->rC[s] = rC[s];
 649         plan->iNin[s] = iNin[s]; plan->oNin[s] = oNin[s]; plan->iNout[s] = iNout[s]; plan->oNout[s] = oNout[s];
 650     }
 651     for (s = 0; s < 2; s++)
 652     {
 653         plan->P[s] = nP[s]; plan->coor[s] = prank[s];
 654     }
 655
 656 /*    plan->fftorder=fftorder;
 657     plan->direction=direction;
 658     plan->realcomplex=realcomplex;
 659  */
 660     plan->flags         = flags;
 661     plan->nthreads      = nthreads;
 662     plan->pinningPolicy = realGridAllocationPinningPolicy;
 663     *rlin               = lin;
 664     *rlout              = lout;
 665     *rlout2             = lout2;
 666     *rlout3             = lout3;
 667     return plan;
 668 }
 669
 670
 671 enum order {
 672     XYZ,
 673     XZY,
 674     YXZ,
 675     YZX,
 676     ZXY,
 677     ZYX
 678 };
 679
 680
 681
 682 /*here x,y,z and N,M,K is in rotated coordinate system!!
 683    x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 684    maxN,maxM,maxK is max size of local data
 685    pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 686    NG, MG, KG is size of global data*/
 687 static void splitaxes(t_complex* lout, const t_complex* lin,
 688                       int maxN, int maxM, int maxK, int pM,
 689                       int P, int NG, const int *N, const int* oN, int starty, int startz, int endy, int endz)
 690 {
 691     int x, y, z, i;
 692     int in_i, out_i, in_z, out_z, in_y, out_y;
 693     int s_y, e_y;
 694
 695     for (z = startz; z < endz+1; z++) /*3. z l*/
 696     {
 697         if (z == startz)
 698         {
 699             s_y = starty;
 700         }
 701         else
 702         {
 703             s_y = 0;
 704         }
 705         if (z == endz)
 706         {
 707             e_y = endy;
 708         }
 709         else
 710         {
 711             e_y = pM;
 712         }
 713         out_z  = z*maxN*maxM;
 714         in_z   = z*NG*pM;
 715
 716         for (i = 0; i < P; i++) /*index cube along long axis*/
 717         {
 718             out_i  = out_z  + i*maxN*maxM*maxK;
 719             in_i   = in_z + oN[i];
 720             for (y = s_y; y < e_y; y++)   /*2. y k*/
 721             {
 722                 out_y  = out_i  + y*maxN;
 723                 in_y   = in_i + y*NG;
 724                 for (x = 0; x < N[i]; x++)       /*1. x j*/
 725                 {
 726                     lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
 727                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 728                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 729                 }
 730             }
 731         }
 732     }
 733 }
 734
 735 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 736 /*transpose mayor and major dimension
 737    variables see above
 738    the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 739    N,M,K local dimensions
 740    KG global size*/
 741 static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
 742                             int maxN, int maxM, int maxK, int pM,
 743                             int P, int KG, const int* K, const int* oK, int starty, int startx, int endy, int endx)
 744 {
 745     int i, x, y, z;
 746     int out_i, in_i, out_x, in_x, out_z, in_z;
 747     int s_y, e_y;
 748
 749     for (x = startx; x < endx+1; x++) /*1.j*/
 750     {
 751         if (x == startx)
 752         {
 753             s_y = starty;
 754         }
 755         else
 756         {
 757             s_y = 0;
 758         }
 759         if (x == endx)
 760         {
 761             e_y = endy;
 762         }
 763         else
 764         {
 765             e_y = pM;
 766         }
 767
 768         out_x  = x*KG*pM;
 769         in_x   = x;
 770
 771         for (i = 0; i < P; i++) /*index cube along long axis*/
 772         {
 773             out_i  = out_x  + oK[i];
 774             in_i   = in_x + i*maxM*maxN*maxK;
 775             for (z = 0; z < K[i]; z++) /*3.l*/
 776             {
 777                 out_z  = out_i  + z;
 778                 in_z   = in_i + z*maxM*maxN;
 779                 for (y = s_y; y < e_y; y++)              /*2.k*/
 780                 {
 781                     lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
 782                 }
 783             }
 784         }
 785     }
 786 }
 787
 788 /*make axis contiguous again (after AllToAll) and also do local transpose
 789    tranpose mayor and middle dimension
 790    variables see above
 791    the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 792    N,M,K local size
 793    MG, global size*/
 794 static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
 795                             int P, int MG, const int* M, const int* oM, int startx, int startz, int endx, int endz)
 796 {
 797     int i, z, y, x;
 798     int out_i, in_i, out_z, in_z, out_x, in_x;
 799     int s_x, e_x;
 800
 801     for (z = startz; z < endz+1; z++)
 802     {
 803         if (z == startz)
 804         {
 805             s_x = startx;
 806         }
 807         else
 808         {
 809             s_x = 0;
 810         }
 811         if (z == endz)
 812         {
 813             e_x = endx;
 814         }
 815         else
 816         {
 817             e_x = pN;
 818         }
 819         out_z  = z*MG*pN;
 820         in_z   = z*maxM*maxN;
 821
 822         for (i = 0; i < P; i++) /*index cube along long axis*/
 823         {
 824             out_i  = out_z  + oM[i];
 825             in_i   = in_z + i*maxM*maxN*maxK;
 826             for (x = s_x; x < e_x; x++)
 827             {
 828                 out_x  = out_i  + x*MG;
 829                 in_x   = in_i + x;
 830                 for (y = 0; y < M[i]; y++)
 831                 {
 832                     lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
 833                 }
 834             }
 835         }
 836     }
 837 }
 838
 839
 840 static void rotate_offsets(int x[])
 841 {
 842     int t = x[0];
 843 /*    x[0]=x[2];
 844     x[2]=x[1];
 845     x[1]=t;*/
 846     x[0] = x[1];
 847     x[1] = x[2];
 848     x[2] = t;
 849 }
 850
 851 /*compute the offset to compare or print transposed local data in original input coordinates
 852    xs matrix dimension size, xl dimension length, xc decomposition offset
 853    s: step in computation = number of transposes*/
 854 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
 855 {
 856 /*    int direction = plan->direction;
 857     int fftorder = plan->fftorder;*/
 858
 859     int  o = 0;
 860     int  pos[3], i;
 861     int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK,
 862     *C      = plan->C, *rC = plan->rC;
 863
 864     NG[0] = plan->NG; NG[1] = plan->MG; NG[2] = plan->KG;
 865
 866     if (!(plan->flags&FFT5D_ORDER_YZ))
 867     {
 868         switch (s)
 869         {
 870             case 0: o = XYZ; break;
 871             case 1: o = ZYX; break;
 872             case 2: o = YZX; break;
 873             default: assert(0);
 874         }
 875     }
 876     else
 877     {
 878         switch (s)
 879         {
 880             case 0: o = XYZ; break;
 881             case 1: o = YXZ; break;
 882             case 2: o = ZXY; break;
 883             default: assert(0);
 884         }
 885     }
 886
 887     switch (o)
 888     {
 889         case XYZ: pos[0] = 1; pos[1] = 2; pos[2] = 3; break;
 890         case XZY: pos[0] = 1; pos[1] = 3; pos[2] = 2; break;
 891         case YXZ: pos[0] = 2; pos[1] = 1; pos[2] = 3; break;
 892         case YZX: pos[0] = 3; pos[1] = 1; pos[2] = 2; break;
 893         case ZXY: pos[0] = 2; pos[1] = 3; pos[2] = 1; break;
 894         case ZYX: pos[0] = 3; pos[1] = 2; pos[2] = 1; break;
 895     }
 896     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 897
 898     /*xs, xl give dimension size and data length in local transposed coordinate system
 899        for 0(/1/2): x(/y/z) in original coordinate system*/
 900     for (i = 0; i < 3; i++)
 901     {
 902         switch (pos[i])
 903         {
 904             case 1: xs[i] = 1;         xc[i] = 0;     xl[i] = C[s]; break;
 905             case 2: xs[i] = C[s];      xc[i] = oM[s]; xl[i] = pM[s]; break;
 906             case 3: xs[i] = C[s]*pM[s]; xc[i] = oK[s]; xl[i] = pK[s]; break;
 907         }
 908     }
 909     /*input order is different for test program to match FFTW order
 910        (important for complex to real)*/
 911     if (plan->flags&FFT5D_BACKWARD)
 912     {
 913         rotate_offsets(xs);
 914         rotate_offsets(xl);
 915         rotate_offsets(xc);
 916         rotate_offsets(NG);
 917         if (plan->flags&FFT5D_ORDER_YZ)
 918         {
 919             rotate_offsets(xs);
 920             rotate_offsets(xl);
 921             rotate_offsets(xc);
 922             rotate_offsets(NG);
 923         }
 924     }
 925     if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s == 0) || ((plan->flags&FFT5D_BACKWARD) && s == 2)))
 926     {
 927         xl[0] = rC[s];
 928     }
 929 }
 930
 931 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
 932 {
 933     int  x, y, z, l;
 934     int *coor = plan->coor;
 935     int  xs[3], xl[3], xc[3], NG[3];
 936     int  ll = (plan->flags&FFT5D_REALCOMPLEX) ? 1 : 2;
 937     compute_offsets(plan, xs, xl, xc, NG, s);
 938     fprintf(debug, txt, coor[0], coor[1]);
 939     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 940     for (z = 0; z < xl[2]; z++)
 941     {
 942         for (y = 0; y < xl[1]; y++)
 943         {
 944             fprintf(debug, "%d %d: ", coor[0], coor[1]);
 945             for (x = 0; x < xl[0]; x++)
 946             {
 947                 for (l = 0; l < ll; l++)
 948                 {
 949                     fprintf(debug, "%f ", reinterpret_cast<const real*>(lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 950                 }
 951                 fprintf(debug, ",");
 952             }
 953             fprintf(debug, "\n");
 954         }
 955     }
 956 }
 957
 958 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
 959 {
 960     t_complex  *lin   = plan->lin;
 961     t_complex  *lout  = plan->lout;
 962     t_complex  *lout2 = plan->lout2;
 963     t_complex  *lout3 = plan->lout3;
 964     t_complex  *fftout, *joinin;
 965
 966     gmx_fft_t **p1d = plan->p1d;
 967 #ifdef FFT5D_MPI_TRANSPOSE
 968     FFTW(plan) *mpip = plan->mpip;
 969 #endif
 970 #if GMX_MPI
 971     MPI_Comm *cart = plan->cart;
 972 #endif
 973 #ifdef NOGMX
 974     double time_fft = 0, time_local = 0, time_mpi[2] = {0}, time = 0;
 975 #endif
 976     int   *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
 977     *C       = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout, **oNout = plan->oNout;
 978     int    s = 0, tstart, tend, bParallelDim;
 979
 980
 981 #if GMX_FFT_FFTW3
 982     if (plan->p3d)
 983     {
 984         if (thread == 0)
 985         {
 986 #ifdef NOGMX
 987             if (times != 0)
 988             {
 989                 time = MPI_Wtime();
 990             }
 991 #endif
 992             FFTW(execute)(plan->p3d);
 993 #ifdef NOGMX
 994             if (times != 0)
 995             {
 996                 times->fft += MPI_Wtime()-time;
 997             }
 998 #endif
 999         }
1000         return;
1001     }
1002 #endif
1003
1004     s = 0;
1005
1006     /*lin: x,y,z*/
1007     if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1008     {
1009         print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1010     }
1011
1012     for (s = 0; s < 2; s++)  /*loop over first two FFT steps (corner rotations)*/
1013
1014     {
1015 #if GMX_MPI
1016         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1017         {
1018             bParallelDim = 1;
1019         }
1020         else
1021 #endif
1022         {
1023             bParallelDim = 0;
1024         }
1025
1026         /* ---------- START FFT ------------ */
1027 #ifdef NOGMX
1028         if (times != 0 && thread == 0)
1029         {
1030             time = MPI_Wtime();
1031         }
1032 #endif
1033
1034         if (bParallelDim || plan->nthreads == 1)
1035         {
1036             fftout = lout;
1037         }
1038         else
1039         {
1040             if (s == 0)
1041             {
1042                 fftout = lout3;
1043             }
1044             else
1045             {
1046                 fftout = lout2;
1047             }
1048         }
1049
1050         tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1051         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s == 0)
1052         {
1053             gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, fftout+tstart);
1054         }
1055         else
1056         {
1057             gmx_fft_many_1d(     p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,               lin+tstart, fftout+tstart);
1058
1059         }
1060
1061 #ifdef NOGMX
1062         if (times != NULL && thread == 0)
1063         {
1064             time_fft += MPI_Wtime()-time;
1065         }
1066 #endif
1067         if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1068         {
1069             print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1070         }
1071         /* ---------- END FFT ------------ */
1072
1073         /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1074         if (bParallelDim)
1075         {
1076 #ifdef NOGMX
1077             if (times != NULL && thread == 0)
1078             {
1079                 time = MPI_Wtime();
1080             }
1081 #endif
1082             /*prepare for A
1083                llToAll
1084                1. (most outer) axes (x) is split into P[s] parts of size N[s]
1085                for sending*/
1086             if (pM[s] > 0)
1087             {
1088                 tend    = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
1089                 tstart /= C[s];
1090                 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1091             }
1092 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1093 #ifdef NOGMX
1094             if (times != NULL && thread == 0)
1095             {
1096                 time_local += MPI_Wtime()-time;
1097             }
1098 #endif
1099
1100             /* ---------- END SPLIT , START TRANSPOSE------------ */
1101
1102             if (thread == 0)
1103             {
1104 #ifdef NOGMX
1105                 if (times != 0)
1106                 {
1107                     time = MPI_Wtime();
1108                 }
1109 #else
1110                 wallcycle_start(times, ewcPME_FFTCOMM);
1111 #endif
1112 #ifdef FFT5D_MPI_TRANSPOSE
1113                 FFTW(execute)(mpip[s]);
1114 #else
1115 #if GMX_MPI
1116                 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1117                 {
1118                     MPI_Alltoall(reinterpret_cast<real *>(lout2), N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, reinterpret_cast<real *>(lout3), N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1119                 }
1120                 else
1121                 {
1122                     MPI_Alltoall(reinterpret_cast<real *>(lout2), N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, reinterpret_cast<real *>(lout3), N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1123                 }
1124 #else
1125                 gmx_incons("fft5d MPI call without MPI configuration");
1126 #endif /*GMX_MPI*/
1127 #endif /*FFT5D_MPI_TRANSPOSE*/
1128 #ifdef NOGMX
1129                 if (times != 0)
1130                 {
1131                     time_mpi[s] = MPI_Wtime()-time;
1132                 }
1133 #else
1134                 wallcycle_stop(times, ewcPME_FFTCOMM);
1135 #endif
1136             } /*master*/
1137         }     /* bPrallelDim */
1138 #pragma omp barrier  /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1139
1140         /* ---------- END SPLIT + TRANSPOSE------------ */
1141
1142         /* ---------- START JOIN ------------ */
1143 #ifdef NOGMX
1144         if (times != NULL && thread == 0)
1145         {
1146             time = MPI_Wtime();
1147         }
1148 #endif
1149
1150         if (bParallelDim)
1151         {
1152             joinin = lout3;
1153         }
1154         else
1155         {
1156             joinin = fftout;
1157         }
1158         /*bring back in matrix form
1159            thus make  new 1. axes contiguos
1160            also local transpose 1 and 2/3
1161            runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1162          */
1163         if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1164         {
1165             if (pM[s] > 0)
1166             {
1167                 tstart = ( thread   *pM[s]*pN[s]/plan->nthreads);
1168                 tend   = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1169                 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1170             }
1171         }
1172         else
1173         {
1174             if (pN[s] > 0)
1175             {
1176                 tstart = ( thread   *pK[s]*pN[s]/plan->nthreads);
1177                 tend   = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1178                 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
1179             }
1180         }
1181
1182 #ifdef NOGMX
1183         if (times != NULL && thread == 0)
1184         {
1185             time_local += MPI_Wtime()-time;
1186         }
1187 #endif
1188         if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1189         {
1190             print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1191         }
1192         /* ---------- END JOIN ------------ */
1193
1194         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1195     }  /* for(s=0;s<2;s++) */
1196 #ifdef NOGMX
1197     if (times != NULL && thread == 0)
1198     {
1199         time = MPI_Wtime();
1200     }
1201 #endif
1202
1203     if (plan->flags&FFT5D_INPLACE)
1204     {
1205         lout = lin;                          /*in place currently not supported*/
1206
1207     }
1208     /*  ----------- FFT ----------- */
1209     tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1210     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1211     {
1212         gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, lout+tstart);
1213     }
1214     else
1215     {
1216         gmx_fft_many_1d(     p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,               lin+tstart, lout+tstart);
1217     }
1218     /* ------------ END FFT ---------*/
1219
1220 #ifdef NOGMX
1221     if (times != NULL && thread == 0)
1222     {
1223         time_fft += MPI_Wtime()-time;
1224
1225         times->fft   += time_fft;
1226         times->local += time_local;
1227         times->mpi2  += time_mpi[1];
1228         times->mpi1  += time_mpi[0];
1229     }
1230 #endif
1231
1232     if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1233     {
1234         print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1235     }
1236 }
1237
1238 void fft5d_destroy(fft5d_plan plan)
1239 {
1240     int s, t;
1241
1242     for (s = 0; s < 3; s++)
1243     {
1244         if (plan->p1d[s])
1245         {
1246             for (t = 0; t < plan->nthreads; t++)
1247             {
1248                 gmx_many_fft_destroy(plan->p1d[s][t]);
1249             }
1250             free(plan->p1d[s]);
1251         }
1252         if (plan->iNin[s])
1253         {
1254             free(plan->iNin[s]);
1255             plan->iNin[s] = nullptr;
1256         }
1257         if (plan->oNin[s])
1258         {
1259             free(plan->oNin[s]);
1260             plan->oNin[s] = nullptr;
1261         }
1262         if (plan->iNout[s])
1263         {
1264             free(plan->iNout[s]);
1265             plan->iNout[s] = nullptr;
1266         }
1267         if (plan->oNout[s])
1268         {
1269             free(plan->oNout[s]);
1270             plan->oNout[s] = nullptr;
1271         }
1272     }
1273 #if GMX_FFT_FFTW3
1274     FFTW_LOCK;
1275 #ifdef FFT5D_MPI_TRANSPOS
1276     for (s = 0; s < 2; s++)
1277     {
1278         FFTW(destroy_plan)(plan->mpip[s]);
1279     }
1280 #endif /* FFT5D_MPI_TRANSPOS */
1281     if (plan->p3d)
1282     {
1283         FFTW(destroy_plan)(plan->p3d);
1284     }
1285     FFTW_UNLOCK;
1286 #endif /* GMX_FFT_FFTW3 */
1287
1288     if (!(plan->flags&FFT5D_NOMALLOC))
1289     {
1290         // only needed for PME GPU mixed mode
1291         if (plan->pinningPolicy == gmx::PinningPolicy::PinnedIfSupported &&
1292             isHostMemoryPinned(plan->lin))
1293         {
1294             gmx::unpinBuffer(plan->lin);
1295         }
1296         sfree_aligned(plan->lin);
1297         sfree_aligned(plan->lout);
1298         if (plan->nthreads > 1)
1299         {
1300             sfree_aligned(plan->lout2);
1301             sfree_aligned(plan->lout3);
1302         }
1303     }
1304
1305 #ifdef FFT5D_THREADS
1306 #ifdef FFT5D_FFTW_THREADS
1307     /*FFTW(cleanup_threads)();*/
1308 #endif
1309 #endif
1310
1311     free(plan);
1312 }
1313
1314 /*Is this better than direct access of plan? enough data?
1315    here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1316 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1317 {
1318     *N1 = plan->N[0];
1319     *M0 = plan->M[0];
1320     *K1 = plan->K[0];
1321     *K0 = plan->N[1];
1322
1323     *coor = plan->coor;
1324 }
1325
1326
1327 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1328    of processor dimensions*/
1329 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
1330 {
1331     MPI_Comm cart[2] = {MPI_COMM_NULL, MPI_COMM_NULL};
1332 #if GMX_MPI
1333     int      size = 1, prank = 0;
1334     int      P[2];
1335     int      coor[2];
1336     int      wrap[] = {0, 0};
1337     MPI_Comm gcart;
1338     int      rdim1[] = {0, 1}, rdim2[] = {1, 0};
1339
1340     MPI_Comm_size(comm, &size);
1341     MPI_Comm_rank(comm, &prank);
1342
1343     if (P0 == 0)
1344     {
1345         P0 = lfactor(size);
1346     }
1347     if (size%P0 != 0)
1348     {
1349         if (prank == 0)
1350         {
1351             printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1352         }
1353         P0 = lfactor(size);
1354     }
1355
1356     P[0] = P0; P[1] = size/P0; /*number of processors in the two dimensions*/
1357
1358     /*Difference between x-y-z regarding 2d decomposition is whether they are
1359        distributed along axis 1, 2 or both*/
1360
1361     MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1362     MPI_Cart_get(gcart, 2, P, wrap, coor);
1363     MPI_Cart_sub(gcart, rdim1, &cart[0]);
1364     MPI_Cart_sub(gcart, rdim2, &cart[1]);
1365 #else
1366     (void)P0;
1367     (void)comm;
1368 #endif
1369     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1370 }
1371
1372
1373
1374 /*prints in original coordinate system of data (as the input to FFT)*/
1375 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1376 {
1377     int  xs[3], xl[3], xc[3], NG[3];
1378     int  x, y, z, l;
1379     int *coor = plan->coor;
1380     int  ll   = 2; /*compare ll values per element (has to be 2 for complex)*/
1381     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1382     {
1383         ll = 1;
1384     }
1385
1386     compute_offsets(plan, xs, xl, xc, NG, 2);
1387     if (plan->flags&FFT5D_DEBUG)
1388     {
1389         printf("Compare2\n");
1390     }
1391     for (z = 0; z < xl[2]; z++)
1392     {
1393         for (y = 0; y < xl[1]; y++)
1394         {
1395             if (plan->flags&FFT5D_DEBUG)
1396             {
1397                 printf("%d %d: ", coor[0], coor[1]);
1398             }
1399             for (x = 0; x < xl[0]; x++)
1400             {
1401                 for (l = 0; l < ll; l++)   /*loop over real/complex parts*/
1402                 {
1403                     real a, b;
1404                     a = reinterpret_cast<const real*>(lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1405                     if (normalize)
1406                     {
1407                         a /= plan->rC[0]*plan->rC[1]*plan->rC[2];
1408                     }
1409                     if (!bothLocal)
1410                     {
1411                         b = reinterpret_cast<const real*>(in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1412                     }
1413                     else
1414                     {
1415                         b = reinterpret_cast<const real*>(in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1416                     }
1417                     if (plan->flags&FFT5D_DEBUG)
1418                     {
1419                         printf("%f %f, ", a, b);
1420                     }
1421                     else
1422                     {
1423                         if (std::fabs(a-b) > 2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS)
1424                         {
1425                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n", coor[0], coor[1], x, y, z, a, b);
1426                         }
1427 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1428                     }
1429                 }
1430                 if (plan->flags&FFT5D_DEBUG)
1431                 {
1432                     printf(",");
1433                 }
1434             }
1435             if (plan->flags&FFT5D_DEBUG)
1436             {
1437                 printf("\n");
1438             }
1439         }
1440     }
1441
1442 }