src/gromacs/fft/fft5d.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2009,2010,2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #include "gmxpre.h"
  36
  37 #include "fft5d.h"
  38
  39 #include "config.h"
  40
  41 #include <assert.h>
  42 #include <float.h>
  43 #include <math.h>
  44 #include <stdio.h>
  45 #include <stdlib.h>
  46 #include <string.h>
  47
  48 #include <algorithm>
  49
  50 #include "gromacs/utility/fatalerror.h"
  51 #include "gromacs/utility/gmxmpi.h"
  52 #include "gromacs/utility/smalloc.h"
  53
  54 #ifdef NOGMX
  55 #define GMX_PARALLEL_ENV_INITIALIZED 1
  56 #else
  57 #ifdef GMX_MPI
  58 #define GMX_PARALLEL_ENV_INITIALIZED 1
  59 #else
  60 #define GMX_PARALLEL_ENV_INITIALIZED 0
  61 #endif
  62 #endif
  63
  64 #ifdef GMX_OPENMP
  65 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
  66 #define FFT5D_THREADS
  67 /* requires fftw compiled with openmp */
  68 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
  69 #endif
  70
  71 #ifndef __FLT_EPSILON__
  72 #define __FLT_EPSILON__ FLT_EPSILON
  73 #define __DBL_EPSILON__ DBL_EPSILON
  74 #endif
  75
  76 #ifdef NOGMX
  77 FILE* debug = 0;
  78 #endif
  79
  80 #if GMX_FFT_FFTW3
  81
  82 #include "gromacs/utility/exceptions.h"
  83 #include "gromacs/utility/mutex.h"
  84 /* none of the fftw3 calls, except execute(), are thread-safe, so
  85    we need to serialize them with this mutex. */
  86 static gmx::Mutex big_fftw_mutex;
  87 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  88 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  89 #endif /* GMX_FFT_FFTW3 */
  90
  91 #ifdef GMX_MPI
  92 /* largest factor smaller than sqrt */
  93 static int lfactor(int z)
  94 {
  95     int i = static_cast<int>(sqrt(static_cast<double>(z)));
  96     while (z%i != 0)
  97     {
  98         i--;
  99     }
 100     return i;
 101 }
 102 #endif
 103
 104 /* largest factor */
 105 static int l2factor(int z)
 106 {
 107     int i;
 108     if (z == 1)
 109     {
 110         i = 1;
 111     }
 112     else
 113     {
 114         i = z/2;
 115         while (z%i != 0)
 116         {
 117             i--;
 118         }
 119     }
 120     return i;
 121 }
 122
 123 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
 124 static int lpfactor(int z)
 125 {
 126     int f = l2factor(z);
 127     if (f == 1)
 128     {
 129         return z;
 130     }
 131     return std::max(lpfactor(f), lpfactor(z/f));
 132 }
 133
 134 #ifndef GMX_MPI
 135 #ifdef HAVE_GETTIMEOFDAY
 136 #include <sys/time.h>
 137 double MPI_Wtime()
 138 {
 139     struct timeval tv;
 140     gettimeofday(&tv, 0);
 141     return tv.tv_sec+tv.tv_usec*1e-6;
 142 }
 143 #else
 144 double MPI_Wtime()
 145 {
 146     return 0.0;
 147 }
 148 #endif
 149 #endif
 150
 151 static int vmax(int* a, int s)
 152 {
 153     int i, max = 0;
 154     for (i = 0; i < s; i++)
 155     {
 156         if (a[i] > max)
 157         {
 158             max = a[i];
 159         }
 160     }
 161     return max;
 162 }
 163
 164
 165 /* NxMxK the size of the data
 166  * comm communicator to use for fft5d
 167  * P0 number of processor in 1st axes (can be null for automatic)
 168  * lin is allocated by fft5d because size of array is only known after planning phase
 169  * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 170  */
 171 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
 172 {
 173
 174     int        P[2], bMaster, prank[2], i, t;
 175     int        rNG, rMG, rKG;
 176     int       *N0 = 0, *N1 = 0, *M0 = 0, *M1 = 0, *K0 = 0, *K1 = 0, *oN0 = 0, *oN1 = 0, *oM0 = 0, *oM1 = 0, *oK0 = 0, *oK1 = 0;
 177     int        N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {0}, *oNin[3] = {0}, *iNout[3] = {0}, *oNout[3] = {0};
 178     int        C[3], rC[3], nP[2];
 179     int        lsize;
 180     t_complex *lin = 0, *lout = 0, *lout2 = 0, *lout3 = 0;
 181     fft5d_plan plan;
 182     int        s;
 183
 184     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 185 #ifdef GMX_MPI
 186     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
 187     {
 188         MPI_Comm_size(comm[0], &P[0]);
 189         MPI_Comm_rank(comm[0], &prank[0]);
 190     }
 191     else
 192 #endif
 193     {
 194         P[0]     = 1;
 195         prank[0] = 0;
 196     }
 197 #ifdef GMX_MPI
 198     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
 199     {
 200         MPI_Comm_size(comm[1], &P[1]);
 201         MPI_Comm_rank(comm[1], &prank[1]);
 202     }
 203     else
 204 #endif
 205     {
 206         P[1]     = 1;
 207         prank[1] = 0;
 208     }
 209
 210     bMaster = (prank[0] == 0 && prank[1] == 0);
 211
 212
 213     if (debug)
 214     {
 215         fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n",
 216                 P[0], P[1], prank[0], prank[1]);
 217     }
 218
 219     if (bMaster)
 220     {
 221         if (debug)
 222         {
 223             fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 224                     NG, MG, KG, P[0], P[1], (flags&FFT5D_REALCOMPLEX) > 0, (flags&FFT5D_BACKWARD) > 0, (flags&FFT5D_ORDER_YZ) > 0, (flags&FFT5D_DEBUG) > 0);
 225         }
 226         /* The check below is not correct, one prime factor 11 or 13 is ok.
 227            if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 228             printf("WARNING: FFT very slow with prime factors larger 7\n");
 229             printf("Change FFT size or in case you cannot change it look at\n");
 230             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 231            }
 232          */
 233     }
 234
 235     if (NG == 0 || MG == 0 || KG == 0)
 236     {
 237         if (bMaster)
 238         {
 239             printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 240         }
 241         return 0;
 242     }
 243
 244     rNG = NG; rMG = MG; rKG = KG;
 245
 246     if (flags&FFT5D_REALCOMPLEX)
 247     {
 248         if (!(flags&FFT5D_BACKWARD))
 249         {
 250             NG = NG/2+1;
 251         }
 252         else
 253         {
 254             if (!(flags&FFT5D_ORDER_YZ))
 255             {
 256                 MG = MG/2+1;
 257             }
 258             else
 259             {
 260                 KG = KG/2+1;
 261             }
 262         }
 263     }
 264
 265
 266     /*for transpose we need to know the size for each processor not only our own size*/
 267
 268     N0  = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 269     M0  = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 270     K0  = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 271     oN0 = (int*)malloc(P[0]*sizeof(int)); oN1 = (int*)malloc(P[1]*sizeof(int));
 272     oM0 = (int*)malloc(P[0]*sizeof(int)); oM1 = (int*)malloc(P[1]*sizeof(int));
 273     oK0 = (int*)malloc(P[0]*sizeof(int)); oK1 = (int*)malloc(P[1]*sizeof(int));
 274
 275     for (i = 0; i < P[0]; i++)
 276     {
 277         #define EVENDIST
 278         #ifndef EVENDIST
 279         oN0[i] = i*ceil((double)NG/P[0]);
 280         oM0[i] = i*ceil((double)MG/P[0]);
 281         oK0[i] = i*ceil((double)KG/P[0]);
 282         #else
 283         oN0[i] = (NG*i)/P[0];
 284         oM0[i] = (MG*i)/P[0];
 285         oK0[i] = (KG*i)/P[0];
 286         #endif
 287     }
 288     for (i = 0; i < P[1]; i++)
 289     {
 290         #ifndef EVENDIST
 291         oN1[i] = i*ceil((double)NG/P[1]);
 292         oM1[i] = i*ceil((double)MG/P[1]);
 293         oK1[i] = i*ceil((double)KG/P[1]);
 294         #else
 295         oN1[i] = (NG*i)/P[1];
 296         oM1[i] = (MG*i)/P[1];
 297         oK1[i] = (KG*i)/P[1];
 298         #endif
 299     }
 300     for (i = 0; i < P[0]-1; i++)
 301     {
 302         N0[i] = oN0[i+1]-oN0[i];
 303         M0[i] = oM0[i+1]-oM0[i];
 304         K0[i] = oK0[i+1]-oK0[i];
 305     }
 306     N0[P[0]-1] = NG-oN0[P[0]-1];
 307     M0[P[0]-1] = MG-oM0[P[0]-1];
 308     K0[P[0]-1] = KG-oK0[P[0]-1];
 309     for (i = 0; i < P[1]-1; i++)
 310     {
 311         N1[i] = oN1[i+1]-oN1[i];
 312         M1[i] = oM1[i+1]-oM1[i];
 313         K1[i] = oK1[i+1]-oK1[i];
 314     }
 315     N1[P[1]-1] = NG-oN1[P[1]-1];
 316     M1[P[1]-1] = MG-oM1[P[1]-1];
 317     K1[P[1]-1] = KG-oK1[P[1]-1];
 318
 319     /* for step 1-3 the local N,M,K sizes of the transposed system
 320        C: contiguous dimension, and nP: number of processor in subcommunicator
 321        for that step */
 322
 323
 324     pM[0] = M0[prank[0]];
 325     oM[0] = oM0[prank[0]];
 326     pK[0] = K1[prank[1]];
 327     oK[0] = oK1[prank[1]];
 328     C[0]  = NG;
 329     rC[0] = rNG;
 330     if (!(flags&FFT5D_ORDER_YZ))
 331     {
 332         N[0]     = vmax(N1, P[1]);
 333         M[0]     = M0[prank[0]];
 334         K[0]     = vmax(K1, P[1]);
 335         pN[0]    = N1[prank[1]];
 336         iNout[0] = N1;
 337         oNout[0] = oN1;
 338         nP[0]    = P[1];
 339         C[1]     = KG;
 340         rC[1]    = rKG;
 341         N[1]     = vmax(K0, P[0]);
 342         pN[1]    = K0[prank[0]];
 343         iNin[1]  = K1;
 344         oNin[1]  = oK1;
 345         iNout[1] = K0;
 346         oNout[1] = oK0;
 347         M[1]     = vmax(M0, P[0]);
 348         pM[1]    = M0[prank[0]];
 349         oM[1]    = oM0[prank[0]];
 350         K[1]     = N1[prank[1]];
 351         pK[1]    = N1[prank[1]];
 352         oK[1]    = oN1[prank[1]];
 353         nP[1]    = P[0];
 354         C[2]     = MG;
 355         rC[2]    = rMG;
 356         iNin[2]  = M0;
 357         oNin[2]  = oM0;
 358         M[2]     = vmax(K0, P[0]);
 359         pM[2]    = K0[prank[0]];
 360         oM[2]    = oK0[prank[0]];
 361         K[2]     = vmax(N1, P[1]);
 362         pK[2]    = N1[prank[1]];
 363         oK[2]    = oN1[prank[1]];
 364         free(N0); free(oN0); /*these are not used for this order*/
 365         free(M1); free(oM1); /*the rest is freed in destroy*/
 366     }
 367     else
 368     {
 369         N[0]     = vmax(N0, P[0]);
 370         M[0]     = vmax(M0, P[0]);
 371         K[0]     = K1[prank[1]];
 372         pN[0]    = N0[prank[0]];
 373         iNout[0] = N0;
 374         oNout[0] = oN0;
 375         nP[0]    = P[0];
 376         C[1]     = MG;
 377         rC[1]    = rMG;
 378         N[1]     = vmax(M1, P[1]);
 379         pN[1]    = M1[prank[1]];
 380         iNin[1]  = M0;
 381         oNin[1]  = oM0;
 382         iNout[1] = M1;
 383         oNout[1] = oM1;
 384         M[1]     = N0[prank[0]];
 385         pM[1]    = N0[prank[0]];
 386         oM[1]    = oN0[prank[0]];
 387         K[1]     = vmax(K1, P[1]);
 388         pK[1]    = K1[prank[1]];
 389         oK[1]    = oK1[prank[1]];
 390         nP[1]    = P[1];
 391         C[2]     = KG;
 392         rC[2]    = rKG;
 393         iNin[2]  = K1;
 394         oNin[2]  = oK1;
 395         M[2]     = vmax(N0, P[0]);
 396         pM[2]    = N0[prank[0]];
 397         oM[2]    = oN0[prank[0]];
 398         K[2]     = vmax(M1, P[1]);
 399         pK[2]    = M1[prank[1]];
 400         oK[2]    = oM1[prank[1]];
 401         free(N1); free(oN1); /*these are not used for this order*/
 402         free(K0); free(oK0); /*the rest is freed in destroy*/
 403     }
 404     N[2] = pN[2] = -1;       /*not used*/
 405
 406     /*
 407        Difference between x-y-z regarding 2d decomposition is whether they are
 408        distributed along axis 1, 2 or both
 409      */
 410
 411     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 412     lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
 413     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 414     if (!(flags&FFT5D_NOMALLOC))
 415     {
 416         snew_aligned(lin, lsize, 32);
 417         snew_aligned(lout, lsize, 32);
 418         if (nthreads > 1)
 419         {
 420             /* We need extra transpose buffers to avoid OpenMP barriers */
 421             snew_aligned(lout2, lsize, 32);
 422             snew_aligned(lout3, lsize, 32);
 423         }
 424         else
 425         {
 426             /* We can reuse the buffers to avoid cache misses */
 427             lout2 = lin;
 428             lout3 = lout;
 429         }
 430     }
 431     else
 432     {
 433         lin  = *rlin;
 434         lout = *rlout;
 435         if (nthreads > 1)
 436         {
 437             lout2 = *rlout2;
 438             lout3 = *rlout3;
 439         }
 440         else
 441         {
 442             lout2 = lin;
 443             lout3 = lout;
 444         }
 445     }
 446
 447     plan = (fft5d_plan)calloc(1, sizeof(struct fft5d_plan_t));
 448
 449
 450     if (debug)
 451     {
 452         fprintf(debug, "Running on %d threads\n", nthreads);
 453     }
 454
 455 #if GMX_FFT_FFTW3
 456     /* Don't add more stuff here! We have already had at least one bug because we are reimplementing
 457      * the low-level FFT interface instead of using the Gromacs FFT module. If we need more
 458      * generic functionality it is far better to extend the interface so we can use it for
 459      * all FFT libraries instead of writing FFTW-specific code here.
 460      */
 461
 462     /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
 463     /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
 464      * within a parallel region. For now deactivated. If it should be supported it has to made sure that
 465      * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
 466      * and that the 3d plan is faster than the 1d plan.
 467      */
 468     if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
 469     {
 470         int fftwflags = FFTW_DESTROY_INPUT;
 471         FFTW(iodim) dims[3];
 472         int inNG = NG, outMG = MG, outKG = KG;
 473
 474         FFTW_LOCK;
 475
 476         fftwflags |= (flags & FFT5D_NOMEASURE) ? FFTW_ESTIMATE : FFTW_MEASURE;
 477
 478         if (flags&FFT5D_REALCOMPLEX)
 479         {
 480             if (!(flags&FFT5D_BACKWARD))        /*input pointer is not complex*/
 481             {
 482                 inNG *= 2;
 483             }
 484             else                                /*output pointer is not complex*/
 485             {
 486                 if (!(flags&FFT5D_ORDER_YZ))
 487                 {
 488                     outMG *= 2;
 489                 }
 490                 else
 491                 {
 492                     outKG *= 2;
 493                 }
 494             }
 495         }
 496
 497         if (!(flags&FFT5D_BACKWARD))
 498         {
 499             dims[0].n  = KG;
 500             dims[1].n  = MG;
 501             dims[2].n  = rNG;
 502
 503             dims[0].is = inNG*MG;         /*N M K*/
 504             dims[1].is = inNG;
 505             dims[2].is = 1;
 506             if (!(flags&FFT5D_ORDER_YZ))
 507             {
 508                 dims[0].os = MG;           /*M K N*/
 509                 dims[1].os = 1;
 510                 dims[2].os = MG*KG;
 511             }
 512             else
 513             {
 514                 dims[0].os = 1;           /*K N M*/
 515                 dims[1].os = KG*NG;
 516                 dims[2].os = KG;
 517             }
 518         }
 519         else
 520         {
 521             if (!(flags&FFT5D_ORDER_YZ))
 522             {
 523                 dims[0].n  = NG;
 524                 dims[1].n  = KG;
 525                 dims[2].n  = rMG;
 526
 527                 dims[0].is = 1;
 528                 dims[1].is = NG*MG;
 529                 dims[2].is = NG;
 530
 531                 dims[0].os = outMG*KG;
 532                 dims[1].os = outMG;
 533                 dims[2].os = 1;
 534             }
 535             else
 536             {
 537                 dims[0].n  = MG;
 538                 dims[1].n  = NG;
 539                 dims[2].n  = rKG;
 540
 541                 dims[0].is = NG;
 542                 dims[1].is = 1;
 543                 dims[2].is = NG*MG;
 544
 545                 dims[0].os = outKG*NG;
 546                 dims[1].os = outKG;
 547                 dims[2].os = 1;
 548             }
 549         }
 550 #ifdef FFT5D_THREADS
 551 #ifdef FFT5D_FFTW_THREADS
 552         FFTW(plan_with_nthreads)(nthreads);
 553 #endif
 554 #endif
 555         if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
 556         {
 557             plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 558                                                          /*howmany*/ 0, /*howmany_dims*/ 0,
 559                                                          (real*)lin, (FFTW(complex) *) lout,
 560                                                          /*flags*/ fftwflags);
 561         }
 562         else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD))
 563         {
 564             plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 565                                                          /*howmany*/ 0, /*howmany_dims*/ 0,
 566                                                          (FFTW(complex) *) lin, (real*)lout,
 567                                                          /*flags*/ fftwflags);
 568         }
 569         else
 570         {
 571             plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 572                                                      /*howmany*/ 0, /*howmany_dims*/ 0,
 573                                                      (FFTW(complex) *) lin, (FFTW(complex) *) lout,
 574                                                      /*sign*/ (flags&FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
 575         }
 576 #ifdef FFT5D_THREADS
 577 #ifdef FFT5D_FFTW_THREADS
 578         FFTW(plan_with_nthreads)(1);
 579 #endif
 580 #endif
 581         FFTW_UNLOCK;
 582     }
 583     if (!plan->p3d) /* for decomposition and if 3d plan did not work */
 584     {
 585 #endif              /* GMX_FFT_FFTW3 */
 586     for (s = 0; s < 3; s++)
 587     {
 588         if (debug)
 589         {
 590             fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 591                     s, rC[s], M[s], pK[s], C[s], lsize);
 592         }
 593         plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
 594
 595         /* Make sure that the init routines are only called by one thread at a time and in order
 596            (later is only important to not confuse valgrind)
 597          */
 598 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
 599         for (t = 0; t < nthreads; t++)
 600         {
 601 #pragma omp ordered
 602             {
 603                 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
 604
 605                 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s == 0) || ((flags&FFT5D_BACKWARD) && s == 2)))
 606                 {
 607                     gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
 608                 }
 609                 else
 610                 {
 611                     gmx_fft_init_many_1d     ( &plan->p1d[s][t],  C[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
 612                 }
 613             }
 614         }
 615     }
 616
 617 #if GMX_FFT_FFTW3
 618 }
 619 #endif
 620     if ((flags&FFT5D_ORDER_YZ))   /*plan->cart is in the order of transposes */
 621     {
 622         plan->cart[0] = comm[0]; plan->cart[1] = comm[1];
 623     }
 624     else
 625     {
 626         plan->cart[1] = comm[0]; plan->cart[0] = comm[1];
 627     }
 628 #ifdef FFT5D_MPI_TRANSPOSE
 629     FFTW_LOCK;
 630     for (s = 0; s < 2; s++)
 631     {
 632         if ((s == 0 && !(flags&FFT5D_ORDER_YZ)) || (s == 1 && (flags&FFT5D_ORDER_YZ)))
 633         {
 634             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 635         }
 636         else
 637         {
 638             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 639         }
 640     }
 641     FFTW_UNLOCK;
 642 #endif
 643
 644
 645     plan->lin   = lin;
 646     plan->lout  = lout;
 647     plan->lout2 = lout2;
 648     plan->lout3 = lout3;
 649
 650     plan->NG = NG; plan->MG = MG; plan->KG = KG;
 651
 652     for (s = 0; s < 3; s++)
 653     {
 654         plan->N[s]    = N[s]; plan->M[s] = M[s]; plan->K[s] = K[s]; plan->pN[s] = pN[s]; plan->pM[s] = pM[s]; plan->pK[s] = pK[s];
 655         plan->oM[s]   = oM[s]; plan->oK[s] = oK[s];
 656         plan->C[s]    = C[s]; plan->rC[s] = rC[s];
 657         plan->iNin[s] = iNin[s]; plan->oNin[s] = oNin[s]; plan->iNout[s] = iNout[s]; plan->oNout[s] = oNout[s];
 658     }
 659     for (s = 0; s < 2; s++)
 660     {
 661         plan->P[s] = nP[s]; plan->coor[s] = prank[s];
 662     }
 663
 664 /*    plan->fftorder=fftorder;
 665     plan->direction=direction;
 666     plan->realcomplex=realcomplex;
 667  */
 668     plan->flags    = flags;
 669     plan->nthreads = nthreads;
 670     *rlin          = lin;
 671     *rlout         = lout;
 672     *rlout2        = lout2;
 673     *rlout3        = lout3;
 674     return plan;
 675 }
 676
 677
 678 enum order {
 679     XYZ,
 680     XZY,
 681     YXZ,
 682     YZX,
 683     ZXY,
 684     ZYX
 685 };
 686
 687
 688
 689 /*here x,y,z and N,M,K is in rotated coordinate system!!
 690    x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 691    maxN,maxM,maxK is max size of local data
 692    pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 693    NG, MG, KG is size of global data*/
 694 static void splitaxes(t_complex* lout, const t_complex* lin,
 695                       int maxN, int maxM, int maxK, int pM,
 696                       int P, int NG, int *N, int* oN, int starty, int startz, int endy, int endz)
 697 {
 698     int x, y, z, i;
 699     int in_i, out_i, in_z, out_z, in_y, out_y;
 700     int s_y, e_y;
 701
 702     for (z = startz; z < endz+1; z++) /*3. z l*/
 703     {
 704         if (z == startz)
 705         {
 706             s_y = starty;
 707         }
 708         else
 709         {
 710             s_y = 0;
 711         }
 712         if (z == endz)
 713         {
 714             e_y = endy;
 715         }
 716         else
 717         {
 718             e_y = pM;
 719         }
 720         out_z  = z*maxN*maxM;
 721         in_z   = z*NG*pM;
 722
 723         for (i = 0; i < P; i++) /*index cube along long axis*/
 724         {
 725             out_i  = out_z  + i*maxN*maxM*maxK;
 726             in_i   = in_z + oN[i];
 727             for (y = s_y; y < e_y; y++)   /*2. y k*/
 728             {
 729                 out_y  = out_i  + y*maxN;
 730                 in_y   = in_i + y*NG;
 731                 for (x = 0; x < N[i]; x++)       /*1. x j*/
 732                 {
 733                     lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
 734                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 735                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 736                 }
 737             }
 738         }
 739     }
 740 }
 741
 742 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 743 /*transpose mayor and major dimension
 744    variables see above
 745    the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 746    N,M,K local dimensions
 747    KG global size*/
 748 static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
 749                             int maxN, int maxM, int maxK, int pM,
 750                             int P, int KG, int* K, int* oK, int starty, int startx, int endy, int endx)
 751 {
 752     int i, x, y, z;
 753     int out_i, in_i, out_x, in_x, out_z, in_z;
 754     int s_y, e_y;
 755
 756     for (x = startx; x < endx+1; x++) /*1.j*/
 757     {
 758         if (x == startx)
 759         {
 760             s_y = starty;
 761         }
 762         else
 763         {
 764             s_y = 0;
 765         }
 766         if (x == endx)
 767         {
 768             e_y = endy;
 769         }
 770         else
 771         {
 772             e_y = pM;
 773         }
 774
 775         out_x  = x*KG*pM;
 776         in_x   = x;
 777
 778         for (i = 0; i < P; i++) /*index cube along long axis*/
 779         {
 780             out_i  = out_x  + oK[i];
 781             in_i   = in_x + i*maxM*maxN*maxK;
 782             for (z = 0; z < K[i]; z++) /*3.l*/
 783             {
 784                 out_z  = out_i  + z;
 785                 in_z   = in_i + z*maxM*maxN;
 786                 for (y = s_y; y < e_y; y++)              /*2.k*/
 787                 {
 788                     lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
 789                 }
 790             }
 791         }
 792     }
 793 }
 794
 795 /*make axis contiguous again (after AllToAll) and also do local transpose
 796    tranpose mayor and middle dimension
 797    variables see above
 798    the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 799    N,M,K local size
 800    MG, global size*/
 801 static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
 802                             int P, int MG, int* M, int* oM, int startx, int startz, int endx, int endz)
 803 {
 804     int i, z, y, x;
 805     int out_i, in_i, out_z, in_z, out_x, in_x;
 806     int s_x, e_x;
 807
 808     for (z = startz; z < endz+1; z++)
 809     {
 810         if (z == startz)
 811         {
 812             s_x = startx;
 813         }
 814         else
 815         {
 816             s_x = 0;
 817         }
 818         if (z == endz)
 819         {
 820             e_x = endx;
 821         }
 822         else
 823         {
 824             e_x = pN;
 825         }
 826         out_z  = z*MG*pN;
 827         in_z   = z*maxM*maxN;
 828
 829         for (i = 0; i < P; i++) /*index cube along long axis*/
 830         {
 831             out_i  = out_z  + oM[i];
 832             in_i   = in_z + i*maxM*maxN*maxK;
 833             for (x = s_x; x < e_x; x++)
 834             {
 835                 out_x  = out_i  + x*MG;
 836                 in_x   = in_i + x;
 837                 for (y = 0; y < M[i]; y++)
 838                 {
 839                     lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
 840                 }
 841             }
 842         }
 843     }
 844 }
 845
 846
 847 static void rotate_offsets(int x[])
 848 {
 849     int t = x[0];
 850 /*    x[0]=x[2];
 851     x[2]=x[1];
 852     x[1]=t;*/
 853     x[0] = x[1];
 854     x[1] = x[2];
 855     x[2] = t;
 856 }
 857
 858 /*compute the offset to compare or print transposed local data in original input coordinates
 859    xs matrix dimension size, xl dimension length, xc decomposition offset
 860    s: step in computation = number of transposes*/
 861 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
 862 {
 863 /*    int direction = plan->direction;
 864     int fftorder = plan->fftorder;*/
 865
 866     int  o = 0;
 867     int  pos[3], i;
 868     int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK,
 869     *C      = plan->C, *rC = plan->rC;
 870
 871     NG[0] = plan->NG; NG[1] = plan->MG; NG[2] = plan->KG;
 872
 873     if (!(plan->flags&FFT5D_ORDER_YZ))
 874     {
 875         switch (s)
 876         {
 877             case 0: o = XYZ; break;
 878             case 1: o = ZYX; break;
 879             case 2: o = YZX; break;
 880             default: assert(0);
 881         }
 882     }
 883     else
 884     {
 885         switch (s)
 886         {
 887             case 0: o = XYZ; break;
 888             case 1: o = YXZ; break;
 889             case 2: o = ZXY; break;
 890             default: assert(0);
 891         }
 892     }
 893
 894     switch (o)
 895     {
 896         case XYZ: pos[0] = 1; pos[1] = 2; pos[2] = 3; break;
 897         case XZY: pos[0] = 1; pos[1] = 3; pos[2] = 2; break;
 898         case YXZ: pos[0] = 2; pos[1] = 1; pos[2] = 3; break;
 899         case YZX: pos[0] = 3; pos[1] = 1; pos[2] = 2; break;
 900         case ZXY: pos[0] = 2; pos[1] = 3; pos[2] = 1; break;
 901         case ZYX: pos[0] = 3; pos[1] = 2; pos[2] = 1; break;
 902     }
 903     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 904
 905     /*xs, xl give dimension size and data length in local transposed coordinate system
 906        for 0(/1/2): x(/y/z) in original coordinate system*/
 907     for (i = 0; i < 3; i++)
 908     {
 909         switch (pos[i])
 910         {
 911             case 1: xs[i] = 1;         xc[i] = 0;     xl[i] = C[s]; break;
 912             case 2: xs[i] = C[s];      xc[i] = oM[s]; xl[i] = pM[s]; break;
 913             case 3: xs[i] = C[s]*pM[s]; xc[i] = oK[s]; xl[i] = pK[s]; break;
 914         }
 915     }
 916     /*input order is different for test program to match FFTW order
 917        (important for complex to real)*/
 918     if (plan->flags&FFT5D_BACKWARD)
 919     {
 920         rotate_offsets(xs);
 921         rotate_offsets(xl);
 922         rotate_offsets(xc);
 923         rotate_offsets(NG);
 924         if (plan->flags&FFT5D_ORDER_YZ)
 925         {
 926             rotate_offsets(xs);
 927             rotate_offsets(xl);
 928             rotate_offsets(xc);
 929             rotate_offsets(NG);
 930         }
 931     }
 932     if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s == 0) || ((plan->flags&FFT5D_BACKWARD) && s == 2)))
 933     {
 934         xl[0] = rC[s];
 935     }
 936 }
 937
 938 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
 939 {
 940     int  x, y, z, l;
 941     int *coor = plan->coor;
 942     int  xs[3], xl[3], xc[3], NG[3];
 943     int  ll = (plan->flags&FFT5D_REALCOMPLEX) ? 1 : 2;
 944     compute_offsets(plan, xs, xl, xc, NG, s);
 945     fprintf(debug, txt, coor[0], coor[1], s);
 946     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 947     for (z = 0; z < xl[2]; z++)
 948     {
 949         for (y = 0; y < xl[1]; y++)
 950         {
 951             fprintf(debug, "%d %d: ", coor[0], coor[1]);
 952             for (x = 0; x < xl[0]; x++)
 953             {
 954                 for (l = 0; l < ll; l++)
 955                 {
 956                     fprintf(debug, "%f ", ((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 957                 }
 958                 fprintf(debug, ",");
 959             }
 960             fprintf(debug, "\n");
 961         }
 962     }
 963 }
 964
 965 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
 966 {
 967     t_complex  *lin   = plan->lin;
 968     t_complex  *lout  = plan->lout;
 969     t_complex  *lout2 = plan->lout2;
 970     t_complex  *lout3 = plan->lout3;
 971     t_complex  *fftout, *joinin;
 972
 973     gmx_fft_t **p1d = plan->p1d;
 974 #ifdef FFT5D_MPI_TRANSPOSE
 975     FFTW(plan) *mpip = plan->mpip;
 976 #endif
 977 #ifdef GMX_MPI
 978     MPI_Comm *cart = plan->cart;
 979 #endif
 980 #ifdef NOGMX
 981     double time_fft = 0, time_local = 0, time_mpi[2] = {0}, time = 0;
 982 #endif
 983     int   *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
 984     *C       = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout, **oNout = plan->oNout;
 985     int    s = 0, tstart, tend, bParallelDim;
 986
 987
 988 #if GMX_FFT_FFTW3
 989     if (plan->p3d)
 990     {
 991         if (thread == 0)
 992         {
 993 #ifdef NOGMX
 994             if (times != 0)
 995             {
 996                 time = MPI_Wtime();
 997             }
 998 #endif
 999             FFTW(execute)(plan->p3d);
1000 #ifdef NOGMX
1001             if (times != 0)
1002             {
1003                 times->fft += MPI_Wtime()-time;
1004             }
1005 #endif
1006         }
1007         return;
1008     }
1009 #endif
1010
1011     s = 0;
1012
1013     /*lin: x,y,z*/
1014     if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1015     {
1016         print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1017     }
1018
1019     for (s = 0; s < 2; s++)  /*loop over first two FFT steps (corner rotations)*/
1020
1021     {
1022 #ifdef GMX_MPI
1023         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1024         {
1025             bParallelDim = 1;
1026         }
1027         else
1028 #endif
1029         {
1030             bParallelDim = 0;
1031         }
1032
1033         /* ---------- START FFT ------------ */
1034 #ifdef NOGMX
1035         if (times != 0 && thread == 0)
1036         {
1037             time = MPI_Wtime();
1038         }
1039 #endif
1040
1041         if (bParallelDim || plan->nthreads == 1)
1042         {
1043             fftout = lout;
1044         }
1045         else
1046         {
1047             if (s == 0)
1048             {
1049                 fftout = lout3;
1050             }
1051             else
1052             {
1053                 fftout = lout2;
1054             }
1055         }
1056
1057         tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1058         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s == 0)
1059         {
1060             gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, fftout+tstart);
1061         }
1062         else
1063         {
1064             gmx_fft_many_1d(     p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,               lin+tstart, fftout+tstart);
1065
1066         }
1067
1068 #ifdef NOGMX
1069         if (times != NULL && thread == 0)
1070         {
1071             time_fft += MPI_Wtime()-time;
1072         }
1073 #endif
1074         if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1075         {
1076             print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1077         }
1078         /* ---------- END FFT ------------ */
1079
1080         /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1081         if (bParallelDim)
1082         {
1083 #ifdef NOGMX
1084             if (times != NULL && thread == 0)
1085             {
1086                 time = MPI_Wtime();
1087             }
1088 #endif
1089             /*prepare for A
1090                llToAll
1091                1. (most outer) axes (x) is split into P[s] parts of size N[s]
1092                for sending*/
1093             if (pM[s] > 0)
1094             {
1095                 tend    = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
1096                 tstart /= C[s];
1097                 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1098             }
1099 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1100 #ifdef NOGMX
1101             if (times != NULL && thread == 0)
1102             {
1103                 time_local += MPI_Wtime()-time;
1104             }
1105 #endif
1106
1107             /* ---------- END SPLIT , START TRANSPOSE------------ */
1108
1109             if (thread == 0)
1110             {
1111 #ifdef NOGMX
1112                 if (times != 0)
1113                 {
1114                     time = MPI_Wtime();
1115                 }
1116 #else
1117                 wallcycle_start(times, ewcPME_FFTCOMM);
1118 #endif
1119 #ifdef FFT5D_MPI_TRANSPOSE
1120                 FFTW(execute)(mpip[s]);
1121 #else
1122 #ifdef GMX_MPI
1123                 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1124                 {
1125                     MPI_Alltoall((real *)lout2, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1126                 }
1127                 else
1128                 {
1129                     MPI_Alltoall((real *)lout2, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1130                 }
1131 #else
1132                 gmx_incons("fft5d MPI call without MPI configuration");
1133 #endif /*GMX_MPI*/
1134 #endif /*FFT5D_MPI_TRANSPOSE*/
1135 #ifdef NOGMX
1136                 if (times != 0)
1137                 {
1138                     time_mpi[s] = MPI_Wtime()-time;
1139                 }
1140 #else
1141                 wallcycle_stop(times, ewcPME_FFTCOMM);
1142 #endif
1143             } /*master*/
1144         }     /* bPrallelDim */
1145 #pragma omp barrier  /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1146
1147         /* ---------- END SPLIT + TRANSPOSE------------ */
1148
1149         /* ---------- START JOIN ------------ */
1150 #ifdef NOGMX
1151         if (times != NULL && thread == 0)
1152         {
1153             time = MPI_Wtime();
1154         }
1155 #endif
1156
1157         if (bParallelDim)
1158         {
1159             joinin = lout3;
1160         }
1161         else
1162         {
1163             joinin = fftout;
1164         }
1165         /*bring back in matrix form
1166            thus make  new 1. axes contiguos
1167            also local transpose 1 and 2/3
1168            runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1169          */
1170         if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1171         {
1172             if (pM[s] > 0)
1173             {
1174                 tstart = ( thread   *pM[s]*pN[s]/plan->nthreads);
1175                 tend   = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1176                 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1177             }
1178         }
1179         else
1180         {
1181             if (pN[s] > 0)
1182             {
1183                 tstart = ( thread   *pK[s]*pN[s]/plan->nthreads);
1184                 tend   = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1185                 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
1186             }
1187         }
1188
1189 #ifdef NOGMX
1190         if (times != NULL && thread == 0)
1191         {
1192             time_local += MPI_Wtime()-time;
1193         }
1194 #endif
1195         if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1196         {
1197             print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1198         }
1199         /* ---------- END JOIN ------------ */
1200
1201         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1202     }  /* for(s=0;s<2;s++) */
1203 #ifdef NOGMX
1204     if (times != NULL && thread == 0)
1205     {
1206         time = MPI_Wtime();
1207     }
1208 #endif
1209
1210     if (plan->flags&FFT5D_INPLACE)
1211     {
1212         lout = lin;                          /*in place currently not supported*/
1213
1214     }
1215     /*  ----------- FFT ----------- */
1216     tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1217     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1218     {
1219         gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, lout+tstart);
1220     }
1221     else
1222     {
1223         gmx_fft_many_1d(     p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,               lin+tstart, lout+tstart);
1224     }
1225     /* ------------ END FFT ---------*/
1226
1227 #ifdef NOGMX
1228     if (times != NULL && thread == 0)
1229     {
1230         time_fft += MPI_Wtime()-time;
1231
1232         times->fft   += time_fft;
1233         times->local += time_local;
1234         times->mpi2  += time_mpi[1];
1235         times->mpi1  += time_mpi[0];
1236     }
1237 #endif
1238
1239     if ((plan->flags&FFT5D_DEBUG) && thread == 0)
1240     {
1241         print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1242     }
1243 }
1244
1245 void fft5d_destroy(fft5d_plan plan)
1246 {
1247     int s, t;
1248
1249     for (s = 0; s < 3; s++)
1250     {
1251         if (plan->p1d[s])
1252         {
1253             for (t = 0; t < plan->nthreads; t++)
1254             {
1255                 gmx_many_fft_destroy(plan->p1d[s][t]);
1256             }
1257             free(plan->p1d[s]);
1258         }
1259         if (plan->iNin[s])
1260         {
1261             free(plan->iNin[s]);
1262             plan->iNin[s] = 0;
1263         }
1264         if (plan->oNin[s])
1265         {
1266             free(plan->oNin[s]);
1267             plan->oNin[s] = 0;
1268         }
1269         if (plan->iNout[s])
1270         {
1271             free(plan->iNout[s]);
1272             plan->iNout[s] = 0;
1273         }
1274         if (plan->oNout[s])
1275         {
1276             free(plan->oNout[s]);
1277             plan->oNout[s] = 0;
1278         }
1279     }
1280 #if GMX_FFT_FFTW3
1281     FFTW_LOCK;
1282 #ifdef FFT5D_MPI_TRANSPOS
1283     for (s = 0; s < 2; s++)
1284     {
1285         FFTW(destroy_plan)(plan->mpip[s]);
1286     }
1287 #endif /* FFT5D_MPI_TRANSPOS */
1288     if (plan->p3d)
1289     {
1290         FFTW(destroy_plan)(plan->p3d);
1291     }
1292     FFTW_UNLOCK;
1293 #endif /* GMX_FFT_FFTW3 */
1294
1295     if (!(plan->flags&FFT5D_NOMALLOC))
1296     {
1297         sfree_aligned(plan->lin);
1298         sfree_aligned(plan->lout);
1299         if (plan->nthreads > 1)
1300         {
1301             sfree_aligned(plan->lout2);
1302             sfree_aligned(plan->lout3);
1303         }
1304     }
1305
1306 #ifdef FFT5D_THREADS
1307 #ifdef FFT5D_FFTW_THREADS
1308     /*FFTW(cleanup_threads)();*/
1309 #endif
1310 #endif
1311
1312     free(plan);
1313 }
1314
1315 /*Is this better than direct access of plan? enough data?
1316    here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1317 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1318 {
1319     *N1 = plan->N[0];
1320     *M0 = plan->M[0];
1321     *K1 = plan->K[0];
1322     *K0 = plan->N[1];
1323
1324     *coor = plan->coor;
1325 }
1326
1327
1328 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1329    of processor dimensions*/
1330 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
1331 {
1332     MPI_Comm cart[2] = {0};
1333 #ifdef GMX_MPI
1334     int      size = 1, prank = 0;
1335     int      P[2];
1336     int      coor[2];
1337     int      wrap[] = {0, 0};
1338     MPI_Comm gcart;
1339     int      rdim1[] = {0, 1}, rdim2[] = {1, 0};
1340
1341     MPI_Comm_size(comm, &size);
1342     MPI_Comm_rank(comm, &prank);
1343
1344     if (P0 == 0)
1345     {
1346         P0 = lfactor(size);
1347     }
1348     if (size%P0 != 0)
1349     {
1350         if (prank == 0)
1351         {
1352             printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1353         }
1354         P0 = lfactor(size);
1355     }
1356
1357     P[0] = P0; P[1] = size/P0; /*number of processors in the two dimensions*/
1358
1359     /*Difference between x-y-z regarding 2d decomposition is whether they are
1360        distributed along axis 1, 2 or both*/
1361
1362     MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1363     MPI_Cart_get(gcart, 2, P, wrap, coor);
1364     MPI_Cart_sub(gcart, rdim1, &cart[0]);
1365     MPI_Cart_sub(gcart, rdim2, &cart[1]);
1366 #else
1367     (void)P0;
1368     (void)comm;
1369 #endif
1370     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1371 }
1372
1373
1374
1375 /*prints in original coordinate system of data (as the input to FFT)*/
1376 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1377 {
1378     int  xs[3], xl[3], xc[3], NG[3];
1379     int  x, y, z, l;
1380     int *coor = plan->coor;
1381     int  ll   = 2; /*compare ll values per element (has to be 2 for complex)*/
1382     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1383     {
1384         ll = 1;
1385     }
1386
1387     compute_offsets(plan, xs, xl, xc, NG, 2);
1388     if (plan->flags&FFT5D_DEBUG)
1389     {
1390         printf("Compare2\n");
1391     }
1392     for (z = 0; z < xl[2]; z++)
1393     {
1394         for (y = 0; y < xl[1]; y++)
1395         {
1396             if (plan->flags&FFT5D_DEBUG)
1397             {
1398                 printf("%d %d: ", coor[0], coor[1]);
1399             }
1400             for (x = 0; x < xl[0]; x++)
1401             {
1402                 for (l = 0; l < ll; l++)   /*loop over real/complex parts*/
1403                 {
1404                     real a, b;
1405                     a = ((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1406                     if (normalize)
1407                     {
1408                         a /= plan->rC[0]*plan->rC[1]*plan->rC[2];
1409                     }
1410                     if (!bothLocal)
1411                     {
1412                         b = ((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1413                     }
1414                     else
1415                     {
1416                         b = ((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1417                     }
1418                     if (plan->flags&FFT5D_DEBUG)
1419                     {
1420                         printf("%f %f, ", a, b);
1421                     }
1422                     else
1423                     {
1424                         if (fabs(a-b) > 2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS)
1425                         {
1426                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n", coor[0], coor[1], x, y, z, a, b);
1427                         }
1428 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1429                     }
1430                 }
1431                 if (plan->flags&FFT5D_DEBUG)
1432                 {
1433                     printf(",");
1434                 }
1435             }
1436             if (plan->flags&FFT5D_DEBUG)
1437             {
1438                 printf("\n");
1439             }
1440         }
1441     }
1442
1443 }