src/gromacs/fft/fft5d.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2009,2010,2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #include "gmxpre.h"
  36
  37 #include "fft5d.h"
  38
  39 #include "config.h"
  40
  41 #include <assert.h>
  42 #include <float.h>
  43 #include <math.h>
  44 #include <stdio.h>
  45 #include <stdlib.h>
  46 #include <string.h>
  47
  48 #include <algorithm>
  49
  50 #include "gromacs/utility/fatalerror.h"
  51 #include "gromacs/utility/gmxmpi.h"
  52 #include "gromacs/utility/smalloc.h"
  53
  54 #ifdef NOGMX
  55 #define GMX_PARALLEL_ENV_INITIALIZED 1
  56 #else
  57 #ifdef GMX_MPI
  58 #define GMX_PARALLEL_ENV_INITIALIZED 1
  59 #else
  60 #define GMX_PARALLEL_ENV_INITIALIZED 0
  61 #endif
  62 #endif
  63
  64 #ifdef GMX_OPENMP
  65 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
  66 #define FFT5D_THREADS
  67 /* requires fftw compiled with openmp */
  68 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
  69 #endif
  70
  71 #ifndef __FLT_EPSILON__
  72 #define __FLT_EPSILON__ FLT_EPSILON
  73 #define __DBL_EPSILON__ DBL_EPSILON
  74 #endif
  75
  76 #ifdef NOGMX
  77 FILE* debug = 0;
  78 #endif
  79
  80 #ifdef GMX_FFT_FFTW3
  81 #include "thread_mpi/mutex.h"
  82
  83 #include "gromacs/utility/exceptions.h"
  84 /* none of the fftw3 calls, except execute(), are thread-safe, so
  85    we need to serialize them with this mutex. */
  86 static tMPI::mutex big_fftw_mutex;
  87 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  88 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  89 #endif /* GMX_FFT_FFTW3 */
  90
  91 #ifdef GMX_MPI
  92 /* largest factor smaller than sqrt */
  93 static int lfactor(int z)
  94 {
  95     int i = static_cast<int>(sqrt(static_cast<double>(z)));
  96     while (z%i != 0)
  97     {
  98         i--;
  99     }
 100     return i;
 101 }
 102 #endif
 103
 104 /* largest factor */
 105 static int l2factor(int z)
 106 {
 107     int i;
 108     if (z == 1)
 109     {
 110         i = 1;
 111     }
 112     else
 113     {
 114         i = z/2;
 115         while (z%i != 0)
 116         {
 117             i--;
 118         }
 119     }
 120     return i;
 121 }
 122
 123 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
 124 static int lpfactor(int z)
 125 {
 126     int f = l2factor(z);
 127     if (f == 1)
 128     {
 129         return z;
 130     }
 131     return std::max(lpfactor(f), lpfactor(z/f));
 132 }
 133
 134 #ifndef GMX_MPI
 135 #ifdef HAVE_GETTIMEOFDAY
 136 #include <sys/time.h>
 137 double MPI_Wtime()
 138 {
 139     struct timeval tv;
 140     gettimeofday(&tv, 0);
 141     return tv.tv_sec+tv.tv_usec*1e-6;
 142 }
 143 #else
 144 double MPI_Wtime()
 145 {
 146     return 0.0;
 147 }
 148 #endif
 149 #endif
 150
 151 static int vmax(int* a, int s)
 152 {
 153     int i, max = 0;
 154     for (i = 0; i < s; i++)
 155     {
 156         if (a[i] > max)
 157         {
 158             max = a[i];
 159         }
 160     }
 161     return max;
 162 }
 163
 164
 165 /* NxMxK the size of the data
 166  * comm communicator to use for fft5d
 167  * P0 number of processor in 1st axes (can be null for automatic)
 168  * lin is allocated by fft5d because size of array is only known after planning phase
 169  * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 170  */
 171 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
 172 {
 173
 174     int        P[2], bMaster, prank[2], i, t;
 175     int        rNG, rMG, rKG;
 176     int       *N0 = 0, *N1 = 0, *M0 = 0, *M1 = 0, *K0 = 0, *K1 = 0, *oN0 = 0, *oN1 = 0, *oM0 = 0, *oM1 = 0, *oK0 = 0, *oK1 = 0;
 177     int        N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {0}, *oNin[3] = {0}, *iNout[3] = {0}, *oNout[3] = {0};
 178     int        C[3], rC[3], nP[2];
 179     int        lsize;
 180     t_complex *lin = 0, *lout = 0, *lout2 = 0, *lout3 = 0;
 181     fft5d_plan plan;
 182     int        s;
 183
 184     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 185 #ifdef GMX_MPI
 186     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
 187     {
 188         MPI_Comm_size(comm[0], &P[0]);
 189         MPI_Comm_rank(comm[0], &prank[0]);
 190     }
 191     else
 192 #endif
 193     {
 194         P[0]     = 1;
 195         prank[0] = 0;
 196     }
 197 #ifdef GMX_MPI
 198     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
 199     {
 200         MPI_Comm_size(comm[1], &P[1]);
 201         MPI_Comm_rank(comm[1], &prank[1]);
 202     }
 203     else
 204 #endif
 205     {
 206         P[1]     = 1;
 207         prank[1] = 0;
 208     }
 209
 210     bMaster = (prank[0] == 0 && prank[1] == 0);
 211
 212
 213     if (debug)
 214     {
 215         fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n",
 216                 P[0], P[1], prank[0], prank[1]);
 217     }
 218
 219     if (bMaster)
 220     {
 221         if (debug)
 222         {
 223             fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 224                     NG, MG, KG, P[0], P[1], (flags&FFT5D_REALCOMPLEX) > 0, (flags&FFT5D_BACKWARD) > 0, (flags&FFT5D_ORDER_YZ) > 0, (flags&FFT5D_DEBUG) > 0);
 225         }
 226         /* The check below is not correct, one prime factor 11 or 13 is ok.
 227            if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 228             printf("WARNING: FFT very slow with prime factors larger 7\n");
 229             printf("Change FFT size or in case you cannot change it look at\n");
 230             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 231            }
 232          */
 233     }
 234
 235     if (NG == 0 || MG == 0 || KG == 0)
 236     {
 237         if (bMaster)
 238         {
 239             printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 240         }
 241         return 0;
 242     }
 243
 244     rNG = NG; rMG = MG; rKG = KG;
 245
 246     if (flags&FFT5D_REALCOMPLEX)
 247     {
 248         if (!(flags&FFT5D_BACKWARD))
 249         {
 250             NG = NG/2+1;
 251         }
 252         else
 253         {
 254             if (!(flags&FFT5D_ORDER_YZ))
 255             {
 256                 MG = MG/2+1;
 257             }
 258             else
 259             {
 260                 KG = KG/2+1;
 261             }
 262         }
 263     }
 264
 265
 266     /*for transpose we need to know the size for each processor not only our own size*/
 267
 268     N0  = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 269     M0  = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 270     K0  = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 271     oN0 = (int*)malloc(P[0]*sizeof(int)); oN1 = (int*)malloc(P[1]*sizeof(int));
 272     oM0 = (int*)malloc(P[0]*sizeof(int)); oM1 = (int*)malloc(P[1]*sizeof(int));
 273     oK0 = (int*)malloc(P[0]*sizeof(int)); oK1 = (int*)malloc(P[1]*sizeof(int));
 274
 275     for (i = 0; i < P[0]; i++)
 276     {
 277         #define EVENDIST
 278         #ifndef EVENDIST
 279         oN0[i] = i*ceil((double)NG/P[0]);
 280         oM0[i] = i*ceil((double)MG/P[0]);
 281         oK0[i] = i*ceil((double)KG/P[0]);
 282         #else
 283         oN0[i] = (NG*i)/P[0];
 284         oM0[i] = (MG*i)/P[0];
 285         oK0[i] = (KG*i)/P[0];
 286         #endif
 287     }
 288     for (i = 0; i < P[1]; i++)
 289     {
 290         #ifndef EVENDIST
 291         oN1[i] = i*ceil((double)NG/P[1]);
 292         oM1[i] = i*ceil((double)MG/P[1]);
 293         oK1[i] = i*ceil((double)KG/P[1]);
 294         #else
 295         oN1[i] = (NG*i)/P[1];
 296         oM1[i] = (MG*i)/P[1];
 297         oK1[i] = (KG*i)/P[1];
 298         #endif
 299     }
 300     for (i = 0; i < P[0]-1; i++)
 301     {
 302         N0[i] = oN0[i+1]-oN0[i];
 303         M0[i] = oM0[i+1]-oM0[i];
 304         K0[i] = oK0[i+1]-oK0[i];
 305     }
 306     N0[P[0]-1] = NG-oN0[P[0]-1];
 307     M0[P[0]-1] = MG-oM0[P[0]-1];
 308     K0[P[0]-1] = KG-oK0[P[0]-1];
 309     for (i = 0; i < P[1]-1; i++)
 310     {
 311         N1[i] = oN1[i+1]-oN1[i];
 312         M1[i] = oM1[i+1]-oM1[i];
 313         K1[i] = oK1[i+1]-oK1[i];
 314     }
 315     N1[P[1]-1] = NG-oN1[P[1]-1];
 316     M1[P[1]-1] = MG-oM1[P[1]-1];
 317     K1[P[1]-1] = KG-oK1[P[1]-1];
 318
 319     /* for step 1-3 the local N,M,K sizes of the transposed system
 320        C: contiguous dimension, and nP: number of processor in subcommunicator
 321        for that step */
 322
 323
 324     pM[0] = M0[prank[0]];
 325     oM[0] = oM0[prank[0]];
 326     pK[0] = K1[prank[1]];
 327     oK[0] = oK1[prank[1]];
 328     C[0]  = NG;
 329     rC[0] = rNG;
 330     if (!(flags&FFT5D_ORDER_YZ))
 331     {
 332         N[0]     = vmax(N1, P[1]);
 333         M[0]     = M0[prank[0]];
 334         K[0]     = vmax(K1, P[1]);
 335         pN[0]    = N1[prank[1]];
 336         iNout[0] = N1;
 337         oNout[0] = oN1;
 338         nP[0]    = P[1];
 339         C[1]     = KG;
 340         rC[1]    = rKG;
 341         N[1]     = vmax(K0, P[0]);
 342         pN[1]    = K0[prank[0]];
 343         iNin[1]  = K1;
 344         oNin[1]  = oK1;
 345         iNout[1] = K0;
 346         oNout[1] = oK0;
 347         M[1]     = vmax(M0, P[0]);
 348         pM[1]    = M0[prank[0]];
 349         oM[1]    = oM0[prank[0]];
 350         K[1]     = N1[prank[1]];
 351         pK[1]    = N1[prank[1]];
 352         oK[1]    = oN1[prank[1]];
 353         nP[1]    = P[0];
 354         C[2]     = MG;
 355         rC[2]    = rMG;
 356         iNin[2]  = M0;
 357         oNin[2]  = oM0;
 358         M[2]     = vmax(K0, P[0]);
 359         pM[2]    = K0[prank[0]];
 360         oM[2]    = oK0[prank[0]];
 361         K[2]     = vmax(N1, P[1]);
 362         pK[2]    = N1[prank[1]];
 363         oK[2]    = oN1[prank[1]];
 364         free(N0); free(oN0); /*these are not used for this order*/
 365         free(M1); free(oM1); /*the rest is freed in destroy*/
 366     }
 367     else
 368     {
 369         N[0]     = vmax(N0, P[0]);
 370         M[0]     = vmax(M0, P[0]);
 371         K[0]     = K1[prank[1]];
 372         pN[0]    = N0[prank[0]];
 373         iNout[0] = N0;
 374         oNout[0] = oN0;
 375         nP[0]    = P[0];
 376         C[1]     = MG;
 377         rC[1]    = rMG;
 378         N[1]     = vmax(M1, P[1]);
 379         pN[1]    = M1[prank[1]];
 380         iNin[1]  = M0;
 381         oNin[1]  = oM0;
 382         iNout[1] = M1;
 383         oNout[1] = oM1;
 384         M[1]     = N0[prank[0]];
 385         pM[1]    = N0[prank[0]];
 386         oM[1]    = oN0[prank[0]];
 387         K[1]     = vmax(K1, P[1]);
 388         pK[1]    = K1[prank[1]];
 389         oK[1]    = oK1[prank[1]];
 390         nP[1]    = P[1];
 391         C[2]     = KG;
 392         rC[2]    = rKG;
 393         iNin[2]  = K1;
 394         oNin[2]  = oK1;
 395         M[2]     = vmax(N0, P[0]);
 396         pM[2]    = N0[prank[0]];
 397         oM[2]    = oN0[prank[0]];
 398         K[2]     = vmax(M1, P[1]);
 399         pK[2]    = M1[prank[1]];
 400         oK[2]    = oM1[prank[1]];
 401         free(N1); free(oN1); /*these are not used for this order*/
 402         free(K0); free(oK0); /*the rest is freed in destroy*/
 403     }
 404     N[2] = pN[2] = -1;       /*not used*/
 405
 406     /*
 407        Difference between x-y-z regarding 2d decomposition is whether they are
 408        distributed along axis 1, 2 or both
 409      */
 410
 411     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 412     lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
 413     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 414     if (!(flags&FFT5D_NOMALLOC))
 415     {
 416         snew_aligned(lin, lsize, 32);
 417         snew_aligned(lout, lsize, 32);
 418         if (nthreads > 1)
 419         {
 420             /* We need extra transpose buffers to avoid OpenMP barriers */
 421             snew_aligned(lout2, lsize, 32);
 422             snew_aligned(lout3, lsize, 32);
 423         }
 424         else
 425         {
 426             /* We can reuse the buffers to avoid cache misses */
 427             lout2 = lin;
 428             lout3 = lout;
 429         }
 430     }
 431     else
 432     {
 433         lin  = *rlin;
 434         lout = *rlout;
 435         if (nthreads > 1)
 436         {
 437             lout2 = *rlout2;
 438             lout3 = *rlout3;
 439         }
 440         else
 441         {
 442             lout2 = lin;
 443             lout3 = lout;
 444         }
 445     }
 446
 447     plan = (fft5d_plan)calloc(1, sizeof(struct fft5d_plan_t));
 448
 449
 450     if (debug)
 451     {
 452         fprintf(debug, "Running on %d threads\n", nthreads);
 453     }
 454
 455 #ifdef GMX_FFT_FFTW3                                                            /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
 456     /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
 457      * within a parallel region. For now deactivated. If it should be supported it has to made sure that
 458      * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
 459      * and that the 3d plan is faster than the 1d plan.
 460      */
 461     if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
 462     {
 463         int fftwflags = FFTW_DESTROY_INPUT;
 464         FFTW(iodim) dims[3];
 465         int inNG = NG, outMG = MG, outKG = KG;
 466
 467         FFTW_LOCK;
 468         if (!(flags&FFT5D_NOMEASURE))
 469         {
 470             fftwflags |= FFTW_MEASURE;
 471         }
 472         if (flags&FFT5D_REALCOMPLEX)
 473         {
 474             if (!(flags&FFT5D_BACKWARD))        /*input pointer is not complex*/
 475             {
 476                 inNG *= 2;
 477             }
 478             else                                /*output pointer is not complex*/
 479             {
 480                 if (!(flags&FFT5D_ORDER_YZ))
 481                 {
 482                     outMG *= 2;
 483                 }
 484                 else
 485                 {
 486                     outKG *= 2;
 487                 }
 488             }
 489         }
 490
 491         if (!(flags&FFT5D_BACKWARD))
 492         {
 493             dims[0].n  = KG;
 494             dims[1].n  = MG;
 495             dims[2].n  = rNG;
 496
 497             dims[0].is = inNG*MG;         /*N M K*/
 498             dims[1].is = inNG;
 499             dims[2].is = 1;
 500             if (!(flags&FFT5D_ORDER_YZ))
 501             {
 502                 dims[0].os = MG;           /*M K N*/
 503                 dims[1].os = 1;
 504                 dims[2].os = MG*KG;
 505             }
 506             else
 507             {
 508                 dims[0].os = 1;           /*K N M*/
 509                 dims[1].os = KG*NG;
 510                 dims[2].os = KG;
 511             }
 512         }
 513         else
 514         {
 515             if (!(flags&FFT5D_ORDER_YZ))
 516             {
 517                 dims[0].n  = NG;
 518                 dims[1].n  = KG;
 519                 dims[2].n  = rMG;
 520
 521                 dims[0].is = 1;
 522                 dims[1].is = NG*MG;
 523                 dims[2].is = NG;
 524
 525                 dims[0].os = outMG*KG;
 526                 dims[1].os = outMG;
 527                 dims[2].os = 1;
 528             }
 529             else
 530             {
 531                 dims[0].n  = MG;
 532                 dims[1].n  = NG;
 533                 dims[2].n  = rKG;
 534
 535                 dims[0].is = NG;
 536                 dims[1].is = 1;
 537                 dims[2].is = NG*MG;
 538
 539                 dims[0].os = outKG*NG;
 540                 dims[1].os = outKG;
 541                 dims[2].os = 1;
 542             }
 543         }
 544 #ifdef FFT5D_THREADS
 545 #ifdef FFT5D_FFTW_THREADS
 546         FFTW(plan_with_nthreads)(nthreads);
 547 #endif
 548 #endif
 549         if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
 550         {
 551             plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 552                                                          /*howmany*/ 0, /*howmany_dims*/ 0,
 553                                                          (real*)lin, (FFTW(complex) *) lout,
 554                                                          /*flags*/ fftwflags);
 555         }
 556         else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD))
 557         {
 558             plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 559                                                          /*howmany*/ 0, /*howmany_dims*/ 0,
 560                                                          (FFTW(complex) *) lin, (real*)lout,
 561                                                          /*flags*/ fftwflags);
 562         }
 563         else
 564         {
 565             plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 566                                                      /*howmany*/ 0, /*howmany_dims*/ 0,
 567                                                      (FFTW(complex) *) lin, (FFTW(complex) *) lout,
 568                                                      /*sign*/ (flags&FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
 569         }
 570 #ifdef FFT5D_THREADS
 571 #ifdef FFT5D_FFTW_THREADS
 572         FFTW(plan_with_nthreads)(1);
 573 #endif
 574 #endif
 575         FFTW_UNLOCK;
 576     }
 577     if (!plan->p3d) /* for decomposition and if 3d plan did not work */
 578     {
 579 #endif              /* GMX_FFT_FFTW3 */
 580     for (s = 0; s < 3; s++)
 581     {
 582         if (debug)
 583         {
 584             fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 585                     s, rC[s], M[s], pK[s], C[s], lsize);
 586         }
 587         plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
 588
 589         /* Make sure that the init routines are only called by one thread at a time and in order
 590            (later is only important to not confuse valgrind)
 591          */
 592 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
 593         for (t = 0; t < nthreads; t++)
 594         {
 595 #pragma omp ordered
 596             {
 597                 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
 598
 599                 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s == 0) || ((flags&FFT5D_BACKWARD) && s == 2)))
 600                 {
 601                     gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
 602                 }
 603                 else
 604                 {
 605                     gmx_fft_init_many_1d     ( &plan->p1d[s][t],  C[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
 606                 }
 607             }
 608         }
 609     }
 610
 611 #ifdef GMX_FFT_FFTW3
 612 }
 613 #endif
 614     if ((flags&FFT5D_ORDER_YZ))   /*plan->cart is in the order of transposes */
 615     {
 616         plan->cart[0] = comm[0]; plan->cart[1] = comm[1];
 617     }
 618     else
 619     {
 620         plan->cart[1] = comm[0]; plan->cart[0] = comm[1];
 621     }
 622 #ifdef FFT5D_MPI_TRANSPOSE
 623     FFTW_LOCK;
 624     for (s = 0; s < 2; s++)
 625     {
 626         if ((s == 0 && !(flags&FFT5D_ORDER_YZ)) || (s == 1 && (flags&FFT5D_ORDER_YZ)))
 627         {
 628             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 629         }
 630         else
 631         {
 632             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 633         }
 634     }
 635     FFTW_UNLOCK;
 636 #endif
 637
 638
 639     plan->lin   = lin;
 640     plan->lout  = lout;
 641     plan->lout2 = lout2;
 642     plan->lout3 = lout3;
 643
 644     plan->NG = NG; plan->MG = MG; plan->KG = KG;
 645
 646     for (s = 0; s < 3; s++)
 647     {
 648         plan->N[s]    = N[s]; plan->M[s] = M[s]; plan->K[s] = K[s]; plan->pN[s] = pN[s]; plan->pM[s] = pM[s]; plan->pK[s] = pK[s];
 649         plan->oM[s]   = oM[s]; plan->oK[s] = oK[s];
 650         plan->C[s]    = C[s]; plan->rC[s] = rC[s];
 651         plan->iNin[s] = iNin[s]; plan->oNin[s] = oNin[s]; plan->iNout[s] = iNout[s]; plan->oNout[s] = oNout[s];
 652     }
 653     for (s = 0; s < 2; s++)
 654     {
 655         plan->P[s] = nP[s]; plan->coor[s] = prank[s];
 656     }
 657
 658 /*    plan->fftorder=fftorder;
 659     plan->direction=direction;
 660     plan->realcomplex=realcomplex;
 661  */
 662     plan->flags    = flags;
 663     plan->nthreads = nthreads;
 664     *rlin          = lin;
 665     *rlout         = lout;
 666     *rlout2        = lout2;
 667     *rlout3        = lout3;
 668     return plan;
 669 }
 670
 671
 672 enum order {
 673     XYZ,
 674     XZY,
 675     YXZ,
 676     YZX,
 677     ZXY,
 678     ZYX
 679 };
 680
 681
 682
 683 /*here x,y,z and N,M,K is in rotated coordinate system!!
 684    x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 685    maxN,maxM,maxK is max size of local data
 686    pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 687    NG, MG, KG is size of global data*/
 688 static void splitaxes(t_complex* lout, const t_complex* lin,
 689                       int maxN, int maxM, int maxK, int pM,
 690                       int P, int NG, int *N, int* oN, int starty, int startz, int endy, int endz)
 691 {
 692     int x, y, z, i;
 693     int in_i, out_i, in_z, out_z, in_y, out_y;
 694     int s_y, e_y;
 695
 696     for (z = startz; z < endz+1; z++) /*3. z l*/
 697     {
 698         if (z == startz)
 699         {
 700             s_y = starty;
 701         }
 702         else
 703         {
 704             s_y = 0;
 705         }
 706         if (z == endz)
 707         {
 708             e_y = endy;
 709         }
 710         else
 711         {
 712             e_y = pM;
 713         }
 714         out_z  = z*maxN*maxM;
 715         in_z   = z*NG*pM;
 716
 717         for (i = 0; i < P; i++) /*index cube along long axis*/
 718         {
 719             out_i  = out_z  + i*maxN*maxM*maxK;
 720             in_i   = in_z + oN[i];
 721             for (y = s_y; y < e_y; y++)   /*2. y k*/
 722             {
 723                 out_y  = out_i  + y*maxN;
 724                 in_y   = in_i + y*NG;
 725                 for (x = 0; x < N[i]; x++)       /*1. x j*/
 726                 {
 727                     lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
 728                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 729                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 730                 }
 731             }
 732         }
 733     }
 734 }
 735
 736 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 737 /*transpose mayor and major dimension
 738    variables see above
 739    the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 740    N,M,K local dimensions
 741    KG global size*/
 742 static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
 743                             int maxN, int maxM, int maxK, int pM,
 744                             int P, int KG, int* K, int* oK, int starty, int startx, int endy, int endx)
 745 {
 746     int i, x, y, z;
 747     int out_i, in_i, out_x, in_x, out_z, in_z;
 748     int s_y, e_y;
 749
 750     for (x = startx; x < endx+1; x++) /*1.j*/
 751     {
 752         if (x == startx)
 753         {
 754             s_y = starty;
 755         }
 756         else
 757         {
 758             s_y = 0;
 759         }
 760         if (x == endx)
 761         {
 762             e_y = endy;
 763         }
 764         else
 765         {
 766             e_y = pM;
 767         }
 768
 769         out_x  = x*KG*pM;
 770         in_x   = x;
 771
 772         for (i = 0; i < P; i++) /*index cube along long axis*/
 773         {
 774             out_i  = out_x  + oK[i];
 775             in_i   = in_x + i*maxM*maxN*maxK;
 776             for (z = 0; z < K[i]; z++) /*3.l*/
 777             {
 778                 out_z  = out_i  + z;
 779                 in_z   = in_i + z*maxM*maxN;
 780                 for (y = s_y; y < e_y; y++)              /*2.k*/
 781                 {
 782                     lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
 783                 }
 784             }
 785         }
 786     }
 787 }
 788
 789 /*make axis contiguous again (after AllToAll) and also do local transpose
 790    tranpose mayor and middle dimension
 791    variables see above
 792    the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 793    N,M,K local size
 794    MG, global size*/
 795 static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
 796                             int P, int MG, int* M, int* oM, int startx, int startz, int endx, int endz)
 797 {
 798     int i, z, y, x;
 799     int out_i, in_i, out_z, in_z, out_x, in_x;
 800     int s_x, e_x;
 801
 802     for (z = startz; z < endz+1; z++)
 803     {
 804         if (z == startz)
 805         {
 806             s_x = startx;
 807         }
 808         else
 809         {
 810             s_x = 0;
 811         }
 812         if (z == endz)
 813         {
 814             e_x = endx;
 815         }
 816         else
 817         {
 818             e_x = pN;
 819         }
 820         out_z  = z*MG*pN;
 821         in_z   = z*maxM*maxN;
 822
 823         for (i = 0; i < P; i++) /*index cube along long axis*/
 824         {
 825             out_i  = out_z  + oM[i];
 826             in_i   = in_z + i*maxM*maxN*maxK;
 827             for (x = s_x; x < e_x; x++)
 828             {
 829                 out_x  = out_i  + x*MG;
 830                 in_x   = in_i + x;
 831                 for (y = 0; y < M[i]; y++)
 832                 {
 833                     lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
 834                 }
 835             }
 836         }
 837     }
 838 }
 839
 840
 841 static void rotate_offsets(int x[])
 842 {
 843     int t = x[0];
 844 /*    x[0]=x[2];
 845     x[2]=x[1];
 846     x[1]=t;*/
 847     x[0] = x[1];
 848     x[1] = x[2];
 849     x[2] = t;
 850 }
 851
 852 /*compute the offset to compare or print transposed local data in original input coordinates
 853    xs matrix dimension size, xl dimension length, xc decomposition offset
 854    s: step in computation = number of transposes*/
 855 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
 856 {
 857 /*    int direction = plan->direction;
 858     int fftorder = plan->fftorder;*/
 859
 860     int  o = 0;
 861     int  pos[3], i;
 862     int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK,
 863     *C      = plan->C, *rC = plan->rC;
 864
 865     NG[0] = plan->NG; NG[1] = plan->MG; NG[2] = plan->KG;
 866
 867     if (!(plan->flags&FFT5D_ORDER_YZ))
 868     {
 869         switch (s)
 870         {
 871             case 0: o = XYZ; break;
 872             case 1: o = ZYX; break;
 873             case 2: o = YZX; break;
 874             default: assert(0);
 875         }
 876     }
 877     else
 878     {
 879         switch (s)
 880         {
 881             case 0: o = XYZ; break;
 882             case 1: o = YXZ; break;
 883             case 2: o = ZXY; break;
 884             default: assert(0);
 885         }
 886     }
 887
 888     switch (o)
 889     {
 890         case XYZ: pos[0] = 1; pos[1] = 2; pos[2] = 3; break;
 891         case XZY: pos[0] = 1; pos[1] = 3; pos[2] = 2; break;
 892         case YXZ: pos[0] = 2; pos[1] = 1; pos[2] = 3; break;
 893         case YZX: pos[0] = 3; pos[1] = 1; pos[2] = 2; break;
 894         case ZXY: pos[0] = 2; pos[1] = 3; pos[2] = 1; break;
 895         case ZYX: pos[0] = 3; pos[1] = 2; pos[2] = 1; break;
 896     }
 897     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 898
 899     /*xs, xl give dimension size and data length in local transposed coordinate system
 900        for 0(/1/2): x(/y/z) in original coordinate system*/
 901     for (i = 0; i < 3; i++)
 902     {
 903         switch (pos[i])
 904         {
 905             case 1: xs[i] = 1;         xc[i] = 0;     xl[i] = C[s]; break;
 906             case 2: xs[i] = C[s];      xc[i] = oM[s]; xl[i] = pM[s]; break;
 907             case 3: xs[i] = C[s]*pM[s]; xc[i] = oK[s]; xl[i] = pK[s]; break;
 908         }
 909     }
 910     /*input order is different for test program to match FFTW order
 911        (important for complex to real)*/
 912     if (plan->flags&FFT5D_BACKWARD)
 913     {
 914         rotate_offsets(xs);
 915         rotate_offsets(xl);
 916         rotate_offsets(xc);
 917         rotate_offsets(NG);
 918         if (plan->flags&FFT5D_ORDER_YZ)
 919         {
 920             rotate_offsets(xs);
 921             rotate_offsets(xl);
 922             rotate_offsets(xc);
 923             rotate_offsets(NG);
 924         }
 925     }
 926     if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s == 0) || ((plan->flags&FFT5D_BACKWARD) && s == 2)))
 927     {
 928         xl[0] = rC[s];
 929     }
 930 }
 931
 932 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
 933 {
 934     int  x, y, z, l;
 935     int *coor = plan->coor;
 936     int  xs[3], xl[3], xc[3], NG[3];
 937     int  ll = (plan->flags&FFT5D_REALCOMPLEX) ? 1 : 2;
 938     compute_offsets(plan, xs, xl, xc, NG, s);
 939     fprintf(debug, txt, coor[0], coor[1], s);
 940     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 941     for (z = 0; z < xl[2]; z++)
 942     {
 943         for (y = 0; y < xl[1]; y++)
 944         {
 945             fprintf(debug, "%d %d: ", coor[0], coor[1]);
 946             for (x = 0; x < xl[0]; x++)
 947             {
 948                 for (l = 0; l < ll; l++)
 949                 {
 950                     fprintf(debug, "%f ", ((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 951                 }
 952                 fprintf(debug, ",");
 953             }
 954             fprintf(debug, "\n");
 955         }
 956     }
 957 }
 958
 959 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
 960 {
 961     t_complex  *lin   = plan->lin;
 962     t_complex  *lout  = plan->lout;
 963     t_complex  *lout2 = plan->lout2;
 964     t_complex  *lout3 = plan->lout3;
 965     t_complex  *fftout, *joinin;
 966
 967     gmx_fft_t **p1d = plan->p1d;
 968 #ifdef FFT5D_MPI_TRANSPOSE
 969     FFTW(plan) *mpip = plan->mpip;
 970 #endif
 971 #ifdef GMX_MPI
 972     MPI_Comm *cart = plan->cart;
 973 #endif
 974 #ifdef NOGMX
 975     double time_fft = 0, time_local = 0, time_mpi[2] = {0}, time = 0;
 976 #endif
 977     int   *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
 978     *C       = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout, **oNout = plan->oNout;
 979     int    s = 0, tstart, tend, bParallelDim;
 980
 981
 982 #ifdef GMX_FFT_FFTW3
 983     if (plan->p3d)
 984     {
 985         if (thread == 0)
 986         {
 987 #ifdef NOGMX
 988             if (times != 0)
 989             {
 990                 time = MPI_Wtime();
 991             }
 992 #endif
 993             FFTW(execute)(plan->p3d);
 994 #ifdef NOGMX
 995             if (times != 0)
 996             {
 997                 times->fft += MPI_Wtime()-time;
 998             }
 999 #endif
1000         }
1001         return;
1002     }
1003 #endif
1004
1005     s = 0;
1006
1007     /*lin: x,y,z*/
1008     if (plan->flags&FFT5D_DEBUG && thread == 0)
1009     {
1010         print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1011     }
1012
1013     for (s = 0; s < 2; s++)  /*loop over first two FFT steps (corner rotations)*/
1014
1015     {
1016 #ifdef GMX_MPI
1017         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1018         {
1019             bParallelDim = 1;
1020         }
1021         else
1022 #endif
1023         {
1024             bParallelDim = 0;
1025         }
1026
1027         /* ---------- START FFT ------------ */
1028 #ifdef NOGMX
1029         if (times != 0 && thread == 0)
1030         {
1031             time = MPI_Wtime();
1032         }
1033 #endif
1034
1035         if (bParallelDim || plan->nthreads == 1)
1036         {
1037             fftout = lout;
1038         }
1039         else
1040         {
1041             if (s == 0)
1042             {
1043                 fftout = lout3;
1044             }
1045             else
1046             {
1047                 fftout = lout2;
1048             }
1049         }
1050
1051         tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1052         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s == 0)
1053         {
1054             gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, fftout+tstart);
1055         }
1056         else
1057         {
1058             gmx_fft_many_1d(     p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,               lin+tstart, fftout+tstart);
1059
1060         }
1061
1062 #ifdef NOGMX
1063         if (times != NULL && thread == 0)
1064         {
1065             time_fft += MPI_Wtime()-time;
1066         }
1067 #endif
1068         if (plan->flags&FFT5D_DEBUG && thread == 0)
1069         {
1070             print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1071         }
1072         /* ---------- END FFT ------------ */
1073
1074         /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1075         if (bParallelDim)
1076         {
1077 #ifdef NOGMX
1078             if (times != NULL && thread == 0)
1079             {
1080                 time = MPI_Wtime();
1081             }
1082 #endif
1083             /*prepare for A
1084                llToAll
1085                1. (most outer) axes (x) is split into P[s] parts of size N[s]
1086                for sending*/
1087             if (pM[s] > 0)
1088             {
1089                 tend    = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
1090                 tstart /= C[s];
1091                 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1092             }
1093 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1094 #ifdef NOGMX
1095             if (times != NULL && thread == 0)
1096             {
1097                 time_local += MPI_Wtime()-time;
1098             }
1099 #endif
1100
1101             /* ---------- END SPLIT , START TRANSPOSE------------ */
1102
1103             if (thread == 0)
1104             {
1105 #ifdef NOGMX
1106                 if (times != 0)
1107                 {
1108                     time = MPI_Wtime();
1109                 }
1110 #else
1111                 wallcycle_start(times, ewcPME_FFTCOMM);
1112 #endif
1113 #ifdef FFT5D_MPI_TRANSPOSE
1114                 FFTW(execute)(mpip[s]);
1115 #else
1116 #ifdef GMX_MPI
1117                 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1118                 {
1119                     MPI_Alltoall((real *)lout2, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1120                 }
1121                 else
1122                 {
1123                     MPI_Alltoall((real *)lout2, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1124                 }
1125 #else
1126                 gmx_incons("fft5d MPI call without MPI configuration");
1127 #endif /*GMX_MPI*/
1128 #endif /*FFT5D_MPI_TRANSPOSE*/
1129 #ifdef NOGMX
1130                 if (times != 0)
1131                 {
1132                     time_mpi[s] = MPI_Wtime()-time;
1133                 }
1134 #else
1135                 wallcycle_stop(times, ewcPME_FFTCOMM);
1136 #endif
1137             } /*master*/
1138         }     /* bPrallelDim */
1139 #pragma omp barrier  /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1140
1141         /* ---------- END SPLIT + TRANSPOSE------------ */
1142
1143         /* ---------- START JOIN ------------ */
1144 #ifdef NOGMX
1145         if (times != NULL && thread == 0)
1146         {
1147             time = MPI_Wtime();
1148         }
1149 #endif
1150
1151         if (bParallelDim)
1152         {
1153             joinin = lout3;
1154         }
1155         else
1156         {
1157             joinin = fftout;
1158         }
1159         /*bring back in matrix form
1160            thus make  new 1. axes contiguos
1161            also local transpose 1 and 2/3
1162            runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1163          */
1164         if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1165         {
1166             if (pM[s] > 0)
1167             {
1168                 tstart = ( thread   *pM[s]*pN[s]/plan->nthreads);
1169                 tend   = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1170                 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1171             }
1172         }
1173         else
1174         {
1175             if (pN[s] > 0)
1176             {
1177                 tstart = ( thread   *pK[s]*pN[s]/plan->nthreads);
1178                 tend   = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1179                 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
1180             }
1181         }
1182
1183 #ifdef NOGMX
1184         if (times != NULL && thread == 0)
1185         {
1186             time_local += MPI_Wtime()-time;
1187         }
1188 #endif
1189         if (plan->flags&FFT5D_DEBUG && thread == 0)
1190         {
1191             print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1192         }
1193         /* ---------- END JOIN ------------ */
1194
1195         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1196     }  /* for(s=0;s<2;s++) */
1197 #ifdef NOGMX
1198     if (times != NULL && thread == 0)
1199     {
1200         time = MPI_Wtime();
1201     }
1202 #endif
1203
1204     if (plan->flags&FFT5D_INPLACE)
1205     {
1206         lout = lin;                          /*in place currently not supported*/
1207
1208     }
1209     /*  ----------- FFT ----------- */
1210     tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1211     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1212     {
1213         gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, lout+tstart);
1214     }
1215     else
1216     {
1217         gmx_fft_many_1d(     p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,               lin+tstart, lout+tstart);
1218     }
1219     /* ------------ END FFT ---------*/
1220
1221 #ifdef NOGMX
1222     if (times != NULL && thread == 0)
1223     {
1224         time_fft += MPI_Wtime()-time;
1225
1226         times->fft   += time_fft;
1227         times->local += time_local;
1228         times->mpi2  += time_mpi[1];
1229         times->mpi1  += time_mpi[0];
1230     }
1231 #endif
1232
1233     if (plan->flags&FFT5D_DEBUG && thread == 0)
1234     {
1235         print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1236     }
1237 }
1238
1239 void fft5d_destroy(fft5d_plan plan)
1240 {
1241     int s, t;
1242
1243     for (s = 0; s < 3; s++)
1244     {
1245         if (plan->p1d[s])
1246         {
1247             for (t = 0; t < plan->nthreads; t++)
1248             {
1249                 gmx_many_fft_destroy(plan->p1d[s][t]);
1250             }
1251             free(plan->p1d[s]);
1252         }
1253         if (plan->iNin[s])
1254         {
1255             free(plan->iNin[s]);
1256             plan->iNin[s] = 0;
1257         }
1258         if (plan->oNin[s])
1259         {
1260             free(plan->oNin[s]);
1261             plan->oNin[s] = 0;
1262         }
1263         if (plan->iNout[s])
1264         {
1265             free(plan->iNout[s]);
1266             plan->iNout[s] = 0;
1267         }
1268         if (plan->oNout[s])
1269         {
1270             free(plan->oNout[s]);
1271             plan->oNout[s] = 0;
1272         }
1273     }
1274 #ifdef GMX_FFT_FFTW3
1275     FFTW_LOCK;
1276 #ifdef FFT5D_MPI_TRANSPOS
1277     for (s = 0; s < 2; s++)
1278     {
1279         FFTW(destroy_plan)(plan->mpip[s]);
1280     }
1281 #endif /* FFT5D_MPI_TRANSPOS */
1282     if (plan->p3d)
1283     {
1284         FFTW(destroy_plan)(plan->p3d);
1285     }
1286     FFTW_UNLOCK;
1287 #endif /* GMX_FFT_FFTW3 */
1288
1289     if (!(plan->flags&FFT5D_NOMALLOC))
1290     {
1291         sfree_aligned(plan->lin);
1292         sfree_aligned(plan->lout);
1293         if (plan->nthreads > 1)
1294         {
1295             sfree_aligned(plan->lout2);
1296             sfree_aligned(plan->lout3);
1297         }
1298     }
1299
1300 #ifdef FFT5D_THREADS
1301 #ifdef FFT5D_FFTW_THREADS
1302     /*FFTW(cleanup_threads)();*/
1303 #endif
1304 #endif
1305
1306     free(plan);
1307 }
1308
1309 /*Is this better than direct access of plan? enough data?
1310    here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1311 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1312 {
1313     *N1 = plan->N[0];
1314     *M0 = plan->M[0];
1315     *K1 = plan->K[0];
1316     *K0 = plan->N[1];
1317
1318     *coor = plan->coor;
1319 }
1320
1321
1322 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1323    of processor dimensions*/
1324 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
1325 {
1326     MPI_Comm cart[2] = {0};
1327 #ifdef GMX_MPI
1328     int      size = 1, prank = 0;
1329     int      P[2];
1330     int      coor[2];
1331     int      wrap[] = {0, 0};
1332     MPI_Comm gcart;
1333     int      rdim1[] = {0, 1}, rdim2[] = {1, 0};
1334
1335     MPI_Comm_size(comm, &size);
1336     MPI_Comm_rank(comm, &prank);
1337
1338     if (P0 == 0)
1339     {
1340         P0 = lfactor(size);
1341     }
1342     if (size%P0 != 0)
1343     {
1344         if (prank == 0)
1345         {
1346             printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1347         }
1348         P0 = lfactor(size);
1349     }
1350
1351     P[0] = P0; P[1] = size/P0; /*number of processors in the two dimensions*/
1352
1353     /*Difference between x-y-z regarding 2d decomposition is whether they are
1354        distributed along axis 1, 2 or both*/
1355
1356     MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1357     MPI_Cart_get(gcart, 2, P, wrap, coor);
1358     MPI_Cart_sub(gcart, rdim1, &cart[0]);
1359     MPI_Cart_sub(gcart, rdim2, &cart[1]);
1360 #else
1361     (void)P0;
1362     (void)comm;
1363 #endif
1364     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1365 }
1366
1367
1368
1369 /*prints in original coordinate system of data (as the input to FFT)*/
1370 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1371 {
1372     int  xs[3], xl[3], xc[3], NG[3];
1373     int  x, y, z, l;
1374     int *coor = plan->coor;
1375     int  ll   = 2; /*compare ll values per element (has to be 2 for complex)*/
1376     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1377     {
1378         ll = 1;
1379     }
1380
1381     compute_offsets(plan, xs, xl, xc, NG, 2);
1382     if (plan->flags&FFT5D_DEBUG)
1383     {
1384         printf("Compare2\n");
1385     }
1386     for (z = 0; z < xl[2]; z++)
1387     {
1388         for (y = 0; y < xl[1]; y++)
1389         {
1390             if (plan->flags&FFT5D_DEBUG)
1391             {
1392                 printf("%d %d: ", coor[0], coor[1]);
1393             }
1394             for (x = 0; x < xl[0]; x++)
1395             {
1396                 for (l = 0; l < ll; l++)   /*loop over real/complex parts*/
1397                 {
1398                     real a, b;
1399                     a = ((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1400                     if (normalize)
1401                     {
1402                         a /= plan->rC[0]*plan->rC[1]*plan->rC[2];
1403                     }
1404                     if (!bothLocal)
1405                     {
1406                         b = ((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1407                     }
1408                     else
1409                     {
1410                         b = ((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1411                     }
1412                     if (plan->flags&FFT5D_DEBUG)
1413                     {
1414                         printf("%f %f, ", a, b);
1415                     }
1416                     else
1417                     {
1418                         if (fabs(a-b) > 2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS)
1419                         {
1420                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n", coor[0], coor[1], x, y, z, a, b);
1421                         }
1422 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1423                     }
1424                 }
1425                 if (plan->flags&FFT5D_DEBUG)
1426                 {
1427                     printf(",");
1428                 }
1429             }
1430             if (plan->flags&FFT5D_DEBUG)
1431             {
1432                 printf("\n");
1433             }
1434         }
1435     }
1436
1437 }