src/gromacs/fft/fft5d.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2009,2010,2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #include "gmxpre.h"
  36
  37 #include "config.h"
  38
  39 #include <algorithm>
  40
  41 #include <stdio.h>
  42 #include <stdlib.h>
  43 #include <string.h>
  44
  45 #ifdef NOGMX
  46 #define GMX_PARALLEL_ENV_INITIALIZED 1
  47 #else
  48 #ifdef GMX_MPI
  49 #define GMX_PARALLEL_ENV_INITIALIZED 1
  50 #else
  51 #define GMX_PARALLEL_ENV_INITIALIZED 0
  52 #endif
  53 #endif
  54
  55 #include "gromacs/utility/gmxmpi.h"
  56
  57 #ifdef GMX_OPENMP
  58 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
  59 #define FFT5D_THREADS
  60 /* requires fftw compiled with openmp */
  61 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
  62 #endif
  63
  64 #include "fft5d.h"
  65 #include <float.h>
  66 #include <math.h>
  67 #include <assert.h>
  68 #include "gromacs/utility/smalloc.h"
  69
  70 #ifndef __FLT_EPSILON__
  71 #define __FLT_EPSILON__ FLT_EPSILON
  72 #define __DBL_EPSILON__ DBL_EPSILON
  73 #endif
  74
  75 #ifdef NOGMX
  76 FILE* debug = 0;
  77 #endif
  78
  79 #include "gromacs/utility/fatalerror.h"
  80
  81
  82 #ifdef GMX_FFT_FFTW3
  83 #include "thread_mpi/mutex.h"
  84 #include "gromacs/utility/exceptions.h"
  85 /* none of the fftw3 calls, except execute(), are thread-safe, so
  86    we need to serialize them with this mutex. */
  87 static tMPI::mutex big_fftw_mutex;
  88 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  89 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  90 #endif /* GMX_FFT_FFTW3 */
  91
  92 #ifdef GMX_MPI
  93 /* largest factor smaller than sqrt */
  94 static int lfactor(int z)
  95 {
  96     int i = static_cast<int>(sqrt(static_cast<double>(z)));
  97     while (z%i != 0)
  98     {
  99         i--;
 100     }
 101     return i;
 102 }
 103 #endif
 104
 105 /* largest factor */
 106 static int l2factor(int z)
 107 {
 108     int i;
 109     if (z == 1)
 110     {
 111         i = 1;
 112     }
 113     else
 114     {
 115         i = z/2;
 116         while (z%i != 0)
 117         {
 118             i--;
 119         }
 120     }
 121     return i;
 122 }
 123
 124 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
 125 static int lpfactor(int z)
 126 {
 127     int f = l2factor(z);
 128     if (f == 1)
 129     {
 130         return z;
 131     }
 132     return std::max(lpfactor(f), lpfactor(z/f));
 133 }
 134
 135 #ifndef GMX_MPI
 136 #ifdef HAVE_GETTIMEOFDAY
 137 #include <sys/time.h>
 138 double MPI_Wtime()
 139 {
 140     struct timeval tv;
 141     gettimeofday(&tv, 0);
 142     return tv.tv_sec+tv.tv_usec*1e-6;
 143 }
 144 #else
 145 double MPI_Wtime()
 146 {
 147     return 0.0;
 148 }
 149 #endif
 150 #endif
 151
 152 static int vmax(int* a, int s)
 153 {
 154     int i, max = 0;
 155     for (i = 0; i < s; i++)
 156     {
 157         if (a[i] > max)
 158         {
 159             max = a[i];
 160         }
 161     }
 162     return max;
 163 }
 164
 165
 166 /* NxMxK the size of the data
 167  * comm communicator to use for fft5d
 168  * P0 number of processor in 1st axes (can be null for automatic)
 169  * lin is allocated by fft5d because size of array is only known after planning phase
 170  * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 171  */
 172 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
 173 {
 174
 175     int        P[2], bMaster, prank[2], i, t;
 176     int        rNG, rMG, rKG;
 177     int       *N0 = 0, *N1 = 0, *M0 = 0, *M1 = 0, *K0 = 0, *K1 = 0, *oN0 = 0, *oN1 = 0, *oM0 = 0, *oM1 = 0, *oK0 = 0, *oK1 = 0;
 178     int        N[3], M[3], K[3], pN[3], pM[3], pK[3], oM[3], oK[3], *iNin[3] = {0}, *oNin[3] = {0}, *iNout[3] = {0}, *oNout[3] = {0};
 179     int        C[3], rC[3], nP[2];
 180     int        lsize;
 181     t_complex *lin = 0, *lout = 0, *lout2 = 0, *lout3 = 0;
 182     fft5d_plan plan;
 183     int        s;
 184
 185     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 186 #ifdef GMX_MPI
 187     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
 188     {
 189         MPI_Comm_size(comm[0], &P[0]);
 190         MPI_Comm_rank(comm[0], &prank[0]);
 191     }
 192     else
 193 #endif
 194     {
 195         P[0]     = 1;
 196         prank[0] = 0;
 197     }
 198 #ifdef GMX_MPI
 199     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
 200     {
 201         MPI_Comm_size(comm[1], &P[1]);
 202         MPI_Comm_rank(comm[1], &prank[1]);
 203     }
 204     else
 205 #endif
 206     {
 207         P[1]     = 1;
 208         prank[1] = 0;
 209     }
 210
 211     bMaster = (prank[0] == 0 && prank[1] == 0);
 212
 213
 214     if (debug)
 215     {
 216         fprintf(debug, "FFT5D: Using %dx%d rank grid, rank %d,%d\n",
 217                 P[0], P[1], prank[0], prank[1]);
 218     }
 219
 220     if (bMaster)
 221     {
 222         if (debug)
 223         {
 224             fprintf(debug, "FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 225                     NG, MG, KG, P[0], P[1], (flags&FFT5D_REALCOMPLEX) > 0, (flags&FFT5D_BACKWARD) > 0, (flags&FFT5D_ORDER_YZ) > 0, (flags&FFT5D_DEBUG) > 0);
 226         }
 227         /* The check below is not correct, one prime factor 11 or 13 is ok.
 228            if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 229             printf("WARNING: FFT very slow with prime factors larger 7\n");
 230             printf("Change FFT size or in case you cannot change it look at\n");
 231             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 232            }
 233          */
 234     }
 235
 236     if (NG == 0 || MG == 0 || KG == 0)
 237     {
 238         if (bMaster)
 239         {
 240             printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 241         }
 242         return 0;
 243     }
 244
 245     rNG = NG; rMG = MG; rKG = KG;
 246
 247     if (flags&FFT5D_REALCOMPLEX)
 248     {
 249         if (!(flags&FFT5D_BACKWARD))
 250         {
 251             NG = NG/2+1;
 252         }
 253         else
 254         {
 255             if (!(flags&FFT5D_ORDER_YZ))
 256             {
 257                 MG = MG/2+1;
 258             }
 259             else
 260             {
 261                 KG = KG/2+1;
 262             }
 263         }
 264     }
 265
 266
 267     /*for transpose we need to know the size for each processor not only our own size*/
 268
 269     N0  = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 270     M0  = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 271     K0  = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 272     oN0 = (int*)malloc(P[0]*sizeof(int)); oN1 = (int*)malloc(P[1]*sizeof(int));
 273     oM0 = (int*)malloc(P[0]*sizeof(int)); oM1 = (int*)malloc(P[1]*sizeof(int));
 274     oK0 = (int*)malloc(P[0]*sizeof(int)); oK1 = (int*)malloc(P[1]*sizeof(int));
 275
 276     for (i = 0; i < P[0]; i++)
 277     {
 278         #define EVENDIST
 279         #ifndef EVENDIST
 280         oN0[i] = i*ceil((double)NG/P[0]);
 281         oM0[i] = i*ceil((double)MG/P[0]);
 282         oK0[i] = i*ceil((double)KG/P[0]);
 283         #else
 284         oN0[i] = (NG*i)/P[0];
 285         oM0[i] = (MG*i)/P[0];
 286         oK0[i] = (KG*i)/P[0];
 287         #endif
 288     }
 289     for (i = 0; i < P[1]; i++)
 290     {
 291         #ifndef EVENDIST
 292         oN1[i] = i*ceil((double)NG/P[1]);
 293         oM1[i] = i*ceil((double)MG/P[1]);
 294         oK1[i] = i*ceil((double)KG/P[1]);
 295         #else
 296         oN1[i] = (NG*i)/P[1];
 297         oM1[i] = (MG*i)/P[1];
 298         oK1[i] = (KG*i)/P[1];
 299         #endif
 300     }
 301     for (i = 0; i < P[0]-1; i++)
 302     {
 303         N0[i] = oN0[i+1]-oN0[i];
 304         M0[i] = oM0[i+1]-oM0[i];
 305         K0[i] = oK0[i+1]-oK0[i];
 306     }
 307     N0[P[0]-1] = NG-oN0[P[0]-1];
 308     M0[P[0]-1] = MG-oM0[P[0]-1];
 309     K0[P[0]-1] = KG-oK0[P[0]-1];
 310     for (i = 0; i < P[1]-1; i++)
 311     {
 312         N1[i] = oN1[i+1]-oN1[i];
 313         M1[i] = oM1[i+1]-oM1[i];
 314         K1[i] = oK1[i+1]-oK1[i];
 315     }
 316     N1[P[1]-1] = NG-oN1[P[1]-1];
 317     M1[P[1]-1] = MG-oM1[P[1]-1];
 318     K1[P[1]-1] = KG-oK1[P[1]-1];
 319
 320     /* for step 1-3 the local N,M,K sizes of the transposed system
 321        C: contiguous dimension, and nP: number of processor in subcommunicator
 322        for that step */
 323
 324
 325     pM[0] = M0[prank[0]];
 326     oM[0] = oM0[prank[0]];
 327     pK[0] = K1[prank[1]];
 328     oK[0] = oK1[prank[1]];
 329     C[0]  = NG;
 330     rC[0] = rNG;
 331     if (!(flags&FFT5D_ORDER_YZ))
 332     {
 333         N[0]     = vmax(N1, P[1]);
 334         M[0]     = M0[prank[0]];
 335         K[0]     = vmax(K1, P[1]);
 336         pN[0]    = N1[prank[1]];
 337         iNout[0] = N1;
 338         oNout[0] = oN1;
 339         nP[0]    = P[1];
 340         C[1]     = KG;
 341         rC[1]    = rKG;
 342         N[1]     = vmax(K0, P[0]);
 343         pN[1]    = K0[prank[0]];
 344         iNin[1]  = K1;
 345         oNin[1]  = oK1;
 346         iNout[1] = K0;
 347         oNout[1] = oK0;
 348         M[1]     = vmax(M0, P[0]);
 349         pM[1]    = M0[prank[0]];
 350         oM[1]    = oM0[prank[0]];
 351         K[1]     = N1[prank[1]];
 352         pK[1]    = N1[prank[1]];
 353         oK[1]    = oN1[prank[1]];
 354         nP[1]    = P[0];
 355         C[2]     = MG;
 356         rC[2]    = rMG;
 357         iNin[2]  = M0;
 358         oNin[2]  = oM0;
 359         M[2]     = vmax(K0, P[0]);
 360         pM[2]    = K0[prank[0]];
 361         oM[2]    = oK0[prank[0]];
 362         K[2]     = vmax(N1, P[1]);
 363         pK[2]    = N1[prank[1]];
 364         oK[2]    = oN1[prank[1]];
 365         free(N0); free(oN0); /*these are not used for this order*/
 366         free(M1); free(oM1); /*the rest is freed in destroy*/
 367     }
 368     else
 369     {
 370         N[0]     = vmax(N0, P[0]);
 371         M[0]     = vmax(M0, P[0]);
 372         K[0]     = K1[prank[1]];
 373         pN[0]    = N0[prank[0]];
 374         iNout[0] = N0;
 375         oNout[0] = oN0;
 376         nP[0]    = P[0];
 377         C[1]     = MG;
 378         rC[1]    = rMG;
 379         N[1]     = vmax(M1, P[1]);
 380         pN[1]    = M1[prank[1]];
 381         iNin[1]  = M0;
 382         oNin[1]  = oM0;
 383         iNout[1] = M1;
 384         oNout[1] = oM1;
 385         M[1]     = N0[prank[0]];
 386         pM[1]    = N0[prank[0]];
 387         oM[1]    = oN0[prank[0]];
 388         K[1]     = vmax(K1, P[1]);
 389         pK[1]    = K1[prank[1]];
 390         oK[1]    = oK1[prank[1]];
 391         nP[1]    = P[1];
 392         C[2]     = KG;
 393         rC[2]    = rKG;
 394         iNin[2]  = K1;
 395         oNin[2]  = oK1;
 396         M[2]     = vmax(N0, P[0]);
 397         pM[2]    = N0[prank[0]];
 398         oM[2]    = oN0[prank[0]];
 399         K[2]     = vmax(M1, P[1]);
 400         pK[2]    = M1[prank[1]];
 401         oK[2]    = oM1[prank[1]];
 402         free(N1); free(oN1); /*these are not used for this order*/
 403         free(K0); free(oK0); /*the rest is freed in destroy*/
 404     }
 405     N[2] = pN[2] = -1;       /*not used*/
 406
 407     /*
 408        Difference between x-y-z regarding 2d decomposition is whether they are
 409        distributed along axis 1, 2 or both
 410      */
 411
 412     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 413     lsize = std::max(N[0]*M[0]*K[0]*nP[0], std::max(N[1]*M[1]*K[1]*nP[1], C[2]*M[2]*K[2]));
 414     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 415     if (!(flags&FFT5D_NOMALLOC))
 416     {
 417         snew_aligned(lin, lsize, 32);
 418         snew_aligned(lout, lsize, 32);
 419         if (nthreads > 1)
 420         {
 421             /* We need extra transpose buffers to avoid OpenMP barriers */
 422             snew_aligned(lout2, lsize, 32);
 423             snew_aligned(lout3, lsize, 32);
 424         }
 425         else
 426         {
 427             /* We can reuse the buffers to avoid cache misses */
 428             lout2 = lin;
 429             lout3 = lout;
 430         }
 431     }
 432     else
 433     {
 434         lin  = *rlin;
 435         lout = *rlout;
 436         if (nthreads > 1)
 437         {
 438             lout2 = *rlout2;
 439             lout3 = *rlout3;
 440         }
 441         else
 442         {
 443             lout2 = lin;
 444             lout3 = lout;
 445         }
 446     }
 447
 448     plan = (fft5d_plan)calloc(1, sizeof(struct fft5d_plan_t));
 449
 450
 451     if (debug)
 452     {
 453         fprintf(debug, "Running on %d threads\n", nthreads);
 454     }
 455
 456 #ifdef GMX_FFT_FFTW3                                                            /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
 457     /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
 458      * within a parallel region. For now deactivated. If it should be supported it has to made sure that
 459      * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
 460      * and that the 3d plan is faster than the 1d plan.
 461      */
 462     if ((!(flags&FFT5D_INPLACE)) && (!(P[0] > 1 || P[1] > 1)) && nthreads == 1) /*don't do 3d plan in parallel or if in_place requested */
 463     {
 464         int fftwflags = FFTW_DESTROY_INPUT;
 465         FFTW(iodim) dims[3];
 466         int inNG = NG, outMG = MG, outKG = KG;
 467
 468         FFTW_LOCK;
 469         if (!(flags&FFT5D_NOMEASURE))
 470         {
 471             fftwflags |= FFTW_MEASURE;
 472         }
 473         if (flags&FFT5D_REALCOMPLEX)
 474         {
 475             if (!(flags&FFT5D_BACKWARD))        /*input pointer is not complex*/
 476             {
 477                 inNG *= 2;
 478             }
 479             else                                /*output pointer is not complex*/
 480             {
 481                 if (!(flags&FFT5D_ORDER_YZ))
 482                 {
 483                     outMG *= 2;
 484                 }
 485                 else
 486                 {
 487                     outKG *= 2;
 488                 }
 489             }
 490         }
 491
 492         if (!(flags&FFT5D_BACKWARD))
 493         {
 494             dims[0].n  = KG;
 495             dims[1].n  = MG;
 496             dims[2].n  = rNG;
 497
 498             dims[0].is = inNG*MG;         /*N M K*/
 499             dims[1].is = inNG;
 500             dims[2].is = 1;
 501             if (!(flags&FFT5D_ORDER_YZ))
 502             {
 503                 dims[0].os = MG;           /*M K N*/
 504                 dims[1].os = 1;
 505                 dims[2].os = MG*KG;
 506             }
 507             else
 508             {
 509                 dims[0].os = 1;           /*K N M*/
 510                 dims[1].os = KG*NG;
 511                 dims[2].os = KG;
 512             }
 513         }
 514         else
 515         {
 516             if (!(flags&FFT5D_ORDER_YZ))
 517             {
 518                 dims[0].n  = NG;
 519                 dims[1].n  = KG;
 520                 dims[2].n  = rMG;
 521
 522                 dims[0].is = 1;
 523                 dims[1].is = NG*MG;
 524                 dims[2].is = NG;
 525
 526                 dims[0].os = outMG*KG;
 527                 dims[1].os = outMG;
 528                 dims[2].os = 1;
 529             }
 530             else
 531             {
 532                 dims[0].n  = MG;
 533                 dims[1].n  = NG;
 534                 dims[2].n  = rKG;
 535
 536                 dims[0].is = NG;
 537                 dims[1].is = 1;
 538                 dims[2].is = NG*MG;
 539
 540                 dims[0].os = outKG*NG;
 541                 dims[1].os = outKG;
 542                 dims[2].os = 1;
 543             }
 544         }
 545 #ifdef FFT5D_THREADS
 546 #ifdef FFT5D_FFTW_THREADS
 547         FFTW(plan_with_nthreads)(nthreads);
 548 #endif
 549 #endif
 550         if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD))
 551         {
 552             plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 553                                                          /*howmany*/ 0, /*howmany_dims*/ 0,
 554                                                          (real*)lin, (FFTW(complex) *) lout,
 555                                                          /*flags*/ fftwflags);
 556         }
 557         else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD))
 558         {
 559             plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 560                                                          /*howmany*/ 0, /*howmany_dims*/ 0,
 561                                                          (FFTW(complex) *) lin, (real*)lout,
 562                                                          /*flags*/ fftwflags);
 563         }
 564         else
 565         {
 566             plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 567                                                      /*howmany*/ 0, /*howmany_dims*/ 0,
 568                                                      (FFTW(complex) *) lin, (FFTW(complex) *) lout,
 569                                                      /*sign*/ (flags&FFT5D_BACKWARD) ? 1 : -1, /*flags*/ fftwflags);
 570         }
 571 #ifdef FFT5D_THREADS
 572 #ifdef FFT5D_FFTW_THREADS
 573         FFTW(plan_with_nthreads)(1);
 574 #endif
 575 #endif
 576         FFTW_UNLOCK;
 577     }
 578     if (!plan->p3d) /* for decomposition and if 3d plan did not work */
 579     {
 580 #endif              /* GMX_FFT_FFTW3 */
 581     for (s = 0; s < 3; s++)
 582     {
 583         if (debug)
 584         {
 585             fprintf(debug, "FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 586                     s, rC[s], M[s], pK[s], C[s], lsize);
 587         }
 588         plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
 589
 590         /* Make sure that the init routines are only called by one thread at a time and in order
 591            (later is only important to not confuse valgrind)
 592          */
 593 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
 594         for (t = 0; t < nthreads; t++)
 595         {
 596 #pragma omp ordered
 597             {
 598                 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
 599
 600                 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s == 0) || ((flags&FFT5D_BACKWARD) && s == 2)))
 601                 {
 602                     gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
 603                 }
 604                 else
 605                 {
 606                     gmx_fft_init_many_1d     ( &plan->p1d[s][t],  C[s], tsize, (flags&FFT5D_NOMEASURE) ? GMX_FFT_FLAG_CONSERVATIVE : 0 );
 607                 }
 608             }
 609         }
 610     }
 611
 612 #ifdef GMX_FFT_FFTW3
 613 }
 614 #endif
 615     if ((flags&FFT5D_ORDER_YZ))   /*plan->cart is in the order of transposes */
 616     {
 617         plan->cart[0] = comm[0]; plan->cart[1] = comm[1];
 618     }
 619     else
 620     {
 621         plan->cart[1] = comm[0]; plan->cart[0] = comm[1];
 622     }
 623 #ifdef FFT5D_MPI_TRANSPOSE
 624     FFTW_LOCK;
 625     for (s = 0; s < 2; s++)
 626     {
 627         if ((s == 0 && !(flags&FFT5D_ORDER_YZ)) || (s == 1 && (flags&FFT5D_ORDER_YZ)))
 628         {
 629             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 630         }
 631         else
 632         {
 633             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 634         }
 635     }
 636     FFTW_UNLOCK;
 637 #endif
 638
 639
 640     plan->lin   = lin;
 641     plan->lout  = lout;
 642     plan->lout2 = lout2;
 643     plan->lout3 = lout3;
 644
 645     plan->NG = NG; plan->MG = MG; plan->KG = KG;
 646
 647     for (s = 0; s < 3; s++)
 648     {
 649         plan->N[s]    = N[s]; plan->M[s] = M[s]; plan->K[s] = K[s]; plan->pN[s] = pN[s]; plan->pM[s] = pM[s]; plan->pK[s] = pK[s];
 650         plan->oM[s]   = oM[s]; plan->oK[s] = oK[s];
 651         plan->C[s]    = C[s]; plan->rC[s] = rC[s];
 652         plan->iNin[s] = iNin[s]; plan->oNin[s] = oNin[s]; plan->iNout[s] = iNout[s]; plan->oNout[s] = oNout[s];
 653     }
 654     for (s = 0; s < 2; s++)
 655     {
 656         plan->P[s] = nP[s]; plan->coor[s] = prank[s];
 657     }
 658
 659 /*    plan->fftorder=fftorder;
 660     plan->direction=direction;
 661     plan->realcomplex=realcomplex;
 662  */
 663     plan->flags    = flags;
 664     plan->nthreads = nthreads;
 665     *rlin          = lin;
 666     *rlout         = lout;
 667     *rlout2        = lout2;
 668     *rlout3        = lout3;
 669     return plan;
 670 }
 671
 672
 673 enum order {
 674     XYZ,
 675     XZY,
 676     YXZ,
 677     YZX,
 678     ZXY,
 679     ZYX
 680 };
 681
 682
 683
 684 /*here x,y,z and N,M,K is in rotated coordinate system!!
 685    x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 686    maxN,maxM,maxK is max size of local data
 687    pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 688    NG, MG, KG is size of global data*/
 689 static void splitaxes(t_complex* lout, const t_complex* lin,
 690                       int maxN, int maxM, int maxK, int pM,
 691                       int P, int NG, int *N, int* oN, int starty, int startz, int endy, int endz)
 692 {
 693     int x, y, z, i;
 694     int in_i, out_i, in_z, out_z, in_y, out_y;
 695     int s_y, e_y;
 696
 697     for (z = startz; z < endz+1; z++) /*3. z l*/
 698     {
 699         if (z == startz)
 700         {
 701             s_y = starty;
 702         }
 703         else
 704         {
 705             s_y = 0;
 706         }
 707         if (z == endz)
 708         {
 709             e_y = endy;
 710         }
 711         else
 712         {
 713             e_y = pM;
 714         }
 715         out_z  = z*maxN*maxM;
 716         in_z   = z*NG*pM;
 717
 718         for (i = 0; i < P; i++) /*index cube along long axis*/
 719         {
 720             out_i  = out_z  + i*maxN*maxM*maxK;
 721             in_i   = in_z + oN[i];
 722             for (y = s_y; y < e_y; y++)   /*2. y k*/
 723             {
 724                 out_y  = out_i  + y*maxN;
 725                 in_y   = in_i + y*NG;
 726                 for (x = 0; x < N[i]; x++)       /*1. x j*/
 727                 {
 728                     lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
 729                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 730                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 731                 }
 732             }
 733         }
 734     }
 735 }
 736
 737 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 738 /*transpose mayor and major dimension
 739    variables see above
 740    the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 741    N,M,K local dimensions
 742    KG global size*/
 743 static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
 744                             int maxN, int maxM, int maxK, int pM,
 745                             int P, int KG, int* K, int* oK, int starty, int startx, int endy, int endx)
 746 {
 747     int i, x, y, z;
 748     int out_i, in_i, out_x, in_x, out_z, in_z;
 749     int s_y, e_y;
 750
 751     for (x = startx; x < endx+1; x++) /*1.j*/
 752     {
 753         if (x == startx)
 754         {
 755             s_y = starty;
 756         }
 757         else
 758         {
 759             s_y = 0;
 760         }
 761         if (x == endx)
 762         {
 763             e_y = endy;
 764         }
 765         else
 766         {
 767             e_y = pM;
 768         }
 769
 770         out_x  = x*KG*pM;
 771         in_x   = x;
 772
 773         for (i = 0; i < P; i++) /*index cube along long axis*/
 774         {
 775             out_i  = out_x  + oK[i];
 776             in_i   = in_x + i*maxM*maxN*maxK;
 777             for (z = 0; z < K[i]; z++) /*3.l*/
 778             {
 779                 out_z  = out_i  + z;
 780                 in_z   = in_i + z*maxM*maxN;
 781                 for (y = s_y; y < e_y; y++)              /*2.k*/
 782                 {
 783                     lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
 784                 }
 785             }
 786         }
 787     }
 788 }
 789
 790 /*make axis contiguous again (after AllToAll) and also do local transpose
 791    tranpose mayor and middle dimension
 792    variables see above
 793    the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 794    N,M,K local size
 795    MG, global size*/
 796 static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
 797                             int P, int MG, int* M, int* oM, int startx, int startz, int endx, int endz)
 798 {
 799     int i, z, y, x;
 800     int out_i, in_i, out_z, in_z, out_x, in_x;
 801     int s_x, e_x;
 802
 803     for (z = startz; z < endz+1; z++)
 804     {
 805         if (z == startz)
 806         {
 807             s_x = startx;
 808         }
 809         else
 810         {
 811             s_x = 0;
 812         }
 813         if (z == endz)
 814         {
 815             e_x = endx;
 816         }
 817         else
 818         {
 819             e_x = pN;
 820         }
 821         out_z  = z*MG*pN;
 822         in_z   = z*maxM*maxN;
 823
 824         for (i = 0; i < P; i++) /*index cube along long axis*/
 825         {
 826             out_i  = out_z  + oM[i];
 827             in_i   = in_z + i*maxM*maxN*maxK;
 828             for (x = s_x; x < e_x; x++)
 829             {
 830                 out_x  = out_i  + x*MG;
 831                 in_x   = in_i + x;
 832                 for (y = 0; y < M[i]; y++)
 833                 {
 834                     lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
 835                 }
 836             }
 837         }
 838     }
 839 }
 840
 841
 842 static void rotate_offsets(int x[])
 843 {
 844     int t = x[0];
 845 /*    x[0]=x[2];
 846     x[2]=x[1];
 847     x[1]=t;*/
 848     x[0] = x[1];
 849     x[1] = x[2];
 850     x[2] = t;
 851 }
 852
 853 /*compute the offset to compare or print transposed local data in original input coordinates
 854    xs matrix dimension size, xl dimension length, xc decomposition offset
 855    s: step in computation = number of transposes*/
 856 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s)
 857 {
 858 /*    int direction = plan->direction;
 859     int fftorder = plan->fftorder;*/
 860
 861     int  o = 0;
 862     int  pos[3], i;
 863     int *pM = plan->pM, *pK = plan->pK, *oM = plan->oM, *oK = plan->oK,
 864     *C      = plan->C, *rC = plan->rC;
 865
 866     NG[0] = plan->NG; NG[1] = plan->MG; NG[2] = plan->KG;
 867
 868     if (!(plan->flags&FFT5D_ORDER_YZ))
 869     {
 870         switch (s)
 871         {
 872             case 0: o = XYZ; break;
 873             case 1: o = ZYX; break;
 874             case 2: o = YZX; break;
 875             default: assert(0);
 876         }
 877     }
 878     else
 879     {
 880         switch (s)
 881         {
 882             case 0: o = XYZ; break;
 883             case 1: o = YXZ; break;
 884             case 2: o = ZXY; break;
 885             default: assert(0);
 886         }
 887     }
 888
 889     switch (o)
 890     {
 891         case XYZ: pos[0] = 1; pos[1] = 2; pos[2] = 3; break;
 892         case XZY: pos[0] = 1; pos[1] = 3; pos[2] = 2; break;
 893         case YXZ: pos[0] = 2; pos[1] = 1; pos[2] = 3; break;
 894         case YZX: pos[0] = 3; pos[1] = 1; pos[2] = 2; break;
 895         case ZXY: pos[0] = 2; pos[1] = 3; pos[2] = 1; break;
 896         case ZYX: pos[0] = 3; pos[1] = 2; pos[2] = 1; break;
 897     }
 898     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 899
 900     /*xs, xl give dimension size and data length in local transposed coordinate system
 901        for 0(/1/2): x(/y/z) in original coordinate system*/
 902     for (i = 0; i < 3; i++)
 903     {
 904         switch (pos[i])
 905         {
 906             case 1: xs[i] = 1;         xc[i] = 0;     xl[i] = C[s]; break;
 907             case 2: xs[i] = C[s];      xc[i] = oM[s]; xl[i] = pM[s]; break;
 908             case 3: xs[i] = C[s]*pM[s]; xc[i] = oK[s]; xl[i] = pK[s]; break;
 909         }
 910     }
 911     /*input order is different for test program to match FFTW order
 912        (important for complex to real)*/
 913     if (plan->flags&FFT5D_BACKWARD)
 914     {
 915         rotate_offsets(xs);
 916         rotate_offsets(xl);
 917         rotate_offsets(xc);
 918         rotate_offsets(NG);
 919         if (plan->flags&FFT5D_ORDER_YZ)
 920         {
 921             rotate_offsets(xs);
 922             rotate_offsets(xl);
 923             rotate_offsets(xc);
 924             rotate_offsets(NG);
 925         }
 926     }
 927     if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s == 0) || ((plan->flags&FFT5D_BACKWARD) && s == 2)))
 928     {
 929         xl[0] = rC[s];
 930     }
 931 }
 932
 933 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan)
 934 {
 935     int  x, y, z, l;
 936     int *coor = plan->coor;
 937     int  xs[3], xl[3], xc[3], NG[3];
 938     int  ll = (plan->flags&FFT5D_REALCOMPLEX) ? 1 : 2;
 939     compute_offsets(plan, xs, xl, xc, NG, s);
 940     fprintf(debug, txt, coor[0], coor[1], s);
 941     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 942     for (z = 0; z < xl[2]; z++)
 943     {
 944         for (y = 0; y < xl[1]; y++)
 945         {
 946             fprintf(debug, "%d %d: ", coor[0], coor[1]);
 947             for (x = 0; x < xl[0]; x++)
 948             {
 949                 for (l = 0; l < ll; l++)
 950                 {
 951                     fprintf(debug, "%f ", ((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 952                 }
 953                 fprintf(debug, ",");
 954             }
 955             fprintf(debug, "\n");
 956         }
 957     }
 958 }
 959
 960 void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
 961 {
 962     t_complex  *lin   = plan->lin;
 963     t_complex  *lout  = plan->lout;
 964     t_complex  *lout2 = plan->lout2;
 965     t_complex  *lout3 = plan->lout3;
 966     t_complex  *fftout, *joinin;
 967
 968     gmx_fft_t **p1d = plan->p1d;
 969 #ifdef FFT5D_MPI_TRANSPOSE
 970     FFTW(plan) *mpip = plan->mpip;
 971 #endif
 972 #ifdef GMX_MPI
 973     MPI_Comm *cart = plan->cart;
 974 #endif
 975 #ifdef NOGMX
 976     double time_fft = 0, time_local = 0, time_mpi[2] = {0}, time = 0;
 977 #endif
 978     int   *N = plan->N, *M = plan->M, *K = plan->K, *pN = plan->pN, *pM = plan->pM, *pK = plan->pK,
 979     *C       = plan->C, *P = plan->P, **iNin = plan->iNin, **oNin = plan->oNin, **iNout = plan->iNout, **oNout = plan->oNout;
 980     int    s = 0, tstart, tend, bParallelDim;
 981
 982
 983 #ifdef GMX_FFT_FFTW3
 984     if (plan->p3d)
 985     {
 986         if (thread == 0)
 987         {
 988 #ifdef NOGMX
 989             if (times != 0)
 990             {
 991                 time = MPI_Wtime();
 992             }
 993 #endif
 994             FFTW(execute)(plan->p3d);
 995 #ifdef NOGMX
 996             if (times != 0)
 997             {
 998                 times->fft += MPI_Wtime()-time;
 999             }
1000 #endif
1001         }
1002         return;
1003     }
1004 #endif
1005
1006     s = 0;
1007
1008     /*lin: x,y,z*/
1009     if (plan->flags&FFT5D_DEBUG && thread == 0)
1010     {
1011         print_localdata(lin, "%d %d: copy in lin\n", s, plan);
1012     }
1013
1014     for (s = 0; s < 2; s++)  /*loop over first two FFT steps (corner rotations)*/
1015
1016     {
1017 #ifdef GMX_MPI
1018         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] != MPI_COMM_NULL && P[s] > 1)
1019         {
1020             bParallelDim = 1;
1021         }
1022         else
1023 #endif
1024         {
1025             bParallelDim = 0;
1026         }
1027
1028         /* ---------- START FFT ------------ */
1029 #ifdef NOGMX
1030         if (times != 0 && thread == 0)
1031         {
1032             time = MPI_Wtime();
1033         }
1034 #endif
1035
1036         if (bParallelDim || plan->nthreads == 1)
1037         {
1038             fftout = lout;
1039         }
1040         else
1041         {
1042             if (s == 0)
1043             {
1044                 fftout = lout3;
1045             }
1046             else
1047             {
1048                 fftout = lout2;
1049             }
1050         }
1051
1052         tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1053         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s == 0)
1054         {
1055             gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, fftout+tstart);
1056         }
1057         else
1058         {
1059             gmx_fft_many_1d(     p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,               lin+tstart, fftout+tstart);
1060
1061         }
1062
1063 #ifdef NOGMX
1064         if (times != NULL && thread == 0)
1065         {
1066             time_fft += MPI_Wtime()-time;
1067         }
1068 #endif
1069         if (plan->flags&FFT5D_DEBUG && thread == 0)
1070         {
1071             print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1072         }
1073         /* ---------- END FFT ------------ */
1074
1075         /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
1076         if (bParallelDim)
1077         {
1078 #ifdef NOGMX
1079             if (times != NULL && thread == 0)
1080             {
1081                 time = MPI_Wtime();
1082             }
1083 #endif
1084             /*prepare for A
1085                llToAll
1086                1. (most outer) axes (x) is split into P[s] parts of size N[s]
1087                for sending*/
1088             if (pM[s] > 0)
1089             {
1090                 tend    = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
1091                 tstart /= C[s];
1092                 splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1093             }
1094 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
1095 #ifdef NOGMX
1096             if (times != NULL && thread == 0)
1097             {
1098                 time_local += MPI_Wtime()-time;
1099             }
1100 #endif
1101
1102             /* ---------- END SPLIT , START TRANSPOSE------------ */
1103
1104             if (thread == 0)
1105             {
1106 #ifdef NOGMX
1107                 if (times != 0)
1108                 {
1109                     time = MPI_Wtime();
1110                 }
1111 #else
1112                 wallcycle_start(times, ewcPME_FFTCOMM);
1113 #endif
1114 #ifdef FFT5D_MPI_TRANSPOSE
1115                 FFTW(execute)(mpip[s]);
1116 #else
1117 #ifdef GMX_MPI
1118                 if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1119                 {
1120                     MPI_Alltoall((real *)lout2, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1121                 }
1122                 else
1123                 {
1124                     MPI_Alltoall((real *)lout2, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, (real *)lout3, N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real), GMX_MPI_REAL, cart[s]);
1125                 }
1126 #else
1127                 gmx_incons("fft5d MPI call without MPI configuration");
1128 #endif /*GMX_MPI*/
1129 #endif /*FFT5D_MPI_TRANSPOSE*/
1130 #ifdef NOGMX
1131                 if (times != 0)
1132                 {
1133                     time_mpi[s] = MPI_Wtime()-time;
1134                 }
1135 #else
1136                 wallcycle_stop(times, ewcPME_FFTCOMM);
1137 #endif
1138             } /*master*/
1139         }     /* bPrallelDim */
1140 #pragma omp barrier  /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1141
1142         /* ---------- END SPLIT + TRANSPOSE------------ */
1143
1144         /* ---------- START JOIN ------------ */
1145 #ifdef NOGMX
1146         if (times != NULL && thread == 0)
1147         {
1148             time = MPI_Wtime();
1149         }
1150 #endif
1151
1152         if (bParallelDim)
1153         {
1154             joinin = lout3;
1155         }
1156         else
1157         {
1158             joinin = fftout;
1159         }
1160         /*bring back in matrix form
1161            thus make  new 1. axes contiguos
1162            also local transpose 1 and 2/3
1163            runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1164          */
1165         if ((s == 0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s == 1 && (plan->flags&FFT5D_ORDER_YZ)))
1166         {
1167             if (pM[s] > 0)
1168             {
1169                 tstart = ( thread   *pM[s]*pN[s]/plan->nthreads);
1170                 tend   = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1171                 joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
1172             }
1173         }
1174         else
1175         {
1176             if (pN[s] > 0)
1177             {
1178                 tstart = ( thread   *pK[s]*pN[s]/plan->nthreads);
1179                 tend   = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1180                 joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
1181             }
1182         }
1183
1184 #ifdef NOGMX
1185         if (times != NULL && thread == 0)
1186         {
1187             time_local += MPI_Wtime()-time;
1188         }
1189 #endif
1190         if (plan->flags&FFT5D_DEBUG && thread == 0)
1191         {
1192             print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1193         }
1194         /* ---------- END JOIN ------------ */
1195
1196         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1197     }  /* for(s=0;s<2;s++) */
1198 #ifdef NOGMX
1199     if (times != NULL && thread == 0)
1200     {
1201         time = MPI_Wtime();
1202     }
1203 #endif
1204
1205     if (plan->flags&FFT5D_INPLACE)
1206     {
1207         lout = lin;                          /*in place currently not supported*/
1208
1209     }
1210     /*  ----------- FFT ----------- */
1211     tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1212     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1213     {
1214         gmx_fft_many_1d_real(p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_COMPLEX_TO_REAL : GMX_FFT_REAL_TO_COMPLEX, lin+tstart, lout+tstart);
1215     }
1216     else
1217     {
1218         gmx_fft_many_1d(     p1d[s][thread], (plan->flags&FFT5D_BACKWARD) ? GMX_FFT_BACKWARD : GMX_FFT_FORWARD,               lin+tstart, lout+tstart);
1219     }
1220     /* ------------ END FFT ---------*/
1221
1222 #ifdef NOGMX
1223     if (times != NULL && thread == 0)
1224     {
1225         time_fft += MPI_Wtime()-time;
1226
1227         times->fft   += time_fft;
1228         times->local += time_local;
1229         times->mpi2  += time_mpi[1];
1230         times->mpi1  += time_mpi[0];
1231     }
1232 #endif
1233
1234     if (plan->flags&FFT5D_DEBUG && thread == 0)
1235     {
1236         print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1237     }
1238 }
1239
1240 void fft5d_destroy(fft5d_plan plan)
1241 {
1242     int s, t;
1243
1244     for (s = 0; s < 3; s++)
1245     {
1246         if (plan->p1d[s])
1247         {
1248             for (t = 0; t < plan->nthreads; t++)
1249             {
1250                 gmx_many_fft_destroy(plan->p1d[s][t]);
1251             }
1252             free(plan->p1d[s]);
1253         }
1254         if (plan->iNin[s])
1255         {
1256             free(plan->iNin[s]);
1257             plan->iNin[s] = 0;
1258         }
1259         if (plan->oNin[s])
1260         {
1261             free(plan->oNin[s]);
1262             plan->oNin[s] = 0;
1263         }
1264         if (plan->iNout[s])
1265         {
1266             free(plan->iNout[s]);
1267             plan->iNout[s] = 0;
1268         }
1269         if (plan->oNout[s])
1270         {
1271             free(plan->oNout[s]);
1272             plan->oNout[s] = 0;
1273         }
1274     }
1275 #ifdef GMX_FFT_FFTW3
1276     FFTW_LOCK;
1277 #ifdef FFT5D_MPI_TRANSPOS
1278     for (s = 0; s < 2; s++)
1279     {
1280         FFTW(destroy_plan)(plan->mpip[s]);
1281     }
1282 #endif /* FFT5D_MPI_TRANSPOS */
1283     if (plan->p3d)
1284     {
1285         FFTW(destroy_plan)(plan->p3d);
1286     }
1287     FFTW_UNLOCK;
1288 #endif /* GMX_FFT_FFTW3 */
1289
1290     if (!(plan->flags&FFT5D_NOMALLOC))
1291     {
1292         sfree_aligned(plan->lin);
1293         sfree_aligned(plan->lout);
1294         if (plan->nthreads > 1)
1295         {
1296             sfree_aligned(plan->lout2);
1297             sfree_aligned(plan->lout3);
1298         }
1299     }
1300
1301 #ifdef FFT5D_THREADS
1302 #ifdef FFT5D_FFTW_THREADS
1303     /*FFTW(cleanup_threads)();*/
1304 #endif
1305 #endif
1306
1307     free(plan);
1308 }
1309
1310 /*Is this better than direct access of plan? enough data?
1311    here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1312 void fft5d_local_size(fft5d_plan plan, int* N1, int* M0, int* K0, int* K1, int** coor)
1313 {
1314     *N1 = plan->N[0];
1315     *M0 = plan->M[0];
1316     *K1 = plan->K[0];
1317     *K0 = plan->N[1];
1318
1319     *coor = plan->coor;
1320 }
1321
1322
1323 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1324    of processor dimensions*/
1325 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
1326 {
1327     MPI_Comm cart[2] = {0};
1328 #ifdef GMX_MPI
1329     int      size = 1, prank = 0;
1330     int      P[2];
1331     int      coor[2];
1332     int      wrap[] = {0, 0};
1333     MPI_Comm gcart;
1334     int      rdim1[] = {0, 1}, rdim2[] = {1, 0};
1335
1336     MPI_Comm_size(comm, &size);
1337     MPI_Comm_rank(comm, &prank);
1338
1339     if (P0 == 0)
1340     {
1341         P0 = lfactor(size);
1342     }
1343     if (size%P0 != 0)
1344     {
1345         if (prank == 0)
1346         {
1347             printf("FFT5D: WARNING: Number of ranks %d not evenly divisible by %d\n", size, P0);
1348         }
1349         P0 = lfactor(size);
1350     }
1351
1352     P[0] = P0; P[1] = size/P0; /*number of processors in the two dimensions*/
1353
1354     /*Difference between x-y-z regarding 2d decomposition is whether they are
1355        distributed along axis 1, 2 or both*/
1356
1357     MPI_Cart_create(comm, 2, P, wrap, 1, &gcart); /*parameter 4: value 1: reorder*/
1358     MPI_Cart_get(gcart, 2, P, wrap, coor);
1359     MPI_Cart_sub(gcart, rdim1, &cart[0]);
1360     MPI_Cart_sub(gcart, rdim2, &cart[1]);
1361 #else
1362     (void)P0;
1363     (void)comm;
1364 #endif
1365     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout, rlout2, rlout3, nthreads);
1366 }
1367
1368
1369
1370 /*prints in original coordinate system of data (as the input to FFT)*/
1371 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize)
1372 {
1373     int  xs[3], xl[3], xc[3], NG[3];
1374     int  x, y, z, l;
1375     int *coor = plan->coor;
1376     int  ll   = 2; /*compare ll values per element (has to be 2 for complex)*/
1377     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1378     {
1379         ll = 1;
1380     }
1381
1382     compute_offsets(plan, xs, xl, xc, NG, 2);
1383     if (plan->flags&FFT5D_DEBUG)
1384     {
1385         printf("Compare2\n");
1386     }
1387     for (z = 0; z < xl[2]; z++)
1388     {
1389         for (y = 0; y < xl[1]; y++)
1390         {
1391             if (plan->flags&FFT5D_DEBUG)
1392             {
1393                 printf("%d %d: ", coor[0], coor[1]);
1394             }
1395             for (x = 0; x < xl[0]; x++)
1396             {
1397                 for (l = 0; l < ll; l++)   /*loop over real/complex parts*/
1398                 {
1399                     real a, b;
1400                     a = ((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1401                     if (normalize)
1402                     {
1403                         a /= plan->rC[0]*plan->rC[1]*plan->rC[2];
1404                     }
1405                     if (!bothLocal)
1406                     {
1407                         b = ((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1408                     }
1409                     else
1410                     {
1411                         b = ((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1412                     }
1413                     if (plan->flags&FFT5D_DEBUG)
1414                     {
1415                         printf("%f %f, ", a, b);
1416                     }
1417                     else
1418                     {
1419                         if (fabs(a-b) > 2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS)
1420                         {
1421                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n", coor[0], coor[1], x, y, z, a, b);
1422                         }
1423 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1424                     }
1425                 }
1426                 if (plan->flags&FFT5D_DEBUG)
1427                 {
1428                     printf(",");
1429                 }
1430             }
1431             if (plan->flags&FFT5D_DEBUG)
1432             {
1433                 printf("\n");
1434             }
1435         }
1436     }
1437
1438 }