src/mdlib/fft5d.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #ifdef HAVE_CONFIG_H
  39 #include <config.h>
  40 #endif
  41
  42
  43 #include <stdio.h>
  44 #include <stdlib.h>
  45 #include <string.h>
  46
  47 #ifdef NOGMX
  48 #define GMX_PARALLEL_ENV_INITIALIZED 1
  49 #else
  50 #ifdef GMX_MPI
  51 #define GMX_PARALLEL_ENV_INITIALIZED 1
  52 #else
  53 #define GMX_PARALLEL_ENV_INITIALIZED 0
  54 #endif
  55 #endif
  56
  57 #ifdef GMX_LIB_MPI
  58 #include <mpi.h>
  59 #endif
  60 #ifdef GMX_THREAD_MPI
  61 #include "tmpi.h"
  62 #endif
  63
  64 #ifdef GMX_OPENMP
  65 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
  66 #define FFT5D_THREADS
  67 /* requires fftw compiled with openmp */
  68 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
  69 #endif
  70
  71 #include "fft5d.h"
  72 #include <float.h>
  73 #include <math.h>
  74 #include <assert.h>
  75 #include "smalloc.h"
  76
  77 #ifndef __FLT_EPSILON__
  78 #define __FLT_EPSILON__ FLT_EPSILON
  79 #define __DBL_EPSILON__ DBL_EPSILON
  80 #endif
  81
  82 #ifdef NOGMX
  83 FILE* debug=0;
  84 #endif
  85
  86 #include "gmx_fatal.h"
  87
  88
  89 #ifdef GMX_FFT_FFTW3
  90 #ifdef GMX_THREAD_MPI
  91 /* none of the fftw3 calls, except execute(), are thread-safe, so
  92    we need to serialize them with this mutex. */
  93 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
  94
  95 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex)
  96 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex)
  97 #else /* GMX_THREAD_MPI */
  98 #define FFTW_LOCK
  99 #define FFTW_UNLOCK
 100 #endif /* GMX_THREAD_MPI */
 101 #endif /* GMX_FFT_FFTW3 */
 102
 103 static double fft5d_fmax(double a, double b){
 104         return (a>b)?a:b;
 105 }
 106
 107 /* largest factor smaller than sqrt */
 108 static int lfactor(int z) {
 109         int i;
 110         for (i=sqrt(z);;i--)
 111                 if (z%i==0) return i;
 112         return 1;
 113 }
 114
 115 /* largest factor */
 116 static int l2factor(int z) {
 117         int i;
 118         if (z==1) return 1;
 119         for (i=z/2;;i--)
 120                 if (z%i==0) return i;
 121         return 1;
 122 }
 123
 124 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
 125 static int lpfactor(int z) {
 126         int f = l2factor(z);
 127         if (f==1) return z;
 128         return fft5d_fmax(lpfactor(f),lpfactor(z/f));
 129 }
 130
 131 #ifndef GMX_MPI
 132 #ifdef HAVE_GETTIMEOFDAY
 133 #include <sys/time.h>
 134 double MPI_Wtime() {
 135     struct timeval tv;
 136     gettimeofday(&tv,0);
 137     return tv.tv_sec+tv.tv_usec*1e-6;
 138 }
 139 #else
 140 double MPI_Wtime() {
 141     return 0.0;
 142 }
 143 #endif
 144 #endif
 145
 146 static int vmax(int* a, int s) {
 147     int i,max=0;
 148     for (i=0;i<s;i++)
 149     {
 150         if (a[i]>max) max=a[i];
 151     }
 152     return max;
 153 }
 154
 155
 156 /* NxMxK the size of the data
 157  * comm communicator to use for fft5d
 158  * P0 number of processor in 1st axes (can be null for automatic)
 159  * lin is allocated by fft5d because size of array is only known after planning phase
 160  * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 161 */
 162 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
 163 {
 164
 165     int P[2],bMaster,prank[2],i,t;
 166     int rNG,rMG,rKG;
 167     int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
 168     int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
 169     int C[3],rC[3],nP[2];
 170     int lsize;
 171     t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
 172     fft5d_plan plan;
 173     int s;
 174
 175     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 176 #ifdef GMX_MPI
 177     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
 178     {
 179         MPI_Comm_size(comm[0],&P[0]);
 180         MPI_Comm_rank(comm[0],&prank[0]);
 181     }
 182     else
 183 #endif
 184     {
 185         P[0] = 1;
 186         prank[0] = 0;
 187     }
 188 #ifdef GMX_MPI
 189     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
 190     {
 191         MPI_Comm_size(comm[1],&P[1]);
 192         MPI_Comm_rank(comm[1],&prank[1]);
 193     }
 194     else
 195 #endif
 196     {
 197         P[1] = 1;
 198         prank[1] = 0;
 199     }
 200
 201     bMaster=(prank[0]==0&&prank[1]==0);
 202
 203
 204     if (debug)
 205     {
 206         fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
 207                 P[0],P[1],prank[0],prank[1]);
 208     }
 209
 210     if (bMaster) {
 211         if (debug)
 212             fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 213                 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
 214         /* The check below is not correct, one prime factor 11 or 13 is ok.
 215         if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 216             printf("WARNING: FFT very slow with prime factors larger 7\n");
 217             printf("Change FFT size or in case you cannot change it look at\n");
 218             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 219         }
 220         */
 221     }
 222
 223     if (NG==0 || MG==0 || KG==0) {
 224         if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 225         return 0;
 226     }
 227
 228     rNG=NG;rMG=MG;rKG=KG;
 229
 230     if (flags&FFT5D_REALCOMPLEX) {
 231         if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
 232         else {
 233             if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
 234             else KG=KG/2+1;
 235         }
 236     }
 237
 238
 239     /*for transpose we need to know the size for each processor not only our own size*/
 240
 241     N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 242     M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 243     K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 244     oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
 245     oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
 246     oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
 247
 248     for (i=0;i<P[0];i++)
 249     {
 250         #define EVENDIST
 251         #ifndef EVENDIST
 252         oN0[i]=i*ceil((double)NG/P[0]);
 253         oM0[i]=i*ceil((double)MG/P[0]);
 254         oK0[i]=i*ceil((double)KG/P[0]);
 255         #else
 256         oN0[i]=(NG*i)/P[0];
 257         oM0[i]=(MG*i)/P[0];
 258         oK0[i]=(KG*i)/P[0];
 259         #endif
 260     }
 261     for (i=0;i<P[1];i++)
 262     {
 263         #ifndef EVENDIST
 264         oN1[i]=i*ceil((double)NG/P[1]);
 265         oM1[i]=i*ceil((double)MG/P[1]);
 266         oK1[i]=i*ceil((double)KG/P[1]);
 267         #else
 268         oN1[i]=(NG*i)/P[1];
 269         oM1[i]=(MG*i)/P[1];
 270         oK1[i]=(KG*i)/P[1];
 271         #endif
 272     }
 273     for (i=0;i<P[0]-1;i++)
 274     {
 275         N0[i]=oN0[i+1]-oN0[i];
 276         M0[i]=oM0[i+1]-oM0[i];
 277         K0[i]=oK0[i+1]-oK0[i];
 278     }
 279     N0[P[0]-1]=NG-oN0[P[0]-1];
 280     M0[P[0]-1]=MG-oM0[P[0]-1];
 281     K0[P[0]-1]=KG-oK0[P[0]-1];
 282     for (i=0;i<P[1]-1;i++)
 283     {
 284         N1[i]=oN1[i+1]-oN1[i];
 285         M1[i]=oM1[i+1]-oM1[i];
 286         K1[i]=oK1[i+1]-oK1[i];
 287     }
 288     N1[P[1]-1]=NG-oN1[P[1]-1];
 289     M1[P[1]-1]=MG-oM1[P[1]-1];
 290     K1[P[1]-1]=KG-oK1[P[1]-1];
 291
 292     /* for step 1-3 the local N,M,K sizes of the transposed system
 293        C: contiguous dimension, and nP: number of processor in subcommunicator
 294        for that step */
 295
 296
 297     pM[0] = M0[prank[0]];
 298     oM[0] = oM0[prank[0]];
 299     pK[0] = K1[prank[1]];
 300     oK[0] = oK1[prank[1]];
 301     C[0] = NG;
 302     rC[0] = rNG;
 303     if (!(flags&FFT5D_ORDER_YZ)) {
 304         N[0] = vmax(N1,P[1]);
 305         M[0] = M0[prank[0]];
 306         K[0] = vmax(K1,P[1]);
 307         pN[0] = N1[prank[1]];
 308         iNout[0] = N1;
 309         oNout[0] = oN1;
 310         nP[0] = P[1];
 311         C[1] = KG;
 312         rC[1] =rKG;
 313         N[1] = vmax(K0,P[0]);
 314         pN[1] = K0[prank[0]];
 315         iNin[1] = K1;
 316         oNin[1] = oK1;
 317         iNout[1] = K0;
 318         oNout[1] = oK0;
 319         M[1] = vmax(M0,P[0]);
 320         pM[1] = M0[prank[0]];
 321         oM[1] = oM0[prank[0]];
 322         K[1] = N1[prank[1]];
 323         pK[1] = N1[prank[1]];
 324         oK[1] = oN1[prank[1]];
 325         nP[1] = P[0];
 326         C[2] = MG;
 327         rC[2] = rMG;
 328         iNin[2] = M0;
 329         oNin[2] = oM0;
 330         M[2] = vmax(K0,P[0]);
 331         pM[2] = K0[prank[0]];
 332         oM[2] = oK0[prank[0]];
 333         K[2] = vmax(N1,P[1]);
 334         pK[2] = N1[prank[1]];
 335         oK[2] = oN1[prank[1]];
 336         free(N0); free(oN0); /*these are not used for this order*/
 337         free(M1); free(oM1); /*the rest is freed in destroy*/
 338     } else {
 339         N[0] = vmax(N0,P[0]);
 340         M[0] = vmax(M0,P[0]);
 341         K[0] = K1[prank[1]];
 342         pN[0] = N0[prank[0]];
 343         iNout[0] = N0;
 344         oNout[0] = oN0;
 345         nP[0] = P[0];
 346         C[1] = MG;
 347         rC[1] =rMG;
 348         N[1] = vmax(M1,P[1]);
 349         pN[1] = M1[prank[1]];
 350         iNin[1] = M0;
 351         oNin[1] = oM0;
 352         iNout[1] = M1;
 353         oNout[1] = oM1;
 354         M[1] = N0[prank[0]];
 355         pM[1] = N0[prank[0]];
 356         oM[1] = oN0[prank[0]];
 357         K[1] = vmax(K1,P[1]);
 358         pK[1] = K1[prank[1]];
 359         oK[1] = oK1[prank[1]];
 360         nP[1] = P[1];
 361         C[2] = KG;
 362         rC[2] = rKG;
 363         iNin[2] = K1;
 364         oNin[2] = oK1;
 365         M[2] = vmax(N0,P[0]);
 366         pM[2] = N0[prank[0]];
 367         oM[2] = oN0[prank[0]];
 368         K[2] = vmax(M1,P[1]);
 369         pK[2] = M1[prank[1]];
 370         oK[2] = oM1[prank[1]];
 371         free(N1); free(oN1); /*these are not used for this order*/
 372         free(K0); free(oK0); /*the rest is freed in destroy*/
 373     }
 374     N[2]=pN[2]=-1; /*not used*/
 375
 376     /*
 377       Difference between x-y-z regarding 2d decomposition is whether they are
 378       distributed along axis 1, 2 or both
 379     */
 380
 381     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 382     lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
 383     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 384     if (!(flags&FFT5D_NOMALLOC)) {
 385         snew_aligned(lin, lsize, 32);
 386         snew_aligned(lout, lsize, 32);
 387         if (nthreads > 1)
 388         {
 389             /* We need extra transpose buffers to avoid OpenMP barriers */
 390             snew_aligned(lout2, lsize, 32);
 391             snew_aligned(lout3, lsize, 32);
 392         }
 393         else
 394         {
 395             /* We can reuse the buffers to avoid cache misses */
 396             lout2 = lin;
 397             lout3 = lout;
 398         }
 399     } else {
 400         lin = *rlin;
 401         lout = *rlout;
 402         if (nthreads > 1)
 403         {
 404             lout2 = *rlout2;
 405             lout3 = *rlout3;
 406         }
 407         else
 408         {
 409             lout2 = lin;
 410             lout3 = lout;
 411         }
 412     }
 413
 414     plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
 415
 416
 417     if (debug)
 418     {
 419         fprintf(debug, "Running on %d threads\n",nthreads);
 420     }
 421
 422 #ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
 423     /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
 424      * within a parallel region. For now deactivated. If it should be supported it has to made sure that
 425      * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
 426      * and that the 3d plan is faster than the 1d plan.
 427      */
 428     if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) {  /*don't do 3d plan in parallel or if in_place requested */
 429             int fftwflags=FFTW_DESTROY_INPUT;
 430             FFTW(iodim) dims[3];
 431             int inNG=NG,outMG=MG,outKG=KG;
 432
 433             FFTW_LOCK;
 434             if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
 435             if (flags&FFT5D_REALCOMPLEX) {
 436                 if (!(flags&FFT5D_BACKWARD)) {  /*input pointer is not complex*/
 437                     inNG*=2;
 438                 } else {                        /*output pointer is not complex*/
 439                     if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
 440                     else outKG*=2;
 441                 }
 442             }
 443
 444             if (!(flags&FFT5D_BACKWARD)) {
 445                 dims[0].n  = KG;
 446                 dims[1].n  = MG;
 447                 dims[2].n  = rNG;
 448
 449                 dims[0].is = inNG*MG;     /*N M K*/
 450                 dims[1].is = inNG;
 451                 dims[2].is = 1;
 452                 if (!(flags&FFT5D_ORDER_YZ)) {
 453                     dims[0].os = MG;       /*M K N*/
 454                     dims[1].os = 1;
 455                     dims[2].os = MG*KG;
 456                 } else  {
 457                     dims[0].os = 1;       /*K N M*/
 458                     dims[1].os = KG*NG;
 459                     dims[2].os = KG;
 460                 }
 461             } else {
 462                 if (!(flags&FFT5D_ORDER_YZ)) {
 463                     dims[0].n  = NG;
 464                     dims[1].n  = KG;
 465                     dims[2].n  = rMG;
 466
 467                     dims[0].is = 1;
 468                     dims[1].is = NG*MG;
 469                     dims[2].is = NG;
 470
 471                     dims[0].os = outMG*KG;
 472                     dims[1].os = outMG;
 473                     dims[2].os = 1;
 474                 } else {
 475                     dims[0].n  = MG;
 476                     dims[1].n  = NG;
 477                     dims[2].n  = rKG;
 478
 479                     dims[0].is = NG;
 480                     dims[1].is = 1;
 481                     dims[2].is = NG*MG;
 482
 483                     dims[0].os = outKG*NG;
 484                     dims[1].os = outKG;
 485                     dims[2].os = 1;
 486                 }
 487             }
 488 #ifdef FFT5D_THREADS
 489 #ifdef FFT5D_FFTW_THREADS
 490             FFTW(plan_with_nthreads)(nthreads);
 491 #endif
 492 #endif
 493             if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
 494                 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 495                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 496                                      (real*)lin, (FFTW(complex) *)lout,
 497                                      /*flags*/ fftwflags);
 498             } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
 499                 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 500                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 501                                      (FFTW(complex) *)lin, (real*)lout,
 502                                      /*flags*/ fftwflags);
 503             } else {
 504                 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 505                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 506                                      (FFTW(complex) *)lin, (FFTW(complex) *)lout,
 507                                      /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
 508             }
 509 #ifdef FFT5D_THREADS
 510 #ifdef FFT5D_FFTW_THREADS
 511             FFTW(plan_with_nthreads)(1);
 512 #endif
 513 #endif
 514             FFTW_UNLOCK;
 515     }
 516     if (!plan->p3d) {  /* for decomposition and if 3d plan did not work */
 517 #endif /* GMX_FFT_FFTW3 */
 518         for (s=0;s<3;s++) {
 519             if (debug)
 520             {
 521                 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 522                         s,rC[s],M[s],pK[s],C[s],lsize);
 523             }
 524             plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
 525
 526             /* Make sure that the init routines are only called by one thread at a time and in order
 527                (later is only important to not confuse valgrind)
 528              */
 529 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
 530             for(t=0; t<nthreads; t++)
 531 #pragma omp ordered
 532             {
 533                 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
 534
 535                 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
 536                     gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 537                 } else {
 538                     gmx_fft_init_many_1d     ( &plan->p1d[s][t],  C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 539                 }
 540             }
 541         }
 542
 543 #ifdef GMX_FFT_FFTW3
 544     }
 545 #endif
 546     if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
 547         plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
 548     } else {
 549         plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
 550     }
 551 #ifdef FFT5D_MPI_TRANSPOSE
 552     FFTW_LOCK;
 553     for (s=0;s<2;s++) {
 554         if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
 555             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 556         else
 557             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 558     }
 559     FFTW_UNLOCK;
 560 #endif
 561
 562
 563     plan->lin=lin;
 564     plan->lout=lout;
 565     plan->lout2=lout2;
 566     plan->lout3=lout3;
 567
 568     plan->NG=NG;plan->MG=MG;plan->KG=KG;
 569
 570     for (s=0;s<3;s++) {
 571         plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
 572         plan->oM[s]=oM[s];plan->oK[s]=oK[s];
 573         plan->C[s]=C[s];plan->rC[s]=rC[s];
 574         plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
 575     }
 576     for (s=0;s<2;s++) {
 577         plan->P[s]=nP[s];plan->coor[s]=prank[s];
 578     }
 579
 580 /*    plan->fftorder=fftorder;
 581     plan->direction=direction;
 582     plan->realcomplex=realcomplex;
 583 */
 584     plan->flags=flags;
 585     plan->nthreads=nthreads;
 586     *rlin=lin;
 587     *rlout=lout;
 588     *rlout2=lout2;
 589     *rlout3=lout3;
 590     return plan;
 591 }
 592
 593
 594 enum order {
 595     XYZ,
 596     XZY,
 597     YXZ,
 598     YZX,
 599     ZXY,
 600     ZYX
 601 };
 602
 603
 604
 605 /*here x,y,z and N,M,K is in rotated coordinate system!!
 606   x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 607   maxN,maxM,maxK is max size of local data
 608   pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 609   NG, MG, KG is size of global data*/
 610 static void splitaxes(t_complex* lout,const t_complex* lin,
 611                       int maxN,int maxM,int maxK, int pN, int pM, int pK,
 612                       int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
 613 {
 614     int x,y,z,i;
 615     int in_i,out_i,in_z,out_z,in_y,out_y;
 616     int s_y,e_y;
 617
 618     for (z=startz; z<endz+1; z++) /*3. z l*/
 619     {
 620         if (z==startz) {
 621             s_y=starty;
 622         } else {
 623             s_y=0;
 624         }
 625         if (z==endz) {
 626             e_y=endy;
 627         } else {
 628             e_y=pM;
 629         }
 630         out_z  = z*maxN*maxM;
 631         in_z = z*NG*pM;
 632
 633         for (i=0; i<P; i++) /*index cube along long axis*/
 634         {
 635             out_i  = out_z  + i*maxN*maxM*maxK;
 636             in_i = in_z + oN[i];
 637             for (y=s_y;y<e_y;y++) { /*2. y k*/
 638                 out_y  = out_i  + y*maxN;
 639                 in_y = in_i + y*NG;
 640                 for (x=0;x<N[i];x++) { /*1. x j*/
 641                     lout[out_y+x] = lin[in_y+x];    /*in=z*NG*pM+oN[i]+y*NG+x*/
 642                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 643                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 644                 }
 645             }
 646         }
 647     }
 648 }
 649
 650 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 651 /*transpose mayor and major dimension
 652   variables see above
 653   the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 654   N,M,K local dimensions
 655   KG global size*/
 656 static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
 657                             int maxN,int maxM,int maxK,int pN, int pM, int pK,
 658                             int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
 659 {
 660     int i,x,y,z;
 661     int out_i,in_i,out_x,in_x,out_z,in_z;
 662     int s_y,e_y;
 663
 664     for (x=startx;x<endx+1;x++) /*1.j*/
 665     {
 666         if (x==startx)
 667         {
 668             s_y=starty;
 669         }
 670         else
 671         {
 672             s_y=0;
 673         }
 674         if (x==endx)
 675         {
 676             e_y=endy;
 677         }
 678         else
 679         {
 680             e_y=pM;
 681         }
 682
 683         out_x  = x*KG*pM;
 684         in_x = x;
 685
 686         for (i=0;i<P;i++) /*index cube along long axis*/
 687         {
 688             out_i  = out_x  + oK[i];
 689             in_i = in_x + i*maxM*maxN*maxK;
 690             for (z=0;z<K[i];z++) /*3.l*/
 691             {
 692                 out_z  = out_i  + z;
 693                 in_z = in_i + z*maxM*maxN;
 694                 for (y=s_y;y<e_y;y++) /*2.k*/
 695                 {
 696                     lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
 697                 }
 698             }
 699         }
 700     }
 701 }
 702
 703 /*make axis contiguous again (after AllToAll) and also do local transpose
 704   tranpose mayor and middle dimension
 705   variables see above
 706   the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 707   N,M,K local size
 708   MG, global size*/
 709 static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
 710                             int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
 711     int i,z,y,x;
 712     int out_i,in_i,out_z,in_z,out_x,in_x;
 713     int s_x,e_x;
 714
 715     for (z=startz; z<endz+1; z++)
 716     {
 717         if (z==startz)
 718         {
 719             s_x=startx;
 720         }
 721         else
 722         {
 723             s_x=0;
 724         }
 725         if (z==endz)
 726         {
 727             e_x=endx;
 728         }
 729         else
 730         {
 731             e_x=pN;
 732         }
 733         out_z  = z*MG*pN;
 734         in_z = z*maxM*maxN;
 735
 736         for (i=0; i<P; i++) /*index cube along long axis*/
 737         {
 738             out_i  = out_z  + oM[i];
 739             in_i = in_z + i*maxM*maxN*maxK;
 740             for (x=s_x;x<e_x;x++)
 741             {
 742                 out_x  = out_i  + x*MG;
 743                 in_x = in_i + x;
 744                 for (y=0;y<M[i];y++)
 745                 {
 746                     lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
 747                 }
 748             }
 749         }
 750     }
 751 }
 752
 753
 754 static void rotate_offsets(int x[]) {
 755     int t=x[0];
 756 /*    x[0]=x[2];
 757     x[2]=x[1];
 758     x[1]=t;*/
 759     x[0]=x[1];
 760     x[1]=x[2];
 761     x[2]=t;
 762 }
 763
 764 /*compute the offset to compare or print transposed local data in original input coordinates
 765   xs matrix dimension size, xl dimension length, xc decomposition offset
 766   s: step in computation = number of transposes*/
 767 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
 768 /*    int direction = plan->direction;
 769     int fftorder = plan->fftorder;*/
 770
 771     int o;
 772     int pos[3],i;
 773     int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
 774         *C=plan->C, *rC=plan->rC;
 775
 776     NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
 777
 778     if (!(plan->flags&FFT5D_ORDER_YZ)) {
 779         switch (s) {
 780         case 0: o=XYZ; break;
 781         case 1: o=ZYX; break;
 782         case 2: o=YZX; break;
 783         default: assert(0);
 784         }
 785     } else {
 786         switch (s) {
 787         case 0: o=XYZ; break;
 788         case 1: o=YXZ; break;
 789         case 2: o=ZXY; break;
 790         default: assert(0);
 791         }
 792     }
 793
 794     switch (o) {
 795         case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
 796         case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
 797         case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
 798         case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
 799         case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
 800         case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
 801     }
 802     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 803
 804     /*xs, xl give dimension size and data length in local transposed coordinate system
 805       for 0(/1/2): x(/y/z) in original coordinate system*/
 806     for (i=0;i<3;i++) {
 807         switch (pos[i]) {
 808         case 1: xs[i]=1;         xc[i]=0;     xl[i]=C[s];break;
 809         case 2: xs[i]=C[s];      xc[i]=oM[s]; xl[i]=pM[s];break;
 810         case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
 811         }
 812     }
 813     /*input order is different for test program to match FFTW order
 814       (important for complex to real)*/
 815     if (plan->flags&FFT5D_BACKWARD) {
 816         rotate_offsets(xs);
 817         rotate_offsets(xl);
 818         rotate_offsets(xc);
 819         rotate_offsets(NG);
 820         if (plan->flags&FFT5D_ORDER_YZ) {
 821             rotate_offsets(xs);
 822             rotate_offsets(xl);
 823             rotate_offsets(xc);
 824             rotate_offsets(NG);
 825         }
 826     }
 827     if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
 828         xl[0] = rC[s];
 829     }
 830 }
 831
 832 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
 833     int x,y,z,l,t;
 834     int *coor = plan->coor;
 835     int xs[3],xl[3],xc[3],NG[3];
 836     int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
 837     compute_offsets(plan,xs,xl,xc,NG,s);
 838     fprintf(debug,txt,coor[0],coor[1],s);
 839     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 840     for(z=0;z<xl[2];z++) {
 841         for(y=0;y<xl[1];y++) {
 842             fprintf(debug,"%d %d: ",coor[0],coor[1]);
 843             for (x=0;x<xl[0];x++) {
 844                 for (l=0;l<ll;l++) {
 845                     fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 846                 }
 847                 fprintf(debug,",");
 848             }
 849             fprintf(debug,"\n");
 850         }
 851     }
 852 }
 853
 854 void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
 855     t_complex *lin = plan->lin;
 856     t_complex *lout = plan->lout;
 857     t_complex *lout2 = plan->lout2;
 858     t_complex *lout3 = plan->lout3;
 859     t_complex *fftout,*joinin;
 860
 861     gmx_fft_t **p1d=plan->p1d;
 862 #ifdef FFT5D_MPI_TRANSPOSE
 863     FFTW(plan) *mpip=plan->mpip;
 864 #endif
 865 #ifdef GMX_MPI
 866     MPI_Comm *cart=plan->cart;
 867 #endif
 868
 869     double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
 870     int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
 871         *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
 872     int s=0,tstart,tend,bParallelDim;
 873
 874
 875 #ifdef GMX_FFT_FFTW3
 876     if (plan->p3d)
 877     {
 878         if (thread == 0)
 879         {
 880 #ifdef NOGMX
 881             if (times!=0)
 882             {
 883                 time=MPI_Wtime();
 884             }
 885 #endif
 886             FFTW(execute)(plan->p3d);
 887 #ifdef NOGMX
 888             if (times!=0)
 889             {
 890                 times->fft+=MPI_Wtime()-time;
 891             }
 892 #endif
 893         }
 894         return;
 895     }
 896 #endif
 897
 898         s=0;
 899
 900     /*lin: x,y,z*/
 901         if (plan->flags&FFT5D_DEBUG && thread == 0)
 902         {
 903             print_localdata(lin, "%d %d: copy in lin\n", s, plan);
 904         }
 905
 906         for (s=0;s<2;s++) {  /*loop over first two FFT steps (corner rotations)*/
 907
 908 #ifdef GMX_MPI
 909         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
 910         {
 911             bParallelDim = 1;
 912         }
 913         else
 914 #endif
 915         {
 916             bParallelDim = 0;
 917         }
 918
 919         /* ---------- START FFT ------------ */
 920 #ifdef NOGMX
 921         if (times!=0 && thread == 0)
 922         {
 923             time=MPI_Wtime();
 924         }
 925 #endif
 926
 927         if (bParallelDim || plan->nthreads == 1) {
 928             fftout = lout;
 929         }
 930         else
 931         {
 932             if (s==0)
 933             {
 934                 fftout = lout3;
 935             } else
 936             {
 937                 fftout = lout2;
 938             }
 939         }
 940
 941         tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
 942         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
 943         {
 944             gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
 945         } else
 946         {
 947             gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,fftout+tstart);
 948
 949         }
 950
 951 #ifdef NOGMX
 952         if (times != NULL && thread == 0)
 953         {
 954             time_fft+=MPI_Wtime()-time;
 955         }
 956 #endif
 957         if (plan->flags&FFT5D_DEBUG && thread == 0)
 958         {
 959             print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 960         }
 961         /* ---------- END FFT ------------ */
 962
 963         /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
 964         if (bParallelDim) {
 965 #ifdef NOGMX
 966             if (times != NULL && thread == 0)
 967             {
 968                 time=MPI_Wtime();
 969             }
 970 #endif
 971             /*prepare for A
 972 llToAll
 973               1. (most outer) axes (x) is split into P[s] parts of size N[s]
 974               for sending*/
 975             if (pM[s]>0)
 976             {
 977                 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
 978                 tstart/=C[s];
 979                 splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
 980             }
 981 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
 982 #ifdef NOGMX
 983             if (times != NULL && thread == 0)
 984             {
 985                 time_local+=MPI_Wtime()-time;
 986             }
 987 #endif
 988
 989         /* ---------- END SPLIT , START TRANSPOSE------------ */
 990
 991             if (thread == 0)
 992             {
 993 #ifdef NOGMX
 994                 if (times!=0)
 995                 {
 996                     time=MPI_Wtime();
 997                 }
 998 #else
 999                 wallcycle_start(times,ewcPME_FFTCOMM);
1000 #endif
1001 #ifdef FFT5D_MPI_TRANSPOSE
1002                 FFTW(execute)(mpip[s]);
1003 #else
1004 #ifdef GMX_MPI
1005                 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
1006                     MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
1007                 else
1008                     MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
1009 #else
1010                 gmx_incons("fft5d MPI call without MPI configuration");
1011 #endif /*GMX_MPI*/
1012 #endif /*FFT5D_MPI_TRANSPOSE*/
1013 #ifdef NOGMX
1014                 if (times!=0)
1015                 {
1016                     time_mpi[s]=MPI_Wtime()-time;
1017                 }
1018 #else
1019                 wallcycle_stop(times,ewcPME_FFTCOMM);
1020 #endif
1021             }  /*master*/
1022         }  /* bPrallelDim */
1023 #pragma omp barrier  /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1024
1025         /* ---------- END SPLIT + TRANSPOSE------------ */
1026
1027         /* ---------- START JOIN ------------ */
1028 #ifdef NOGMX
1029         if (times != NULL && thread == 0)
1030         {
1031             time=MPI_Wtime();
1032         }
1033 #endif
1034
1035         if (bParallelDim) {
1036             joinin = lout3;
1037         } else {
1038             joinin = fftout;
1039         }
1040         /*bring back in matrix form
1041           thus make  new 1. axes contiguos
1042           also local transpose 1 and 2/3
1043           runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1044         */
1045         if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
1046             if (pM[s]>0)
1047             {
1048                 tstart = ( thread   *pM[s]*pN[s]/plan->nthreads);
1049                 tend   = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1050                 joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
1051             }
1052         }
1053         else {
1054             if (pN[s]>0)
1055             {
1056                 tstart = ( thread   *pK[s]*pN[s]/plan->nthreads);
1057                 tend   = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1058                 joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
1059             }
1060         }
1061
1062 #ifdef NOGMX
1063         if (times != NULL && thread == 0)
1064         {
1065             time_local+=MPI_Wtime()-time;
1066         }
1067 #endif
1068         if (plan->flags&FFT5D_DEBUG && thread == 0)
1069         {
1070             print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1071         }
1072         /* ---------- END JOIN ------------ */
1073
1074         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1075     }  /* for(s=0;s<2;s++) */
1076 #ifdef NOGMX
1077         if (times != NULL && thread == 0)
1078         {
1079             time=MPI_Wtime();
1080         }
1081 #endif
1082
1083     if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
1084
1085     /*  ----------- FFT ----------- */
1086     tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1087     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
1088         gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
1089     } else {
1090         gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,lout+tstart);
1091     }
1092     /* ------------ END FFT ---------*/
1093
1094 #ifdef NOGMX
1095     if (times != NULL && thread == 0)
1096     {
1097         time_fft+=MPI_Wtime()-time;
1098
1099         times->fft+=time_fft;
1100         times->local+=time_local;
1101         times->mpi2+=time_mpi[1];
1102         times->mpi1+=time_mpi[0];
1103     }
1104 #endif
1105
1106     if (plan->flags&FFT5D_DEBUG && thread == 0)
1107     {
1108         print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1109     }
1110 }
1111
1112 void fft5d_destroy(fft5d_plan plan) {
1113     int s,t;
1114
1115     for (s=0;s<3;s++)
1116     {
1117         if (plan->p1d[s])
1118         {
1119             for (t=0;t<plan->nthreads;t++)
1120             {
1121                 gmx_many_fft_destroy(plan->p1d[s][t]);
1122             }
1123             free(plan->p1d[s]);
1124         }
1125         if (plan->iNin[s])
1126         {
1127             free(plan->iNin[s]);
1128             plan->iNin[s]=0;
1129         }
1130         if (plan->oNin[s])
1131         {
1132             free(plan->oNin[s]);
1133             plan->oNin[s]=0;
1134         }
1135         if (plan->iNout[s])
1136         {
1137             free(plan->iNout[s]);
1138             plan->iNout[s]=0;
1139         }
1140         if (plan->oNout[s])
1141         {
1142             free(plan->oNout[s]);
1143             plan->oNout[s]=0;
1144         }
1145     }
1146 #ifdef GMX_FFT_FFTW3
1147     FFTW_LOCK;
1148 #ifdef FFT5D_MPI_TRANSPOS
1149     for (s=0;s<2;s++)
1150     {
1151         FFTW(destroy_plan)(plan->mpip[s]);
1152     }
1153     if (plan->p3d)
1154     {
1155         FFTW(destroy_plan)(plan->p3d);
1156     }
1157 #endif /* FFT5D_MPI_TRANSPOS */
1158 #endif /* GMX_FFT_FFTW3 */
1159
1160     if (!(plan->flags&FFT5D_NOMALLOC))
1161     {
1162         sfree_aligned(plan->lin);
1163         sfree_aligned(plan->lout);
1164         if (plan->nthreads > 1)
1165         {
1166             sfree_aligned(plan->lout2);
1167             sfree_aligned(plan->lout3);
1168         }
1169     }
1170
1171 #ifdef FFT5D_THREADS
1172 #ifdef FFT5D_FFTW_THREADS
1173     /*FFTW(cleanup_threads)();*/
1174 #endif
1175 #endif
1176
1177     free(plan);
1178 }
1179
1180 /*Is this better than direct access of plan? enough data?
1181   here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1182 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
1183     *N1=plan->N[0];
1184     *M0=plan->M[0];
1185     *K1=plan->K[0];
1186     *K0=plan->N[1];
1187
1188     *coor=plan->coor;
1189 }
1190
1191
1192 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1193   of processor dimensions*/
1194 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
1195     MPI_Comm cart[2]={0};
1196 #ifdef GMX_MPI
1197     int size=1,prank=0;
1198     int P[2];
1199     int coor[2];
1200     int wrap[]={0,0};
1201     MPI_Comm gcart;
1202     int rdim1[] = {0,1}, rdim2[] = {1,0};
1203
1204     MPI_Comm_size(comm,&size);
1205     MPI_Comm_rank(comm,&prank);
1206
1207     if (P0==0) P0 = lfactor(size);
1208     if (size%P0!=0)
1209     {
1210         if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1211         P0 = lfactor(size);
1212     }
1213
1214     P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1215
1216     /*Difference between x-y-z regarding 2d decomposition is whether they are
1217       distributed along axis 1, 2 or both*/
1218
1219     MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1220     MPI_Cart_get(gcart,2,P,wrap,coor);
1221     MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1222     MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1223 #endif
1224     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
1225 }
1226
1227
1228
1229 /*prints in original coordinate system of data (as the input to FFT)*/
1230 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1231     int xs[3],xl[3],xc[3],NG[3];
1232     int x,y,z,l;
1233     int *coor = plan->coor;
1234     int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1235     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1236     {
1237         ll=1;
1238     }
1239
1240     compute_offsets(plan,xs,xl,xc,NG,2);
1241     if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1242     for (z=0;z<xl[2];z++) {
1243         for(y=0;y<xl[1];y++) {
1244             if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1245             for (x=0;x<xl[0];x++) {
1246                 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1247                     real a,b;
1248                     a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1249                     if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1250                     if (!bothLocal)
1251                         b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1252                     else
1253                         b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1254                     if (plan->flags&FFT5D_DEBUG) {
1255                         printf("%f %f, ",a,b);
1256                     } else {
1257                         if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1258                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1259                         }
1260 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1261                     }
1262                 }
1263                 if (plan->flags&FFT5D_DEBUG) printf(",");
1264             }
1265             if (plan->flags&FFT5D_DEBUG) printf("\n");
1266         }
1267     }
1268
1269 }
1270