src/mdlib/fft5d.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  11  * Copyright (c) 2001-2012, The GROMACS development team,
  12  * check out http://www.gromacs.org for more information.
  13
  14  * This program is free software; you can redistribute it and/or
  15  * modify it under the terms of the GNU General Public License
  16  * as published by the Free Software Foundation; either version 2
  17  * of the License, or (at your option) any later version.
  18  *
  19  * If you want to redistribute modifications, please consider that
  20  * scientific software is very special. Version control is crucial -
  21  * bugs must be traceable. We will be happy to consider code for
  22  * inclusion in the official distribution, but derived work must not
  23  * be called official GROMACS. Details are found in the README & COPYING
  24  * files - if they are missing, get the official version at www.gromacs.org.
  25  *
  26  * To help us fund GROMACS development, we humbly ask that you cite
  27  * the papers on the package - you can find them in the top README file.
  28  *
  29  * For more info, check our website at http://www.gromacs.org
  30  *
  31  * And Hey:
  32  * Groningen Machine for Chemical Simulation
  33  */
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37
  38
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <string.h>
  42
  43 #ifdef NOGMX
  44 #define GMX_PARALLEL_ENV_INITIALIZED 1
  45 #else
  46 #ifdef GMX_MPI
  47 #define GMX_PARALLEL_ENV_INITIALIZED 1
  48 #else
  49 #define GMX_PARALLEL_ENV_INITIALIZED 0
  50 #endif
  51 #endif
  52
  53 #ifdef GMX_LIB_MPI
  54 #include <mpi.h>
  55 #endif
  56 #ifdef GMX_THREAD_MPI
  57 #include "tmpi.h"
  58 #endif
  59
  60 #ifdef GMX_OPENMP
  61 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
  62 #define FFT5D_THREADS
  63 /* requires fftw compiled with openmp */
  64 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
  65 #endif
  66
  67 #include "fft5d.h"
  68 #include <float.h>
  69 #include <math.h>
  70 #include <assert.h>
  71 #include "smalloc.h"
  72
  73 #ifndef __FLT_EPSILON__
  74 #define __FLT_EPSILON__ FLT_EPSILON
  75 #define __DBL_EPSILON__ DBL_EPSILON
  76 #endif
  77
  78 #ifdef NOGMX
  79 FILE* debug=0;
  80 #endif
  81
  82 #include "gmx_fatal.h"
  83
  84
  85 #ifdef GMX_FFT_FFTW3
  86 #ifdef GMX_THREAD_MPI
  87 /* none of the fftw3 calls, except execute(), are thread-safe, so
  88    we need to serialize them with this mutex. */
  89 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
  90
  91 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex)
  92 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex)
  93 #else /* GMX_THREAD_MPI */
  94 #define FFTW_LOCK
  95 #define FFTW_UNLOCK
  96 #endif /* GMX_THREAD_MPI */
  97 #endif /* GMX_FFT_FFTW3 */
  98
  99 static double fft5d_fmax(double a, double b){
 100         return (a>b)?a:b;
 101 }
 102
 103 /* largest factor smaller than sqrt */
 104 static int lfactor(int z) {
 105         int i;
 106         for (i=sqrt(z);;i--)
 107                 if (z%i==0) return i;
 108         return 1;
 109 }
 110
 111 /* largest factor */
 112 static int l2factor(int z) {
 113         int i;
 114         if (z==1) return 1;
 115         for (i=z/2;;i--)
 116                 if (z%i==0) return i;
 117         return 1;
 118 }
 119
 120 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
 121 static int lpfactor(int z) {
 122         int f = l2factor(z);
 123         if (f==1) return z;
 124         return fft5d_fmax(lpfactor(f),lpfactor(z/f));
 125 }
 126
 127 #ifndef GMX_MPI
 128 #ifdef HAVE_GETTIMEOFDAY
 129 #include <sys/time.h>
 130 double MPI_Wtime() {
 131     struct timeval tv;
 132     gettimeofday(&tv,0);
 133     return tv.tv_sec+tv.tv_usec*1e-6;
 134 }
 135 #else
 136 double MPI_Wtime() {
 137     return 0.0;
 138 }
 139 #endif
 140 #endif
 141
 142 static int vmax(int* a, int s) {
 143     int i,max=0;
 144     for (i=0;i<s;i++)
 145     {
 146         if (a[i]>max) max=a[i];
 147     }
 148     return max;
 149 }
 150
 151
 152 /* NxMxK the size of the data
 153  * comm communicator to use for fft5d
 154  * P0 number of processor in 1st axes (can be null for automatic)
 155  * lin is allocated by fft5d because size of array is only known after planning phase
 156  * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 157 */
 158 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
 159 {
 160
 161     int P[2],bMaster,prank[2],i,t;
 162     int rNG,rMG,rKG;
 163     int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
 164     int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
 165     int C[3],rC[3],nP[2];
 166     int lsize;
 167     t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
 168     fft5d_plan plan;
 169     int s;
 170
 171     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 172 #ifdef GMX_MPI
 173     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
 174     {
 175         MPI_Comm_size(comm[0],&P[0]);
 176         MPI_Comm_rank(comm[0],&prank[0]);
 177     }
 178     else
 179 #endif
 180     {
 181         P[0] = 1;
 182         prank[0] = 0;
 183     }
 184 #ifdef GMX_MPI
 185     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
 186     {
 187         MPI_Comm_size(comm[1],&P[1]);
 188         MPI_Comm_rank(comm[1],&prank[1]);
 189     }
 190     else
 191 #endif
 192     {
 193         P[1] = 1;
 194         prank[1] = 0;
 195     }
 196
 197     bMaster=(prank[0]==0&&prank[1]==0);
 198
 199
 200     if (debug)
 201     {
 202         fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
 203                 P[0],P[1],prank[0],prank[1]);
 204     }
 205
 206     if (bMaster) {
 207         if (debug)
 208             fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 209                 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
 210         /* The check below is not correct, one prime factor 11 or 13 is ok.
 211         if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 212             printf("WARNING: FFT very slow with prime factors larger 7\n");
 213             printf("Change FFT size or in case you cannot change it look at\n");
 214             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 215         }
 216         */
 217     }
 218
 219     if (NG==0 || MG==0 || KG==0) {
 220         if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 221         return 0;
 222     }
 223
 224     rNG=NG;rMG=MG;rKG=KG;
 225
 226     if (flags&FFT5D_REALCOMPLEX) {
 227         if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
 228         else {
 229             if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
 230             else KG=KG/2+1;
 231         }
 232     }
 233
 234
 235     /*for transpose we need to know the size for each processor not only our own size*/
 236
 237     N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 238     M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 239     K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 240     oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
 241     oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
 242     oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
 243
 244     for (i=0;i<P[0];i++)
 245     {
 246         #define EVENDIST
 247         #ifndef EVENDIST
 248         oN0[i]=i*ceil((double)NG/P[0]);
 249         oM0[i]=i*ceil((double)MG/P[0]);
 250         oK0[i]=i*ceil((double)KG/P[0]);
 251         #else
 252         oN0[i]=(NG*i)/P[0];
 253         oM0[i]=(MG*i)/P[0];
 254         oK0[i]=(KG*i)/P[0];
 255         #endif
 256     }
 257     for (i=0;i<P[1];i++)
 258     {
 259         #ifndef EVENDIST
 260         oN1[i]=i*ceil((double)NG/P[1]);
 261         oM1[i]=i*ceil((double)MG/P[1]);
 262         oK1[i]=i*ceil((double)KG/P[1]);
 263         #else
 264         oN1[i]=(NG*i)/P[1];
 265         oM1[i]=(MG*i)/P[1];
 266         oK1[i]=(KG*i)/P[1];
 267         #endif
 268     }
 269     for (i=0;i<P[0]-1;i++)
 270     {
 271         N0[i]=oN0[i+1]-oN0[i];
 272         M0[i]=oM0[i+1]-oM0[i];
 273         K0[i]=oK0[i+1]-oK0[i];
 274     }
 275     N0[P[0]-1]=NG-oN0[P[0]-1];
 276     M0[P[0]-1]=MG-oM0[P[0]-1];
 277     K0[P[0]-1]=KG-oK0[P[0]-1];
 278     for (i=0;i<P[1]-1;i++)
 279     {
 280         N1[i]=oN1[i+1]-oN1[i];
 281         M1[i]=oM1[i+1]-oM1[i];
 282         K1[i]=oK1[i+1]-oK1[i];
 283     }
 284     N1[P[1]-1]=NG-oN1[P[1]-1];
 285     M1[P[1]-1]=MG-oM1[P[1]-1];
 286     K1[P[1]-1]=KG-oK1[P[1]-1];
 287
 288     /* for step 1-3 the local N,M,K sizes of the transposed system
 289        C: contiguous dimension, and nP: number of processor in subcommunicator
 290        for that step */
 291
 292
 293     pM[0] = M0[prank[0]];
 294     oM[0] = oM0[prank[0]];
 295     pK[0] = K1[prank[1]];
 296     oK[0] = oK1[prank[1]];
 297     C[0] = NG;
 298     rC[0] = rNG;
 299     if (!(flags&FFT5D_ORDER_YZ)) {
 300         N[0] = vmax(N1,P[1]);
 301         M[0] = M0[prank[0]];
 302         K[0] = vmax(K1,P[1]);
 303         pN[0] = N1[prank[1]];
 304         iNout[0] = N1;
 305         oNout[0] = oN1;
 306         nP[0] = P[1];
 307         C[1] = KG;
 308         rC[1] =rKG;
 309         N[1] = vmax(K0,P[0]);
 310         pN[1] = K0[prank[0]];
 311         iNin[1] = K1;
 312         oNin[1] = oK1;
 313         iNout[1] = K0;
 314         oNout[1] = oK0;
 315         M[1] = vmax(M0,P[0]);
 316         pM[1] = M0[prank[0]];
 317         oM[1] = oM0[prank[0]];
 318         K[1] = N1[prank[1]];
 319         pK[1] = N1[prank[1]];
 320         oK[1] = oN1[prank[1]];
 321         nP[1] = P[0];
 322         C[2] = MG;
 323         rC[2] = rMG;
 324         iNin[2] = M0;
 325         oNin[2] = oM0;
 326         M[2] = vmax(K0,P[0]);
 327         pM[2] = K0[prank[0]];
 328         oM[2] = oK0[prank[0]];
 329         K[2] = vmax(N1,P[1]);
 330         pK[2] = N1[prank[1]];
 331         oK[2] = oN1[prank[1]];
 332         free(N0); free(oN0); /*these are not used for this order*/
 333         free(M1); free(oM1); /*the rest is freed in destroy*/
 334     } else {
 335         N[0] = vmax(N0,P[0]);
 336         M[0] = vmax(M0,P[0]);
 337         K[0] = K1[prank[1]];
 338         pN[0] = N0[prank[0]];
 339         iNout[0] = N0;
 340         oNout[0] = oN0;
 341         nP[0] = P[0];
 342         C[1] = MG;
 343         rC[1] =rMG;
 344         N[1] = vmax(M1,P[1]);
 345         pN[1] = M1[prank[1]];
 346         iNin[1] = M0;
 347         oNin[1] = oM0;
 348         iNout[1] = M1;
 349         oNout[1] = oM1;
 350         M[1] = N0[prank[0]];
 351         pM[1] = N0[prank[0]];
 352         oM[1] = oN0[prank[0]];
 353         K[1] = vmax(K1,P[1]);
 354         pK[1] = K1[prank[1]];
 355         oK[1] = oK1[prank[1]];
 356         nP[1] = P[1];
 357         C[2] = KG;
 358         rC[2] = rKG;
 359         iNin[2] = K1;
 360         oNin[2] = oK1;
 361         M[2] = vmax(N0,P[0]);
 362         pM[2] = N0[prank[0]];
 363         oM[2] = oN0[prank[0]];
 364         K[2] = vmax(M1,P[1]);
 365         pK[2] = M1[prank[1]];
 366         oK[2] = oM1[prank[1]];
 367         free(N1); free(oN1); /*these are not used for this order*/
 368         free(K0); free(oK0); /*the rest is freed in destroy*/
 369     }
 370     N[2]=pN[2]=-1; /*not used*/
 371
 372     /*
 373       Difference between x-y-z regarding 2d decomposition is whether they are
 374       distributed along axis 1, 2 or both
 375     */
 376
 377     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 378     lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
 379     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 380     if (!(flags&FFT5D_NOMALLOC)) {
 381         snew_aligned(lin, lsize, 32);
 382         snew_aligned(lout, lsize, 32);
 383         if (nthreads > 1)
 384         {
 385             /* We need extra transpose buffers to avoid OpenMP barriers */
 386             snew_aligned(lout2, lsize, 32);
 387             snew_aligned(lout3, lsize, 32);
 388         }
 389         else
 390         {
 391             /* We can reuse the buffers to avoid cache misses */
 392             lout2 = lin;
 393             lout3 = lout;
 394         }
 395     } else {
 396         lin = *rlin;
 397         lout = *rlout;
 398         if (nthreads > 1)
 399         {
 400             lout2 = *rlout2;
 401             lout3 = *rlout3;
 402         }
 403         else
 404         {
 405             lout2 = lin;
 406             lout3 = lout;
 407         }
 408     }
 409
 410     plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
 411
 412
 413     if (debug)
 414     {
 415         fprintf(debug, "Running on %d threads\n",nthreads);
 416     }
 417
 418 #ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
 419     /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
 420      * within a parallel region. For now deactivated. If it should be supported it has to made sure that
 421      * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
 422      * and that the 3d plan is faster than the 1d plan.
 423      */
 424     if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) {  /*don't do 3d plan in parallel or if in_place requested */
 425             int fftwflags=FFTW_DESTROY_INPUT;
 426             FFTW(iodim) dims[3];
 427             int inNG=NG,outMG=MG,outKG=KG;
 428
 429             FFTW_LOCK;
 430             if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
 431             if (flags&FFT5D_REALCOMPLEX) {
 432                 if (!(flags&FFT5D_BACKWARD)) {  /*input pointer is not complex*/
 433                     inNG*=2;
 434                 } else {                        /*output pointer is not complex*/
 435                     if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
 436                     else outKG*=2;
 437                 }
 438             }
 439
 440             if (!(flags&FFT5D_BACKWARD)) {
 441                 dims[0].n  = KG;
 442                 dims[1].n  = MG;
 443                 dims[2].n  = rNG;
 444
 445                 dims[0].is = inNG*MG;     /*N M K*/
 446                 dims[1].is = inNG;
 447                 dims[2].is = 1;
 448                 if (!(flags&FFT5D_ORDER_YZ)) {
 449                     dims[0].os = MG;       /*M K N*/
 450                     dims[1].os = 1;
 451                     dims[2].os = MG*KG;
 452                 } else  {
 453                     dims[0].os = 1;       /*K N M*/
 454                     dims[1].os = KG*NG;
 455                     dims[2].os = KG;
 456                 }
 457             } else {
 458                 if (!(flags&FFT5D_ORDER_YZ)) {
 459                     dims[0].n  = NG;
 460                     dims[1].n  = KG;
 461                     dims[2].n  = rMG;
 462
 463                     dims[0].is = 1;
 464                     dims[1].is = NG*MG;
 465                     dims[2].is = NG;
 466
 467                     dims[0].os = outMG*KG;
 468                     dims[1].os = outMG;
 469                     dims[2].os = 1;
 470                 } else {
 471                     dims[0].n  = MG;
 472                     dims[1].n  = NG;
 473                     dims[2].n  = rKG;
 474
 475                     dims[0].is = NG;
 476                     dims[1].is = 1;
 477                     dims[2].is = NG*MG;
 478
 479                     dims[0].os = outKG*NG;
 480                     dims[1].os = outKG;
 481                     dims[2].os = 1;
 482                 }
 483             }
 484 #ifdef FFT5D_THREADS
 485 #ifdef FFT5D_FFTW_THREADS
 486             FFTW(plan_with_nthreads)(nthreads);
 487 #endif
 488 #endif
 489             if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
 490                 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 491                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 492                                      (real*)lin, (FFTW(complex) *)lout,
 493                                      /*flags*/ fftwflags);
 494             } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
 495                 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 496                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 497                                      (FFTW(complex) *)lin, (real*)lout,
 498                                      /*flags*/ fftwflags);
 499             } else {
 500                 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 501                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 502                                      (FFTW(complex) *)lin, (FFTW(complex) *)lout,
 503                                      /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
 504             }
 505 #ifdef FFT5D_THREADS
 506 #ifdef FFT5D_FFTW_THREADS
 507             FFTW(plan_with_nthreads)(1);
 508 #endif
 509 #endif
 510             FFTW_UNLOCK;
 511     }
 512     if (!plan->p3d) {  /* for decomposition and if 3d plan did not work */
 513 #endif /* GMX_FFT_FFTW3 */
 514         for (s=0;s<3;s++) {
 515             if (debug)
 516             {
 517                 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 518                         s,rC[s],M[s],pK[s],C[s],lsize);
 519             }
 520             plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
 521
 522             /* Make sure that the init routines are only called by one thread at a time and in order
 523                (later is only important to not confuse valgrind)
 524              */
 525 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
 526             for(t=0; t<nthreads; t++)
 527 #pragma omp ordered
 528             {
 529                 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
 530
 531                 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
 532                     gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 533                 } else {
 534                     gmx_fft_init_many_1d     ( &plan->p1d[s][t],  C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 535                 }
 536             }
 537         }
 538
 539 #ifdef GMX_FFT_FFTW3
 540     }
 541 #endif
 542     if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
 543         plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
 544     } else {
 545         plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
 546     }
 547 #ifdef FFT5D_MPI_TRANSPOSE
 548     FFTW_LOCK;
 549     for (s=0;s<2;s++) {
 550         if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
 551             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 552         else
 553             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 554     }
 555     FFTW_UNLOCK;
 556 #endif
 557
 558
 559     plan->lin=lin;
 560     plan->lout=lout;
 561     plan->lout2=lout2;
 562     plan->lout3=lout3;
 563
 564     plan->NG=NG;plan->MG=MG;plan->KG=KG;
 565
 566     for (s=0;s<3;s++) {
 567         plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
 568         plan->oM[s]=oM[s];plan->oK[s]=oK[s];
 569         plan->C[s]=C[s];plan->rC[s]=rC[s];
 570         plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
 571     }
 572     for (s=0;s<2;s++) {
 573         plan->P[s]=nP[s];plan->coor[s]=prank[s];
 574     }
 575
 576 /*    plan->fftorder=fftorder;
 577     plan->direction=direction;
 578     plan->realcomplex=realcomplex;
 579 */
 580     plan->flags=flags;
 581     plan->nthreads=nthreads;
 582     *rlin=lin;
 583     *rlout=lout;
 584     *rlout2=lout2;
 585     *rlout3=lout3;
 586     return plan;
 587 }
 588
 589
 590 enum order {
 591     XYZ,
 592     XZY,
 593     YXZ,
 594     YZX,
 595     ZXY,
 596     ZYX
 597 };
 598
 599
 600
 601 /*here x,y,z and N,M,K is in rotated coordinate system!!
 602   x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 603   maxN,maxM,maxK is max size of local data
 604   pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 605   NG, MG, KG is size of global data*/
 606 static void splitaxes(t_complex* lout,const t_complex* lin,
 607                       int maxN,int maxM,int maxK, int pN, int pM, int pK,
 608                       int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
 609 {
 610     int x,y,z,i;
 611     int in_i,out_i,in_z,out_z,in_y,out_y;
 612     int s_y,e_y;
 613
 614     for (z=startz; z<endz+1; z++) /*3. z l*/
 615     {
 616         if (z==startz) {
 617             s_y=starty;
 618         } else {
 619             s_y=0;
 620         }
 621         if (z==endz) {
 622             e_y=endy;
 623         } else {
 624             e_y=pM;
 625         }
 626         out_z  = z*maxN*maxM;
 627         in_z = z*NG*pM;
 628
 629         for (i=0; i<P; i++) /*index cube along long axis*/
 630         {
 631             out_i  = out_z  + i*maxN*maxM*maxK;
 632             in_i = in_z + oN[i];
 633             for (y=s_y;y<e_y;y++) { /*2. y k*/
 634                 out_y  = out_i  + y*maxN;
 635                 in_y = in_i + y*NG;
 636                 for (x=0;x<N[i];x++) { /*1. x j*/
 637                     lout[out_y+x] = lin[in_y+x];    /*in=z*NG*pM+oN[i]+y*NG+x*/
 638                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 639                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 640                 }
 641             }
 642         }
 643     }
 644 }
 645
 646 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 647 /*transpose mayor and major dimension
 648   variables see above
 649   the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 650   N,M,K local dimensions
 651   KG global size*/
 652 static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
 653                             int maxN,int maxM,int maxK,int pN, int pM, int pK,
 654                             int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
 655 {
 656     int i,x,y,z;
 657     int out_i,in_i,out_x,in_x,out_z,in_z;
 658     int s_y,e_y;
 659
 660     for (x=startx;x<endx+1;x++) /*1.j*/
 661     {
 662         if (x==startx)
 663         {
 664             s_y=starty;
 665         }
 666         else
 667         {
 668             s_y=0;
 669         }
 670         if (x==endx)
 671         {
 672             e_y=endy;
 673         }
 674         else
 675         {
 676             e_y=pM;
 677         }
 678
 679         out_x  = x*KG*pM;
 680         in_x = x;
 681
 682         for (i=0;i<P;i++) /*index cube along long axis*/
 683         {
 684             out_i  = out_x  + oK[i];
 685             in_i = in_x + i*maxM*maxN*maxK;
 686             for (z=0;z<K[i];z++) /*3.l*/
 687             {
 688                 out_z  = out_i  + z;
 689                 in_z = in_i + z*maxM*maxN;
 690                 for (y=s_y;y<e_y;y++) /*2.k*/
 691                 {
 692                     lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
 693                 }
 694             }
 695         }
 696     }
 697 }
 698
 699 /*make axis contiguous again (after AllToAll) and also do local transpose
 700   tranpose mayor and middle dimension
 701   variables see above
 702   the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 703   N,M,K local size
 704   MG, global size*/
 705 static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
 706                             int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
 707     int i,z,y,x;
 708     int out_i,in_i,out_z,in_z,out_x,in_x;
 709     int s_x,e_x;
 710
 711     for (z=startz; z<endz+1; z++)
 712     {
 713         if (z==startz)
 714         {
 715             s_x=startx;
 716         }
 717         else
 718         {
 719             s_x=0;
 720         }
 721         if (z==endz)
 722         {
 723             e_x=endx;
 724         }
 725         else
 726         {
 727             e_x=pN;
 728         }
 729         out_z  = z*MG*pN;
 730         in_z = z*maxM*maxN;
 731
 732         for (i=0; i<P; i++) /*index cube along long axis*/
 733         {
 734             out_i  = out_z  + oM[i];
 735             in_i = in_z + i*maxM*maxN*maxK;
 736             for (x=s_x;x<e_x;x++)
 737             {
 738                 out_x  = out_i  + x*MG;
 739                 in_x = in_i + x;
 740                 for (y=0;y<M[i];y++)
 741                 {
 742                     lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
 743                 }
 744             }
 745         }
 746     }
 747 }
 748
 749
 750 static void rotate_offsets(int x[]) {
 751     int t=x[0];
 752 /*    x[0]=x[2];
 753     x[2]=x[1];
 754     x[1]=t;*/
 755     x[0]=x[1];
 756     x[1]=x[2];
 757     x[2]=t;
 758 }
 759
 760 /*compute the offset to compare or print transposed local data in original input coordinates
 761   xs matrix dimension size, xl dimension length, xc decomposition offset
 762   s: step in computation = number of transposes*/
 763 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
 764 /*    int direction = plan->direction;
 765     int fftorder = plan->fftorder;*/
 766
 767     int o;
 768     int pos[3],i;
 769     int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
 770         *C=plan->C, *rC=plan->rC;
 771
 772     NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
 773
 774     if (!(plan->flags&FFT5D_ORDER_YZ)) {
 775         switch (s) {
 776         case 0: o=XYZ; break;
 777         case 1: o=ZYX; break;
 778         case 2: o=YZX; break;
 779         default: assert(0);
 780         }
 781     } else {
 782         switch (s) {
 783         case 0: o=XYZ; break;
 784         case 1: o=YXZ; break;
 785         case 2: o=ZXY; break;
 786         default: assert(0);
 787         }
 788     }
 789
 790     switch (o) {
 791         case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
 792         case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
 793         case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
 794         case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
 795         case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
 796         case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
 797     }
 798     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 799
 800     /*xs, xl give dimension size and data length in local transposed coordinate system
 801       for 0(/1/2): x(/y/z) in original coordinate system*/
 802     for (i=0;i<3;i++) {
 803         switch (pos[i]) {
 804         case 1: xs[i]=1;         xc[i]=0;     xl[i]=C[s];break;
 805         case 2: xs[i]=C[s];      xc[i]=oM[s]; xl[i]=pM[s];break;
 806         case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
 807         }
 808     }
 809     /*input order is different for test program to match FFTW order
 810       (important for complex to real)*/
 811     if (plan->flags&FFT5D_BACKWARD) {
 812         rotate_offsets(xs);
 813         rotate_offsets(xl);
 814         rotate_offsets(xc);
 815         rotate_offsets(NG);
 816         if (plan->flags&FFT5D_ORDER_YZ) {
 817             rotate_offsets(xs);
 818             rotate_offsets(xl);
 819             rotate_offsets(xc);
 820             rotate_offsets(NG);
 821         }
 822     }
 823     if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
 824         xl[0] = rC[s];
 825     }
 826 }
 827
 828 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
 829     int x,y,z,l,t;
 830     int *coor = plan->coor;
 831     int xs[3],xl[3],xc[3],NG[3];
 832     int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
 833     compute_offsets(plan,xs,xl,xc,NG,s);
 834     fprintf(debug,txt,coor[0],coor[1],s);
 835     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 836     for(z=0;z<xl[2];z++) {
 837         for(y=0;y<xl[1];y++) {
 838             fprintf(debug,"%d %d: ",coor[0],coor[1]);
 839             for (x=0;x<xl[0];x++) {
 840                 for (l=0;l<ll;l++) {
 841                     fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 842                 }
 843                 fprintf(debug,",");
 844             }
 845             fprintf(debug,"\n");
 846         }
 847     }
 848 }
 849
 850 void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
 851     t_complex *lin = plan->lin;
 852     t_complex *lout = plan->lout;
 853     t_complex *lout2 = plan->lout2;
 854     t_complex *lout3 = plan->lout3;
 855     t_complex *fftout,*joinin;
 856
 857     gmx_fft_t **p1d=plan->p1d;
 858 #ifdef FFT5D_MPI_TRANSPOSE
 859     FFTW(plan) *mpip=plan->mpip;
 860 #endif
 861 #ifdef GMX_MPI
 862     MPI_Comm *cart=plan->cart;
 863 #endif
 864
 865     double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
 866     int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
 867         *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
 868     int s=0,tstart,tend,bParallelDim;
 869
 870
 871 #ifdef GMX_FFT_FFTW3
 872     if (plan->p3d)
 873     {
 874         if (thread == 0)
 875         {
 876 #ifdef NOGMX
 877             if (times!=0)
 878             {
 879                 time=MPI_Wtime();
 880             }
 881 #endif
 882             FFTW(execute)(plan->p3d);
 883 #ifdef NOGMX
 884             if (times!=0)
 885             {
 886                 times->fft+=MPI_Wtime()-time;
 887             }
 888 #endif
 889         }
 890         return;
 891     }
 892 #endif
 893
 894         s=0;
 895
 896     /*lin: x,y,z*/
 897         if (plan->flags&FFT5D_DEBUG && thread == 0)
 898         {
 899             print_localdata(lin, "%d %d: copy in lin\n", s, plan);
 900         }
 901
 902         for (s=0;s<2;s++) {  /*loop over first two FFT steps (corner rotations)*/
 903
 904 #ifdef GMX_MPI
 905         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
 906         {
 907             bParallelDim = 1;
 908         }
 909         else
 910 #endif
 911         {
 912             bParallelDim = 0;
 913         }
 914
 915         /* ---------- START FFT ------------ */
 916 #ifdef NOGMX
 917         if (times!=0 && thread == 0)
 918         {
 919             time=MPI_Wtime();
 920         }
 921 #endif
 922
 923         if (bParallelDim || plan->nthreads == 1) {
 924             fftout = lout;
 925         }
 926         else
 927         {
 928             if (s==0)
 929             {
 930                 fftout = lout3;
 931             } else
 932             {
 933                 fftout = lout2;
 934             }
 935         }
 936
 937         tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
 938         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
 939         {
 940             gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
 941         } else
 942         {
 943             gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,fftout+tstart);
 944
 945         }
 946
 947 #ifdef NOGMX
 948         if (times != NULL && thread == 0)
 949         {
 950             time_fft+=MPI_Wtime()-time;
 951         }
 952 #endif
 953         if (plan->flags&FFT5D_DEBUG && thread == 0)
 954         {
 955             print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 956         }
 957         /* ---------- END FFT ------------ */
 958
 959         /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
 960         if (bParallelDim) {
 961 #ifdef NOGMX
 962             if (times != NULL && thread == 0)
 963             {
 964                 time=MPI_Wtime();
 965             }
 966 #endif
 967             /*prepare for A
 968 llToAll
 969               1. (most outer) axes (x) is split into P[s] parts of size N[s]
 970               for sending*/
 971             if (pM[s]>0)
 972             {
 973                 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
 974                 tstart/=C[s];
 975                 splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
 976             }
 977 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
 978 #ifdef NOGMX
 979             if (times != NULL && thread == 0)
 980             {
 981                 time_local+=MPI_Wtime()-time;
 982             }
 983 #endif
 984
 985         /* ---------- END SPLIT , START TRANSPOSE------------ */
 986
 987             if (thread == 0)
 988             {
 989 #ifdef NOGMX
 990                 if (times!=0)
 991                 {
 992                     time=MPI_Wtime();
 993                 }
 994 #else
 995                 wallcycle_start(times,ewcPME_FFTCOMM);
 996 #endif
 997 #ifdef FFT5D_MPI_TRANSPOSE
 998                 FFTW(execute)(mpip[s]);
 999 #else
1000 #ifdef GMX_MPI
1001                 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
1002                     MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
1003                 else
1004                     MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
1005 #else
1006                 gmx_incons("fft5d MPI call without MPI configuration");
1007 #endif /*GMX_MPI*/
1008 #endif /*FFT5D_MPI_TRANSPOSE*/
1009 #ifdef NOGMX
1010                 if (times!=0)
1011                 {
1012                     time_mpi[s]=MPI_Wtime()-time;
1013                 }
1014 #else
1015                 wallcycle_stop(times,ewcPME_FFTCOMM);
1016 #endif
1017             }  /*master*/
1018         }  /* bPrallelDim */
1019 #pragma omp barrier  /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1020
1021         /* ---------- END SPLIT + TRANSPOSE------------ */
1022
1023         /* ---------- START JOIN ------------ */
1024 #ifdef NOGMX
1025         if (times != NULL && thread == 0)
1026         {
1027             time=MPI_Wtime();
1028         }
1029 #endif
1030
1031         if (bParallelDim) {
1032             joinin = lout3;
1033         } else {
1034             joinin = fftout;
1035         }
1036         /*bring back in matrix form
1037           thus make  new 1. axes contiguos
1038           also local transpose 1 and 2/3
1039           runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1040         */
1041         if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
1042             if (pM[s]>0)
1043             {
1044                 tstart = ( thread   *pM[s]*pN[s]/plan->nthreads);
1045                 tend   = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1046                 joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
1047             }
1048         }
1049         else {
1050             if (pN[s]>0)
1051             {
1052                 tstart = ( thread   *pK[s]*pN[s]/plan->nthreads);
1053                 tend   = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1054                 joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
1055             }
1056         }
1057
1058 #ifdef NOGMX
1059         if (times != NULL && thread == 0)
1060         {
1061             time_local+=MPI_Wtime()-time;
1062         }
1063 #endif
1064         if (plan->flags&FFT5D_DEBUG && thread == 0)
1065         {
1066             print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1067         }
1068         /* ---------- END JOIN ------------ */
1069
1070         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1071     }  /* for(s=0;s<2;s++) */
1072 #ifdef NOGMX
1073         if (times != NULL && thread == 0)
1074         {
1075             time=MPI_Wtime();
1076         }
1077 #endif
1078
1079     if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
1080
1081     /*  ----------- FFT ----------- */
1082     tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1083     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
1084         gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
1085     } else {
1086         gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,lout+tstart);
1087     }
1088     /* ------------ END FFT ---------*/
1089
1090 #ifdef NOGMX
1091     if (times != NULL && thread == 0)
1092     {
1093         time_fft+=MPI_Wtime()-time;
1094
1095         times->fft+=time_fft;
1096         times->local+=time_local;
1097         times->mpi2+=time_mpi[1];
1098         times->mpi1+=time_mpi[0];
1099     }
1100 #endif
1101
1102     if (plan->flags&FFT5D_DEBUG && thread == 0)
1103     {
1104         print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1105     }
1106 }
1107
1108 void fft5d_destroy(fft5d_plan plan) {
1109     int s,t;
1110
1111     /* Note that we expect plan->lin and plan->lout to be freed elsewhere */
1112     if (plan->nthreads > 1)
1113     {
1114         free(plan->lout2);
1115         free(plan->lout3);
1116     }
1117
1118     for (s=0;s<3;s++)
1119     {
1120         if (plan->p1d[s])
1121         {
1122             for (t=0;t<plan->nthreads;t++)
1123             {
1124                 gmx_many_fft_destroy(plan->p1d[s][t]);
1125             }
1126             free(plan->p1d[s]);
1127         }
1128         if (plan->iNin[s])
1129         {
1130             free(plan->iNin[s]);
1131             plan->iNin[s]=0;
1132         }
1133         if (plan->oNin[s])
1134         {
1135             free(plan->oNin[s]);
1136             plan->oNin[s]=0;
1137         }
1138         if (plan->iNout[s])
1139         {
1140             free(plan->iNout[s]);
1141             plan->iNout[s]=0;
1142         }
1143         if (plan->oNout[s])
1144         {
1145             free(plan->oNout[s]);
1146             plan->oNout[s]=0;
1147         }
1148     }
1149 #ifdef GMX_FFT_FFTW3
1150     FFTW_LOCK;
1151 #ifdef FFT5D_MPI_TRANSPOS
1152     for (s=0;s<2;s++)
1153     {
1154         FFTW(destroy_plan)(plan->mpip[s]);
1155     }
1156     if (plan->p3d)
1157     {
1158         FFTW(destroy_plan)(plan->p3d);
1159     }
1160 #endif /* FFT5D_MPI_TRANSPOS */
1161 #endif /* GMX_FFT_FFTW3 */
1162
1163     if (!(plan->flags&FFT5D_NOMALLOC))
1164     {
1165         sfree_aligned(plan->lin);
1166         sfree_aligned(plan->lout);
1167         sfree_aligned(plan->lout2);
1168         sfree_aligned(plan->lout3);
1169     }
1170
1171 #ifdef FFT5D_THREADS
1172 #ifdef FFT5D_FFTW_THREADS
1173     /*FFTW(cleanup_threads)();*/
1174 #endif
1175 #endif
1176
1177     free(plan);
1178 }
1179
1180 /*Is this better than direct access of plan? enough data?
1181   here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1182 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
1183     *N1=plan->N[0];
1184     *M0=plan->M[0];
1185     *K1=plan->K[0];
1186     *K0=plan->N[1];
1187
1188     *coor=plan->coor;
1189 }
1190
1191
1192 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1193   of processor dimensions*/
1194 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
1195     MPI_Comm cart[2]={0};
1196 #ifdef GMX_MPI
1197     int size=1,prank=0;
1198     int P[2];
1199     int coor[2];
1200     int wrap[]={0,0};
1201     MPI_Comm gcart;
1202     int rdim1[] = {0,1}, rdim2[] = {1,0};
1203
1204     MPI_Comm_size(comm,&size);
1205     MPI_Comm_rank(comm,&prank);
1206
1207     if (P0==0) P0 = lfactor(size);
1208     if (size%P0!=0)
1209     {
1210         if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1211         P0 = lfactor(size);
1212     }
1213
1214     P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1215
1216     /*Difference between x-y-z regarding 2d decomposition is whether they are
1217       distributed along axis 1, 2 or both*/
1218
1219     MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1220     MPI_Cart_get(gcart,2,P,wrap,coor);
1221     MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1222     MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1223 #endif
1224     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
1225 }
1226
1227
1228
1229 /*prints in original coordinate system of data (as the input to FFT)*/
1230 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1231     int xs[3],xl[3],xc[3],NG[3];
1232     int x,y,z,l;
1233     int *coor = plan->coor;
1234     int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1235     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1236     {
1237         ll=1;
1238     }
1239
1240     compute_offsets(plan,xs,xl,xc,NG,2);
1241     if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1242     for (z=0;z<xl[2];z++) {
1243         for(y=0;y<xl[1];y++) {
1244             if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1245             for (x=0;x<xl[0];x++) {
1246                 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1247                     real a,b;
1248                     a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1249                     if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1250                     if (!bothLocal)
1251                         b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1252                     else
1253                         b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1254                     if (plan->flags&FFT5D_DEBUG) {
1255                         printf("%f %f, ",a,b);
1256                     } else {
1257                         if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1258                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1259                         }
1260 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1261                     }
1262                 }
1263                 if (plan->flags&FFT5D_DEBUG) printf(",");
1264             }
1265             if (plan->flags&FFT5D_DEBUG) printf("\n");
1266         }
1267     }
1268
1269 }
1270