src/mdlib/fft5d.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  11  * Copyright (c) 2001-2012, The GROMACS development team,
  12  * check out http://www.gromacs.org for more information.
  13
  14  * This program is free software; you can redistribute it and/or
  15  * modify it under the terms of the GNU General Public License
  16  * as published by the Free Software Foundation; either version 2
  17  * of the License, or (at your option) any later version.
  18  *
  19  * If you want to redistribute modifications, please consider that
  20  * scientific software is very special. Version control is crucial -
  21  * bugs must be traceable. We will be happy to consider code for
  22  * inclusion in the official distribution, but derived work must not
  23  * be called official GROMACS. Details are found in the README & COPYING
  24  * files - if they are missing, get the official version at www.gromacs.org.
  25  *
  26  * To help us fund GROMACS development, we humbly ask that you cite
  27  * the papers on the package - you can find them in the top README file.
  28  *
  29  * For more info, check our website at http://www.gromacs.org
  30  *
  31  * And Hey:
  32  * Groningen Machine for Chemical Simulation
  33  */
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37
  38
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <string.h>
  42
  43 #ifdef NOGMX
  44 #define GMX_PARALLEL_ENV_INITIALIZED 1
  45 #else
  46 #include "main.h"
  47 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
  48 #endif
  49
  50 #ifdef GMX_LIB_MPI
  51 #include <mpi.h>
  52 #endif
  53 #ifdef GMX_THREAD_MPI
  54 #include "tmpi.h"
  55 #endif
  56
  57 #ifdef GMX_OPENMP
  58 #define FFT5D_THREADS
  59 #endif
  60 #ifdef FFT5D_THREADS
  61 #include <omp.h>
  62 /* requires fftw compiled with openmp */
  63 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
  64 #endif
  65
  66 #include "fft5d.h"
  67 #include <float.h>
  68 #include <math.h>
  69 #include <assert.h>
  70
  71 #ifndef __FLT_EPSILON__
  72 #define __FLT_EPSILON__ FLT_EPSILON
  73 #define __DBL_EPSILON__ DBL_EPSILON
  74 #endif
  75
  76 #ifdef NOGMX
  77 FILE* debug=0;
  78 #endif
  79
  80 #include "gmx_fatal.h"
  81
  82
  83 #ifdef GMX_FFT_FFTW3
  84 #ifdef GMX_THREAD_MPI
  85 /* none of the fftw3 calls, except execute(), are thread-safe, so
  86    we need to serialize them with this mutex. */
  87 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
  88
  89 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex)
  90 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex)
  91 #else /* GMX_THREAD_MPI */
  92 #define FFTW_LOCK
  93 #define FFTW_UNLOCK
  94 #endif /* GMX_THREAD_MPI */
  95 #endif /* GMX_FFT_FFTW3 */
  96
  97 static double fft5d_fmax(double a, double b){
  98         return (a>b)?a:b;
  99 }
 100
 101 /* largest factor smaller than sqrt */
 102 static int lfactor(int z) {
 103         int i;
 104         for (i=sqrt(z);;i--)
 105                 if (z%i==0) return i;
 106         return 1;
 107 }
 108
 109 /* largest factor */
 110 static int l2factor(int z) {
 111         int i;
 112         if (z==1) return 1;
 113         for (i=z/2;;i--)
 114                 if (z%i==0) return i;
 115         return 1;
 116 }
 117
 118 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
 119 static int lpfactor(int z) {
 120         int f = l2factor(z);
 121         if (f==1) return z;
 122         return fft5d_fmax(lpfactor(f),lpfactor(z/f));
 123 }
 124
 125 #ifndef GMX_MPI
 126 #ifdef HAVE_GETTIMEOFDAY
 127 #include <sys/time.h>
 128 double MPI_Wtime() {
 129     struct timeval tv;
 130     gettimeofday(&tv,0);
 131     return tv.tv_sec+tv.tv_usec*1e-6;
 132 }
 133 #else
 134 double MPI_Wtime() {
 135     return 0.0;
 136 }
 137 #endif
 138 #endif
 139
 140 static int vmax(int* a, int s) {
 141     int i,max=0;
 142     for (i=0;i<s;i++)
 143     {
 144         if (a[i]>max) max=a[i];
 145     }
 146     return max;
 147 }
 148
 149 /*
 150 copied here from fftgrid, because:
 151 1. function there not publically available
 152 2. not sure whether we keep fftgrid
 153 3. less dependencies for fft5d
 154
 155 Only used for non-fftw case
 156 */
 157 static void *
 158 gmx_calloc_aligned(size_t size)
 159 {
 160     void *p0,*p;
 161
 162     /*We initialize by zero for Valgrind
 163       For non-divisible case we communicate more than the data.
 164       If we don't initialize the data we communicate uninitialized data*/
 165     p0 = calloc(size+32,1);
 166
 167     if(p0 == NULL)
 168     {
 169         gmx_fatal(FARGS,"Failed to allocated %u bytes of aligned memory.",size+32);
 170     }
 171
 172     p = (void *) (((size_t) p0 + 32) & (~((size_t) 31)));
 173
 174     /* Yeah, yeah, we cannot free this pointer, but who cares... */
 175     return p;
 176 }
 177
 178
 179 /* NxMxK the size of the data
 180  * comm communicator to use for fft5d
 181  * P0 number of processor in 1st axes (can be null for automatic)
 182  * lin is allocated by fft5d because size of array is only known after planning phase
 183  * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 184 */
 185 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
 186 {
 187
 188     int P[2],bMaster,prank[2],i,t;
 189     int rNG,rMG,rKG;
 190     int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
 191     int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
 192     int C[3],rC[3],nP[2];
 193     int lsize;
 194     t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
 195     fft5d_plan plan;
 196     int s;
 197
 198     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 199 #ifdef GMX_MPI
 200     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
 201     {
 202         MPI_Comm_size(comm[0],&P[0]);
 203         MPI_Comm_rank(comm[0],&prank[0]);
 204     }
 205     else
 206 #endif
 207     {
 208         P[0] = 1;
 209         prank[0] = 0;
 210     }
 211 #ifdef GMX_MPI
 212     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
 213     {
 214         MPI_Comm_size(comm[1],&P[1]);
 215         MPI_Comm_rank(comm[1],&prank[1]);
 216     }
 217     else
 218 #endif
 219     {
 220         P[1] = 1;
 221         prank[1] = 0;
 222     }
 223
 224     bMaster=(prank[0]==0&&prank[1]==0);
 225
 226
 227     if (debug)
 228     {
 229         fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
 230                 P[0],P[1],prank[0],prank[1]);
 231     }
 232
 233     if (bMaster) {
 234         if (debug)
 235             fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 236                 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
 237         /* The check below is not correct, one prime factor 11 or 13 is ok.
 238         if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 239             printf("WARNING: FFT very slow with prime factors larger 7\n");
 240             printf("Change FFT size or in case you cannot change it look at\n");
 241             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 242         }
 243         */
 244     }
 245
 246     if (NG==0 || MG==0 || KG==0) {
 247         if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 248         return 0;
 249     }
 250
 251     rNG=NG;rMG=MG;rKG=KG;
 252
 253     if (flags&FFT5D_REALCOMPLEX) {
 254         if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
 255         else {
 256             if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
 257             else KG=KG/2+1;
 258         }
 259     }
 260
 261
 262     /*for transpose we need to know the size for each processor not only our own size*/
 263
 264     N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 265     M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 266     K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 267     oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
 268     oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
 269     oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
 270
 271     for (i=0;i<P[0];i++)
 272     {
 273         #define EVENDIST
 274         #ifndef EVENDIST
 275         oN0[i]=i*ceil((double)NG/P[0]);
 276         oM0[i]=i*ceil((double)MG/P[0]);
 277         oK0[i]=i*ceil((double)KG/P[0]);
 278         #else
 279         oN0[i]=(NG*i)/P[0];
 280         oM0[i]=(MG*i)/P[0];
 281         oK0[i]=(KG*i)/P[0];
 282         #endif
 283     }
 284     for (i=0;i<P[1];i++)
 285     {
 286         #ifndef EVENDIST
 287         oN1[i]=i*ceil((double)NG/P[1]);
 288         oM1[i]=i*ceil((double)MG/P[1]);
 289         oK1[i]=i*ceil((double)KG/P[1]);
 290         #else
 291         oN1[i]=(NG*i)/P[1];
 292         oM1[i]=(MG*i)/P[1];
 293         oK1[i]=(KG*i)/P[1];
 294         #endif
 295     }
 296     for (i=0;i<P[0]-1;i++)
 297     {
 298         N0[i]=oN0[i+1]-oN0[i];
 299         M0[i]=oM0[i+1]-oM0[i];
 300         K0[i]=oK0[i+1]-oK0[i];
 301     }
 302     N0[P[0]-1]=NG-oN0[P[0]-1];
 303     M0[P[0]-1]=MG-oM0[P[0]-1];
 304     K0[P[0]-1]=KG-oK0[P[0]-1];
 305     for (i=0;i<P[1]-1;i++)
 306     {
 307         N1[i]=oN1[i+1]-oN1[i];
 308         M1[i]=oM1[i+1]-oM1[i];
 309         K1[i]=oK1[i+1]-oK1[i];
 310     }
 311     N1[P[1]-1]=NG-oN1[P[1]-1];
 312     M1[P[1]-1]=MG-oM1[P[1]-1];
 313     K1[P[1]-1]=KG-oK1[P[1]-1];
 314
 315     /* for step 1-3 the local N,M,K sizes of the transposed system
 316        C: contiguous dimension, and nP: number of processor in subcommunicator
 317        for that step */
 318
 319
 320     pM[0] = M0[prank[0]];
 321     oM[0] = oM0[prank[0]];
 322     pK[0] = K1[prank[1]];
 323     oK[0] = oK1[prank[1]];
 324     C[0] = NG;
 325     rC[0] = rNG;
 326     if (!(flags&FFT5D_ORDER_YZ)) {
 327         N[0] = vmax(N1,P[1]);
 328         M[0] = M0[prank[0]];
 329         K[0] = vmax(K1,P[1]);
 330         pN[0] = N1[prank[1]];
 331         iNout[0] = N1;
 332         oNout[0] = oN1;
 333         nP[0] = P[1];
 334         C[1] = KG;
 335         rC[1] =rKG;
 336         N[1] = vmax(K0,P[0]);
 337         pN[1] = K0[prank[0]];
 338         iNin[1] = K1;
 339         oNin[1] = oK1;
 340         iNout[1] = K0;
 341         oNout[1] = oK0;
 342         M[1] = vmax(M0,P[0]);
 343         pM[1] = M0[prank[0]];
 344         oM[1] = oM0[prank[0]];
 345         K[1] = N1[prank[1]];
 346         pK[1] = N1[prank[1]];
 347         oK[1] = oN1[prank[1]];
 348         nP[1] = P[0];
 349         C[2] = MG;
 350         rC[2] = rMG;
 351         iNin[2] = M0;
 352         oNin[2] = oM0;
 353         M[2] = vmax(K0,P[0]);
 354         pM[2] = K0[prank[0]];
 355         oM[2] = oK0[prank[0]];
 356         K[2] = vmax(N1,P[1]);
 357         pK[2] = N1[prank[1]];
 358         oK[2] = oN1[prank[1]];
 359         free(N0); free(oN0); /*these are not used for this order*/
 360         free(M1); free(oM1); /*the rest is freed in destroy*/
 361     } else {
 362         N[0] = vmax(N0,P[0]);
 363         M[0] = vmax(M0,P[0]);
 364         K[0] = K1[prank[1]];
 365         pN[0] = N0[prank[0]];
 366         iNout[0] = N0;
 367         oNout[0] = oN0;
 368         nP[0] = P[0];
 369         C[1] = MG;
 370         rC[1] =rMG;
 371         N[1] = vmax(M1,P[1]);
 372         pN[1] = M1[prank[1]];
 373         iNin[1] = M0;
 374         oNin[1] = oM0;
 375         iNout[1] = M1;
 376         oNout[1] = oM1;
 377         M[1] = N0[prank[0]];
 378         pM[1] = N0[prank[0]];
 379         oM[1] = oN0[prank[0]];
 380         K[1] = vmax(K1,P[1]);
 381         pK[1] = K1[prank[1]];
 382         oK[1] = oK1[prank[1]];
 383         nP[1] = P[1];
 384         C[2] = KG;
 385         rC[2] = rKG;
 386         iNin[2] = K1;
 387         oNin[2] = oK1;
 388         M[2] = vmax(N0,P[0]);
 389         pM[2] = N0[prank[0]];
 390         oM[2] = oN0[prank[0]];
 391         K[2] = vmax(M1,P[1]);
 392         pK[2] = M1[prank[1]];
 393         oK[2] = oM1[prank[1]];
 394         free(N1); free(oN1); /*these are not used for this order*/
 395         free(K0); free(oK0); /*the rest is freed in destroy*/
 396     }
 397
 398     /*
 399       Difference between x-y-z regarding 2d decomposition is whether they are
 400       distributed along axis 1, 2 or both
 401     */
 402
 403     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 404     lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
 405     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 406     if (!(flags&FFT5D_NOMALLOC)) {
 407         lin = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
 408         lout = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
 409         lout2 = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
 410         lout3 = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
 411     } else {
 412         lin = *rlin;
 413         lout = *rlout;
 414         lout2 = *rlout2;
 415         lout3 = *rlout3;
 416     }
 417
 418     plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
 419
 420
 421     if (debug)
 422     {
 423         fprintf(debug, "Running on %d threads\n",nthreads);
 424     }
 425
 426 #ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
 427     /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
 428      * within a parallel region. For now deactivated. If it should be supported it has to made sure that
 429      * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
 430      * and that the 3d plan is faster than the 1d plan.
 431      */
 432     if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) {  /*don't do 3d plan in parallel or if in_place requested */
 433             int fftwflags=FFTW_DESTROY_INPUT;
 434             FFTW(iodim) dims[3];
 435             int inNG=NG,outMG=MG,outKG=KG;
 436
 437             FFTW_LOCK;
 438             if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
 439             if (flags&FFT5D_REALCOMPLEX) {
 440                 if (!(flags&FFT5D_BACKWARD)) {  /*input pointer is not complex*/
 441                     inNG*=2;
 442                 } else {                        /*output pointer is not complex*/
 443                     if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
 444                     else outKG*=2;
 445                 }
 446             }
 447
 448             if (!(flags&FFT5D_BACKWARD)) {
 449                 dims[0].n  = KG;
 450                 dims[1].n  = MG;
 451                 dims[2].n  = rNG;
 452
 453                 dims[0].is = inNG*MG;     /*N M K*/
 454                 dims[1].is = inNG;
 455                 dims[2].is = 1;
 456                 if (!(flags&FFT5D_ORDER_YZ)) {
 457                     dims[0].os = MG;       /*M K N*/
 458                     dims[1].os = 1;
 459                     dims[2].os = MG*KG;
 460                 } else  {
 461                     dims[0].os = 1;       /*K N M*/
 462                     dims[1].os = KG*NG;
 463                     dims[2].os = KG;
 464                 }
 465             } else {
 466                 if (!(flags&FFT5D_ORDER_YZ)) {
 467                     dims[0].n  = NG;
 468                     dims[1].n  = KG;
 469                     dims[2].n  = rMG;
 470
 471                     dims[0].is = 1;
 472                     dims[1].is = NG*MG;
 473                     dims[2].is = NG;
 474
 475                     dims[0].os = outMG*KG;
 476                     dims[1].os = outMG;
 477                     dims[2].os = 1;
 478                 } else {
 479                     dims[0].n  = MG;
 480                     dims[1].n  = NG;
 481                     dims[2].n  = rKG;
 482
 483                     dims[0].is = NG;
 484                     dims[1].is = 1;
 485                     dims[2].is = NG*MG;
 486
 487                     dims[0].os = outKG*NG;
 488                     dims[1].os = outKG;
 489                     dims[2].os = 1;
 490                 }
 491             }
 492 #ifdef FFT5D_THREADS
 493 #ifdef FFT5D_FFTW_THREADS
 494             FFTW(plan_with_nthreads)(nthreads);
 495 #endif
 496 #endif
 497             if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
 498                 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 499                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 500                                      (real*)lin, (FFTW(complex) *)lout,
 501                                      /*flags*/ fftwflags);
 502             } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
 503                 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 504                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 505                                      (FFTW(complex) *)lin, (real*)lout,
 506                                      /*flags*/ fftwflags);
 507             } else {
 508                 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 509                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 510                                      (FFTW(complex) *)lin, (FFTW(complex) *)lout,
 511                                      /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
 512             }
 513 #ifdef FFT5D_THREADS
 514 #ifdef FFT5D_FFTW_THREADS
 515             FFTW(plan_with_nthreads)(1);
 516 #endif
 517 #endif
 518             FFTW_UNLOCK;
 519     }
 520     if (!plan->p3d) {  /* for decomposition and if 3d plan did not work */
 521 #endif /* GMX_FFT_FFTW3 */
 522         for (s=0;s<3;s++) {
 523             if (debug)
 524             {
 525                 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 526                         s,rC[s],M[s],pK[s],C[s],lsize);
 527             }
 528             plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
 529
 530             /* Make sure that the init routines are only called by one thread at a time and in order
 531                (later is only important to not confuse valgrind)
 532              */
 533 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
 534             for(t=0; t<nthreads; t++)
 535             {
 536                 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
 537
 538                 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
 539                     gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 540                 } else {
 541                     gmx_fft_init_many_1d     ( &plan->p1d[s][t],  C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 542                 }
 543             }
 544         }
 545
 546 #ifdef GMX_FFT_FFTW3
 547     }
 548 #endif
 549     if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
 550         plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
 551     } else {
 552         plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
 553     }
 554 #ifdef FFT5D_MPI_TRANSPOSE
 555     FFTW_LOCK;
 556     for (s=0;s<2;s++) {
 557         if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
 558             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 559         else
 560             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 561     }
 562     FFTW_UNLOCK;
 563 #endif
 564
 565
 566     plan->lin=lin;
 567     plan->lout=lout;
 568     plan->lout2=lout2;
 569     plan->lout3=lout3;
 570
 571     plan->NG=NG;plan->MG=MG;plan->KG=KG;
 572
 573     for (s=0;s<3;s++) {
 574         plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
 575         plan->oM[s]=oM[s];plan->oK[s]=oK[s];
 576         plan->C[s]=C[s];plan->rC[s]=rC[s];
 577         plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
 578     }
 579     for (s=0;s<2;s++) {
 580         plan->P[s]=nP[s];plan->coor[s]=prank[s];
 581     }
 582
 583 /*    plan->fftorder=fftorder;
 584     plan->direction=direction;
 585     plan->realcomplex=realcomplex;
 586 */
 587     plan->flags=flags;
 588     plan->nthreads=nthreads;
 589     *rlin=lin;
 590     *rlout=lout;
 591     *rlout2=lout2;
 592     *rlout3=lout3;
 593     return plan;
 594 }
 595
 596
 597 enum order {
 598     XYZ,
 599     XZY,
 600     YXZ,
 601     YZX,
 602     ZXY,
 603     ZYX
 604 };
 605
 606
 607
 608 /*here x,y,z and N,M,K is in rotated coordinate system!!
 609   x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 610   maxN,maxM,maxK is max size of local data
 611   pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 612   NG, MG, KG is size of global data*/
 613 static void splitaxes(t_complex* lout,const t_complex* lin,
 614                       int maxN,int maxM,int maxK, int pN, int pM, int pK,
 615                       int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
 616 {
 617     int x,y,z,i;
 618     int in_i,out_i,in_z,out_z,in_y,out_y;
 619     int s_y,e_y;
 620
 621     for (z=startz; z<endz+1; z++) /*3. z l*/
 622     {
 623         if (z==startz) {
 624             s_y=starty;
 625         } else {
 626             s_y=0;
 627         }
 628         if (z==endz) {
 629             e_y=endy;
 630         } else {
 631             e_y=pM;
 632         }
 633         out_z  = z*maxN*maxM;
 634         in_z = z*NG*pM;
 635
 636         for (i=0; i<P; i++) /*index cube along long axis*/
 637         {
 638             out_i  = out_z  + i*maxN*maxM*maxK;
 639             in_i = in_z + oN[i];
 640             for (y=s_y;y<e_y;y++) { /*2. y k*/
 641                 out_y  = out_i  + y*maxN;
 642                 in_y = in_i + y*NG;
 643                 for (x=0;x<N[i];x++) { /*1. x j*/
 644                     lout[out_y+x] = lin[in_y+x];    /*in=z*NG*pM+oN[i]+y*NG+x*/
 645                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 646                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 647                 }
 648             }
 649         }
 650     }
 651 }
 652
 653 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 654 /*transpose mayor and major dimension
 655   variables see above
 656   the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 657   N,M,K local dimensions
 658   KG global size*/
 659 static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
 660                             int maxN,int maxM,int maxK,int pN, int pM, int pK,
 661                             int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
 662 {
 663     int i,x,y,z;
 664     int out_i,in_i,out_x,in_x,out_z,in_z;
 665     int s_y,e_y;
 666
 667     for (x=startx;x<endx+1;x++) /*1.j*/
 668     {
 669         if (x==startx)
 670         {
 671             s_y=starty;
 672         }
 673         else
 674         {
 675             s_y=0;
 676         }
 677         if (x==endx)
 678         {
 679             e_y=endy;
 680         }
 681         else
 682         {
 683             e_y=pM;
 684         }
 685
 686         out_x  = x*KG*pM;
 687         in_x = x;
 688
 689         for (i=0;i<P;i++) /*index cube along long axis*/
 690         {
 691             out_i  = out_x  + oK[i];
 692             in_i = in_x + i*maxM*maxN*maxK;
 693             for (z=0;z<K[i];z++) /*3.l*/
 694             {
 695                 out_z  = out_i  + z;
 696                 in_z = in_i + z*maxM*maxN;
 697                 for (y=s_y;y<e_y;y++) /*2.k*/
 698                 {
 699                     lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
 700                 }
 701             }
 702         }
 703     }
 704 }
 705
 706 /*make axis contiguous again (after AllToAll) and also do local transpose
 707   tranpose mayor and middle dimension
 708   variables see above
 709   the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 710   N,M,K local size
 711   MG, global size*/
 712 static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
 713                             int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
 714     int i,z,y,x;
 715     int out_i,in_i,out_z,in_z,out_x,in_x;
 716     int s_x,e_x;
 717
 718     for (z=startz; z<endz+1; z++)
 719     {
 720         if (z==startz)
 721         {
 722             s_x=startx;
 723         }
 724         else
 725         {
 726             s_x=0;
 727         }
 728         if (z==endz)
 729         {
 730             e_x=endx;
 731         }
 732         else
 733         {
 734             e_x=pN;
 735         }
 736         out_z  = z*MG*pN;
 737         in_z = z*maxM*maxN;
 738
 739         for (i=0; i<P; i++) /*index cube along long axis*/
 740         {
 741             out_i  = out_z  + oM[i];
 742             in_i = in_z + i*maxM*maxN*maxK;
 743             for (x=s_x;x<e_x;x++)
 744             {
 745                 out_x  = out_i  + x*MG;
 746                 in_x = in_i + x;
 747                 for (y=0;y<M[i];y++)
 748                 {
 749                     lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
 750                 }
 751             }
 752         }
 753     }
 754 }
 755
 756
 757 static void rotate(int x[]) {
 758     int t=x[0];
 759 /*    x[0]=x[2];
 760     x[2]=x[1];
 761     x[1]=t;*/
 762     x[0]=x[1];
 763     x[1]=x[2];
 764     x[2]=t;
 765 }
 766
 767 /*compute the offset to compare or print transposed local data in original input coordinates
 768   xs matrix dimension size, xl dimension length, xc decomposition offset
 769   s: step in computation = number of transposes*/
 770 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
 771 /*    int direction = plan->direction;
 772     int fftorder = plan->fftorder;*/
 773
 774     int o;
 775     int pos[3],i;
 776     int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
 777         *C=plan->C, *rC=plan->rC;
 778
 779     NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
 780
 781     if (!(plan->flags&FFT5D_ORDER_YZ)) {
 782         switch (s) {
 783         case 0: o=XYZ; break;
 784         case 1: o=ZYX; break;
 785         case 2: o=YZX; break;
 786         default: assert(0);
 787         }
 788     } else {
 789         switch (s) {
 790         case 0: o=XYZ; break;
 791         case 1: o=YXZ; break;
 792         case 2: o=ZXY; break;
 793         default: assert(0);
 794         }
 795     }
 796
 797     switch (o) {
 798         case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
 799         case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
 800         case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
 801         case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
 802         case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
 803         case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
 804     }
 805     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 806
 807     /*xs, xl give dimension size and data length in local transposed coordinate system
 808       for 0(/1/2): x(/y/z) in original coordinate system*/
 809     for (i=0;i<3;i++) {
 810         switch (pos[i]) {
 811         case 1: xs[i]=1;         xc[i]=0;     xl[i]=C[s];break;
 812         case 2: xs[i]=C[s];      xc[i]=oM[s]; xl[i]=pM[s];break;
 813         case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
 814         }
 815     }
 816     /*input order is different for test program to match FFTW order
 817       (important for complex to real)*/
 818     if (plan->flags&FFT5D_BACKWARD) {
 819         rotate(xs);
 820         rotate(xl);
 821         rotate(xc);
 822         rotate(NG);
 823         if (plan->flags&FFT5D_ORDER_YZ) {
 824             rotate(xs);
 825             rotate(xl);
 826             rotate(xc);
 827             rotate(NG);
 828         }
 829     }
 830     if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
 831         xl[0] = rC[s];
 832     }
 833 }
 834
 835 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
 836     int x,y,z,l,t;
 837     int *coor = plan->coor;
 838     int xs[3],xl[3],xc[3],NG[3];
 839     int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
 840     compute_offsets(plan,xs,xl,xc,NG,s);
 841     fprintf(debug,txt,coor[0],coor[1],s);
 842     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 843     for(z=0;z<xl[2];z++) {
 844         for(y=0;y<xl[1];y++) {
 845             fprintf(debug,"%d %d: ",coor[0],coor[1]);
 846             for (x=0;x<xl[0];x++) {
 847                 for (l=0;l<ll;l++) {
 848                     fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 849                 }
 850                 fprintf(debug,",");
 851             }
 852             fprintf(debug,"\n");
 853         }
 854     }
 855 }
 856
 857 void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
 858     t_complex *lin = plan->lin;
 859     t_complex *lout = plan->lout;
 860     t_complex *lout2 = plan->lout2;
 861     t_complex *lout3 = plan->lout3;
 862     t_complex *fftout,*joinin;
 863
 864     gmx_fft_t **p1d=plan->p1d;
 865 #ifdef FFT5D_MPI_TRANSPOSE
 866     FFTW(plan) *mpip=plan->mpip;
 867 #endif
 868 #ifdef GMX_MPI
 869     MPI_Comm *cart=plan->cart;
 870 #endif
 871
 872     double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
 873     int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
 874         *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
 875     int s=0,tstart,tend,bParallelDim;
 876
 877
 878 #ifdef GMX_FFT_FFTW3
 879     if (plan->p3d)
 880     {
 881         if (thread == 0)
 882         {
 883 #ifdef NOGMX
 884             if (times!=0)
 885             {
 886                 time=MPI_Wtime();
 887             }
 888 #endif
 889             FFTW(execute)(plan->p3d);
 890 #ifdef NOGMX
 891             if (times!=0)
 892             {
 893                 times->fft+=MPI_Wtime()-time;
 894             }
 895 #endif
 896         }
 897         return;
 898     }
 899 #endif
 900
 901         s=0;
 902
 903     /*lin: x,y,z*/
 904         if (plan->flags&FFT5D_DEBUG && thread == 0)
 905         {
 906             print_localdata(lin, "%d %d: copy in lin\n", s, plan);
 907         }
 908
 909         for (s=0;s<2;s++) {  /*loop over first two FFT steps (corner rotations)*/
 910
 911 #ifdef GMX_MPI
 912         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] !=0 && P[s]>1 )
 913         {
 914             bParallelDim = 1;
 915         }
 916         else
 917 #endif
 918         {
 919             bParallelDim = 0;
 920         }
 921
 922         /* ---------- START FFT ------------ */
 923 #ifdef NOGMX
 924         if (times!=0 && thread == 0)
 925         {
 926             time=MPI_Wtime();
 927         }
 928 #endif
 929
 930         if (bParallelDim) {
 931             fftout = lout;
 932         }
 933         else
 934         {
 935             if (s==0)
 936             {
 937                 fftout = lout3;
 938             } else
 939             {
 940                 fftout = lout2;
 941             }
 942         }
 943
 944         tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
 945         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
 946         {
 947             gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
 948         } else
 949         {
 950             gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,fftout+tstart);
 951
 952         }
 953
 954 #ifdef NOGMX
 955         if (times != NULL && thread == 0)
 956         {
 957             time_fft+=MPI_Wtime()-time;
 958         }
 959 #endif
 960         if (plan->flags&FFT5D_DEBUG && thread == 0)
 961         {
 962             print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 963         }
 964         /* ---------- END FFT ------------ */
 965
 966         /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
 967         if (bParallelDim) {
 968 #ifdef NOGMX
 969             if (times != NULL && thread == 0)
 970             {
 971                 time=MPI_Wtime();
 972             }
 973 #endif
 974             /*prepare for A
 975 llToAll
 976               1. (most outer) axes (x) is split into P[s] parts of size N[s]
 977               for sending*/
 978             if (pM[s]>0)
 979             {
 980                 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
 981                 tstart/=C[s];
 982                 splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
 983             }
 984 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
 985 #ifdef NOGMX
 986             if (times != NULL && thread == 0)
 987             {
 988                 time_local+=MPI_Wtime()-time;
 989             }
 990 #endif
 991
 992         /* ---------- END SPLIT , START TRANSPOSE------------ */
 993
 994             if (thread == 0)
 995             {
 996 #ifdef NOGMX
 997                 if (times!=0)
 998                 {
 999                     time=MPI_Wtime();
1000                 }
1001 #else
1002                 wallcycle_start(times,ewcPME_FFTCOMM);
1003 #endif
1004 #ifdef FFT5D_MPI_TRANSPOSE
1005                 FFTW(execute)(mpip[s]);
1006 #else
1007 #ifdef GMX_MPI
1008                 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
1009                     MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
1010                 else
1011                     MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
1012 #else
1013                 gmx_incons("fft5d MPI call without MPI configuration");
1014 #endif /*GMX_MPI*/
1015 #endif /*FFT5D_MPI_TRANSPOSE*/
1016 #ifdef NOGMX
1017                 if (times!=0)
1018                 {
1019                     time_mpi[s]=MPI_Wtime()-time;
1020                 }
1021 #else
1022                 wallcycle_stop(times,ewcPME_FFTCOMM);
1023 #endif
1024             }  /*master*/
1025         }  /* bPrallelDim */
1026 #pragma omp barrier  /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1027
1028         /* ---------- END SPLIT + TRANSPOSE------------ */
1029
1030         /* ---------- START JOIN ------------ */
1031 #ifdef NOGMX
1032         if (times != NULL && thread == 0)
1033         {
1034             time=MPI_Wtime();
1035         }
1036 #endif
1037
1038         if (bParallelDim) {
1039             joinin = lout3;
1040         } else {
1041             joinin = fftout;
1042         }
1043         /*bring back in matrix form
1044           thus make  new 1. axes contiguos
1045           also local transpose 1 and 2/3
1046           runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1047         */
1048         if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
1049             if (pM[s]>0)
1050             {
1051                 tstart = ( thread   *pM[s]*pN[s]/plan->nthreads);
1052                 tend   = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1053                 joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
1054             }
1055         }
1056         else {
1057             if (pN[s]>0)
1058             {
1059                 tstart = ( thread   *pK[s]*pN[s]/plan->nthreads);
1060                 tend   = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1061                 joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
1062             }
1063         }
1064
1065 #ifdef NOGMX
1066         if (times != NULL && thread == 0)
1067         {
1068             time_local+=MPI_Wtime()-time;
1069         }
1070 #endif
1071         if (plan->flags&FFT5D_DEBUG && thread == 0)
1072         {
1073             print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1074         }
1075         /* ---------- END JOIN ------------ */
1076
1077         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1078     }  /* for(s=0;s<2;s++) */
1079 #ifdef NOGMX
1080         if (times != NULL && thread == 0)
1081         {
1082             time=MPI_Wtime();
1083         }
1084 #endif
1085
1086     if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
1087
1088     /*  ----------- FFT ----------- */
1089     tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1090     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
1091         gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
1092     } else {
1093         gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,lout+tstart);
1094     }
1095     /* ------------ END FFT ---------*/
1096
1097 #ifdef NOGMX
1098     if (times != NULL && thread == 0)
1099     {
1100         time_fft+=MPI_Wtime()-time;
1101
1102         times->fft+=time_fft;
1103         times->local+=time_local;
1104         times->mpi2+=time_mpi[1];
1105         times->mpi1+=time_mpi[0];
1106     }
1107 #endif
1108
1109     if (plan->flags&FFT5D_DEBUG && thread == 0)
1110     {
1111         print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1112     }
1113 }
1114
1115 void fft5d_destroy(fft5d_plan plan) {
1116     int s,t;
1117     for (s=0;s<3;s++)
1118     {
1119         if (plan->p1d[s])
1120         {
1121             for (t=0;t<plan->nthreads;t++)
1122             {
1123                 gmx_many_fft_destroy(plan->p1d[s][t]);
1124             }
1125             free(plan->p1d[s]);
1126         }
1127         if (plan->iNin[s])
1128         {
1129             free(plan->iNin[s]);
1130             plan->iNin[s]=0;
1131         }
1132         if (plan->oNin[s])
1133         {
1134             free(plan->oNin[s]);
1135             plan->oNin[s]=0;
1136         }
1137         if (plan->iNout[s])
1138         {
1139             free(plan->iNout[s]);
1140             plan->iNout[s]=0;
1141         }
1142         if (plan->oNout[s])
1143         {
1144             free(plan->oNout[s]);
1145             plan->oNout[s]=0;
1146         }
1147     }
1148 #ifdef GMX_FFT_FFTW3
1149     FFTW_LOCK;
1150 #ifdef FFT5D_MPI_TRANSPOS
1151     for (s=0;s<2;s++)
1152     {
1153         FFTW(destroy_plan)(plan->mpip[s]);
1154     }
1155     if (plan->p3d)
1156     {
1157         FFTW(destroy_plan)(plan->p3d);
1158     }
1159 #endif /* FFT5D_MPI_TRANSPOS */
1160 #endif /* GMX_FFT_FFTW3 */
1161
1162     /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
1163
1164
1165 #ifdef FFT5D_THREADS
1166 #ifdef FFT5D_FFTW_THREADS
1167     /*FFTW(cleanup_threads)();*/
1168 #endif
1169 #endif
1170
1171     free(plan);
1172 }
1173
1174 /*Is this better than direct access of plan? enough data?
1175   here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1176 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
1177     *N1=plan->N[0];
1178     *M0=plan->M[0];
1179     *K1=plan->K[0];
1180     *K0=plan->N[1];
1181
1182     *coor=plan->coor;
1183 }
1184
1185
1186 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1187   of processor dimensions*/
1188 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
1189     MPI_Comm cart[2]={0};
1190 #ifdef GMX_MPI
1191     int size=1,prank=0;
1192     int P[2];
1193     int coor[2];
1194     int wrap[]={0,0};
1195     MPI_Comm gcart;
1196     int rdim1[] = {0,1}, rdim2[] = {1,0};
1197
1198     MPI_Comm_size(comm,&size);
1199     MPI_Comm_rank(comm,&prank);
1200
1201     if (P0==0) P0 = lfactor(size);
1202     if (size%P0!=0)
1203     {
1204         if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1205         P0 = lfactor(size);
1206     }
1207
1208     P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1209
1210     /*Difference between x-y-z regarding 2d decomposition is whether they are
1211       distributed along axis 1, 2 or both*/
1212
1213     MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1214     MPI_Cart_get(gcart,2,P,wrap,coor);
1215     MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1216     MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1217 #endif
1218     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
1219 }
1220
1221
1222
1223 /*prints in original coordinate system of data (as the input to FFT)*/
1224 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1225     int xs[3],xl[3],xc[3],NG[3];
1226     int x,y,z,l;
1227     int *coor = plan->coor;
1228     int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1229     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1230     {
1231         ll=1;
1232     }
1233
1234     compute_offsets(plan,xs,xl,xc,NG,2);
1235     if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1236     for (z=0;z<xl[2];z++) {
1237         for(y=0;y<xl[1];y++) {
1238             if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1239             for (x=0;x<xl[0];x++) {
1240                 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1241                     real a,b;
1242                     a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1243                     if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1244                     if (!bothLocal)
1245                         b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1246                     else
1247                         b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1248                     if (plan->flags&FFT5D_DEBUG) {
1249                         printf("%f %f, ",a,b);
1250                     } else {
1251                         if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1252                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1253                         }
1254 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1255                     }
1256                 }
1257                 if (plan->flags&FFT5D_DEBUG) printf(",");
1258             }
1259             if (plan->flags&FFT5D_DEBUG) printf("\n");
1260         }
1261     }
1262
1263 }
1264