src/mdlib/fft5d.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  11  * Copyright (c) 2001-2012, The GROMACS development team,
  12  * check out http://www.gromacs.org for more information.
  13
  14  * This program is free software; you can redistribute it and/or
  15  * modify it under the terms of the GNU General Public License
  16  * as published by the Free Software Foundation; either version 2
  17  * of the License, or (at your option) any later version.
  18  *
  19  * If you want to redistribute modifications, please consider that
  20  * scientific software is very special. Version control is crucial -
  21  * bugs must be traceable. We will be happy to consider code for
  22  * inclusion in the official distribution, but derived work must not
  23  * be called official GROMACS. Details are found in the README & COPYING
  24  * files - if they are missing, get the official version at www.gromacs.org.
  25  *
  26  * To help us fund GROMACS development, we humbly ask that you cite
  27  * the papers on the package - you can find them in the top README file.
  28  *
  29  * For more info, check our website at http://www.gromacs.org
  30  *
  31  * And Hey:
  32  * Groningen Machine for Chemical Simulation
  33  */
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37
  38
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <string.h>
  42
  43 #ifdef NOGMX
  44 #define GMX_PARALLEL_ENV_INITIALIZED 1
  45 #else
  46 #ifdef GMX_MPI
  47 #define GMX_PARALLEL_ENV_INITIALIZED 1
  48 #else
  49 #define GMX_PARALLEL_ENV_INITIALIZED 0
  50 #endif
  51 #endif
  52
  53 #ifdef GMX_LIB_MPI
  54 #include <mpi.h>
  55 #endif
  56 #ifdef GMX_THREAD_MPI
  57 #include "tmpi.h"
  58 #endif
  59
  60 #ifdef GMX_OPENMP
  61 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
  62 #define FFT5D_THREADS
  63 #endif
  64 #ifdef FFT5D_THREADS
  65 #include "gmx_omp.h"
  66 /* requires fftw compiled with openmp */
  67 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
  68 #endif
  69
  70 #include "fft5d.h"
  71 #include <float.h>
  72 #include <math.h>
  73 #include <assert.h>
  74 #include "smalloc.h"
  75
  76 #ifndef __FLT_EPSILON__
  77 #define __FLT_EPSILON__ FLT_EPSILON
  78 #define __DBL_EPSILON__ DBL_EPSILON
  79 #endif
  80
  81 #ifdef NOGMX
  82 FILE* debug=0;
  83 #endif
  84
  85 #include "gmx_fatal.h"
  86
  87
  88 #ifdef GMX_FFT_FFTW3
  89 #ifdef GMX_THREAD_MPI
  90 /* none of the fftw3 calls, except execute(), are thread-safe, so
  91    we need to serialize them with this mutex. */
  92 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
  93
  94 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex)
  95 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex)
  96 #else /* GMX_THREAD_MPI */
  97 #define FFTW_LOCK
  98 #define FFTW_UNLOCK
  99 #endif /* GMX_THREAD_MPI */
 100 #endif /* GMX_FFT_FFTW3 */
 101
 102 static double fft5d_fmax(double a, double b){
 103         return (a>b)?a:b;
 104 }
 105
 106 /* largest factor smaller than sqrt */
 107 static int lfactor(int z) {
 108         int i;
 109         for (i=sqrt(z);;i--)
 110                 if (z%i==0) return i;
 111         return 1;
 112 }
 113
 114 /* largest factor */
 115 static int l2factor(int z) {
 116         int i;
 117         if (z==1) return 1;
 118         for (i=z/2;;i--)
 119                 if (z%i==0) return i;
 120         return 1;
 121 }
 122
 123 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
 124 static int lpfactor(int z) {
 125         int f = l2factor(z);
 126         if (f==1) return z;
 127         return fft5d_fmax(lpfactor(f),lpfactor(z/f));
 128 }
 129
 130 #ifndef GMX_MPI
 131 #ifdef HAVE_GETTIMEOFDAY
 132 #include <sys/time.h>
 133 double MPI_Wtime() {
 134     struct timeval tv;
 135     gettimeofday(&tv,0);
 136     return tv.tv_sec+tv.tv_usec*1e-6;
 137 }
 138 #else
 139 double MPI_Wtime() {
 140     return 0.0;
 141 }
 142 #endif
 143 #endif
 144
 145 static int vmax(int* a, int s) {
 146     int i,max=0;
 147     for (i=0;i<s;i++)
 148     {
 149         if (a[i]>max) max=a[i];
 150     }
 151     return max;
 152 }
 153
 154
 155 /* NxMxK the size of the data
 156  * comm communicator to use for fft5d
 157  * P0 number of processor in 1st axes (can be null for automatic)
 158  * lin is allocated by fft5d because size of array is only known after planning phase
 159  * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 160 */
 161 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
 162 {
 163
 164     int P[2],bMaster,prank[2],i,t;
 165     int rNG,rMG,rKG;
 166     int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
 167     int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
 168     int C[3],rC[3],nP[2];
 169     int lsize;
 170     t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
 171     fft5d_plan plan;
 172     int s;
 173
 174     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 175 #ifdef GMX_MPI
 176     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
 177     {
 178         MPI_Comm_size(comm[0],&P[0]);
 179         MPI_Comm_rank(comm[0],&prank[0]);
 180     }
 181     else
 182 #endif
 183     {
 184         P[0] = 1;
 185         prank[0] = 0;
 186     }
 187 #ifdef GMX_MPI
 188     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
 189     {
 190         MPI_Comm_size(comm[1],&P[1]);
 191         MPI_Comm_rank(comm[1],&prank[1]);
 192     }
 193     else
 194 #endif
 195     {
 196         P[1] = 1;
 197         prank[1] = 0;
 198     }
 199
 200     bMaster=(prank[0]==0&&prank[1]==0);
 201
 202
 203     if (debug)
 204     {
 205         fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
 206                 P[0],P[1],prank[0],prank[1]);
 207     }
 208
 209     if (bMaster) {
 210         if (debug)
 211             fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 212                 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
 213         /* The check below is not correct, one prime factor 11 or 13 is ok.
 214         if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 215             printf("WARNING: FFT very slow with prime factors larger 7\n");
 216             printf("Change FFT size or in case you cannot change it look at\n");
 217             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 218         }
 219         */
 220     }
 221
 222     if (NG==0 || MG==0 || KG==0) {
 223         if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 224         return 0;
 225     }
 226
 227     rNG=NG;rMG=MG;rKG=KG;
 228
 229     if (flags&FFT5D_REALCOMPLEX) {
 230         if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
 231         else {
 232             if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
 233             else KG=KG/2+1;
 234         }
 235     }
 236
 237
 238     /*for transpose we need to know the size for each processor not only our own size*/
 239
 240     N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 241     M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 242     K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 243     oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
 244     oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
 245     oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
 246
 247     for (i=0;i<P[0];i++)
 248     {
 249         #define EVENDIST
 250         #ifndef EVENDIST
 251         oN0[i]=i*ceil((double)NG/P[0]);
 252         oM0[i]=i*ceil((double)MG/P[0]);
 253         oK0[i]=i*ceil((double)KG/P[0]);
 254         #else
 255         oN0[i]=(NG*i)/P[0];
 256         oM0[i]=(MG*i)/P[0];
 257         oK0[i]=(KG*i)/P[0];
 258         #endif
 259     }
 260     for (i=0;i<P[1];i++)
 261     {
 262         #ifndef EVENDIST
 263         oN1[i]=i*ceil((double)NG/P[1]);
 264         oM1[i]=i*ceil((double)MG/P[1]);
 265         oK1[i]=i*ceil((double)KG/P[1]);
 266         #else
 267         oN1[i]=(NG*i)/P[1];
 268         oM1[i]=(MG*i)/P[1];
 269         oK1[i]=(KG*i)/P[1];
 270         #endif
 271     }
 272     for (i=0;i<P[0]-1;i++)
 273     {
 274         N0[i]=oN0[i+1]-oN0[i];
 275         M0[i]=oM0[i+1]-oM0[i];
 276         K0[i]=oK0[i+1]-oK0[i];
 277     }
 278     N0[P[0]-1]=NG-oN0[P[0]-1];
 279     M0[P[0]-1]=MG-oM0[P[0]-1];
 280     K0[P[0]-1]=KG-oK0[P[0]-1];
 281     for (i=0;i<P[1]-1;i++)
 282     {
 283         N1[i]=oN1[i+1]-oN1[i];
 284         M1[i]=oM1[i+1]-oM1[i];
 285         K1[i]=oK1[i+1]-oK1[i];
 286     }
 287     N1[P[1]-1]=NG-oN1[P[1]-1];
 288     M1[P[1]-1]=MG-oM1[P[1]-1];
 289     K1[P[1]-1]=KG-oK1[P[1]-1];
 290
 291     /* for step 1-3 the local N,M,K sizes of the transposed system
 292        C: contiguous dimension, and nP: number of processor in subcommunicator
 293        for that step */
 294
 295
 296     pM[0] = M0[prank[0]];
 297     oM[0] = oM0[prank[0]];
 298     pK[0] = K1[prank[1]];
 299     oK[0] = oK1[prank[1]];
 300     C[0] = NG;
 301     rC[0] = rNG;
 302     if (!(flags&FFT5D_ORDER_YZ)) {
 303         N[0] = vmax(N1,P[1]);
 304         M[0] = M0[prank[0]];
 305         K[0] = vmax(K1,P[1]);
 306         pN[0] = N1[prank[1]];
 307         iNout[0] = N1;
 308         oNout[0] = oN1;
 309         nP[0] = P[1];
 310         C[1] = KG;
 311         rC[1] =rKG;
 312         N[1] = vmax(K0,P[0]);
 313         pN[1] = K0[prank[0]];
 314         iNin[1] = K1;
 315         oNin[1] = oK1;
 316         iNout[1] = K0;
 317         oNout[1] = oK0;
 318         M[1] = vmax(M0,P[0]);
 319         pM[1] = M0[prank[0]];
 320         oM[1] = oM0[prank[0]];
 321         K[1] = N1[prank[1]];
 322         pK[1] = N1[prank[1]];
 323         oK[1] = oN1[prank[1]];
 324         nP[1] = P[0];
 325         C[2] = MG;
 326         rC[2] = rMG;
 327         iNin[2] = M0;
 328         oNin[2] = oM0;
 329         M[2] = vmax(K0,P[0]);
 330         pM[2] = K0[prank[0]];
 331         oM[2] = oK0[prank[0]];
 332         K[2] = vmax(N1,P[1]);
 333         pK[2] = N1[prank[1]];
 334         oK[2] = oN1[prank[1]];
 335         free(N0); free(oN0); /*these are not used for this order*/
 336         free(M1); free(oM1); /*the rest is freed in destroy*/
 337     } else {
 338         N[0] = vmax(N0,P[0]);
 339         M[0] = vmax(M0,P[0]);
 340         K[0] = K1[prank[1]];
 341         pN[0] = N0[prank[0]];
 342         iNout[0] = N0;
 343         oNout[0] = oN0;
 344         nP[0] = P[0];
 345         C[1] = MG;
 346         rC[1] =rMG;
 347         N[1] = vmax(M1,P[1]);
 348         pN[1] = M1[prank[1]];
 349         iNin[1] = M0;
 350         oNin[1] = oM0;
 351         iNout[1] = M1;
 352         oNout[1] = oM1;
 353         M[1] = N0[prank[0]];
 354         pM[1] = N0[prank[0]];
 355         oM[1] = oN0[prank[0]];
 356         K[1] = vmax(K1,P[1]);
 357         pK[1] = K1[prank[1]];
 358         oK[1] = oK1[prank[1]];
 359         nP[1] = P[1];
 360         C[2] = KG;
 361         rC[2] = rKG;
 362         iNin[2] = K1;
 363         oNin[2] = oK1;
 364         M[2] = vmax(N0,P[0]);
 365         pM[2] = N0[prank[0]];
 366         oM[2] = oN0[prank[0]];
 367         K[2] = vmax(M1,P[1]);
 368         pK[2] = M1[prank[1]];
 369         oK[2] = oM1[prank[1]];
 370         free(N1); free(oN1); /*these are not used for this order*/
 371         free(K0); free(oK0); /*the rest is freed in destroy*/
 372     }
 373     N[2]=pN[2]=-1; /*not used*/
 374
 375     /*
 376       Difference between x-y-z regarding 2d decomposition is whether they are
 377       distributed along axis 1, 2 or both
 378     */
 379
 380     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 381     lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
 382     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 383     if (!(flags&FFT5D_NOMALLOC)) {
 384         snew_aligned(lin, lsize, 32);
 385         snew_aligned(lout, lsize, 32);
 386         snew_aligned(lout2, lsize, 32);
 387         snew_aligned(lout3, lsize, 32);
 388     } else {
 389         lin = *rlin;
 390         lout = *rlout;
 391         lout2 = *rlout2;
 392         lout3 = *rlout3;
 393     }
 394
 395     plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
 396
 397
 398     if (debug)
 399     {
 400         fprintf(debug, "Running on %d threads\n",nthreads);
 401     }
 402
 403 #ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
 404     /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
 405      * within a parallel region. For now deactivated. If it should be supported it has to made sure that
 406      * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
 407      * and that the 3d plan is faster than the 1d plan.
 408      */
 409     if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) {  /*don't do 3d plan in parallel or if in_place requested */
 410             int fftwflags=FFTW_DESTROY_INPUT;
 411             FFTW(iodim) dims[3];
 412             int inNG=NG,outMG=MG,outKG=KG;
 413
 414             FFTW_LOCK;
 415             if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
 416             if (flags&FFT5D_REALCOMPLEX) {
 417                 if (!(flags&FFT5D_BACKWARD)) {  /*input pointer is not complex*/
 418                     inNG*=2;
 419                 } else {                        /*output pointer is not complex*/
 420                     if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
 421                     else outKG*=2;
 422                 }
 423             }
 424
 425             if (!(flags&FFT5D_BACKWARD)) {
 426                 dims[0].n  = KG;
 427                 dims[1].n  = MG;
 428                 dims[2].n  = rNG;
 429
 430                 dims[0].is = inNG*MG;     /*N M K*/
 431                 dims[1].is = inNG;
 432                 dims[2].is = 1;
 433                 if (!(flags&FFT5D_ORDER_YZ)) {
 434                     dims[0].os = MG;       /*M K N*/
 435                     dims[1].os = 1;
 436                     dims[2].os = MG*KG;
 437                 } else  {
 438                     dims[0].os = 1;       /*K N M*/
 439                     dims[1].os = KG*NG;
 440                     dims[2].os = KG;
 441                 }
 442             } else {
 443                 if (!(flags&FFT5D_ORDER_YZ)) {
 444                     dims[0].n  = NG;
 445                     dims[1].n  = KG;
 446                     dims[2].n  = rMG;
 447
 448                     dims[0].is = 1;
 449                     dims[1].is = NG*MG;
 450                     dims[2].is = NG;
 451
 452                     dims[0].os = outMG*KG;
 453                     dims[1].os = outMG;
 454                     dims[2].os = 1;
 455                 } else {
 456                     dims[0].n  = MG;
 457                     dims[1].n  = NG;
 458                     dims[2].n  = rKG;
 459
 460                     dims[0].is = NG;
 461                     dims[1].is = 1;
 462                     dims[2].is = NG*MG;
 463
 464                     dims[0].os = outKG*NG;
 465                     dims[1].os = outKG;
 466                     dims[2].os = 1;
 467                 }
 468             }
 469 #ifdef FFT5D_THREADS
 470 #ifdef FFT5D_FFTW_THREADS
 471             FFTW(plan_with_nthreads)(nthreads);
 472 #endif
 473 #endif
 474             if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
 475                 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 476                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 477                                      (real*)lin, (FFTW(complex) *)lout,
 478                                      /*flags*/ fftwflags);
 479             } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
 480                 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 481                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 482                                      (FFTW(complex) *)lin, (real*)lout,
 483                                      /*flags*/ fftwflags);
 484             } else {
 485                 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 486                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 487                                      (FFTW(complex) *)lin, (FFTW(complex) *)lout,
 488                                      /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
 489             }
 490 #ifdef FFT5D_THREADS
 491 #ifdef FFT5D_FFTW_THREADS
 492             FFTW(plan_with_nthreads)(1);
 493 #endif
 494 #endif
 495             FFTW_UNLOCK;
 496     }
 497     if (!plan->p3d) {  /* for decomposition and if 3d plan did not work */
 498 #endif /* GMX_FFT_FFTW3 */
 499         for (s=0;s<3;s++) {
 500             if (debug)
 501             {
 502                 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 503                         s,rC[s],M[s],pK[s],C[s],lsize);
 504             }
 505             plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
 506
 507             /* Make sure that the init routines are only called by one thread at a time and in order
 508                (later is only important to not confuse valgrind)
 509              */
 510 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
 511             for(t=0; t<nthreads; t++)
 512             {
 513                 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
 514
 515                 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
 516                     gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 517                 } else {
 518                     gmx_fft_init_many_1d     ( &plan->p1d[s][t],  C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 519                 }
 520             }
 521         }
 522
 523 #ifdef GMX_FFT_FFTW3
 524     }
 525 #endif
 526     if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
 527         plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
 528     } else {
 529         plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
 530     }
 531 #ifdef FFT5D_MPI_TRANSPOSE
 532     FFTW_LOCK;
 533     for (s=0;s<2;s++) {
 534         if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
 535             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 536         else
 537             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 538     }
 539     FFTW_UNLOCK;
 540 #endif
 541
 542
 543     plan->lin=lin;
 544     plan->lout=lout;
 545     plan->lout2=lout2;
 546     plan->lout3=lout3;
 547
 548     plan->NG=NG;plan->MG=MG;plan->KG=KG;
 549
 550     for (s=0;s<3;s++) {
 551         plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
 552         plan->oM[s]=oM[s];plan->oK[s]=oK[s];
 553         plan->C[s]=C[s];plan->rC[s]=rC[s];
 554         plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
 555     }
 556     for (s=0;s<2;s++) {
 557         plan->P[s]=nP[s];plan->coor[s]=prank[s];
 558     }
 559
 560 /*    plan->fftorder=fftorder;
 561     plan->direction=direction;
 562     plan->realcomplex=realcomplex;
 563 */
 564     plan->flags=flags;
 565     plan->nthreads=nthreads;
 566     *rlin=lin;
 567     *rlout=lout;
 568     *rlout2=lout2;
 569     *rlout3=lout3;
 570     return plan;
 571 }
 572
 573
 574 enum order {
 575     XYZ,
 576     XZY,
 577     YXZ,
 578     YZX,
 579     ZXY,
 580     ZYX
 581 };
 582
 583
 584
 585 /*here x,y,z and N,M,K is in rotated coordinate system!!
 586   x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 587   maxN,maxM,maxK is max size of local data
 588   pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 589   NG, MG, KG is size of global data*/
 590 static void splitaxes(t_complex* lout,const t_complex* lin,
 591                       int maxN,int maxM,int maxK, int pN, int pM, int pK,
 592                       int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
 593 {
 594     int x,y,z,i;
 595     int in_i,out_i,in_z,out_z,in_y,out_y;
 596     int s_y,e_y;
 597
 598     for (z=startz; z<endz+1; z++) /*3. z l*/
 599     {
 600         if (z==startz) {
 601             s_y=starty;
 602         } else {
 603             s_y=0;
 604         }
 605         if (z==endz) {
 606             e_y=endy;
 607         } else {
 608             e_y=pM;
 609         }
 610         out_z  = z*maxN*maxM;
 611         in_z = z*NG*pM;
 612
 613         for (i=0; i<P; i++) /*index cube along long axis*/
 614         {
 615             out_i  = out_z  + i*maxN*maxM*maxK;
 616             in_i = in_z + oN[i];
 617             for (y=s_y;y<e_y;y++) { /*2. y k*/
 618                 out_y  = out_i  + y*maxN;
 619                 in_y = in_i + y*NG;
 620                 for (x=0;x<N[i];x++) { /*1. x j*/
 621                     lout[out_y+x] = lin[in_y+x];    /*in=z*NG*pM+oN[i]+y*NG+x*/
 622                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 623                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 624                 }
 625             }
 626         }
 627     }
 628 }
 629
 630 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 631 /*transpose mayor and major dimension
 632   variables see above
 633   the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 634   N,M,K local dimensions
 635   KG global size*/
 636 static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
 637                             int maxN,int maxM,int maxK,int pN, int pM, int pK,
 638                             int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
 639 {
 640     int i,x,y,z;
 641     int out_i,in_i,out_x,in_x,out_z,in_z;
 642     int s_y,e_y;
 643
 644     for (x=startx;x<endx+1;x++) /*1.j*/
 645     {
 646         if (x==startx)
 647         {
 648             s_y=starty;
 649         }
 650         else
 651         {
 652             s_y=0;
 653         }
 654         if (x==endx)
 655         {
 656             e_y=endy;
 657         }
 658         else
 659         {
 660             e_y=pM;
 661         }
 662
 663         out_x  = x*KG*pM;
 664         in_x = x;
 665
 666         for (i=0;i<P;i++) /*index cube along long axis*/
 667         {
 668             out_i  = out_x  + oK[i];
 669             in_i = in_x + i*maxM*maxN*maxK;
 670             for (z=0;z<K[i];z++) /*3.l*/
 671             {
 672                 out_z  = out_i  + z;
 673                 in_z = in_i + z*maxM*maxN;
 674                 for (y=s_y;y<e_y;y++) /*2.k*/
 675                 {
 676                     lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
 677                 }
 678             }
 679         }
 680     }
 681 }
 682
 683 /*make axis contiguous again (after AllToAll) and also do local transpose
 684   tranpose mayor and middle dimension
 685   variables see above
 686   the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 687   N,M,K local size
 688   MG, global size*/
 689 static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
 690                             int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
 691     int i,z,y,x;
 692     int out_i,in_i,out_z,in_z,out_x,in_x;
 693     int s_x,e_x;
 694
 695     for (z=startz; z<endz+1; z++)
 696     {
 697         if (z==startz)
 698         {
 699             s_x=startx;
 700         }
 701         else
 702         {
 703             s_x=0;
 704         }
 705         if (z==endz)
 706         {
 707             e_x=endx;
 708         }
 709         else
 710         {
 711             e_x=pN;
 712         }
 713         out_z  = z*MG*pN;
 714         in_z = z*maxM*maxN;
 715
 716         for (i=0; i<P; i++) /*index cube along long axis*/
 717         {
 718             out_i  = out_z  + oM[i];
 719             in_i = in_z + i*maxM*maxN*maxK;
 720             for (x=s_x;x<e_x;x++)
 721             {
 722                 out_x  = out_i  + x*MG;
 723                 in_x = in_i + x;
 724                 for (y=0;y<M[i];y++)
 725                 {
 726                     lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
 727                 }
 728             }
 729         }
 730     }
 731 }
 732
 733
 734 static void rotate_offsets(int x[]) {
 735     int t=x[0];
 736 /*    x[0]=x[2];
 737     x[2]=x[1];
 738     x[1]=t;*/
 739     x[0]=x[1];
 740     x[1]=x[2];
 741     x[2]=t;
 742 }
 743
 744 /*compute the offset to compare or print transposed local data in original input coordinates
 745   xs matrix dimension size, xl dimension length, xc decomposition offset
 746   s: step in computation = number of transposes*/
 747 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
 748 /*    int direction = plan->direction;
 749     int fftorder = plan->fftorder;*/
 750
 751     int o;
 752     int pos[3],i;
 753     int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
 754         *C=plan->C, *rC=plan->rC;
 755
 756     NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
 757
 758     if (!(plan->flags&FFT5D_ORDER_YZ)) {
 759         switch (s) {
 760         case 0: o=XYZ; break;
 761         case 1: o=ZYX; break;
 762         case 2: o=YZX; break;
 763         default: assert(0);
 764         }
 765     } else {
 766         switch (s) {
 767         case 0: o=XYZ; break;
 768         case 1: o=YXZ; break;
 769         case 2: o=ZXY; break;
 770         default: assert(0);
 771         }
 772     }
 773
 774     switch (o) {
 775         case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
 776         case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
 777         case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
 778         case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
 779         case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
 780         case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
 781     }
 782     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 783
 784     /*xs, xl give dimension size and data length in local transposed coordinate system
 785       for 0(/1/2): x(/y/z) in original coordinate system*/
 786     for (i=0;i<3;i++) {
 787         switch (pos[i]) {
 788         case 1: xs[i]=1;         xc[i]=0;     xl[i]=C[s];break;
 789         case 2: xs[i]=C[s];      xc[i]=oM[s]; xl[i]=pM[s];break;
 790         case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
 791         }
 792     }
 793     /*input order is different for test program to match FFTW order
 794       (important for complex to real)*/
 795     if (plan->flags&FFT5D_BACKWARD) {
 796         rotate_offsets(xs);
 797         rotate_offsets(xl);
 798         rotate_offsets(xc);
 799         rotate_offsets(NG);
 800         if (plan->flags&FFT5D_ORDER_YZ) {
 801             rotate_offsets(xs);
 802             rotate_offsets(xl);
 803             rotate_offsets(xc);
 804             rotate_offsets(NG);
 805         }
 806     }
 807     if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
 808         xl[0] = rC[s];
 809     }
 810 }
 811
 812 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
 813     int x,y,z,l,t;
 814     int *coor = plan->coor;
 815     int xs[3],xl[3],xc[3],NG[3];
 816     int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
 817     compute_offsets(plan,xs,xl,xc,NG,s);
 818     fprintf(debug,txt,coor[0],coor[1],s);
 819     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 820     for(z=0;z<xl[2];z++) {
 821         for(y=0;y<xl[1];y++) {
 822             fprintf(debug,"%d %d: ",coor[0],coor[1]);
 823             for (x=0;x<xl[0];x++) {
 824                 for (l=0;l<ll;l++) {
 825                     fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 826                 }
 827                 fprintf(debug,",");
 828             }
 829             fprintf(debug,"\n");
 830         }
 831     }
 832 }
 833
 834 void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
 835     t_complex *lin = plan->lin;
 836     t_complex *lout = plan->lout;
 837     t_complex *lout2 = plan->lout2;
 838     t_complex *lout3 = plan->lout3;
 839     t_complex *fftout,*joinin;
 840
 841     gmx_fft_t **p1d=plan->p1d;
 842 #ifdef FFT5D_MPI_TRANSPOSE
 843     FFTW(plan) *mpip=plan->mpip;
 844 #endif
 845 #ifdef GMX_MPI
 846     MPI_Comm *cart=plan->cart;
 847 #endif
 848
 849     double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
 850     int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
 851         *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
 852     int s=0,tstart,tend,bParallelDim;
 853
 854
 855 #ifdef GMX_FFT_FFTW3
 856     if (plan->p3d)
 857     {
 858         if (thread == 0)
 859         {
 860 #ifdef NOGMX
 861             if (times!=0)
 862             {
 863                 time=MPI_Wtime();
 864             }
 865 #endif
 866             FFTW(execute)(plan->p3d);
 867 #ifdef NOGMX
 868             if (times!=0)
 869             {
 870                 times->fft+=MPI_Wtime()-time;
 871             }
 872 #endif
 873         }
 874         return;
 875     }
 876 #endif
 877
 878         s=0;
 879
 880     /*lin: x,y,z*/
 881         if (plan->flags&FFT5D_DEBUG && thread == 0)
 882         {
 883             print_localdata(lin, "%d %d: copy in lin\n", s, plan);
 884         }
 885
 886         for (s=0;s<2;s++) {  /*loop over first two FFT steps (corner rotations)*/
 887
 888 #ifdef GMX_MPI
 889         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
 890         {
 891             bParallelDim = 1;
 892         }
 893         else
 894 #endif
 895         {
 896             bParallelDim = 0;
 897         }
 898
 899         /* ---------- START FFT ------------ */
 900 #ifdef NOGMX
 901         if (times!=0 && thread == 0)
 902         {
 903             time=MPI_Wtime();
 904         }
 905 #endif
 906
 907         if (bParallelDim) {
 908             fftout = lout;
 909         }
 910         else
 911         {
 912             if (s==0)
 913             {
 914                 fftout = lout3;
 915             } else
 916             {
 917                 fftout = lout2;
 918             }
 919         }
 920
 921         tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
 922         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
 923         {
 924             gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
 925         } else
 926         {
 927             gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,fftout+tstart);
 928
 929         }
 930
 931 #ifdef NOGMX
 932         if (times != NULL && thread == 0)
 933         {
 934             time_fft+=MPI_Wtime()-time;
 935         }
 936 #endif
 937         if (plan->flags&FFT5D_DEBUG && thread == 0)
 938         {
 939             print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 940         }
 941         /* ---------- END FFT ------------ */
 942
 943         /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
 944         if (bParallelDim) {
 945 #ifdef NOGMX
 946             if (times != NULL && thread == 0)
 947             {
 948                 time=MPI_Wtime();
 949             }
 950 #endif
 951             /*prepare for A
 952 llToAll
 953               1. (most outer) axes (x) is split into P[s] parts of size N[s]
 954               for sending*/
 955             if (pM[s]>0)
 956             {
 957                 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
 958                 tstart/=C[s];
 959                 splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
 960             }
 961 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
 962 #ifdef NOGMX
 963             if (times != NULL && thread == 0)
 964             {
 965                 time_local+=MPI_Wtime()-time;
 966             }
 967 #endif
 968
 969         /* ---------- END SPLIT , START TRANSPOSE------------ */
 970
 971             if (thread == 0)
 972             {
 973 #ifdef NOGMX
 974                 if (times!=0)
 975                 {
 976                     time=MPI_Wtime();
 977                 }
 978 #else
 979                 wallcycle_start(times,ewcPME_FFTCOMM);
 980 #endif
 981 #ifdef FFT5D_MPI_TRANSPOSE
 982                 FFTW(execute)(mpip[s]);
 983 #else
 984 #ifdef GMX_MPI
 985                 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
 986                     MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 987                 else
 988                     MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 989 #else
 990                 gmx_incons("fft5d MPI call without MPI configuration");
 991 #endif /*GMX_MPI*/
 992 #endif /*FFT5D_MPI_TRANSPOSE*/
 993 #ifdef NOGMX
 994                 if (times!=0)
 995                 {
 996                     time_mpi[s]=MPI_Wtime()-time;
 997                 }
 998 #else
 999                 wallcycle_stop(times,ewcPME_FFTCOMM);
1000 #endif
1001             }  /*master*/
1002         }  /* bPrallelDim */
1003 #pragma omp barrier  /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1004
1005         /* ---------- END SPLIT + TRANSPOSE------------ */
1006
1007         /* ---------- START JOIN ------------ */
1008 #ifdef NOGMX
1009         if (times != NULL && thread == 0)
1010         {
1011             time=MPI_Wtime();
1012         }
1013 #endif
1014
1015         if (bParallelDim) {
1016             joinin = lout3;
1017         } else {
1018             joinin = fftout;
1019         }
1020         /*bring back in matrix form
1021           thus make  new 1. axes contiguos
1022           also local transpose 1 and 2/3
1023           runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1024         */
1025         if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
1026             if (pM[s]>0)
1027             {
1028                 tstart = ( thread   *pM[s]*pN[s]/plan->nthreads);
1029                 tend   = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1030                 joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
1031             }
1032         }
1033         else {
1034             if (pN[s]>0)
1035             {
1036                 tstart = ( thread   *pK[s]*pN[s]/plan->nthreads);
1037                 tend   = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1038                 joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
1039             }
1040         }
1041
1042 #ifdef NOGMX
1043         if (times != NULL && thread == 0)
1044         {
1045             time_local+=MPI_Wtime()-time;
1046         }
1047 #endif
1048         if (plan->flags&FFT5D_DEBUG && thread == 0)
1049         {
1050             print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1051         }
1052         /* ---------- END JOIN ------------ */
1053
1054         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1055     }  /* for(s=0;s<2;s++) */
1056 #ifdef NOGMX
1057         if (times != NULL && thread == 0)
1058         {
1059             time=MPI_Wtime();
1060         }
1061 #endif
1062
1063     if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
1064
1065     /*  ----------- FFT ----------- */
1066     tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1067     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
1068         gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
1069     } else {
1070         gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,lout+tstart);
1071     }
1072     /* ------------ END FFT ---------*/
1073
1074 #ifdef NOGMX
1075     if (times != NULL && thread == 0)
1076     {
1077         time_fft+=MPI_Wtime()-time;
1078
1079         times->fft+=time_fft;
1080         times->local+=time_local;
1081         times->mpi2+=time_mpi[1];
1082         times->mpi1+=time_mpi[0];
1083     }
1084 #endif
1085
1086     if (plan->flags&FFT5D_DEBUG && thread == 0)
1087     {
1088         print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1089     }
1090 }
1091
1092 void fft5d_destroy(fft5d_plan plan) {
1093     int s,t;
1094     for (s=0;s<3;s++)
1095     {
1096         if (plan->p1d[s])
1097         {
1098             for (t=0;t<plan->nthreads;t++)
1099             {
1100                 gmx_many_fft_destroy(plan->p1d[s][t]);
1101             }
1102             free(plan->p1d[s]);
1103         }
1104         if (plan->iNin[s])
1105         {
1106             free(plan->iNin[s]);
1107             plan->iNin[s]=0;
1108         }
1109         if (plan->oNin[s])
1110         {
1111             free(plan->oNin[s]);
1112             plan->oNin[s]=0;
1113         }
1114         if (plan->iNout[s])
1115         {
1116             free(plan->iNout[s]);
1117             plan->iNout[s]=0;
1118         }
1119         if (plan->oNout[s])
1120         {
1121             free(plan->oNout[s]);
1122             plan->oNout[s]=0;
1123         }
1124     }
1125 #ifdef GMX_FFT_FFTW3
1126     FFTW_LOCK;
1127 #ifdef FFT5D_MPI_TRANSPOS
1128     for (s=0;s<2;s++)
1129     {
1130         FFTW(destroy_plan)(plan->mpip[s]);
1131     }
1132     if (plan->p3d)
1133     {
1134         FFTW(destroy_plan)(plan->p3d);
1135     }
1136 #endif /* FFT5D_MPI_TRANSPOS */
1137 #endif /* GMX_FFT_FFTW3 */
1138
1139     if (!(plan->flags&FFT5D_NOMALLOC))
1140     {
1141         sfree_aligned(plan->lin);
1142         sfree_aligned(plan->lout);
1143         sfree_aligned(plan->lout2);
1144         sfree_aligned(plan->lout3);
1145     }
1146
1147 #ifdef FFT5D_THREADS
1148 #ifdef FFT5D_FFTW_THREADS
1149     /*FFTW(cleanup_threads)();*/
1150 #endif
1151 #endif
1152
1153     free(plan);
1154 }
1155
1156 /*Is this better than direct access of plan? enough data?
1157   here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1158 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
1159     *N1=plan->N[0];
1160     *M0=plan->M[0];
1161     *K1=plan->K[0];
1162     *K0=plan->N[1];
1163
1164     *coor=plan->coor;
1165 }
1166
1167
1168 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1169   of processor dimensions*/
1170 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
1171     MPI_Comm cart[2]={0};
1172 #ifdef GMX_MPI
1173     int size=1,prank=0;
1174     int P[2];
1175     int coor[2];
1176     int wrap[]={0,0};
1177     MPI_Comm gcart;
1178     int rdim1[] = {0,1}, rdim2[] = {1,0};
1179
1180     MPI_Comm_size(comm,&size);
1181     MPI_Comm_rank(comm,&prank);
1182
1183     if (P0==0) P0 = lfactor(size);
1184     if (size%P0!=0)
1185     {
1186         if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1187         P0 = lfactor(size);
1188     }
1189
1190     P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1191
1192     /*Difference between x-y-z regarding 2d decomposition is whether they are
1193       distributed along axis 1, 2 or both*/
1194
1195     MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1196     MPI_Cart_get(gcart,2,P,wrap,coor);
1197     MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1198     MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1199 #endif
1200     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
1201 }
1202
1203
1204
1205 /*prints in original coordinate system of data (as the input to FFT)*/
1206 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1207     int xs[3],xl[3],xc[3],NG[3];
1208     int x,y,z,l;
1209     int *coor = plan->coor;
1210     int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1211     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1212     {
1213         ll=1;
1214     }
1215
1216     compute_offsets(plan,xs,xl,xc,NG,2);
1217     if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1218     for (z=0;z<xl[2];z++) {
1219         for(y=0;y<xl[1];y++) {
1220             if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1221             for (x=0;x<xl[0];x++) {
1222                 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1223                     real a,b;
1224                     a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1225                     if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1226                     if (!bothLocal)
1227                         b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1228                     else
1229                         b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1230                     if (plan->flags&FFT5D_DEBUG) {
1231                         printf("%f %f, ",a,b);
1232                     } else {
1233                         if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1234                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1235                         }
1236 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1237                     }
1238                 }
1239                 if (plan->flags&FFT5D_DEBUG) printf(",");
1240             }
1241             if (plan->flags&FFT5D_DEBUG) printf("\n");
1242         }
1243     }
1244
1245 }
1246