src/mdlib/fft5d.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  11  * Copyright (c) 2001-2012, The GROMACS development team,
  12  * check out http://www.gromacs.org for more information.
  13
  14  * This program is free software; you can redistribute it and/or
  15  * modify it under the terms of the GNU General Public License
  16  * as published by the Free Software Foundation; either version 2
  17  * of the License, or (at your option) any later version.
  18  *
  19  * If you want to redistribute modifications, please consider that
  20  * scientific software is very special. Version control is crucial -
  21  * bugs must be traceable. We will be happy to consider code for
  22  * inclusion in the official distribution, but derived work must not
  23  * be called official GROMACS. Details are found in the README & COPYING
  24  * files - if they are missing, get the official version at www.gromacs.org.
  25  *
  26  * To help us fund GROMACS development, we humbly ask that you cite
  27  * the papers on the package - you can find them in the top README file.
  28  *
  29  * For more info, check our website at http://www.gromacs.org
  30  *
  31  * And Hey:
  32  * Groningen Machine for Chemical Simulation
  33  */
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37
  38
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <string.h>
  42
  43 #ifdef NOGMX
  44 #define GMX_PARALLEL_ENV_INITIALIZED 1
  45 #else
  46 #ifdef GMX_MPI
  47 #define GMX_PARALLEL_ENV_INITIALIZED 1
  48 #else
  49 #define GMX_PARALLEL_ENV_INITIALIZED 0
  50 #endif
  51 #endif
  52
  53 #ifdef GMX_LIB_MPI
  54 #include <mpi.h>
  55 #endif
  56 #ifdef GMX_THREAD_MPI
  57 #include "tmpi.h"
  58 #endif
  59
  60 #ifdef GMX_OPENMP
  61 #define FFT5D_THREADS
  62 #endif
  63 #ifdef FFT5D_THREADS
  64 #include <omp.h>
  65 /* requires fftw compiled with openmp */
  66 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
  67 #endif
  68
  69 #include "fft5d.h"
  70 #include <float.h>
  71 #include <math.h>
  72 #include <assert.h>
  73 #include "smalloc.h"
  74
  75 #ifndef __FLT_EPSILON__
  76 #define __FLT_EPSILON__ FLT_EPSILON
  77 #define __DBL_EPSILON__ DBL_EPSILON
  78 #endif
  79
  80 #ifdef NOGMX
  81 FILE* debug=0;
  82 #endif
  83
  84 #include "gmx_fatal.h"
  85
  86
  87 #ifdef GMX_FFT_FFTW3
  88 #ifdef GMX_THREAD_MPI
  89 /* none of the fftw3 calls, except execute(), are thread-safe, so
  90    we need to serialize them with this mutex. */
  91 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
  92
  93 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex)
  94 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex)
  95 #else /* GMX_THREAD_MPI */
  96 #define FFTW_LOCK
  97 #define FFTW_UNLOCK
  98 #endif /* GMX_THREAD_MPI */
  99 #endif /* GMX_FFT_FFTW3 */
 100
 101 static double fft5d_fmax(double a, double b){
 102         return (a>b)?a:b;
 103 }
 104
 105 /* largest factor smaller than sqrt */
 106 static int lfactor(int z) {
 107         int i;
 108         for (i=sqrt(z);;i--)
 109                 if (z%i==0) return i;
 110         return 1;
 111 }
 112
 113 /* largest factor */
 114 static int l2factor(int z) {
 115         int i;
 116         if (z==1) return 1;
 117         for (i=z/2;;i--)
 118                 if (z%i==0) return i;
 119         return 1;
 120 }
 121
 122 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
 123 static int lpfactor(int z) {
 124         int f = l2factor(z);
 125         if (f==1) return z;
 126         return fft5d_fmax(lpfactor(f),lpfactor(z/f));
 127 }
 128
 129 #ifndef GMX_MPI
 130 #ifdef HAVE_GETTIMEOFDAY
 131 #include <sys/time.h>
 132 double MPI_Wtime() {
 133     struct timeval tv;
 134     gettimeofday(&tv,0);
 135     return tv.tv_sec+tv.tv_usec*1e-6;
 136 }
 137 #else
 138 double MPI_Wtime() {
 139     return 0.0;
 140 }
 141 #endif
 142 #endif
 143
 144 static int vmax(int* a, int s) {
 145     int i,max=0;
 146     for (i=0;i<s;i++)
 147     {
 148         if (a[i]>max) max=a[i];
 149     }
 150     return max;
 151 }
 152
 153
 154 /* NxMxK the size of the data
 155  * comm communicator to use for fft5d
 156  * P0 number of processor in 1st axes (can be null for automatic)
 157  * lin is allocated by fft5d because size of array is only known after planning phase
 158  * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 159 */
 160 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
 161 {
 162
 163     int P[2],bMaster,prank[2],i,t;
 164     int rNG,rMG,rKG;
 165     int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
 166     int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
 167     int C[3],rC[3],nP[2];
 168     int lsize;
 169     t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
 170     fft5d_plan plan;
 171     int s;
 172
 173     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 174 #ifdef GMX_MPI
 175     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
 176     {
 177         MPI_Comm_size(comm[0],&P[0]);
 178         MPI_Comm_rank(comm[0],&prank[0]);
 179     }
 180     else
 181 #endif
 182     {
 183         P[0] = 1;
 184         prank[0] = 0;
 185     }
 186 #ifdef GMX_MPI
 187     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
 188     {
 189         MPI_Comm_size(comm[1],&P[1]);
 190         MPI_Comm_rank(comm[1],&prank[1]);
 191     }
 192     else
 193 #endif
 194     {
 195         P[1] = 1;
 196         prank[1] = 0;
 197     }
 198
 199     bMaster=(prank[0]==0&&prank[1]==0);
 200
 201
 202     if (debug)
 203     {
 204         fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
 205                 P[0],P[1],prank[0],prank[1]);
 206     }
 207
 208     if (bMaster) {
 209         if (debug)
 210             fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 211                 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
 212         /* The check below is not correct, one prime factor 11 or 13 is ok.
 213         if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 214             printf("WARNING: FFT very slow with prime factors larger 7\n");
 215             printf("Change FFT size or in case you cannot change it look at\n");
 216             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 217         }
 218         */
 219     }
 220
 221     if (NG==0 || MG==0 || KG==0) {
 222         if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 223         return 0;
 224     }
 225
 226     rNG=NG;rMG=MG;rKG=KG;
 227
 228     if (flags&FFT5D_REALCOMPLEX) {
 229         if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
 230         else {
 231             if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
 232             else KG=KG/2+1;
 233         }
 234     }
 235
 236
 237     /*for transpose we need to know the size for each processor not only our own size*/
 238
 239     N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 240     M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 241     K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 242     oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
 243     oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
 244     oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
 245
 246     for (i=0;i<P[0];i++)
 247     {
 248         #define EVENDIST
 249         #ifndef EVENDIST
 250         oN0[i]=i*ceil((double)NG/P[0]);
 251         oM0[i]=i*ceil((double)MG/P[0]);
 252         oK0[i]=i*ceil((double)KG/P[0]);
 253         #else
 254         oN0[i]=(NG*i)/P[0];
 255         oM0[i]=(MG*i)/P[0];
 256         oK0[i]=(KG*i)/P[0];
 257         #endif
 258     }
 259     for (i=0;i<P[1];i++)
 260     {
 261         #ifndef EVENDIST
 262         oN1[i]=i*ceil((double)NG/P[1]);
 263         oM1[i]=i*ceil((double)MG/P[1]);
 264         oK1[i]=i*ceil((double)KG/P[1]);
 265         #else
 266         oN1[i]=(NG*i)/P[1];
 267         oM1[i]=(MG*i)/P[1];
 268         oK1[i]=(KG*i)/P[1];
 269         #endif
 270     }
 271     for (i=0;i<P[0]-1;i++)
 272     {
 273         N0[i]=oN0[i+1]-oN0[i];
 274         M0[i]=oM0[i+1]-oM0[i];
 275         K0[i]=oK0[i+1]-oK0[i];
 276     }
 277     N0[P[0]-1]=NG-oN0[P[0]-1];
 278     M0[P[0]-1]=MG-oM0[P[0]-1];
 279     K0[P[0]-1]=KG-oK0[P[0]-1];
 280     for (i=0;i<P[1]-1;i++)
 281     {
 282         N1[i]=oN1[i+1]-oN1[i];
 283         M1[i]=oM1[i+1]-oM1[i];
 284         K1[i]=oK1[i+1]-oK1[i];
 285     }
 286     N1[P[1]-1]=NG-oN1[P[1]-1];
 287     M1[P[1]-1]=MG-oM1[P[1]-1];
 288     K1[P[1]-1]=KG-oK1[P[1]-1];
 289
 290     /* for step 1-3 the local N,M,K sizes of the transposed system
 291        C: contiguous dimension, and nP: number of processor in subcommunicator
 292        for that step */
 293
 294
 295     pM[0] = M0[prank[0]];
 296     oM[0] = oM0[prank[0]];
 297     pK[0] = K1[prank[1]];
 298     oK[0] = oK1[prank[1]];
 299     C[0] = NG;
 300     rC[0] = rNG;
 301     if (!(flags&FFT5D_ORDER_YZ)) {
 302         N[0] = vmax(N1,P[1]);
 303         M[0] = M0[prank[0]];
 304         K[0] = vmax(K1,P[1]);
 305         pN[0] = N1[prank[1]];
 306         iNout[0] = N1;
 307         oNout[0] = oN1;
 308         nP[0] = P[1];
 309         C[1] = KG;
 310         rC[1] =rKG;
 311         N[1] = vmax(K0,P[0]);
 312         pN[1] = K0[prank[0]];
 313         iNin[1] = K1;
 314         oNin[1] = oK1;
 315         iNout[1] = K0;
 316         oNout[1] = oK0;
 317         M[1] = vmax(M0,P[0]);
 318         pM[1] = M0[prank[0]];
 319         oM[1] = oM0[prank[0]];
 320         K[1] = N1[prank[1]];
 321         pK[1] = N1[prank[1]];
 322         oK[1] = oN1[prank[1]];
 323         nP[1] = P[0];
 324         C[2] = MG;
 325         rC[2] = rMG;
 326         iNin[2] = M0;
 327         oNin[2] = oM0;
 328         M[2] = vmax(K0,P[0]);
 329         pM[2] = K0[prank[0]];
 330         oM[2] = oK0[prank[0]];
 331         K[2] = vmax(N1,P[1]);
 332         pK[2] = N1[prank[1]];
 333         oK[2] = oN1[prank[1]];
 334         free(N0); free(oN0); /*these are not used for this order*/
 335         free(M1); free(oM1); /*the rest is freed in destroy*/
 336     } else {
 337         N[0] = vmax(N0,P[0]);
 338         M[0] = vmax(M0,P[0]);
 339         K[0] = K1[prank[1]];
 340         pN[0] = N0[prank[0]];
 341         iNout[0] = N0;
 342         oNout[0] = oN0;
 343         nP[0] = P[0];
 344         C[1] = MG;
 345         rC[1] =rMG;
 346         N[1] = vmax(M1,P[1]);
 347         pN[1] = M1[prank[1]];
 348         iNin[1] = M0;
 349         oNin[1] = oM0;
 350         iNout[1] = M1;
 351         oNout[1] = oM1;
 352         M[1] = N0[prank[0]];
 353         pM[1] = N0[prank[0]];
 354         oM[1] = oN0[prank[0]];
 355         K[1] = vmax(K1,P[1]);
 356         pK[1] = K1[prank[1]];
 357         oK[1] = oK1[prank[1]];
 358         nP[1] = P[1];
 359         C[2] = KG;
 360         rC[2] = rKG;
 361         iNin[2] = K1;
 362         oNin[2] = oK1;
 363         M[2] = vmax(N0,P[0]);
 364         pM[2] = N0[prank[0]];
 365         oM[2] = oN0[prank[0]];
 366         K[2] = vmax(M1,P[1]);
 367         pK[2] = M1[prank[1]];
 368         oK[2] = oM1[prank[1]];
 369         free(N1); free(oN1); /*these are not used for this order*/
 370         free(K0); free(oK0); /*the rest is freed in destroy*/
 371     }
 372     N[2]=pN[2]=-1; /*not used*/
 373
 374     /*
 375       Difference between x-y-z regarding 2d decomposition is whether they are
 376       distributed along axis 1, 2 or both
 377     */
 378
 379     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 380     lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
 381     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 382     if (!(flags&FFT5D_NOMALLOC)) {
 383         snew_aligned(lin, lsize, 32);
 384         snew_aligned(lout, lsize, 32);
 385         snew_aligned(lout2, lsize, 32);
 386         snew_aligned(lout3, lsize, 32);
 387     } else {
 388         lin = *rlin;
 389         lout = *rlout;
 390         lout2 = *rlout2;
 391         lout3 = *rlout3;
 392     }
 393
 394     plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
 395
 396
 397     if (debug)
 398     {
 399         fprintf(debug, "Running on %d threads\n",nthreads);
 400     }
 401
 402 #ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
 403     /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
 404      * within a parallel region. For now deactivated. If it should be supported it has to made sure that
 405      * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
 406      * and that the 3d plan is faster than the 1d plan.
 407      */
 408     if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) {  /*don't do 3d plan in parallel or if in_place requested */
 409             int fftwflags=FFTW_DESTROY_INPUT;
 410             FFTW(iodim) dims[3];
 411             int inNG=NG,outMG=MG,outKG=KG;
 412
 413             FFTW_LOCK;
 414             if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
 415             if (flags&FFT5D_REALCOMPLEX) {
 416                 if (!(flags&FFT5D_BACKWARD)) {  /*input pointer is not complex*/
 417                     inNG*=2;
 418                 } else {                        /*output pointer is not complex*/
 419                     if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
 420                     else outKG*=2;
 421                 }
 422             }
 423
 424             if (!(flags&FFT5D_BACKWARD)) {
 425                 dims[0].n  = KG;
 426                 dims[1].n  = MG;
 427                 dims[2].n  = rNG;
 428
 429                 dims[0].is = inNG*MG;     /*N M K*/
 430                 dims[1].is = inNG;
 431                 dims[2].is = 1;
 432                 if (!(flags&FFT5D_ORDER_YZ)) {
 433                     dims[0].os = MG;       /*M K N*/
 434                     dims[1].os = 1;
 435                     dims[2].os = MG*KG;
 436                 } else  {
 437                     dims[0].os = 1;       /*K N M*/
 438                     dims[1].os = KG*NG;
 439                     dims[2].os = KG;
 440                 }
 441             } else {
 442                 if (!(flags&FFT5D_ORDER_YZ)) {
 443                     dims[0].n  = NG;
 444                     dims[1].n  = KG;
 445                     dims[2].n  = rMG;
 446
 447                     dims[0].is = 1;
 448                     dims[1].is = NG*MG;
 449                     dims[2].is = NG;
 450
 451                     dims[0].os = outMG*KG;
 452                     dims[1].os = outMG;
 453                     dims[2].os = 1;
 454                 } else {
 455                     dims[0].n  = MG;
 456                     dims[1].n  = NG;
 457                     dims[2].n  = rKG;
 458
 459                     dims[0].is = NG;
 460                     dims[1].is = 1;
 461                     dims[2].is = NG*MG;
 462
 463                     dims[0].os = outKG*NG;
 464                     dims[1].os = outKG;
 465                     dims[2].os = 1;
 466                 }
 467             }
 468 #ifdef FFT5D_THREADS
 469 #ifdef FFT5D_FFTW_THREADS
 470             FFTW(plan_with_nthreads)(nthreads);
 471 #endif
 472 #endif
 473             if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
 474                 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 475                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 476                                      (real*)lin, (FFTW(complex) *)lout,
 477                                      /*flags*/ fftwflags);
 478             } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
 479                 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 480                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 481                                      (FFTW(complex) *)lin, (real*)lout,
 482                                      /*flags*/ fftwflags);
 483             } else {
 484                 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 485                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 486                                      (FFTW(complex) *)lin, (FFTW(complex) *)lout,
 487                                      /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
 488             }
 489 #ifdef FFT5D_THREADS
 490 #ifdef FFT5D_FFTW_THREADS
 491             FFTW(plan_with_nthreads)(1);
 492 #endif
 493 #endif
 494             FFTW_UNLOCK;
 495     }
 496     if (!plan->p3d) {  /* for decomposition and if 3d plan did not work */
 497 #endif /* GMX_FFT_FFTW3 */
 498         for (s=0;s<3;s++) {
 499             if (debug)
 500             {
 501                 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 502                         s,rC[s],M[s],pK[s],C[s],lsize);
 503             }
 504             plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
 505
 506             /* Make sure that the init routines are only called by one thread at a time and in order
 507                (later is only important to not confuse valgrind)
 508              */
 509 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
 510             for(t=0; t<nthreads; t++)
 511             {
 512                 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
 513
 514                 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
 515                     gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 516                 } else {
 517                     gmx_fft_init_many_1d     ( &plan->p1d[s][t],  C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 518                 }
 519             }
 520         }
 521
 522 #ifdef GMX_FFT_FFTW3
 523     }
 524 #endif
 525     if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
 526         plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
 527     } else {
 528         plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
 529     }
 530 #ifdef FFT5D_MPI_TRANSPOSE
 531     FFTW_LOCK;
 532     for (s=0;s<2;s++) {
 533         if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
 534             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 535         else
 536             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 537     }
 538     FFTW_UNLOCK;
 539 #endif
 540
 541
 542     plan->lin=lin;
 543     plan->lout=lout;
 544     plan->lout2=lout2;
 545     plan->lout3=lout3;
 546
 547     plan->NG=NG;plan->MG=MG;plan->KG=KG;
 548
 549     for (s=0;s<3;s++) {
 550         plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
 551         plan->oM[s]=oM[s];plan->oK[s]=oK[s];
 552         plan->C[s]=C[s];plan->rC[s]=rC[s];
 553         plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
 554     }
 555     for (s=0;s<2;s++) {
 556         plan->P[s]=nP[s];plan->coor[s]=prank[s];
 557     }
 558
 559 /*    plan->fftorder=fftorder;
 560     plan->direction=direction;
 561     plan->realcomplex=realcomplex;
 562 */
 563     plan->flags=flags;
 564     plan->nthreads=nthreads;
 565     *rlin=lin;
 566     *rlout=lout;
 567     *rlout2=lout2;
 568     *rlout3=lout3;
 569     return plan;
 570 }
 571
 572
 573 enum order {
 574     XYZ,
 575     XZY,
 576     YXZ,
 577     YZX,
 578     ZXY,
 579     ZYX
 580 };
 581
 582
 583
 584 /*here x,y,z and N,M,K is in rotated coordinate system!!
 585   x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 586   maxN,maxM,maxK is max size of local data
 587   pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 588   NG, MG, KG is size of global data*/
 589 static void splitaxes(t_complex* lout,const t_complex* lin,
 590                       int maxN,int maxM,int maxK, int pN, int pM, int pK,
 591                       int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
 592 {
 593     int x,y,z,i;
 594     int in_i,out_i,in_z,out_z,in_y,out_y;
 595     int s_y,e_y;
 596
 597     for (z=startz; z<endz+1; z++) /*3. z l*/
 598     {
 599         if (z==startz) {
 600             s_y=starty;
 601         } else {
 602             s_y=0;
 603         }
 604         if (z==endz) {
 605             e_y=endy;
 606         } else {
 607             e_y=pM;
 608         }
 609         out_z  = z*maxN*maxM;
 610         in_z = z*NG*pM;
 611
 612         for (i=0; i<P; i++) /*index cube along long axis*/
 613         {
 614             out_i  = out_z  + i*maxN*maxM*maxK;
 615             in_i = in_z + oN[i];
 616             for (y=s_y;y<e_y;y++) { /*2. y k*/
 617                 out_y  = out_i  + y*maxN;
 618                 in_y = in_i + y*NG;
 619                 for (x=0;x<N[i];x++) { /*1. x j*/
 620                     lout[out_y+x] = lin[in_y+x];    /*in=z*NG*pM+oN[i]+y*NG+x*/
 621                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 622                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 623                 }
 624             }
 625         }
 626     }
 627 }
 628
 629 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 630 /*transpose mayor and major dimension
 631   variables see above
 632   the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 633   N,M,K local dimensions
 634   KG global size*/
 635 static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
 636                             int maxN,int maxM,int maxK,int pN, int pM, int pK,
 637                             int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
 638 {
 639     int i,x,y,z;
 640     int out_i,in_i,out_x,in_x,out_z,in_z;
 641     int s_y,e_y;
 642
 643     for (x=startx;x<endx+1;x++) /*1.j*/
 644     {
 645         if (x==startx)
 646         {
 647             s_y=starty;
 648         }
 649         else
 650         {
 651             s_y=0;
 652         }
 653         if (x==endx)
 654         {
 655             e_y=endy;
 656         }
 657         else
 658         {
 659             e_y=pM;
 660         }
 661
 662         out_x  = x*KG*pM;
 663         in_x = x;
 664
 665         for (i=0;i<P;i++) /*index cube along long axis*/
 666         {
 667             out_i  = out_x  + oK[i];
 668             in_i = in_x + i*maxM*maxN*maxK;
 669             for (z=0;z<K[i];z++) /*3.l*/
 670             {
 671                 out_z  = out_i  + z;
 672                 in_z = in_i + z*maxM*maxN;
 673                 for (y=s_y;y<e_y;y++) /*2.k*/
 674                 {
 675                     lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
 676                 }
 677             }
 678         }
 679     }
 680 }
 681
 682 /*make axis contiguous again (after AllToAll) and also do local transpose
 683   tranpose mayor and middle dimension
 684   variables see above
 685   the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 686   N,M,K local size
 687   MG, global size*/
 688 static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
 689                             int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
 690     int i,z,y,x;
 691     int out_i,in_i,out_z,in_z,out_x,in_x;
 692     int s_x,e_x;
 693
 694     for (z=startz; z<endz+1; z++)
 695     {
 696         if (z==startz)
 697         {
 698             s_x=startx;
 699         }
 700         else
 701         {
 702             s_x=0;
 703         }
 704         if (z==endz)
 705         {
 706             e_x=endx;
 707         }
 708         else
 709         {
 710             e_x=pN;
 711         }
 712         out_z  = z*MG*pN;
 713         in_z = z*maxM*maxN;
 714
 715         for (i=0; i<P; i++) /*index cube along long axis*/
 716         {
 717             out_i  = out_z  + oM[i];
 718             in_i = in_z + i*maxM*maxN*maxK;
 719             for (x=s_x;x<e_x;x++)
 720             {
 721                 out_x  = out_i  + x*MG;
 722                 in_x = in_i + x;
 723                 for (y=0;y<M[i];y++)
 724                 {
 725                     lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
 726                 }
 727             }
 728         }
 729     }
 730 }
 731
 732
 733 static void rotate_offsets(int x[]) {
 734     int t=x[0];
 735 /*    x[0]=x[2];
 736     x[2]=x[1];
 737     x[1]=t;*/
 738     x[0]=x[1];
 739     x[1]=x[2];
 740     x[2]=t;
 741 }
 742
 743 /*compute the offset to compare or print transposed local data in original input coordinates
 744   xs matrix dimension size, xl dimension length, xc decomposition offset
 745   s: step in computation = number of transposes*/
 746 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
 747 /*    int direction = plan->direction;
 748     int fftorder = plan->fftorder;*/
 749
 750     int o;
 751     int pos[3],i;
 752     int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
 753         *C=plan->C, *rC=plan->rC;
 754
 755     NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
 756
 757     if (!(plan->flags&FFT5D_ORDER_YZ)) {
 758         switch (s) {
 759         case 0: o=XYZ; break;
 760         case 1: o=ZYX; break;
 761         case 2: o=YZX; break;
 762         default: assert(0);
 763         }
 764     } else {
 765         switch (s) {
 766         case 0: o=XYZ; break;
 767         case 1: o=YXZ; break;
 768         case 2: o=ZXY; break;
 769         default: assert(0);
 770         }
 771     }
 772
 773     switch (o) {
 774         case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
 775         case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
 776         case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
 777         case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
 778         case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
 779         case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
 780     }
 781     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 782
 783     /*xs, xl give dimension size and data length in local transposed coordinate system
 784       for 0(/1/2): x(/y/z) in original coordinate system*/
 785     for (i=0;i<3;i++) {
 786         switch (pos[i]) {
 787         case 1: xs[i]=1;         xc[i]=0;     xl[i]=C[s];break;
 788         case 2: xs[i]=C[s];      xc[i]=oM[s]; xl[i]=pM[s];break;
 789         case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
 790         }
 791     }
 792     /*input order is different for test program to match FFTW order
 793       (important for complex to real)*/
 794     if (plan->flags&FFT5D_BACKWARD) {
 795         rotate_offsets(xs);
 796         rotate_offsets(xl);
 797         rotate_offsets(xc);
 798         rotate_offsets(NG);
 799         if (plan->flags&FFT5D_ORDER_YZ) {
 800             rotate_offsets(xs);
 801             rotate_offsets(xl);
 802             rotate_offsets(xc);
 803             rotate_offsets(NG);
 804         }
 805     }
 806     if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
 807         xl[0] = rC[s];
 808     }
 809 }
 810
 811 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
 812     int x,y,z,l,t;
 813     int *coor = plan->coor;
 814     int xs[3],xl[3],xc[3],NG[3];
 815     int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
 816     compute_offsets(plan,xs,xl,xc,NG,s);
 817     fprintf(debug,txt,coor[0],coor[1],s);
 818     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 819     for(z=0;z<xl[2];z++) {
 820         for(y=0;y<xl[1];y++) {
 821             fprintf(debug,"%d %d: ",coor[0],coor[1]);
 822             for (x=0;x<xl[0];x++) {
 823                 for (l=0;l<ll;l++) {
 824                     fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 825                 }
 826                 fprintf(debug,",");
 827             }
 828             fprintf(debug,"\n");
 829         }
 830     }
 831 }
 832
 833 void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
 834     t_complex *lin = plan->lin;
 835     t_complex *lout = plan->lout;
 836     t_complex *lout2 = plan->lout2;
 837     t_complex *lout3 = plan->lout3;
 838     t_complex *fftout,*joinin;
 839
 840     gmx_fft_t **p1d=plan->p1d;
 841 #ifdef FFT5D_MPI_TRANSPOSE
 842     FFTW(plan) *mpip=plan->mpip;
 843 #endif
 844 #ifdef GMX_MPI
 845     MPI_Comm *cart=plan->cart;
 846 #endif
 847
 848     double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
 849     int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
 850         *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
 851     int s=0,tstart,tend,bParallelDim;
 852
 853
 854 #ifdef GMX_FFT_FFTW3
 855     if (plan->p3d)
 856     {
 857         if (thread == 0)
 858         {
 859 #ifdef NOGMX
 860             if (times!=0)
 861             {
 862                 time=MPI_Wtime();
 863             }
 864 #endif
 865             FFTW(execute)(plan->p3d);
 866 #ifdef NOGMX
 867             if (times!=0)
 868             {
 869                 times->fft+=MPI_Wtime()-time;
 870             }
 871 #endif
 872         }
 873         return;
 874     }
 875 #endif
 876
 877         s=0;
 878
 879     /*lin: x,y,z*/
 880         if (plan->flags&FFT5D_DEBUG && thread == 0)
 881         {
 882             print_localdata(lin, "%d %d: copy in lin\n", s, plan);
 883         }
 884
 885         for (s=0;s<2;s++) {  /*loop over first two FFT steps (corner rotations)*/
 886
 887 #ifdef GMX_MPI
 888         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
 889         {
 890             bParallelDim = 1;
 891         }
 892         else
 893 #endif
 894         {
 895             bParallelDim = 0;
 896         }
 897
 898         /* ---------- START FFT ------------ */
 899 #ifdef NOGMX
 900         if (times!=0 && thread == 0)
 901         {
 902             time=MPI_Wtime();
 903         }
 904 #endif
 905
 906         if (bParallelDim) {
 907             fftout = lout;
 908         }
 909         else
 910         {
 911             if (s==0)
 912             {
 913                 fftout = lout3;
 914             } else
 915             {
 916                 fftout = lout2;
 917             }
 918         }
 919
 920         tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
 921         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
 922         {
 923             gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
 924         } else
 925         {
 926             gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,fftout+tstart);
 927
 928         }
 929
 930 #ifdef NOGMX
 931         if (times != NULL && thread == 0)
 932         {
 933             time_fft+=MPI_Wtime()-time;
 934         }
 935 #endif
 936         if (plan->flags&FFT5D_DEBUG && thread == 0)
 937         {
 938             print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 939         }
 940         /* ---------- END FFT ------------ */
 941
 942         /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
 943         if (bParallelDim) {
 944 #ifdef NOGMX
 945             if (times != NULL && thread == 0)
 946             {
 947                 time=MPI_Wtime();
 948             }
 949 #endif
 950             /*prepare for A
 951 llToAll
 952               1. (most outer) axes (x) is split into P[s] parts of size N[s]
 953               for sending*/
 954             if (pM[s]>0)
 955             {
 956                 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
 957                 tstart/=C[s];
 958                 splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
 959             }
 960 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
 961 #ifdef NOGMX
 962             if (times != NULL && thread == 0)
 963             {
 964                 time_local+=MPI_Wtime()-time;
 965             }
 966 #endif
 967
 968         /* ---------- END SPLIT , START TRANSPOSE------------ */
 969
 970             if (thread == 0)
 971             {
 972 #ifdef NOGMX
 973                 if (times!=0)
 974                 {
 975                     time=MPI_Wtime();
 976                 }
 977 #else
 978                 wallcycle_start(times,ewcPME_FFTCOMM);
 979 #endif
 980 #ifdef FFT5D_MPI_TRANSPOSE
 981                 FFTW(execute)(mpip[s]);
 982 #else
 983 #ifdef GMX_MPI
 984                 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
 985                     MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 986                 else
 987                     MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 988 #else
 989                 gmx_incons("fft5d MPI call without MPI configuration");
 990 #endif /*GMX_MPI*/
 991 #endif /*FFT5D_MPI_TRANSPOSE*/
 992 #ifdef NOGMX
 993                 if (times!=0)
 994                 {
 995                     time_mpi[s]=MPI_Wtime()-time;
 996                 }
 997 #else
 998                 wallcycle_stop(times,ewcPME_FFTCOMM);
 999 #endif
1000             }  /*master*/
1001         }  /* bPrallelDim */
1002 #pragma omp barrier  /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
1003
1004         /* ---------- END SPLIT + TRANSPOSE------------ */
1005
1006         /* ---------- START JOIN ------------ */
1007 #ifdef NOGMX
1008         if (times != NULL && thread == 0)
1009         {
1010             time=MPI_Wtime();
1011         }
1012 #endif
1013
1014         if (bParallelDim) {
1015             joinin = lout3;
1016         } else {
1017             joinin = fftout;
1018         }
1019         /*bring back in matrix form
1020           thus make  new 1. axes contiguos
1021           also local transpose 1 and 2/3
1022           runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1023         */
1024         if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
1025             if (pM[s]>0)
1026             {
1027                 tstart = ( thread   *pM[s]*pN[s]/plan->nthreads);
1028                 tend   = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1029                 joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
1030             }
1031         }
1032         else {
1033             if (pN[s]>0)
1034             {
1035                 tstart = ( thread   *pK[s]*pN[s]/plan->nthreads);
1036                 tend   = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1037                 joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
1038             }
1039         }
1040
1041 #ifdef NOGMX
1042         if (times != NULL && thread == 0)
1043         {
1044             time_local+=MPI_Wtime()-time;
1045         }
1046 #endif
1047         if (plan->flags&FFT5D_DEBUG && thread == 0)
1048         {
1049             print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1050         }
1051         /* ---------- END JOIN ------------ */
1052
1053         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1054     }  /* for(s=0;s<2;s++) */
1055 #ifdef NOGMX
1056         if (times != NULL && thread == 0)
1057         {
1058             time=MPI_Wtime();
1059         }
1060 #endif
1061
1062     if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
1063
1064     /*  ----------- FFT ----------- */
1065     tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1066     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
1067         gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
1068     } else {
1069         gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,lout+tstart);
1070     }
1071     /* ------------ END FFT ---------*/
1072
1073 #ifdef NOGMX
1074     if (times != NULL && thread == 0)
1075     {
1076         time_fft+=MPI_Wtime()-time;
1077
1078         times->fft+=time_fft;
1079         times->local+=time_local;
1080         times->mpi2+=time_mpi[1];
1081         times->mpi1+=time_mpi[0];
1082     }
1083 #endif
1084
1085     if (plan->flags&FFT5D_DEBUG && thread == 0)
1086     {
1087         print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1088     }
1089 }
1090
1091 void fft5d_destroy(fft5d_plan plan) {
1092     int s,t;
1093     for (s=0;s<3;s++)
1094     {
1095         if (plan->p1d[s])
1096         {
1097             for (t=0;t<plan->nthreads;t++)
1098             {
1099                 gmx_many_fft_destroy(plan->p1d[s][t]);
1100             }
1101             free(plan->p1d[s]);
1102         }
1103         if (plan->iNin[s])
1104         {
1105             free(plan->iNin[s]);
1106             plan->iNin[s]=0;
1107         }
1108         if (plan->oNin[s])
1109         {
1110             free(plan->oNin[s]);
1111             plan->oNin[s]=0;
1112         }
1113         if (plan->iNout[s])
1114         {
1115             free(plan->iNout[s]);
1116             plan->iNout[s]=0;
1117         }
1118         if (plan->oNout[s])
1119         {
1120             free(plan->oNout[s]);
1121             plan->oNout[s]=0;
1122         }
1123     }
1124 #ifdef GMX_FFT_FFTW3
1125     FFTW_LOCK;
1126 #ifdef FFT5D_MPI_TRANSPOS
1127     for (s=0;s<2;s++)
1128     {
1129         FFTW(destroy_plan)(plan->mpip[s]);
1130     }
1131     if (plan->p3d)
1132     {
1133         FFTW(destroy_plan)(plan->p3d);
1134     }
1135 #endif /* FFT5D_MPI_TRANSPOS */
1136 #endif /* GMX_FFT_FFTW3 */
1137
1138     if (!(plan->flags&FFT5D_NOMALLOC))
1139     {
1140         sfree_aligned(plan->lin);
1141         sfree_aligned(plan->lout);
1142         sfree_aligned(plan->lout2);
1143         sfree_aligned(plan->lout3);
1144     }
1145
1146 #ifdef FFT5D_THREADS
1147 #ifdef FFT5D_FFTW_THREADS
1148     /*FFTW(cleanup_threads)();*/
1149 #endif
1150 #endif
1151
1152     free(plan);
1153 }
1154
1155 /*Is this better than direct access of plan? enough data?
1156   here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1157 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
1158     *N1=plan->N[0];
1159     *M0=plan->M[0];
1160     *K1=plan->K[0];
1161     *K0=plan->N[1];
1162
1163     *coor=plan->coor;
1164 }
1165
1166
1167 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1168   of processor dimensions*/
1169 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
1170     MPI_Comm cart[2]={0};
1171 #ifdef GMX_MPI
1172     int size=1,prank=0;
1173     int P[2];
1174     int coor[2];
1175     int wrap[]={0,0};
1176     MPI_Comm gcart;
1177     int rdim1[] = {0,1}, rdim2[] = {1,0};
1178
1179     MPI_Comm_size(comm,&size);
1180     MPI_Comm_rank(comm,&prank);
1181
1182     if (P0==0) P0 = lfactor(size);
1183     if (size%P0!=0)
1184     {
1185         if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1186         P0 = lfactor(size);
1187     }
1188
1189     P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1190
1191     /*Difference between x-y-z regarding 2d decomposition is whether they are
1192       distributed along axis 1, 2 or both*/
1193
1194     MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1195     MPI_Cart_get(gcart,2,P,wrap,coor);
1196     MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1197     MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1198 #endif
1199     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
1200 }
1201
1202
1203
1204 /*prints in original coordinate system of data (as the input to FFT)*/
1205 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1206     int xs[3],xl[3],xc[3],NG[3];
1207     int x,y,z,l;
1208     int *coor = plan->coor;
1209     int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1210     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1211     {
1212         ll=1;
1213     }
1214
1215     compute_offsets(plan,xs,xl,xc,NG,2);
1216     if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1217     for (z=0;z<xl[2];z++) {
1218         for(y=0;y<xl[1];y++) {
1219             if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1220             for (x=0;x<xl[0];x++) {
1221                 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1222                     real a,b;
1223                     a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1224                     if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1225                     if (!bothLocal)
1226                         b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1227                     else
1228                         b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1229                     if (plan->flags&FFT5D_DEBUG) {
1230                         printf("%f %f, ",a,b);
1231                     } else {
1232                         if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1233                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1234                         }
1235 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1236                     }
1237                 }
1238                 if (plan->flags&FFT5D_DEBUG) printf(",");
1239             }
1240             if (plan->flags&FFT5D_DEBUG) printf("\n");
1241         }
1242     }
1243
1244 }
1245