src/gromacs/mdlib/fft5d.cpp

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  11  * Copyright (c) 2001-2012, The GROMACS development team,
  12  * check out http://www.gromacs.org for more information.
  13
  14  * This program is free software; you can redistribute it and/or
  15  * modify it under the terms of the GNU General Public License
  16  * as published by the Free Software Foundation; either version 2
  17  * of the License, or (at your option) any later version.
  18  *
  19  * If you want to redistribute modifications, please consider that
  20  * scientific software is very special. Version control is crucial -
  21  * bugs must be traceable. We will be happy to consider code for
  22  * inclusion in the official distribution, but derived work must not
  23  * be called official GROMACS. Details are found in the README & COPYING
  24  * files - if they are missing, get the official version at www.gromacs.org.
  25  *
  26  * To help us fund GROMACS development, we humbly ask that you cite
  27  * the papers on the package - you can find them in the top README file.
  28  *
  29  * For more info, check our website at http://www.gromacs.org
  30  *
  31  * And Hey:
  32  * Groningen Machine for Chemical Simulation
  33  */
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37
  38 #include <algorithm>
  39
  40 #include <stdio.h>
  41 #include <stdlib.h>
  42 #include <string.h>
  43
  44 #ifdef NOGMX
  45 #define GMX_PARALLEL_ENV_INITIALIZED 1
  46 #else
  47 #ifdef GMX_MPI
  48 #define GMX_PARALLEL_ENV_INITIALIZED 1
  49 #else
  50 #define GMX_PARALLEL_ENV_INITIALIZED 0
  51 #endif
  52 #endif
  53
  54 #ifdef GMX_LIB_MPI
  55 #include <mpi.h>
  56 #endif
  57 #ifdef GMX_THREAD_MPI
  58 #include "tmpi.h"
  59 #endif
  60
  61 #ifdef GMX_OPENMP
  62 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
  63 #define FFT5D_THREADS
  64 #endif
  65 #ifdef FFT5D_THREADS
  66 #include "gmx_omp.h"
  67 /* requires fftw compiled with openmp */
  68 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
  69 #endif
  70
  71 #include "fft5d.h"
  72 #include <float.h>
  73 #include <math.h>
  74 #include <assert.h>
  75 #include "smalloc.h"
  76
  77 #ifndef __FLT_EPSILON__
  78 #define __FLT_EPSILON__ FLT_EPSILON
  79 #define __DBL_EPSILON__ DBL_EPSILON
  80 #endif
  81
  82 #ifdef NOGMX
  83 FILE* debug=0;
  84 #endif
  85
  86 #include "gmx_fatal.h"
  87
  88
  89 #ifdef GMX_FFT_FFTW3
  90 #include "thread_mpi/mutex.h"
  91 #include "gromacs/utility/exceptions.h"
  92 /* none of the fftw3 calls, except execute(), are thread-safe, so
  93    we need to serialize them with this mutex. */
  94 static tMPI::mutex big_fftw_mutex;
  95 #define FFTW_LOCK try { big_fftw_mutex.lock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  96 #define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  97 #endif /* GMX_FFT_FFTW3 */
  98
  99 /* largest factor smaller than sqrt */
 100 static int lfactor(int z) {
 101         int i;
 102         for (i=static_cast<int>(sqrt(static_cast<double>(z)));;i--)
 103                 if (z%i==0) return i;
 104         return 1;
 105 }
 106
 107 /* largest factor */
 108 static int l2factor(int z) {
 109         int i;
 110         if (z==1) return 1;
 111         for (i=z/2;;i--)
 112                 if (z%i==0) return i;
 113         return 1;
 114 }
 115
 116 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
 117 static int lpfactor(int z) {
 118         int f = l2factor(z);
 119         if (f==1) return z;
 120         return std::max(lpfactor(f),lpfactor(z/f));
 121 }
 122
 123 #ifndef GMX_MPI
 124 #ifdef HAVE_GETTIMEOFDAY
 125 #include <sys/time.h>
 126 double MPI_Wtime() {
 127     struct timeval tv;
 128     gettimeofday(&tv,0);
 129     return tv.tv_sec+tv.tv_usec*1e-6;
 130 }
 131 #else
 132 double MPI_Wtime() {
 133     return 0.0;
 134 }
 135 #endif
 136 #endif
 137
 138 static int vmax(int* a, int s) {
 139     int i,max=0;
 140     for (i=0;i<s;i++)
 141     {
 142         if (a[i]>max) max=a[i];
 143     }
 144     return max;
 145 }
 146
 147
 148 /* NxMxK the size of the data
 149  * comm communicator to use for fft5d
 150  * P0 number of processor in 1st axes (can be null for automatic)
 151  * lin is allocated by fft5d because size of array is only known after planning phase
 152  * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
 153 */
 154 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
 155 {
 156
 157     int P[2],bMaster,prank[2],i,t;
 158     int rNG,rMG,rKG;
 159     int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
 160     int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
 161     int C[3],rC[3],nP[2];
 162     int lsize;
 163     t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
 164     fft5d_plan plan;
 165     int s;
 166
 167     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 168 #ifdef GMX_MPI
 169     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
 170     {
 171         MPI_Comm_size(comm[0],&P[0]);
 172         MPI_Comm_rank(comm[0],&prank[0]);
 173     }
 174     else
 175 #endif
 176     {
 177         P[0] = 1;
 178         prank[0] = 0;
 179     }
 180 #ifdef GMX_MPI
 181     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
 182     {
 183         MPI_Comm_size(comm[1],&P[1]);
 184         MPI_Comm_rank(comm[1],&prank[1]);
 185     }
 186     else
 187 #endif
 188     {
 189         P[1] = 1;
 190         prank[1] = 0;
 191     }
 192
 193     bMaster=(prank[0]==0&&prank[1]==0);
 194
 195
 196     if (debug)
 197     {
 198         fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
 199                 P[0],P[1],prank[0],prank[1]);
 200     }
 201
 202     if (bMaster) {
 203         if (debug)
 204             fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 205                 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
 206         /* The check below is not correct, one prime factor 11 or 13 is ok.
 207         if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 208             printf("WARNING: FFT very slow with prime factors larger 7\n");
 209             printf("Change FFT size or in case you cannot change it look at\n");
 210             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 211         }
 212         */
 213     }
 214
 215     if (NG==0 || MG==0 || KG==0) {
 216         if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 217         return 0;
 218     }
 219
 220     rNG=NG;rMG=MG;rKG=KG;
 221
 222     if (flags&FFT5D_REALCOMPLEX) {
 223         if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
 224         else {
 225             if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
 226             else KG=KG/2+1;
 227         }
 228     }
 229
 230
 231     /*for transpose we need to know the size for each processor not only our own size*/
 232
 233     N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 234     M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 235     K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 236     oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
 237     oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
 238     oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
 239
 240     for (i=0;i<P[0];i++)
 241     {
 242         #define EVENDIST
 243         #ifndef EVENDIST
 244         oN0[i]=i*ceil((double)NG/P[0]);
 245         oM0[i]=i*ceil((double)MG/P[0]);
 246         oK0[i]=i*ceil((double)KG/P[0]);
 247         #else
 248         oN0[i]=(NG*i)/P[0];
 249         oM0[i]=(MG*i)/P[0];
 250         oK0[i]=(KG*i)/P[0];
 251         #endif
 252     }
 253     for (i=0;i<P[1];i++)
 254     {
 255         #ifndef EVENDIST
 256         oN1[i]=i*ceil((double)NG/P[1]);
 257         oM1[i]=i*ceil((double)MG/P[1]);
 258         oK1[i]=i*ceil((double)KG/P[1]);
 259         #else
 260         oN1[i]=(NG*i)/P[1];
 261         oM1[i]=(MG*i)/P[1];
 262         oK1[i]=(KG*i)/P[1];
 263         #endif
 264     }
 265     for (i=0;i<P[0]-1;i++)
 266     {
 267         N0[i]=oN0[i+1]-oN0[i];
 268         M0[i]=oM0[i+1]-oM0[i];
 269         K0[i]=oK0[i+1]-oK0[i];
 270     }
 271     N0[P[0]-1]=NG-oN0[P[0]-1];
 272     M0[P[0]-1]=MG-oM0[P[0]-1];
 273     K0[P[0]-1]=KG-oK0[P[0]-1];
 274     for (i=0;i<P[1]-1;i++)
 275     {
 276         N1[i]=oN1[i+1]-oN1[i];
 277         M1[i]=oM1[i+1]-oM1[i];
 278         K1[i]=oK1[i+1]-oK1[i];
 279     }
 280     N1[P[1]-1]=NG-oN1[P[1]-1];
 281     M1[P[1]-1]=MG-oM1[P[1]-1];
 282     K1[P[1]-1]=KG-oK1[P[1]-1];
 283
 284     /* for step 1-3 the local N,M,K sizes of the transposed system
 285        C: contiguous dimension, and nP: number of processor in subcommunicator
 286        for that step */
 287
 288
 289     pM[0] = M0[prank[0]];
 290     oM[0] = oM0[prank[0]];
 291     pK[0] = K1[prank[1]];
 292     oK[0] = oK1[prank[1]];
 293     C[0] = NG;
 294     rC[0] = rNG;
 295     if (!(flags&FFT5D_ORDER_YZ)) {
 296         N[0] = vmax(N1,P[1]);
 297         M[0] = M0[prank[0]];
 298         K[0] = vmax(K1,P[1]);
 299         pN[0] = N1[prank[1]];
 300         iNout[0] = N1;
 301         oNout[0] = oN1;
 302         nP[0] = P[1];
 303         C[1] = KG;
 304         rC[1] =rKG;
 305         N[1] = vmax(K0,P[0]);
 306         pN[1] = K0[prank[0]];
 307         iNin[1] = K1;
 308         oNin[1] = oK1;
 309         iNout[1] = K0;
 310         oNout[1] = oK0;
 311         M[1] = vmax(M0,P[0]);
 312         pM[1] = M0[prank[0]];
 313         oM[1] = oM0[prank[0]];
 314         K[1] = N1[prank[1]];
 315         pK[1] = N1[prank[1]];
 316         oK[1] = oN1[prank[1]];
 317         nP[1] = P[0];
 318         C[2] = MG;
 319         rC[2] = rMG;
 320         iNin[2] = M0;
 321         oNin[2] = oM0;
 322         M[2] = vmax(K0,P[0]);
 323         pM[2] = K0[prank[0]];
 324         oM[2] = oK0[prank[0]];
 325         K[2] = vmax(N1,P[1]);
 326         pK[2] = N1[prank[1]];
 327         oK[2] = oN1[prank[1]];
 328         free(N0); free(oN0); /*these are not used for this order*/
 329         free(M1); free(oM1); /*the rest is freed in destroy*/
 330     } else {
 331         N[0] = vmax(N0,P[0]);
 332         M[0] = vmax(M0,P[0]);
 333         K[0] = K1[prank[1]];
 334         pN[0] = N0[prank[0]];
 335         iNout[0] = N0;
 336         oNout[0] = oN0;
 337         nP[0] = P[0];
 338         C[1] = MG;
 339         rC[1] =rMG;
 340         N[1] = vmax(M1,P[1]);
 341         pN[1] = M1[prank[1]];
 342         iNin[1] = M0;
 343         oNin[1] = oM0;
 344         iNout[1] = M1;
 345         oNout[1] = oM1;
 346         M[1] = N0[prank[0]];
 347         pM[1] = N0[prank[0]];
 348         oM[1] = oN0[prank[0]];
 349         K[1] = vmax(K1,P[1]);
 350         pK[1] = K1[prank[1]];
 351         oK[1] = oK1[prank[1]];
 352         nP[1] = P[1];
 353         C[2] = KG;
 354         rC[2] = rKG;
 355         iNin[2] = K1;
 356         oNin[2] = oK1;
 357         M[2] = vmax(N0,P[0]);
 358         pM[2] = N0[prank[0]];
 359         oM[2] = oN0[prank[0]];
 360         K[2] = vmax(M1,P[1]);
 361         pK[2] = M1[prank[1]];
 362         oK[2] = oM1[prank[1]];
 363         free(N1); free(oN1); /*these are not used for this order*/
 364         free(K0); free(oK0); /*the rest is freed in destroy*/
 365     }
 366     N[2]=pN[2]=-1; /*not used*/
 367
 368     /*
 369       Difference between x-y-z regarding 2d decomposition is whether they are
 370       distributed along axis 1, 2 or both
 371     */
 372
 373     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 374     lsize = std::max(N[0]*M[0]*K[0]*nP[0],std::max(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
 375     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 376     if (!(flags&FFT5D_NOMALLOC)) {
 377         snew_aligned(lin, lsize, 32);
 378         snew_aligned(lout, lsize, 32);
 379         snew_aligned(lout2, lsize, 32);
 380         snew_aligned(lout3, lsize, 32);
 381     } else {
 382         lin = *rlin;
 383         lout = *rlout;
 384         lout2 = *rlout2;
 385         lout3 = *rlout3;
 386     }
 387
 388     plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
 389
 390
 391     if (debug)
 392     {
 393         fprintf(debug, "Running on %d threads\n",nthreads);
 394     }
 395
 396 #ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
 397     /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
 398      * within a parallel region. For now deactivated. If it should be supported it has to made sure that
 399      * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
 400      * and that the 3d plan is faster than the 1d plan.
 401      */
 402     if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) {  /*don't do 3d plan in parallel or if in_place requested */
 403             int fftwflags=FFTW_DESTROY_INPUT;
 404             FFTW(iodim) dims[3];
 405             int inNG=NG,outMG=MG,outKG=KG;
 406
 407             FFTW_LOCK;
 408             if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
 409             if (flags&FFT5D_REALCOMPLEX) {
 410                 if (!(flags&FFT5D_BACKWARD)) {  /*input pointer is not complex*/
 411                     inNG*=2;
 412                 } else {                        /*output pointer is not complex*/
 413                     if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
 414                     else outKG*=2;
 415                 }
 416             }
 417
 418             if (!(flags&FFT5D_BACKWARD)) {
 419                 dims[0].n  = KG;
 420                 dims[1].n  = MG;
 421                 dims[2].n  = rNG;
 422
 423                 dims[0].is = inNG*MG;     /*N M K*/
 424                 dims[1].is = inNG;
 425                 dims[2].is = 1;
 426                 if (!(flags&FFT5D_ORDER_YZ)) {
 427                     dims[0].os = MG;       /*M K N*/
 428                     dims[1].os = 1;
 429                     dims[2].os = MG*KG;
 430                 } else  {
 431                     dims[0].os = 1;       /*K N M*/
 432                     dims[1].os = KG*NG;
 433                     dims[2].os = KG;
 434                 }
 435             } else {
 436                 if (!(flags&FFT5D_ORDER_YZ)) {
 437                     dims[0].n  = NG;
 438                     dims[1].n  = KG;
 439                     dims[2].n  = rMG;
 440
 441                     dims[0].is = 1;
 442                     dims[1].is = NG*MG;
 443                     dims[2].is = NG;
 444
 445                     dims[0].os = outMG*KG;
 446                     dims[1].os = outMG;
 447                     dims[2].os = 1;
 448                 } else {
 449                     dims[0].n  = MG;
 450                     dims[1].n  = NG;
 451                     dims[2].n  = rKG;
 452
 453                     dims[0].is = NG;
 454                     dims[1].is = 1;
 455                     dims[2].is = NG*MG;
 456
 457                     dims[0].os = outKG*NG;
 458                     dims[1].os = outKG;
 459                     dims[2].os = 1;
 460                 }
 461             }
 462 #ifdef FFT5D_THREADS
 463 #ifdef FFT5D_FFTW_THREADS
 464             FFTW(plan_with_nthreads)(nthreads);
 465 #endif
 466 #endif
 467             if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
 468                 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 469                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 470                                      (real*)lin, (FFTW(complex) *)lout,
 471                                      /*flags*/ fftwflags);
 472             } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
 473                 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 474                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 475                                      (FFTW(complex) *)lin, (real*)lout,
 476                                      /*flags*/ fftwflags);
 477             } else {
 478                 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 479                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 480                                      (FFTW(complex) *)lin, (FFTW(complex) *)lout,
 481                                      /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
 482             }
 483 #ifdef FFT5D_THREADS
 484 #ifdef FFT5D_FFTW_THREADS
 485             FFTW(plan_with_nthreads)(1);
 486 #endif
 487 #endif
 488             FFTW_UNLOCK;
 489     }
 490     if (!plan->p3d) {  /* for decomposition and if 3d plan did not work */
 491 #endif /* GMX_FFT_FFTW3 */
 492         for (s=0;s<3;s++) {
 493             if (debug)
 494             {
 495                 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 496                         s,rC[s],M[s],pK[s],C[s],lsize);
 497             }
 498             plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
 499
 500             /* Make sure that the init routines are only called by one thread at a time and in order
 501                (later is only important to not confuse valgrind)
 502              */
 503 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
 504             for(t=0; t<nthreads; t++)
 505             {
 506                 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
 507
 508                 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
 509                     gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 510                 } else {
 511                     gmx_fft_init_many_1d     ( &plan->p1d[s][t],  C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 512                 }
 513             }
 514         }
 515
 516 #ifdef GMX_FFT_FFTW3
 517     }
 518 #endif
 519     if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
 520         plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
 521     } else {
 522         plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
 523     }
 524 #ifdef FFT5D_MPI_TRANSPOSE
 525     FFTW_LOCK;
 526     for (s=0;s<2;s++) {
 527         if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
 528             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 529         else
 530             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
 531     }
 532     FFTW_UNLOCK;
 533 #endif
 534
 535
 536     plan->lin=lin;
 537     plan->lout=lout;
 538     plan->lout2=lout2;
 539     plan->lout3=lout3;
 540
 541     plan->NG=NG;plan->MG=MG;plan->KG=KG;
 542
 543     for (s=0;s<3;s++) {
 544         plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
 545         plan->oM[s]=oM[s];plan->oK[s]=oK[s];
 546         plan->C[s]=C[s];plan->rC[s]=rC[s];
 547         plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
 548     }
 549     for (s=0;s<2;s++) {
 550         plan->P[s]=nP[s];plan->coor[s]=prank[s];
 551     }
 552
 553 /*    plan->fftorder=fftorder;
 554     plan->direction=direction;
 555     plan->realcomplex=realcomplex;
 556 */
 557     plan->flags=flags;
 558     plan->nthreads=nthreads;
 559     *rlin=lin;
 560     *rlout=lout;
 561     *rlout2=lout2;
 562     *rlout3=lout3;
 563     return plan;
 564 }
 565
 566
 567 enum order {
 568     XYZ,
 569     XZY,
 570     YXZ,
 571     YZX,
 572     ZXY,
 573     ZYX
 574 };
 575
 576
 577
 578 /*here x,y,z and N,M,K is in rotated coordinate system!!
 579   x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 580   maxN,maxM,maxK is max size of local data
 581   pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 582   NG, MG, KG is size of global data*/
 583 static void splitaxes(t_complex* lout,const t_complex* lin,
 584                       int maxN,int maxM,int maxK, int pN, int pM, int pK,
 585                       int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
 586 {
 587     int x,y,z,i;
 588     int in_i,out_i,in_z,out_z,in_y,out_y;
 589     int s_y,e_y;
 590
 591     for (z=startz; z<endz+1; z++) /*3. z l*/
 592     {
 593         if (z==startz) {
 594             s_y=starty;
 595         } else {
 596             s_y=0;
 597         }
 598         if (z==endz) {
 599             e_y=endy;
 600         } else {
 601             e_y=pM;
 602         }
 603         out_z  = z*maxN*maxM;
 604         in_z = z*NG*pM;
 605
 606         for (i=0; i<P; i++) /*index cube along long axis*/
 607         {
 608             out_i  = out_z  + i*maxN*maxM*maxK;
 609             in_i = in_z + oN[i];
 610             for (y=s_y;y<e_y;y++) { /*2. y k*/
 611                 out_y  = out_i  + y*maxN;
 612                 in_y = in_i + y*NG;
 613                 for (x=0;x<N[i];x++) { /*1. x j*/
 614                     lout[out_y+x] = lin[in_y+x];    /*in=z*NG*pM+oN[i]+y*NG+x*/
 615                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 616                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 617                 }
 618             }
 619         }
 620     }
 621 }
 622
 623 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 624 /*transpose mayor and major dimension
 625   variables see above
 626   the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 627   N,M,K local dimensions
 628   KG global size*/
 629 static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
 630                             int maxN,int maxM,int maxK,int pN, int pM, int pK,
 631                             int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
 632 {
 633     int i,x,y,z;
 634     int out_i,in_i,out_x,in_x,out_z,in_z;
 635     int s_y,e_y;
 636
 637     for (x=startx;x<endx+1;x++) /*1.j*/
 638     {
 639         if (x==startx)
 640         {
 641             s_y=starty;
 642         }
 643         else
 644         {
 645             s_y=0;
 646         }
 647         if (x==endx)
 648         {
 649             e_y=endy;
 650         }
 651         else
 652         {
 653             e_y=pM;
 654         }
 655
 656         out_x  = x*KG*pM;
 657         in_x = x;
 658
 659         for (i=0;i<P;i++) /*index cube along long axis*/
 660         {
 661             out_i  = out_x  + oK[i];
 662             in_i = in_x + i*maxM*maxN*maxK;
 663             for (z=0;z<K[i];z++) /*3.l*/
 664             {
 665                 out_z  = out_i  + z;
 666                 in_z = in_i + z*maxM*maxN;
 667                 for (y=s_y;y<e_y;y++) /*2.k*/
 668                 {
 669                     lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
 670                 }
 671             }
 672         }
 673     }
 674 }
 675
 676 /*make axis contiguous again (after AllToAll) and also do local transpose
 677   tranpose mayor and middle dimension
 678   variables see above
 679   the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 680   N,M,K local size
 681   MG, global size*/
 682 static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
 683                             int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
 684     int i,z,y,x;
 685     int out_i,in_i,out_z,in_z,out_x,in_x;
 686     int s_x,e_x;
 687
 688     for (z=startz; z<endz+1; z++)
 689     {
 690         if (z==startz)
 691         {
 692             s_x=startx;
 693         }
 694         else
 695         {
 696             s_x=0;
 697         }
 698         if (z==endz)
 699         {
 700             e_x=endx;
 701         }
 702         else
 703         {
 704             e_x=pN;
 705         }
 706         out_z  = z*MG*pN;
 707         in_z = z*maxM*maxN;
 708
 709         for (i=0; i<P; i++) /*index cube along long axis*/
 710         {
 711             out_i  = out_z  + oM[i];
 712             in_i = in_z + i*maxM*maxN*maxK;
 713             for (x=s_x;x<e_x;x++)
 714             {
 715                 out_x  = out_i  + x*MG;
 716                 in_x = in_i + x;
 717                 for (y=0;y<M[i];y++)
 718                 {
 719                     lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
 720                 }
 721             }
 722         }
 723     }
 724 }
 725
 726
 727 static void rotate_offsets(int x[]) {
 728     int t=x[0];
 729 /*    x[0]=x[2];
 730     x[2]=x[1];
 731     x[1]=t;*/
 732     x[0]=x[1];
 733     x[1]=x[2];
 734     x[2]=t;
 735 }
 736
 737 /*compute the offset to compare or print transposed local data in original input coordinates
 738   xs matrix dimension size, xl dimension length, xc decomposition offset
 739   s: step in computation = number of transposes*/
 740 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
 741 /*    int direction = plan->direction;
 742     int fftorder = plan->fftorder;*/
 743
 744     int o;
 745     int pos[3],i;
 746     int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
 747         *C=plan->C, *rC=plan->rC;
 748
 749     NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
 750
 751     if (!(plan->flags&FFT5D_ORDER_YZ)) {
 752         switch (s) {
 753         case 0: o=XYZ; break;
 754         case 1: o=ZYX; break;
 755         case 2: o=YZX; break;
 756         default: assert(0);
 757         }
 758     } else {
 759         switch (s) {
 760         case 0: o=XYZ; break;
 761         case 1: o=YXZ; break;
 762         case 2: o=ZXY; break;
 763         default: assert(0);
 764         }
 765     }
 766
 767     switch (o) {
 768         case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
 769         case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
 770         case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
 771         case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
 772         case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
 773         case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
 774     }
 775     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 776
 777     /*xs, xl give dimension size and data length in local transposed coordinate system
 778       for 0(/1/2): x(/y/z) in original coordinate system*/
 779     for (i=0;i<3;i++) {
 780         switch (pos[i]) {
 781         case 1: xs[i]=1;         xc[i]=0;     xl[i]=C[s];break;
 782         case 2: xs[i]=C[s];      xc[i]=oM[s]; xl[i]=pM[s];break;
 783         case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
 784         }
 785     }
 786     /*input order is different for test program to match FFTW order
 787       (important for complex to real)*/
 788     if (plan->flags&FFT5D_BACKWARD) {
 789         rotate_offsets(xs);
 790         rotate_offsets(xl);
 791         rotate_offsets(xc);
 792         rotate_offsets(NG);
 793         if (plan->flags&FFT5D_ORDER_YZ) {
 794             rotate_offsets(xs);
 795             rotate_offsets(xl);
 796             rotate_offsets(xc);
 797             rotate_offsets(NG);
 798         }
 799     }
 800     if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
 801         xl[0] = rC[s];
 802     }
 803 }
 804
 805 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
 806     int x,y,z,l;
 807     int *coor = plan->coor;
 808     int xs[3],xl[3],xc[3],NG[3];
 809     int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
 810     compute_offsets(plan,xs,xl,xc,NG,s);
 811     fprintf(debug,txt,coor[0],coor[1],s);
 812     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 813     for(z=0;z<xl[2];z++) {
 814         for(y=0;y<xl[1];y++) {
 815             fprintf(debug,"%d %d: ",coor[0],coor[1]);
 816             for (x=0;x<xl[0];x++) {
 817                 for (l=0;l<ll;l++) {
 818                     fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 819                 }
 820                 fprintf(debug,",");
 821             }
 822             fprintf(debug,"\n");
 823         }
 824     }
 825 }
 826
 827 void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
 828     t_complex *lin = plan->lin;
 829     t_complex *lout = plan->lout;
 830     t_complex *lout2 = plan->lout2;
 831     t_complex *lout3 = plan->lout3;
 832     t_complex *fftout,*joinin;
 833
 834     gmx_fft_t **p1d=plan->p1d;
 835 #ifdef FFT5D_MPI_TRANSPOSE
 836     FFTW(plan) *mpip=plan->mpip;
 837 #endif
 838 #ifdef GMX_MPI
 839     MPI_Comm *cart=plan->cart;
 840 #endif
 841 #ifdef NOGMX
 842     double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
 843 #endif
 844     int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
 845         *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
 846     int s=0,tstart,tend,bParallelDim;
 847
 848
 849 #ifdef GMX_FFT_FFTW3
 850     if (plan->p3d)
 851     {
 852         if (thread == 0)
 853         {
 854 #ifdef NOGMX
 855             if (times!=0)
 856             {
 857                 time=MPI_Wtime();
 858             }
 859 #endif
 860             FFTW(execute)(plan->p3d);
 861 #ifdef NOGMX
 862             if (times!=0)
 863             {
 864                 times->fft+=MPI_Wtime()-time;
 865             }
 866 #endif
 867         }
 868         return;
 869     }
 870 #endif
 871
 872         s=0;
 873
 874     /*lin: x,y,z*/
 875         if (plan->flags&FFT5D_DEBUG && thread == 0)
 876         {
 877             print_localdata(lin, "%d %d: copy in lin\n", s, plan);
 878         }
 879
 880         for (s=0;s<2;s++) {  /*loop over first two FFT steps (corner rotations)*/
 881
 882 #ifdef GMX_MPI
 883         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
 884         {
 885             bParallelDim = 1;
 886         }
 887         else
 888 #endif
 889         {
 890             bParallelDim = 0;
 891         }
 892
 893         /* ---------- START FFT ------------ */
 894 #ifdef NOGMX
 895         if (times!=0 && thread == 0)
 896         {
 897             time=MPI_Wtime();
 898         }
 899 #endif
 900
 901         if (bParallelDim) {
 902             fftout = lout;
 903         }
 904         else
 905         {
 906             if (s==0)
 907             {
 908                 fftout = lout3;
 909             } else
 910             {
 911                 fftout = lout2;
 912             }
 913         }
 914
 915         tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
 916         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
 917         {
 918             gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
 919         } else
 920         {
 921             gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,fftout+tstart);
 922
 923         }
 924
 925 #ifdef NOGMX
 926         if (times != NULL && thread == 0)
 927         {
 928             time_fft+=MPI_Wtime()-time;
 929         }
 930 #endif
 931         if (plan->flags&FFT5D_DEBUG && thread == 0)
 932         {
 933             print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 934         }
 935         /* ---------- END FFT ------------ */
 936
 937         /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
 938         if (bParallelDim) {
 939 #ifdef NOGMX
 940             if (times != NULL && thread == 0)
 941             {
 942                 time=MPI_Wtime();
 943             }
 944 #endif
 945             /*prepare for A
 946 llToAll
 947               1. (most outer) axes (x) is split into P[s] parts of size N[s]
 948               for sending*/
 949             if (pM[s]>0)
 950             {
 951                 tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
 952                 tstart/=C[s];
 953                 splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
 954             }
 955 #pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
 956 #ifdef NOGMX
 957             if (times != NULL && thread == 0)
 958             {
 959                 time_local+=MPI_Wtime()-time;
 960             }
 961 #endif
 962
 963         /* ---------- END SPLIT , START TRANSPOSE------------ */
 964
 965             if (thread == 0)
 966             {
 967 #ifdef NOGMX
 968                 if (times!=0)
 969                 {
 970                     time=MPI_Wtime();
 971                 }
 972 #else
 973                 wallcycle_start(times,ewcPME_FFTCOMM);
 974 #endif
 975 #ifdef FFT5D_MPI_TRANSPOSE
 976                 FFTW(execute)(mpip[s]);
 977 #else
 978 #ifdef GMX_MPI
 979                 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
 980                     MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 981                 else
 982                     MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 983 #else
 984                 gmx_incons("fft5d MPI call without MPI configuration");
 985 #endif /*GMX_MPI*/
 986 #endif /*FFT5D_MPI_TRANSPOSE*/
 987 #ifdef NOGMX
 988                 if (times!=0)
 989                 {
 990                     time_mpi[s]=MPI_Wtime()-time;
 991                 }
 992 #else
 993                 wallcycle_stop(times,ewcPME_FFTCOMM);
 994 #endif
 995             }  /*master*/
 996         }  /* bPrallelDim */
 997 #pragma omp barrier  /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
 998
 999         /* ---------- END SPLIT + TRANSPOSE------------ */
1000
1001         /* ---------- START JOIN ------------ */
1002 #ifdef NOGMX
1003         if (times != NULL && thread == 0)
1004         {
1005             time=MPI_Wtime();
1006         }
1007 #endif
1008
1009         if (bParallelDim) {
1010             joinin = lout3;
1011         } else {
1012             joinin = fftout;
1013         }
1014         /*bring back in matrix form
1015           thus make  new 1. axes contiguos
1016           also local transpose 1 and 2/3
1017           runs on thread used for following FFT (thus needing a barrier before but not afterwards)
1018         */
1019         if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
1020             if (pM[s]>0)
1021             {
1022                 tstart = ( thread   *pM[s]*pN[s]/plan->nthreads);
1023                 tend   = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
1024                 joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
1025             }
1026         }
1027         else {
1028             if (pN[s]>0)
1029             {
1030                 tstart = ( thread   *pK[s]*pN[s]/plan->nthreads);
1031                 tend   = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
1032                 joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
1033             }
1034         }
1035
1036 #ifdef NOGMX
1037         if (times != NULL && thread == 0)
1038         {
1039             time_local+=MPI_Wtime()-time;
1040         }
1041 #endif
1042         if (plan->flags&FFT5D_DEBUG && thread == 0)
1043         {
1044             print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
1045         }
1046         /* ---------- END JOIN ------------ */
1047
1048         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
1049     }  /* for(s=0;s<2;s++) */
1050 #ifdef NOGMX
1051         if (times != NULL && thread == 0)
1052         {
1053             time=MPI_Wtime();
1054         }
1055 #endif
1056
1057     if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
1058
1059     /*  ----------- FFT ----------- */
1060     tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
1061     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
1062         gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
1063     } else {
1064         gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,lout+tstart);
1065     }
1066     /* ------------ END FFT ---------*/
1067
1068 #ifdef NOGMX
1069     if (times != NULL && thread == 0)
1070     {
1071         time_fft+=MPI_Wtime()-time;
1072
1073         times->fft+=time_fft;
1074         times->local+=time_local;
1075         times->mpi2+=time_mpi[1];
1076         times->mpi1+=time_mpi[0];
1077     }
1078 #endif
1079
1080     if (plan->flags&FFT5D_DEBUG && thread == 0)
1081     {
1082         print_localdata(lout, "%d %d: FFT %d\n", s, plan);
1083     }
1084 }
1085
1086 void fft5d_destroy(fft5d_plan plan) {
1087     int s,t;
1088     for (s=0;s<3;s++)
1089     {
1090         if (plan->p1d[s])
1091         {
1092             for (t=0;t<plan->nthreads;t++)
1093             {
1094                 gmx_many_fft_destroy(plan->p1d[s][t]);
1095             }
1096             free(plan->p1d[s]);
1097         }
1098         if (plan->iNin[s])
1099         {
1100             free(plan->iNin[s]);
1101             plan->iNin[s]=0;
1102         }
1103         if (plan->oNin[s])
1104         {
1105             free(plan->oNin[s]);
1106             plan->oNin[s]=0;
1107         }
1108         if (plan->iNout[s])
1109         {
1110             free(plan->iNout[s]);
1111             plan->iNout[s]=0;
1112         }
1113         if (plan->oNout[s])
1114         {
1115             free(plan->oNout[s]);
1116             plan->oNout[s]=0;
1117         }
1118     }
1119 #ifdef GMX_FFT_FFTW3
1120     FFTW_LOCK;
1121 #ifdef FFT5D_MPI_TRANSPOS
1122     for (s=0;s<2;s++)
1123     {
1124         FFTW(destroy_plan)(plan->mpip[s]);
1125     }
1126 #endif /* FFT5D_MPI_TRANSPOS */
1127     if (plan->p3d)
1128     {
1129         FFTW(destroy_plan)(plan->p3d);
1130     }
1131     FFTW_UNLOCK;
1132 #endif /* GMX_FFT_FFTW3 */
1133
1134     if (!(plan->flags&FFT5D_NOMALLOC))
1135     {
1136         sfree_aligned(plan->lin);
1137         sfree_aligned(plan->lout);
1138         sfree_aligned(plan->lout2);
1139         sfree_aligned(plan->lout3);
1140     }
1141
1142 #ifdef FFT5D_THREADS
1143 #ifdef FFT5D_FFTW_THREADS
1144     /*FFTW(cleanup_threads)();*/
1145 #endif
1146 #endif
1147
1148     free(plan);
1149 }
1150
1151 /*Is this better than direct access of plan? enough data?
1152   here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
1153 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
1154     *N1=plan->N[0];
1155     *M0=plan->M[0];
1156     *K1=plan->K[0];
1157     *K0=plan->N[1];
1158
1159     *coor=plan->coor;
1160 }
1161
1162
1163 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
1164   of processor dimensions*/
1165 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
1166     MPI_Comm cart[2]={0};
1167 #ifdef GMX_MPI
1168     int size=1,prank=0;
1169     int P[2];
1170     int coor[2];
1171     int wrap[]={0,0};
1172     MPI_Comm gcart;
1173     int rdim1[] = {0,1}, rdim2[] = {1,0};
1174
1175     MPI_Comm_size(comm,&size);
1176     MPI_Comm_rank(comm,&prank);
1177
1178     if (P0==0) P0 = lfactor(size);
1179     if (size%P0!=0)
1180     {
1181         if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1182         P0 = lfactor(size);
1183     }
1184
1185     P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1186
1187     /*Difference between x-y-z regarding 2d decomposition is whether they are
1188       distributed along axis 1, 2 or both*/
1189
1190     MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1191     MPI_Cart_get(gcart,2,P,wrap,coor);
1192     MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1193     MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1194 #endif
1195     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
1196 }
1197
1198
1199
1200 /*prints in original coordinate system of data (as the input to FFT)*/
1201 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1202     int xs[3],xl[3],xc[3],NG[3];
1203     int x,y,z,l;
1204     int *coor = plan->coor;
1205     int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1206     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1207     {
1208         ll=1;
1209     }
1210
1211     compute_offsets(plan,xs,xl,xc,NG,2);
1212     if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1213     for (z=0;z<xl[2];z++) {
1214         for(y=0;y<xl[1];y++) {
1215             if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1216             for (x=0;x<xl[0];x++) {
1217                 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1218                     real a,b;
1219                     a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1220                     if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1221                     if (!bothLocal)
1222                         b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1223                     else
1224                         b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1225                     if (plan->flags&FFT5D_DEBUG) {
1226                         printf("%f %f, ",a,b);
1227                     } else {
1228                         if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1229                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1230                         }
1231 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1232                     }
1233                 }
1234                 if (plan->flags&FFT5D_DEBUG) printf(",");
1235             }
1236             if (plan->flags&FFT5D_DEBUG) printf("\n");
1237         }
1238     }
1239
1240 }
1241