src/mdlib/fft5d.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  *                        VERSION 4.5
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2008, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Groningen Machine for Chemical Simulation
  34  */
  35 #ifdef HAVE_CONFIG_H
  36 #include <config.h>
  37 #endif
  38
  39
  40 #include <stdio.h>
  41 #include <stdlib.h>
  42 #include <string.h>
  43
  44 #ifdef NOGMX
  45 #define GMX_PARALLEL_ENV_INITIALIZED 1
  46 #else
  47 #include "main.h"
  48 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
  49 #endif
  50
  51 #ifdef GMX_LIB_MPI
  52 #include <mpi.h>
  53 #endif
  54 #ifdef GMX_THREADS
  55 #include "tmpi.h"
  56 #endif
  57
  58 #ifdef FFT5D_THREADS
  59 #include <omp.h>
  60 /* requires fftw compiled with openmp */
  61 #define FFT5D_FFTW_THREADS
  62 #endif
  63
  64 #include "fft5d.h"
  65 #include <float.h>
  66 #include <math.h>
  67 #include <assert.h>
  68
  69 #ifndef __FLT_EPSILON__
  70 #define __FLT_EPSILON__ FLT_EPSILON
  71 #define __DBL_EPSILON__ DBL_EPSILON
  72 #endif
  73
  74 #ifdef NOGMX
  75 FILE* debug=0;
  76 #endif
  77
  78 #include "gmx_fatal.h"
  79
  80
  81 #ifdef GMX_FFT_FFTW3
  82 #ifdef GMX_THREADS
  83 /* none of the fftw3 calls, except execute(), are thread-safe, so
  84    we need to serialize them with this mutex. */
  85 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
  86
  87 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex);
  88 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex);
  89 #else /* GMX_THREADS */
  90 #define FFTW_LOCK
  91 #define FFTW_UNLOCK
  92 #endif /* GMX_THREADS */
  93 #endif /* GMX_FFT_FFTW3 */
  94
  95 static double fft5d_fmax(double a, double b){
  96         return (a>b)?a:b;
  97 }
  98
  99 /* largest factor smaller than sqrt */
 100 static int lfactor(int z) {
 101         int i;
 102         for (i=sqrt(z);;i--)
 103                 if (z%i==0) return i;
 104         return 1;
 105 }
 106
 107 /* largest factor */
 108 static int l2factor(int z) {
 109         int i;
 110         if (z==1) return 1;
 111         for (i=z/2;;i--)
 112                 if (z%i==0) return i;
 113         return 1;
 114 }
 115
 116 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
 117 static int lpfactor(int z) {
 118         int f = l2factor(z);
 119         if (f==1) return z;
 120         return fft5d_fmax(lpfactor(f),lpfactor(z/f));
 121 }
 122
 123 #ifndef GMX_MPI
 124 #ifdef HAVE_GETTIMEOFDAY
 125 #include <sys/time.h>
 126 double MPI_Wtime() {
 127     struct timeval tv;
 128     gettimeofday(&tv,0);
 129     return tv.tv_sec+tv.tv_usec*1e-6;
 130 }
 131 #else
 132 double MPI_Wtime() {
 133     return 0.0;
 134 }
 135 #endif
 136 #endif
 137
 138 static int vmax(int* a, int s) {
 139     int i,max=0;
 140     for (i=0;i<s;i++)
 141     {
 142         if (a[i]>max) max=a[i];
 143     }
 144     return max;
 145 }
 146
 147 /*
 148 copied here from fftgrid, because:
 149 1. function there not publically available
 150 2. not sure whether we keep fftgrid
 151 3. less dependencies for fft5d
 152
 153 Only used for non-fftw case
 154 */
 155 static void *
 156 gmx_calloc_aligned(size_t size)
 157 {
 158     void *p0,*p;
 159
 160     /*We initialize by zero for Valgrind
 161       For non-divisible case we communicate more than the data.
 162       If we don't initialize the data we communicate uninitialized data*/
 163     p0 = calloc(size+32,1);
 164
 165     if(p0 == NULL)
 166     {
 167         gmx_fatal(FARGS,"Failed to allocated %u bytes of aligned memory.",size+32);
 168     }
 169
 170     p = (void *) (((size_t) p0 + 32) & (~((size_t) 31)));
 171
 172     /* Yeah, yeah, we cannot free this pointer, but who cares... */
 173     return p;
 174 }
 175
 176
 177 /* NxMxK the size of the data
 178  * comm communicator to use for fft5d
 179  * P0 number of processor in 1st axes (can be null for automatic)
 180  * lin is allocated by fft5d because size of array is only known after planning phase */
 181 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout)
 182 {
 183
 184     int P[2],bMaster,prank[2],i;
 185     int rNG,rMG,rKG;
 186     int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
 187     int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
 188     int C[3],rC[3],nP[2];
 189     int lsize;
 190     t_complex *lin=0,*lout=0;
 191     fft5d_plan plan;
 192     int s;
 193
 194     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 195 #ifdef GMX_MPI
 196     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != 0)
 197     {
 198         MPI_Comm_size(comm[0],&P[0]);
 199         MPI_Comm_rank(comm[0],&prank[0]);
 200     }
 201     else
 202 #endif
 203     {
 204         P[0] = 1;
 205         prank[0] = 0;
 206     }
 207 #ifdef GMX_MPI
 208     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != 0)
 209     {
 210         MPI_Comm_size(comm[1],&P[1]);
 211         MPI_Comm_rank(comm[1],&prank[1]);
 212     }
 213     else
 214 #endif
 215     {
 216         P[1] = 1;
 217         prank[1] = 0;
 218     }
 219
 220     bMaster=(prank[0]==0&&prank[1]==0);
 221
 222
 223     if (debug)
 224     {
 225         fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
 226                 P[0],P[1],prank[0],prank[1]);
 227     }
 228
 229     if (bMaster) {
 230         if (debug)
 231             fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 232                 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
 233         /* The check below is not correct, one prime factor 11 or 13 is ok.
 234         if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 235             printf("WARNING: FFT very slow with prime factors larger 7\n");
 236             printf("Change FFT size or in case you cannot change it look at\n");
 237             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 238         }
 239         */
 240     }
 241
 242     if (NG==0 || MG==0 || KG==0) {
 243         if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 244         return 0;
 245     }
 246
 247     rNG=NG;rMG=MG;rKG=KG;
 248
 249     if (flags&FFT5D_REALCOMPLEX) {
 250         if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
 251         else {
 252             if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
 253             else KG=KG/2+1;
 254         }
 255     }
 256
 257
 258     /*for transpose we need to know the size for each processor not only our own size*/
 259
 260     N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 261     M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 262     K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 263     oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
 264     oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
 265     oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
 266
 267     for (i=0;i<P[0];i++)
 268     {
 269         #define EVENDIST
 270         #ifndef EVENDIST
 271         oN0[i]=i*ceil((double)NG/P[0]);
 272         oM0[i]=i*ceil((double)MG/P[0]);
 273         oK0[i]=i*ceil((double)KG/P[0]);
 274         #else
 275         oN0[i]=(NG*i)/P[0];
 276         oM0[i]=(MG*i)/P[0];
 277         oK0[i]=(KG*i)/P[0];
 278         #endif
 279     }
 280     for (i=0;i<P[1];i++)
 281     {
 282         #ifndef EVENDIST
 283         oN1[i]=i*ceil((double)NG/P[1]);
 284         oM1[i]=i*ceil((double)MG/P[1]);
 285         oK1[i]=i*ceil((double)KG/P[1]);
 286         #else
 287         oN1[i]=(NG*i)/P[1];
 288         oM1[i]=(MG*i)/P[1];
 289         oK1[i]=(KG*i)/P[1];
 290         #endif
 291     }
 292     for (i=0;i<P[0]-1;i++)
 293     {
 294         N0[i]=oN0[i+1]-oN0[i];
 295         M0[i]=oM0[i+1]-oM0[i];
 296         K0[i]=oK0[i+1]-oK0[i];
 297     }
 298     N0[P[0]-1]=NG-oN0[P[0]-1];
 299     M0[P[0]-1]=MG-oM0[P[0]-1];
 300     K0[P[0]-1]=KG-oK0[P[0]-1];
 301     for (i=0;i<P[1]-1;i++)
 302     {
 303         N1[i]=oN1[i+1]-oN1[i];
 304         M1[i]=oM1[i+1]-oM1[i];
 305         K1[i]=oK1[i+1]-oK1[i];
 306     }
 307     N1[P[1]-1]=NG-oN1[P[1]-1];
 308     M1[P[1]-1]=MG-oM1[P[1]-1];
 309     K1[P[1]-1]=KG-oK1[P[1]-1];
 310
 311     /* for step 1-3 the local N,M,K sizes of the transposed system
 312        C: contiguous dimension, and nP: number of processor in subcommunicator
 313        for that step */
 314
 315
 316     pM[0] = M0[prank[0]];
 317     oM[0] = oM0[prank[0]];
 318     pK[0] = K1[prank[1]];
 319     oK[0] = oK1[prank[1]];
 320     C[0] = NG;
 321     rC[0] = rNG;
 322     if (!(flags&FFT5D_ORDER_YZ)) {
 323         N[0] = vmax(N1,P[1]);
 324         M[0] = M0[prank[0]];
 325         K[0] = vmax(K1,P[1]);
 326         pN[0] = N1[prank[1]];
 327         iNout[0] = N1;
 328         oNout[0] = oN1;
 329         nP[0] = P[1];
 330         C[1] = KG;
 331         rC[1] =rKG;
 332         N[1] = vmax(K0,P[0]);
 333         pN[1] = K0[prank[0]];
 334         iNin[1] = K1;
 335         oNin[1] = oK1;
 336         iNout[1] = K0;
 337         oNout[1] = oK0;
 338         M[1] = vmax(M0,P[0]);
 339         pM[1] = M0[prank[0]];
 340         oM[1] = oM0[prank[0]];
 341         K[1] = N1[prank[1]];
 342         pK[1] = N1[prank[1]];
 343         oK[1] = oN1[prank[1]];
 344         nP[1] = P[0];
 345         C[2] = MG;
 346         rC[2] = rMG;
 347         iNin[2] = M0;
 348         oNin[2] = oM0;
 349         M[2] = vmax(K0,P[0]);
 350         pM[2] = K0[prank[0]];
 351         oM[2] = oK0[prank[0]];
 352         K[2] = vmax(N1,P[1]);
 353         pK[2] = N1[prank[1]];
 354         oK[2] = oN1[prank[1]];
 355         free(N0); free(oN0); /*these are not used for this order*/
 356         free(M1); free(oM1); /*the rest is freed in destroy*/
 357     } else {
 358         N[0] = vmax(N0,P[0]);
 359         M[0] = vmax(M0,P[0]);
 360         K[0] = K1[prank[1]];
 361         pN[0] = N0[prank[0]];
 362         iNout[0] = N0;
 363         oNout[0] = oN0;
 364         nP[0] = P[0];
 365         C[1] = MG;
 366         rC[1] =rMG;
 367         N[1] = vmax(M1,P[1]);
 368         pN[1] = M1[prank[1]];
 369         iNin[1] = M0;
 370         oNin[1] = oM0;
 371         iNout[1] = M1;
 372         oNout[1] = oM1;
 373         M[1] = N0[prank[0]];
 374         pM[1] = N0[prank[0]];
 375         oM[1] = oN0[prank[0]];
 376         K[1] = vmax(K1,P[1]);
 377         pK[1] = K1[prank[1]];
 378         oK[1] = oK1[prank[1]];
 379         nP[1] = P[1];
 380         C[2] = KG;
 381         rC[2] = rKG;
 382         iNin[2] = K1;
 383         oNin[2] = oK1;
 384         M[2] = vmax(N0,P[0]);
 385         pM[2] = N0[prank[0]];
 386         oM[2] = oN0[prank[0]];
 387         K[2] = vmax(M1,P[1]);
 388         pK[2] = M1[prank[1]];
 389         oK[2] = oM1[prank[1]];
 390         free(N1); free(oN1); /*these are not used for this order*/
 391         free(K0); free(oK0); /*the rest is freed in destroy*/
 392     }
 393
 394     /*
 395       Difference between x-y-z regarding 2d decomposition is whether they are
 396       distributed along axis 1, 2 or both
 397     */
 398
 399     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 400     lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
 401     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 402     if (!(flags&FFT5D_NOMALLOC)) {
 403         lin = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
 404         lout = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
 405     } else {
 406         lin = *rlin;
 407         lout = *rlout;
 408     }
 409
 410     plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
 411
 412
 413 #ifdef FFT5D_THREADS
 414 #ifdef FFT5D_FFTW_THREADS
 415     FFTW(init_threads)();
 416     int nthreads;
 417     #pragma omp parallel
 418     {
 419         #pragma omp master
 420         {
 421             nthreads = omp_get_num_threads();
 422         }
 423     }
 424     if (prank[0] == 0 && prank[1] == 0)
 425     {
 426         printf("Running fftw on %d threads\n",nthreads);
 427     }
 428     FFTW(plan_with_nthreads)(nthreads);
 429 #endif
 430 #endif
 431
 432 #ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
 433     if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1))) {  /*don't do 3d plan in parallel or if in_place requested */
 434             int fftwflags=FFTW_DESTROY_INPUT;
 435             FFTW(iodim) dims[3];
 436             int inNG=NG,outMG=MG,outKG=KG;
 437
 438             FFTW_LOCK;
 439             if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
 440             if (flags&FFT5D_REALCOMPLEX) {
 441                 if (!(flags&FFT5D_BACKWARD)) {  /*input pointer is not complex*/
 442                     inNG*=2;
 443                 } else {                        /*output pointer is not complex*/
 444                     if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
 445                     else outKG*=2;
 446                 }
 447             }
 448
 449             if (!(flags&FFT5D_BACKWARD)) {
 450                 dims[0].n  = KG;
 451                 dims[1].n  = MG;
 452                 dims[2].n  = rNG;
 453
 454                 dims[0].is = inNG*MG;     /*N M K*/
 455                 dims[1].is = inNG;
 456                 dims[2].is = 1;
 457                 if (!(flags&FFT5D_ORDER_YZ)) {
 458                     dims[0].os = MG;       /*M K N*/
 459                     dims[1].os = 1;
 460                     dims[2].os = MG*KG;
 461                 } else  {
 462                     dims[0].os = 1;       /*K N M*/
 463                     dims[1].os = KG*NG;
 464                     dims[2].os = KG;
 465                 }
 466             } else {
 467                 if (!(flags&FFT5D_ORDER_YZ)) {
 468                     dims[0].n  = NG;
 469                     dims[1].n  = KG;
 470                     dims[2].n  = rMG;
 471
 472                     dims[0].is = 1;
 473                     dims[1].is = NG*MG;
 474                     dims[2].is = NG;
 475
 476                     dims[0].os = outMG*KG;
 477                     dims[1].os = outMG;
 478                     dims[2].os = 1;
 479                 } else {
 480                     dims[0].n  = MG;
 481                     dims[1].n  = NG;
 482                     dims[2].n  = rKG;
 483
 484                     dims[0].is = NG;
 485                     dims[1].is = 1;
 486                     dims[2].is = NG*MG;
 487
 488                     dims[0].os = outKG*NG;
 489                     dims[1].os = outKG;
 490                     dims[2].os = 1;
 491                 }
 492             }
 493             if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
 494                 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 495                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 496                                      (real*)lin, (FFTW(complex) *)lout,
 497                                      /*flags*/ fftwflags);
 498             } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
 499                 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 500                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 501                                      (FFTW(complex) *)lin, (real*)lout,
 502                                      /*flags*/ fftwflags);
 503             } else {
 504                 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 505                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 506                                      (FFTW(complex) *)lin, (FFTW(complex) *)lout,
 507                                      /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
 508             }
 509             FFTW_UNLOCK;
 510     }
 511     if (!plan->p3d) {  /* for decomposition and if 3d plan did not work */
 512 #endif /* GMX_FFT_FFTW3 */
 513         for (s=0;s<3;s++) {
 514             if (debug)
 515             {
 516                 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 517                         s,rC[s],M[s],pK[s],C[s],lsize);
 518             }
 519             if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
 520                 gmx_fft_init_many_1d_real( &plan->p1d[s], rC[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 521             } else {
 522                 gmx_fft_init_many_1d     ( &plan->p1d[s],  C[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 523             }
 524         }
 525 #ifdef GMX_FFT_FFTW3
 526     }
 527 #endif
 528     if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
 529         plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
 530     } else {
 531         plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
 532     }
 533 #ifdef FFT5D_MPI_TRANSPOSE
 534     FFTW_LOCK
 535     for (s=0;s<2;s++) {
 536         if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
 537             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
 538         else
 539             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
 540     }
 541     FFTW_UNLOCK
 542 #endif
 543
 544
 545     plan->lin=lin;
 546     plan->lout=lout;
 547
 548     plan->NG=NG;plan->MG=MG;plan->KG=KG;
 549
 550     for (s=0;s<3;s++) {
 551         plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
 552         plan->oM[s]=oM[s];plan->oK[s]=oK[s];
 553         plan->C[s]=C[s];plan->rC[s]=rC[s];
 554         plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
 555     }
 556     for (s=0;s<2;s++) {
 557         plan->P[s]=nP[s];plan->coor[s]=prank[s];
 558     }
 559
 560 /*    plan->fftorder=fftorder;
 561     plan->direction=direction;
 562     plan->realcomplex=realcomplex;
 563 */
 564     plan->flags=flags;
 565     *rlin=lin;
 566     *rlout=lout;
 567     return plan;
 568 }
 569
 570
 571 enum order {
 572     XYZ,
 573     XZY,
 574     YXZ,
 575     YZX,
 576     ZXY,
 577     ZYX
 578 };
 579
 580
 581
 582 /*here x,y,z and N,M,K is in rotated coordinate system!!
 583   x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 584   maxN,maxM,maxK is max size of local data
 585   pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 586   NG, MG, KG is size of global data*/
 587 static void splitaxes(t_complex* lout,const t_complex* lin,
 588                       int maxN,int maxM,int maxK, int pN, int pM, int pK,
 589                       int P,int NG,int *N, int* oN)
 590 {
 591     int x,y,z,i;
 592     int in_i,out_i,in_z,out_z,in_y,out_y;
 593
 594 #ifdef FFT5D_THREADS
 595     int zi;
 596
 597     /* In the thread parallel case we want to loop over z and i
 598      * in a single for loop to allow for better load balancing.
 599      */
 600 #pragma omp parallel for private(z,in_z,out_z,i,in_i,out_i,y,in_y,out_y,x) schedule(static)
 601     for (zi=0; zi<pK*P; zi++)
 602     {
 603         z = zi/P;
 604         i = zi - z*P;
 605 #else
 606     for (z=0; z<pK; z++) /*3. z l*/
 607     {
 608 #endif
 609         in_z  = z*maxN*maxM;
 610         out_z = z*NG*pM;
 611
 612 #ifndef FFT5D_THREADS
 613         for (i=0; i<P; i++) /*index cube along long axis*/
 614 #endif
 615         {
 616             in_i  = in_z  + i*maxN*maxM*maxK;
 617             out_i = out_z + oN[i];
 618             for (y=0;y<pM;y++) { /*2. y k*/
 619                 in_y  = in_i  + y*maxN;
 620                 out_y = out_i + y*NG;
 621                 for (x=0;x<N[i];x++) { /*1. x j*/
 622                     lout[in_y+x] = lin[out_y+x];
 623                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 624                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 625                 }
 626             }
 627         }
 628     }
 629 }
 630
 631 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 632 /*transpose mayor and major dimension
 633   variables see above
 634   the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 635   N,M,K local dimensions
 636   KG global size*/
 637 static void joinAxesTrans13(t_complex* lin,const t_complex* lout,
 638                             int maxN,int maxM,int maxK,int pN, int pM, int pK,
 639                             int P,int KG, int* K, int* oK)
 640 {
 641     int i,x,y,z;
 642     int in_i,out_i,in_x,out_x,in_z,out_z;
 643
 644 #ifdef FFT5D_THREADS
 645     int xi;
 646
 647     /* In the thread parallel case we want to loop over x and i
 648      * in a single for loop to allow for better load balancing.
 649      */
 650 #pragma omp parallel for private(x,in_x,out_x,i,in_i,out_i,z,in_z,out_z,y) schedule(static)
 651     for (xi=0; xi<pN*P; xi++)
 652     {
 653         x = xi/P;
 654         i = xi - x*P;
 655 #else
 656     for (x=0;x<pN;x++) /*1.j*/
 657     {
 658 #endif
 659         in_x  = x*KG*pM;
 660         out_x = x;
 661
 662 #ifndef FFT5D_THREADS
 663         for (i=0;i<P;i++) /*index cube along long axis*/
 664 #endif
 665         {
 666             in_i  = in_x  + oK[i];
 667             out_i = out_x + i*maxM*maxN*maxK;
 668             for (z=0;z<K[i];z++) /*3.l*/
 669             {
 670                 in_z  = in_i  + z;
 671                 out_z = out_i + z*maxM*maxN;
 672                 for (y=0;y<pM;y++) { /*2.k*/
 673                     lin[in_z+y*KG] = lout[out_z+y*maxN];
 674                 }
 675             }
 676         }
 677     }
 678 }
 679
 680 /*make axis contiguous again (after AllToAll) and also do local transpose
 681   tranpose mayor and middle dimension
 682   variables see above
 683   the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 684   N,M,K local size
 685   MG, global size*/
 686 static void joinAxesTrans12(t_complex* lin,const t_complex* lout,int maxN,int maxM,int maxK,int pN, int pM, int pK,
 687                 int P,int MG, int* M, int* oM) {
 688     int i,z,y,x;
 689     int in_i,out_i,in_z,out_z,in_x,out_x;
 690
 691 #ifdef FFT5D_THREADS
 692     int zi;
 693
 694     /* In the thread parallel case we want to loop over z and i
 695      * in a single for loop to allow for better load balancing.
 696      */
 697 #pragma omp parallel for private(i,in_i,out_i,z,in_z,out_z,in_x,out_x,x,y) schedule(static)
 698     for (zi=0; zi<pK*P; zi++)
 699     {
 700         z = zi/P;
 701         i = zi - z*P;
 702 #else
 703     for (z=0; z<pK; z++)
 704     {
 705 #endif
 706         in_z  = z*MG*pN;
 707         out_z = z*maxM*maxN;
 708
 709 #ifndef FFT5D_THREADS
 710         for (i=0; i<P; i++) /*index cube along long axis*/
 711 #endif
 712         {
 713             in_i  = in_z  + oM[i];
 714             out_i = out_z + i*maxM*maxN*maxK;
 715             for (x=0;x<pN;x++) {
 716                 in_x  = in_i  + x*MG;
 717                 out_x = out_i + x;
 718                 for (y=0;y<M[i];y++) {
 719                     lin[in_x+y] = lout[out_x+y*maxN];
 720                 }
 721             }
 722         }
 723     }
 724 }
 725
 726
 727 static void rotate(int x[]) {
 728     int t=x[0];
 729 /*    x[0]=x[2];
 730     x[2]=x[1];
 731     x[1]=t;*/
 732     x[0]=x[1];
 733     x[1]=x[2];
 734     x[2]=t;
 735 }
 736
 737 /*compute the offset to compare or print transposed local data in original input coordinates
 738   xs matrix dimension size, xl dimension length, xc decomposition offset
 739   s: step in computation = number of transposes*/
 740 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
 741 /*    int direction = plan->direction;
 742     int fftorder = plan->fftorder;*/
 743
 744     int o;
 745     int pos[3],i;
 746     int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
 747         *C=plan->C, *rC=plan->rC;
 748
 749     NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
 750
 751     if (!(plan->flags&FFT5D_ORDER_YZ)) {
 752         switch (s) {
 753         case 0: o=XYZ; break;
 754         case 1: o=ZYX; break;
 755         case 2: o=YZX; break;
 756         default: assert(0);
 757         }
 758     } else {
 759         switch (s) {
 760         case 0: o=XYZ; break;
 761         case 1: o=YXZ; break;
 762         case 2: o=ZXY; break;
 763         default: assert(0);
 764         }
 765     }
 766
 767     switch (o) {
 768         case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
 769         case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
 770         case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
 771         case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
 772         case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
 773         case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
 774     }
 775     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 776
 777     /*xs, xl give dimension size and data length in local transposed coordinate system
 778       for 0(/1/2): x(/y/z) in original coordinate system*/
 779     for (i=0;i<3;i++) {
 780         switch (pos[i]) {
 781         case 1: xs[i]=1;         xc[i]=0;     xl[i]=C[s];break;
 782         case 2: xs[i]=C[s];      xc[i]=oM[s]; xl[i]=pM[s];break;
 783         case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
 784         }
 785     }
 786     /*input order is different for test program to match FFTW order
 787       (important for complex to real)*/
 788     if (plan->flags&FFT5D_BACKWARD) {
 789         rotate(xs);
 790         rotate(xl);
 791         rotate(xc);
 792         rotate(NG);
 793         if (plan->flags&FFT5D_ORDER_YZ) {
 794             rotate(xs);
 795             rotate(xl);
 796             rotate(xc);
 797             rotate(NG);
 798         }
 799     }
 800     if ((plan->flags&FFT5D_REALCOMPLEX) && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || ((plan->flags&FFT5D_BACKWARD) && s==2))) {
 801         xl[0] = rC[s];
 802     }
 803 }
 804
 805 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
 806     int x,y,z,l;
 807     int *coor = plan->coor;
 808     int xs[3],xl[3],xc[3],NG[3];
 809     int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
 810     compute_offsets(plan,xs,xl,xc,NG,s);
 811     fprintf(debug,txt,coor[0],coor[1],s);
 812     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 813     for(z=0;z<xl[2];z++) {
 814         for(y=0;y<xl[1];y++) {
 815             fprintf(debug,"%d %d: ",coor[0],coor[1]);
 816             for (x=0;x<xl[0];x++) {
 817                 for (l=0;l<ll;l++) {
 818                     fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 819                 }
 820                 fprintf(debug,",");
 821             }
 822             fprintf(debug,"\n");
 823         }
 824     }
 825 }
 826
 827 void fft5d_execute(fft5d_plan plan,fft5d_time times) {
 828     t_complex *lin = plan->lin;
 829     t_complex *lout = plan->lout;
 830
 831     gmx_fft_t *p1d=plan->p1d;
 832 #ifdef FFT5D_MPI_TRANSPOSE
 833     FFTW(plan) *mpip=plan->mpip;
 834 #endif
 835 #ifdef GMX_MPI
 836     MPI_Comm *cart=plan->cart;
 837 #endif
 838
 839     double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
 840     int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
 841         *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
 842     int s=0;
 843
 844
 845 #ifdef GMX_FFT_FFTW3
 846     if (plan->p3d) {
 847         if (times!=0)
 848             time=MPI_Wtime();
 849         FFTW(execute)(plan->p3d);
 850         if (times!=0)
 851             times->fft+=MPI_Wtime()-time;
 852         return;
 853     }
 854 #endif
 855
 856     /*lin: x,y,z*/
 857     if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: copy in lin\n", s, plan);
 858     for (s=0;s<2;s++) {
 859         if (times!=0)
 860             time=MPI_Wtime();
 861
 862         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0) {
 863             gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
 864         } else {
 865             gmx_fft_many_1d(     p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin,lout);
 866         }
 867         if (times!=0)
 868             time_fft+=MPI_Wtime()-time;
 869
 870         if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 871
 872 #ifdef GMX_MPI
 873         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] !=0 && P[s]>1 )
 874         {
 875             if (times!=0)
 876                 time=MPI_Wtime();
 877             /*prepare for AllToAll
 878               1. (most outer) axes (x) is split into P[s] parts of size N[s]
 879               for sending*/
 880             splitaxes(lin,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s]);
 881
 882             if (times!=0)
 883             {
 884                 time_local+=MPI_Wtime()-time;
 885
 886                 /*send, recv*/
 887                 time=MPI_Wtime();
 888             }
 889
 890 #ifdef FFT5D_MPI_TRANSPOSE
 891             FFTW(execute)(mpip[s]);
 892 #else
 893             if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
 894                 MPI_Alltoall(lin,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 895             else
 896                 MPI_Alltoall(lin,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 897 #endif /*FFT5D_MPI_TRANSPOSE*/
 898             if (times!=0)
 899                 time_mpi[s]=MPI_Wtime()-time;
 900         }
 901 #endif /*GMX_MPI*/
 902
 903
 904         if (times!=0)
 905             time=MPI_Wtime();
 906         /*bring back in matrix form
 907           thus make  new 1. axes contiguos
 908           also local transpose 1 and 2/3 */
 909         if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
 910             joinAxesTrans13(lin,lout,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
 911         else
 912             joinAxesTrans12(lin,lout,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
 913         if (times!=0)
 914             time_local+=MPI_Wtime()-time;
 915
 916         if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
 917
 918         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
 919     }
 920
 921     if (times!=0)
 922         time=MPI_Wtime();
 923     if (plan->flags&FFT5D_INPLACE) lout=lin;
 924     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
 925         gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
 926     } else {
 927         gmx_fft_many_1d(     p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin,lout);
 928     }
 929
 930     if (times!=0)
 931         time_fft+=MPI_Wtime()-time;
 932     if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 933     /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
 934
 935     if (times!=0) {
 936         times->fft+=time_fft;
 937         times->local+=time_local;
 938         times->mpi2+=time_mpi[1];
 939         times->mpi1+=time_mpi[0];
 940     }
 941 }
 942
 943 void fft5d_destroy(fft5d_plan plan) {
 944     int s;
 945     for (s=0;s<3;s++) {
 946         gmx_many_fft_destroy(plan->p1d[s]);
 947         if (plan->iNin[s]) {
 948             free(plan->iNin[s]);
 949             plan->iNin[s]=0;
 950         }
 951         if (plan->oNin[s]) {
 952             free(plan->oNin[s]);
 953             plan->oNin[s]=0;
 954         }
 955         if (plan->iNout[s]) {
 956             free(plan->iNout[s]);
 957             plan->iNout[s]=0;
 958         }
 959         if (plan->oNout[s]) {
 960             free(plan->oNout[s]);
 961             plan->oNout[s]=0;
 962         }
 963     }
 964 #ifdef GMX_FFT_FFTW3
 965     FFTW_LOCK;
 966 #ifdef FFT5D_MPI_TRANSPOS
 967     for (s=0;s<2;s++)
 968         FFTW(destroy_plan)(plan->mpip[s]);
 969 #endif /* FFT5D_MPI_TRANSPOS */
 970 #endif /* GMX_FFT_FFTW3 */
 971
 972     /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
 973
 974
 975 #ifdef FFT5D_THREADS
 976 #ifdef FFT5D_FFTW_THREADS
 977     FFTW(cleanup_threads)();
 978 #endif
 979 #endif
 980
 981     free(plan);
 982 }
 983
 984 /*Is this better than direct access of plan? enough data?
 985   here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
 986 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
 987     *N1=plan->N[0];
 988     *M0=plan->M[0];
 989     *K1=plan->K[0];
 990     *K0=plan->N[1];
 991
 992     *coor=plan->coor;
 993 }
 994
 995
 996 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
 997   of processor dimensions*/
 998 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout) {
 999     MPI_Comm cart[2]={0};
1000 #ifdef GMX_MPI
1001     int size=1,prank=0;
1002     int P[2];
1003     int coor[2];
1004     int wrap[]={0,0};
1005     MPI_Comm gcart;
1006     int rdim1[] = {0,1}, rdim2[] = {1,0};
1007
1008     MPI_Comm_size(comm,&size);
1009     MPI_Comm_rank(comm,&prank);
1010
1011     if (P0==0) P0 = lfactor(size);
1012     if (size%P0!=0) {
1013         if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1014         P0 = lfactor(size);
1015     }
1016
1017     P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1018
1019     /*Difference between x-y-z regarding 2d decomposition is whether they are
1020       distributed along axis 1, 2 or both*/
1021
1022     MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1023     MPI_Cart_get(gcart,2,P,wrap,coor);
1024     MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1025     MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1026 #endif
1027     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout);
1028 }
1029
1030
1031
1032 /*prints in original coordinate system of data (as the input to FFT)*/
1033 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1034     int xs[3],xl[3],xc[3],NG[3];
1035     int x,y,z,l;
1036     int *coor = plan->coor;
1037     int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1038     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD))
1039     {
1040         ll=1;
1041     }
1042
1043     compute_offsets(plan,xs,xl,xc,NG,2);
1044     if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1045     for (z=0;z<xl[2];z++) {
1046         for(y=0;y<xl[1];y++) {
1047             if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1048             for (x=0;x<xl[0];x++) {
1049                 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1050                     real a,b;
1051                     a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1052                     if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1053                     if (!bothLocal)
1054                         b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1055                     else
1056                         b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1057                     if (plan->flags&FFT5D_DEBUG) {
1058                         printf("%f %f, ",a,b);
1059                     } else {
1060                         if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1061                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1062                         }
1063 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1064                     }
1065                 }
1066                 if (plan->flags&FFT5D_DEBUG) printf(",");
1067             }
1068             if (plan->flags&FFT5D_DEBUG) printf("\n");
1069         }
1070     }
1071
1072 }
1073