src/gromacs/mdlib/pme.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  *                        VERSION 3.2.0
  11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  13  * Copyright (c) 2001-2004, The GROMACS development team,
  14  * check out http://www.gromacs.org for more information.
  15
  16  * This program is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU General Public License
  18  * as published by the Free Software Foundation; either version 2
  19  * of the License, or (at your option) any later version.
  20  *
  21  * If you want to redistribute modifications, please consider that
  22  * scientific software is very special. Version control is crucial -
  23  * bugs must be traceable. We will be happy to consider code for
  24  * inclusion in the official distribution, but derived work must not
  25  * be called official GROMACS. Details are found in the README & COPYING
  26  * files - if they are missing, get the official version at www.gromacs.org.
  27  *
  28  * To help us fund GROMACS development, we humbly ask that you cite
  29  * the papers on the package - you can find them in the top README file.
  30  *
  31  * For more info, check our website at http://www.gromacs.org
  32  *
  33  * And Hey:
  34  * GROwing Monsters And Cloning Shrimps
  35  */
  36 /* IMPORTANT FOR DEVELOPERS:
  37  *
  38  * Triclinic pme stuff isn't entirely trivial, and we've experienced
  39  * some bugs during development (many of them due to me). To avoid
  40  * this in the future, please check the following things if you make
  41  * changes in this file:
  42  *
  43  * 1. You should obtain identical (at least to the PME precision)
  44  *    energies, forces, and virial for
  45  *    a rectangular box and a triclinic one where the z (or y) axis is
  46  *    tilted a whole box side. For instance you could use these boxes:
  47  *
  48  *    rectangular       triclinic
  49  *     2  0  0           2  0  0
  50  *     0  2  0           0  2  0
  51  *     0  0  6           2  2  6
  52  *
  53  * 2. You should check the energy conservation in a triclinic box.
  54  *
  55  * It might seem an overkill, but better safe than sorry.
  56  * /Erik 001109
  57  */
  58
  59 #ifdef HAVE_CONFIG_H
  60 #include <config.h>
  61 #endif
  62
  63 #include "gromacs/fft/parallel_3dfft.h"
  64 #include "gromacs/utility/gmxmpi.h"
  65
  66 #include <stdio.h>
  67 #include <string.h>
  68 #include <math.h>
  69 #include <assert.h>
  70 #include "typedefs.h"
  71 #include "txtdump.h"
  72 #include "vec.h"
  73 #include "gmxcomplex.h"
  74 #include "smalloc.h"
  75 #include "futil.h"
  76 #include "coulomb.h"
  77 #include "gmx_fatal.h"
  78 #include "pme.h"
  79 #include "network.h"
  80 #include "physics.h"
  81 #include "nrnb.h"
  82 #include "gmx_wallcycle.h"
  83 #include "pdbio.h"
  84 #include "gmx_cyclecounter.h"
  85 #include "gmx_omp.h"
  86 #include "macros.h"
  87
  88 /* Single precision, with SSE2 or higher available */
  89 #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
  90
  91 #include "gmx_x86_simd_single.h"
  92
  93 #define PME_SSE
  94 /* Some old AMD processors could have problems with unaligned loads+stores */
  95 #ifndef GMX_FAHCORE
  96 #define PME_SSE_UNALIGNED
  97 #endif
  98 #endif
  99
 100 #define DFT_TOL 1e-7
 101 /* #define PRT_FORCE */
 102 /* conditions for on the fly time-measurement */
 103 /* #define TAKETIME (step > 1 && timesteps < 10) */
 104 #define TAKETIME FALSE
 105
 106 /* #define PME_TIME_THREADS */
 107
 108 #ifdef GMX_DOUBLE
 109 #define mpi_type MPI_DOUBLE
 110 #else
 111 #define mpi_type MPI_FLOAT
 112 #endif
 113
 114 /* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
 115 #define GMX_CACHE_SEP 64
 116
 117 /* We only define a maximum to be able to use local arrays without allocation.
 118  * An order larger than 12 should never be needed, even for test cases.
 119  * If needed it can be changed here.
 120  */
 121 #define PME_ORDER_MAX 12
 122
 123 /* Internal datastructures */
 124 typedef struct {
 125     int send_index0;
 126     int send_nindex;
 127     int recv_index0;
 128     int recv_nindex;
 129     int recv_size;   /* Receive buffer width, used with OpenMP */
 130 } pme_grid_comm_t;
 131
 132 typedef struct {
 133 #ifdef GMX_MPI
 134     MPI_Comm         mpi_comm;
 135 #endif
 136     int              nnodes, nodeid;
 137     int             *s2g0;
 138     int             *s2g1;
 139     int              noverlap_nodes;
 140     int             *send_id, *recv_id;
 141     int              send_size; /* Send buffer width, used with OpenMP */
 142     pme_grid_comm_t *comm_data;
 143     real            *sendbuf;
 144     real            *recvbuf;
 145 } pme_overlap_t;
 146
 147 typedef struct {
 148     int *n;      /* Cumulative counts of the number of particles per thread */
 149     int  nalloc; /* Allocation size of i */
 150     int *i;      /* Particle indices ordered on thread index (n) */
 151 } thread_plist_t;
 152
 153 typedef struct {
 154     int      *thread_one;
 155     int       n;
 156     int      *ind;
 157     splinevec theta;
 158     real     *ptr_theta_z;
 159     splinevec dtheta;
 160     real     *ptr_dtheta_z;
 161 } splinedata_t;
 162
 163 typedef struct {
 164     int      dimind;        /* The index of the dimension, 0=x, 1=y */
 165     int      nslab;
 166     int      nodeid;
 167 #ifdef GMX_MPI
 168     MPI_Comm mpi_comm;
 169 #endif
 170
 171     int     *node_dest;     /* The nodes to send x and q to with DD */
 172     int     *node_src;      /* The nodes to receive x and q from with DD */
 173     int     *buf_index;     /* Index for commnode into the buffers */
 174
 175     int      maxshift;
 176
 177     int      npd;
 178     int      pd_nalloc;
 179     int     *pd;
 180     int     *count;         /* The number of atoms to send to each node */
 181     int    **count_thread;
 182     int     *rcount;        /* The number of atoms to receive */
 183
 184     int      n;
 185     int      nalloc;
 186     rvec    *x;
 187     real    *q;
 188     rvec    *f;
 189     gmx_bool bSpread;       /* These coordinates are used for spreading */
 190     int      pme_order;
 191     ivec    *idx;
 192     rvec    *fractx;            /* Fractional coordinate relative to the
 193                                  * lower cell boundary
 194                                  */
 195     int             nthread;
 196     int            *thread_idx; /* Which thread should spread which charge */
 197     thread_plist_t *thread_plist;
 198     splinedata_t   *spline;
 199 } pme_atomcomm_t;
 200
 201 #define FLBS  3
 202 #define FLBSZ 4
 203
 204 typedef struct {
 205     ivec  ci;     /* The spatial location of this grid         */
 206     ivec  n;      /* The used size of *grid, including order-1 */
 207     ivec  offset; /* The grid offset from the full node grid   */
 208     int   order;  /* PME spreading order                       */
 209     ivec  s;      /* The allocated size of *grid, s >= n       */
 210     real *grid;   /* The grid local thread, size n             */
 211 } pmegrid_t;
 212
 213 typedef struct {
 214     pmegrid_t  grid;         /* The full node grid (non thread-local)            */
 215     int        nthread;      /* The number of threads operating on this grid     */
 216     ivec       nc;           /* The local spatial decomposition over the threads */
 217     pmegrid_t *grid_th;      /* Array of grids for each thread                   */
 218     real      *grid_all;     /* Allocated array for the grids in *grid_th        */
 219     int      **g2t;          /* The grid to thread index                         */
 220     ivec       nthread_comm; /* The number of threads to communicate with        */
 221 } pmegrids_t;
 222
 223
 224 typedef struct {
 225 #ifdef PME_SSE
 226     /* Masks for SSE aligned spreading and gathering */
 227     __m128 mask_SSE0[6], mask_SSE1[6];
 228 #else
 229     int    dummy; /* C89 requires that struct has at least one member */
 230 #endif
 231 } pme_spline_work_t;
 232
 233 typedef struct {
 234     /* work data for solve_pme */
 235     int      nalloc;
 236     real *   mhx;
 237     real *   mhy;
 238     real *   mhz;
 239     real *   m2;
 240     real *   denom;
 241     real *   tmp1_alloc;
 242     real *   tmp1;
 243     real *   eterm;
 244     real *   m2inv;
 245
 246     real     energy;
 247     matrix   vir;
 248 } pme_work_t;
 249
 250 typedef struct gmx_pme {
 251     int           ndecompdim; /* The number of decomposition dimensions */
 252     int           nodeid;     /* Our nodeid in mpi->mpi_comm */
 253     int           nodeid_major;
 254     int           nodeid_minor;
 255     int           nnodes;    /* The number of nodes doing PME */
 256     int           nnodes_major;
 257     int           nnodes_minor;
 258
 259     MPI_Comm      mpi_comm;
 260     MPI_Comm      mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
 261 #ifdef GMX_MPI
 262     MPI_Datatype  rvec_mpi;      /* the pme vector's MPI type */
 263 #endif
 264
 265     gmx_bool   bUseThreads;   /* Does any of the PME ranks have nthread>1 ?  */
 266     int        nthread;       /* The number of threads doing PME on our rank */
 267
 268     gmx_bool   bPPnode;       /* Node also does particle-particle forces */
 269     gmx_bool   bFEP;          /* Compute Free energy contribution */
 270     int        nkx, nky, nkz; /* Grid dimensions */
 271     gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
 272     int        pme_order;
 273     real       epsilon_r;
 274
 275     pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
 276     pmegrids_t pmegridB;
 277     /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
 278     int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
 279     /* pmegrid_nz might be larger than strictly necessary to ensure
 280      * memory alignment, pmegrid_nz_base gives the real base size.
 281      */
 282     int     pmegrid_nz_base;
 283     /* The local PME grid starting indices */
 284     int     pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
 285
 286     /* Work data for spreading and gathering */
 287     pme_spline_work_t    *spline_work;
 288
 289     real                 *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
 290     real                 *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
 291     int                   fftgrid_nx, fftgrid_ny, fftgrid_nz;
 292
 293     t_complex            *cfftgridA;  /* Grids for complex FFT data */
 294     t_complex            *cfftgridB;
 295     int                   cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
 296
 297     gmx_parallel_3dfft_t  pfft_setupA;
 298     gmx_parallel_3dfft_t  pfft_setupB;
 299
 300     int                  *nnx, *nny, *nnz;
 301     real                 *fshx, *fshy, *fshz;
 302
 303     pme_atomcomm_t        atc[2]; /* Indexed on decomposition index */
 304     matrix                recipbox;
 305     splinevec             bsp_mod;
 306
 307     pme_overlap_t         overlap[2]; /* Indexed on dimension, 0=x, 1=y */
 308
 309     pme_atomcomm_t        atc_energy; /* Only for gmx_pme_calc_energy */
 310
 311     rvec                 *bufv;       /* Communication buffer */
 312     real                 *bufr;       /* Communication buffer */
 313     int                   buf_nalloc; /* The communication buffer size */
 314
 315     /* thread local work data for solve_pme */
 316     pme_work_t *work;
 317
 318     /* Work data for PME_redist */
 319     gmx_bool redist_init;
 320     int *    scounts;
 321     int *    rcounts;
 322     int *    sdispls;
 323     int *    rdispls;
 324     int *    sidx;
 325     int *    idxa;
 326     real *   redist_buf;
 327     int      redist_buf_nalloc;
 328
 329     /* Work data for sum_qgrid */
 330     real *   sum_qgrid_tmp;
 331     real *   sum_qgrid_dd_tmp;
 332 } t_gmx_pme;
 333
 334
 335 static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
 336                                    int start, int end, int thread)
 337 {
 338     int             i;
 339     int            *idxptr, tix, tiy, tiz;
 340     real           *xptr, *fptr, tx, ty, tz;
 341     real            rxx, ryx, ryy, rzx, rzy, rzz;
 342     int             nx, ny, nz;
 343     int             start_ix, start_iy, start_iz;
 344     int            *g2tx, *g2ty, *g2tz;
 345     gmx_bool        bThreads;
 346     int            *thread_idx = NULL;
 347     thread_plist_t *tpl        = NULL;
 348     int            *tpl_n      = NULL;
 349     int             thread_i;
 350
 351     nx  = pme->nkx;
 352     ny  = pme->nky;
 353     nz  = pme->nkz;
 354
 355     start_ix = pme->pmegrid_start_ix;
 356     start_iy = pme->pmegrid_start_iy;
 357     start_iz = pme->pmegrid_start_iz;
 358
 359     rxx = pme->recipbox[XX][XX];
 360     ryx = pme->recipbox[YY][XX];
 361     ryy = pme->recipbox[YY][YY];
 362     rzx = pme->recipbox[ZZ][XX];
 363     rzy = pme->recipbox[ZZ][YY];
 364     rzz = pme->recipbox[ZZ][ZZ];
 365
 366     g2tx = pme->pmegridA.g2t[XX];
 367     g2ty = pme->pmegridA.g2t[YY];
 368     g2tz = pme->pmegridA.g2t[ZZ];
 369
 370     bThreads = (atc->nthread > 1);
 371     if (bThreads)
 372     {
 373         thread_idx = atc->thread_idx;
 374
 375         tpl   = &atc->thread_plist[thread];
 376         tpl_n = tpl->n;
 377         for (i = 0; i < atc->nthread; i++)
 378         {
 379             tpl_n[i] = 0;
 380         }
 381     }
 382
 383     for (i = start; i < end; i++)
 384     {
 385         xptr   = atc->x[i];
 386         idxptr = atc->idx[i];
 387         fptr   = atc->fractx[i];
 388
 389         /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
 390         tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
 391         ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
 392         tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
 393
 394         tix = (int)(tx);
 395         tiy = (int)(ty);
 396         tiz = (int)(tz);
 397
 398         /* Because decomposition only occurs in x and y,
 399          * we never have a fraction correction in z.
 400          */
 401         fptr[XX] = tx - tix + pme->fshx[tix];
 402         fptr[YY] = ty - tiy + pme->fshy[tiy];
 403         fptr[ZZ] = tz - tiz;
 404
 405         idxptr[XX] = pme->nnx[tix];
 406         idxptr[YY] = pme->nny[tiy];
 407         idxptr[ZZ] = pme->nnz[tiz];
 408
 409 #ifdef DEBUG
 410         range_check(idxptr[XX], 0, pme->pmegrid_nx);
 411         range_check(idxptr[YY], 0, pme->pmegrid_ny);
 412         range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
 413 #endif
 414
 415         if (bThreads)
 416         {
 417             thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
 418             thread_idx[i] = thread_i;
 419             tpl_n[thread_i]++;
 420         }
 421     }
 422
 423     if (bThreads)
 424     {
 425         /* Make a list of particle indices sorted on thread */
 426
 427         /* Get the cumulative count */
 428         for (i = 1; i < atc->nthread; i++)
 429         {
 430             tpl_n[i] += tpl_n[i-1];
 431         }
 432         /* The current implementation distributes particles equally
 433          * over the threads, so we could actually allocate for that
 434          * in pme_realloc_atomcomm_things.
 435          */
 436         if (tpl_n[atc->nthread-1] > tpl->nalloc)
 437         {
 438             tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
 439             srenew(tpl->i, tpl->nalloc);
 440         }
 441         /* Set tpl_n to the cumulative start */
 442         for (i = atc->nthread-1; i >= 1; i--)
 443         {
 444             tpl_n[i] = tpl_n[i-1];
 445         }
 446         tpl_n[0] = 0;
 447
 448         /* Fill our thread local array with indices sorted on thread */
 449         for (i = start; i < end; i++)
 450         {
 451             tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
 452         }
 453         /* Now tpl_n contains the cummulative count again */
 454     }
 455 }
 456
 457 static void make_thread_local_ind(pme_atomcomm_t *atc,
 458                                   int thread, splinedata_t *spline)
 459 {
 460     int             n, t, i, start, end;
 461     thread_plist_t *tpl;
 462
 463     /* Combine the indices made by each thread into one index */
 464
 465     n     = 0;
 466     start = 0;
 467     for (t = 0; t < atc->nthread; t++)
 468     {
 469         tpl = &atc->thread_plist[t];
 470         /* Copy our part (start - end) from the list of thread t */
 471         if (thread > 0)
 472         {
 473             start = tpl->n[thread-1];
 474         }
 475         end = tpl->n[thread];
 476         for (i = start; i < end; i++)
 477         {
 478             spline->ind[n++] = tpl->i[i];
 479         }
 480     }
 481
 482     spline->n = n;
 483 }
 484
 485
 486 static void pme_calc_pidx(int start, int end,
 487                           matrix recipbox, rvec x[],
 488                           pme_atomcomm_t *atc, int *count)
 489 {
 490     int   nslab, i;
 491     int   si;
 492     real *xptr, s;
 493     real  rxx, ryx, rzx, ryy, rzy;
 494     int  *pd;
 495
 496     /* Calculate PME task index (pidx) for each grid index.
 497      * Here we always assign equally sized slabs to each node
 498      * for load balancing reasons (the PME grid spacing is not used).
 499      */
 500
 501     nslab = atc->nslab;
 502     pd    = atc->pd;
 503
 504     /* Reset the count */
 505     for (i = 0; i < nslab; i++)
 506     {
 507         count[i] = 0;
 508     }
 509
 510     if (atc->dimind == 0)
 511     {
 512         rxx = recipbox[XX][XX];
 513         ryx = recipbox[YY][XX];
 514         rzx = recipbox[ZZ][XX];
 515         /* Calculate the node index in x-dimension */
 516         for (i = start; i < end; i++)
 517         {
 518             xptr   = x[i];
 519             /* Fractional coordinates along box vectors */
 520             s     = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
 521             si    = (int)(s + 2*nslab) % nslab;
 522             pd[i] = si;
 523             count[si]++;
 524         }
 525     }
 526     else
 527     {
 528         ryy = recipbox[YY][YY];
 529         rzy = recipbox[ZZ][YY];
 530         /* Calculate the node index in y-dimension */
 531         for (i = start; i < end; i++)
 532         {
 533             xptr   = x[i];
 534             /* Fractional coordinates along box vectors */
 535             s     = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
 536             si    = (int)(s + 2*nslab) % nslab;
 537             pd[i] = si;
 538             count[si]++;
 539         }
 540     }
 541 }
 542
 543 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
 544                                   pme_atomcomm_t *atc)
 545 {
 546     int nthread, thread, slab;
 547
 548     nthread = atc->nthread;
 549
 550 #pragma omp parallel for num_threads(nthread) schedule(static)
 551     for (thread = 0; thread < nthread; thread++)
 552     {
 553         pme_calc_pidx(natoms* thread   /nthread,
 554                       natoms*(thread+1)/nthread,
 555                       recipbox, x, atc, atc->count_thread[thread]);
 556     }
 557     /* Non-parallel reduction, since nslab is small */
 558
 559     for (thread = 1; thread < nthread; thread++)
 560     {
 561         for (slab = 0; slab < atc->nslab; slab++)
 562         {
 563             atc->count_thread[0][slab] += atc->count_thread[thread][slab];
 564         }
 565     }
 566 }
 567
 568 static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
 569 {
 570     const int padding = 4;
 571     int       i;
 572
 573     srenew(th[XX], nalloc);
 574     srenew(th[YY], nalloc);
 575     /* In z we add padding, this is only required for the aligned SSE code */
 576     srenew(*ptr_z, nalloc+2*padding);
 577     th[ZZ] = *ptr_z + padding;
 578
 579     for (i = 0; i < padding; i++)
 580     {
 581         (*ptr_z)[               i] = 0;
 582         (*ptr_z)[padding+nalloc+i] = 0;
 583     }
 584 }
 585
 586 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
 587 {
 588     int i, d;
 589
 590     srenew(spline->ind, atc->nalloc);
 591     /* Initialize the index to identity so it works without threads */
 592     for (i = 0; i < atc->nalloc; i++)
 593     {
 594         spline->ind[i] = i;
 595     }
 596
 597     realloc_splinevec(spline->theta, &spline->ptr_theta_z,
 598                       atc->pme_order*atc->nalloc);
 599     realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
 600                       atc->pme_order*atc->nalloc);
 601 }
 602
 603 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 604 {
 605     int nalloc_old, i, j, nalloc_tpl;
 606
 607     /* We have to avoid a NULL pointer for atc->x to avoid
 608      * possible fatal errors in MPI routines.
 609      */
 610     if (atc->n > atc->nalloc || atc->nalloc == 0)
 611     {
 612         nalloc_old  = atc->nalloc;
 613         atc->nalloc = over_alloc_dd(max(atc->n, 1));
 614
 615         if (atc->nslab > 1)
 616         {
 617             srenew(atc->x, atc->nalloc);
 618             srenew(atc->q, atc->nalloc);
 619             srenew(atc->f, atc->nalloc);
 620             for (i = nalloc_old; i < atc->nalloc; i++)
 621             {
 622                 clear_rvec(atc->f[i]);
 623             }
 624         }
 625         if (atc->bSpread)
 626         {
 627             srenew(atc->fractx, atc->nalloc);
 628             srenew(atc->idx, atc->nalloc);
 629
 630             if (atc->nthread > 1)
 631             {
 632                 srenew(atc->thread_idx, atc->nalloc);
 633             }
 634
 635             for (i = 0; i < atc->nthread; i++)
 636             {
 637                 pme_realloc_splinedata(&atc->spline[i], atc);
 638             }
 639         }
 640     }
 641 }
 642
 643 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
 644                          int n, gmx_bool bXF, rvec *x_f, real *charge,
 645                          pme_atomcomm_t *atc)
 646 /* Redistribute particle data for PME calculation */
 647 /* domain decomposition by x coordinate           */
 648 {
 649     int *idxa;
 650     int  i, ii;
 651
 652     if (FALSE == pme->redist_init)
 653     {
 654         snew(pme->scounts, atc->nslab);
 655         snew(pme->rcounts, atc->nslab);
 656         snew(pme->sdispls, atc->nslab);
 657         snew(pme->rdispls, atc->nslab);
 658         snew(pme->sidx, atc->nslab);
 659         pme->redist_init = TRUE;
 660     }
 661     if (n > pme->redist_buf_nalloc)
 662     {
 663         pme->redist_buf_nalloc = over_alloc_dd(n);
 664         srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
 665     }
 666
 667     pme->idxa = atc->pd;
 668
 669 #ifdef GMX_MPI
 670     if (forw && bXF)
 671     {
 672         /* forward, redistribution from pp to pme */
 673
 674         /* Calculate send counts and exchange them with other nodes */
 675         for (i = 0; (i < atc->nslab); i++)
 676         {
 677             pme->scounts[i] = 0;
 678         }
 679         for (i = 0; (i < n); i++)
 680         {
 681             pme->scounts[pme->idxa[i]]++;
 682         }
 683         MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
 684
 685         /* Calculate send and receive displacements and index into send
 686            buffer */
 687         pme->sdispls[0] = 0;
 688         pme->rdispls[0] = 0;
 689         pme->sidx[0]    = 0;
 690         for (i = 1; i < atc->nslab; i++)
 691         {
 692             pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
 693             pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
 694             pme->sidx[i]    = pme->sdispls[i];
 695         }
 696         /* Total # of particles to be received */
 697         atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
 698
 699         pme_realloc_atomcomm_things(atc);
 700
 701         /* Copy particle coordinates into send buffer and exchange*/
 702         for (i = 0; (i < n); i++)
 703         {
 704             ii = DIM*pme->sidx[pme->idxa[i]];
 705             pme->sidx[pme->idxa[i]]++;
 706             pme->redist_buf[ii+XX] = x_f[i][XX];
 707             pme->redist_buf[ii+YY] = x_f[i][YY];
 708             pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
 709         }
 710         MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
 711                       pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
 712                       pme->rvec_mpi, atc->mpi_comm);
 713     }
 714     if (forw)
 715     {
 716         /* Copy charge into send buffer and exchange*/
 717         for (i = 0; i < atc->nslab; i++)
 718         {
 719             pme->sidx[i] = pme->sdispls[i];
 720         }
 721         for (i = 0; (i < n); i++)
 722         {
 723             ii = pme->sidx[pme->idxa[i]];
 724             pme->sidx[pme->idxa[i]]++;
 725             pme->redist_buf[ii] = charge[i];
 726         }
 727         MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
 728                       atc->q, pme->rcounts, pme->rdispls, mpi_type,
 729                       atc->mpi_comm);
 730     }
 731     else   /* backward, redistribution from pme to pp */
 732     {
 733         MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
 734                       pme->redist_buf, pme->scounts, pme->sdispls,
 735                       pme->rvec_mpi, atc->mpi_comm);
 736
 737         /* Copy data from receive buffer */
 738         for (i = 0; i < atc->nslab; i++)
 739         {
 740             pme->sidx[i] = pme->sdispls[i];
 741         }
 742         for (i = 0; (i < n); i++)
 743         {
 744             ii          = DIM*pme->sidx[pme->idxa[i]];
 745             x_f[i][XX] += pme->redist_buf[ii+XX];
 746             x_f[i][YY] += pme->redist_buf[ii+YY];
 747             x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
 748             pme->sidx[pme->idxa[i]]++;
 749         }
 750     }
 751 #endif
 752 }
 753
 754 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
 755                             gmx_bool bBackward, int shift,
 756                             void *buf_s, int nbyte_s,
 757                             void *buf_r, int nbyte_r)
 758 {
 759 #ifdef GMX_MPI
 760     int        dest, src;
 761     MPI_Status stat;
 762
 763     if (bBackward == FALSE)
 764     {
 765         dest = atc->node_dest[shift];
 766         src  = atc->node_src[shift];
 767     }
 768     else
 769     {
 770         dest = atc->node_src[shift];
 771         src  = atc->node_dest[shift];
 772     }
 773
 774     if (nbyte_s > 0 && nbyte_r > 0)
 775     {
 776         MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
 777                      dest, shift,
 778                      buf_r, nbyte_r, MPI_BYTE,
 779                      src, shift,
 780                      atc->mpi_comm, &stat);
 781     }
 782     else if (nbyte_s > 0)
 783     {
 784         MPI_Send(buf_s, nbyte_s, MPI_BYTE,
 785                  dest, shift,
 786                  atc->mpi_comm);
 787     }
 788     else if (nbyte_r > 0)
 789     {
 790         MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
 791                  src, shift,
 792                  atc->mpi_comm, &stat);
 793     }
 794 #endif
 795 }
 796
 797 static void dd_pmeredist_x_q(gmx_pme_t pme,
 798                              int n, gmx_bool bX, rvec *x, real *charge,
 799                              pme_atomcomm_t *atc)
 800 {
 801     int *commnode, *buf_index;
 802     int  nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
 803
 804     commnode  = atc->node_dest;
 805     buf_index = atc->buf_index;
 806
 807     nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 808
 809     nsend = 0;
 810     for (i = 0; i < nnodes_comm; i++)
 811     {
 812         buf_index[commnode[i]] = nsend;
 813         nsend                 += atc->count[commnode[i]];
 814     }
 815     if (bX)
 816     {
 817         if (atc->count[atc->nodeid] + nsend != n)
 818         {
 819             gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
 820                       "This usually means that your system is not well equilibrated.",
 821                       n - (atc->count[atc->nodeid] + nsend),
 822                       pme->nodeid, 'x'+atc->dimind);
 823         }
 824
 825         if (nsend > pme->buf_nalloc)
 826         {
 827             pme->buf_nalloc = over_alloc_dd(nsend);
 828             srenew(pme->bufv, pme->buf_nalloc);
 829             srenew(pme->bufr, pme->buf_nalloc);
 830         }
 831
 832         atc->n = atc->count[atc->nodeid];
 833         for (i = 0; i < nnodes_comm; i++)
 834         {
 835             scount = atc->count[commnode[i]];
 836             /* Communicate the count */
 837             if (debug)
 838             {
 839                 fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
 840                         atc->dimind, atc->nodeid, commnode[i], scount);
 841             }
 842             pme_dd_sendrecv(atc, FALSE, i,
 843                             &scount, sizeof(int),
 844                             &atc->rcount[i], sizeof(int));
 845             atc->n += atc->rcount[i];
 846         }
 847
 848         pme_realloc_atomcomm_things(atc);
 849     }
 850
 851     local_pos = 0;
 852     for (i = 0; i < n; i++)
 853     {
 854         node = atc->pd[i];
 855         if (node == atc->nodeid)
 856         {
 857             /* Copy direct to the receive buffer */
 858             if (bX)
 859             {
 860                 copy_rvec(x[i], atc->x[local_pos]);
 861             }
 862             atc->q[local_pos] = charge[i];
 863             local_pos++;
 864         }
 865         else
 866         {
 867             /* Copy to the send buffer */
 868             if (bX)
 869             {
 870                 copy_rvec(x[i], pme->bufv[buf_index[node]]);
 871             }
 872             pme->bufr[buf_index[node]] = charge[i];
 873             buf_index[node]++;
 874         }
 875     }
 876
 877     buf_pos = 0;
 878     for (i = 0; i < nnodes_comm; i++)
 879     {
 880         scount = atc->count[commnode[i]];
 881         rcount = atc->rcount[i];
 882         if (scount > 0 || rcount > 0)
 883         {
 884             if (bX)
 885             {
 886                 /* Communicate the coordinates */
 887                 pme_dd_sendrecv(atc, FALSE, i,
 888                                 pme->bufv[buf_pos], scount*sizeof(rvec),
 889                                 atc->x[local_pos], rcount*sizeof(rvec));
 890             }
 891             /* Communicate the charges */
 892             pme_dd_sendrecv(atc, FALSE, i,
 893                             pme->bufr+buf_pos, scount*sizeof(real),
 894                             atc->q+local_pos, rcount*sizeof(real));
 895             buf_pos   += scount;
 896             local_pos += atc->rcount[i];
 897         }
 898     }
 899 }
 900
 901 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 902                            int n, rvec *f,
 903                            gmx_bool bAddF)
 904 {
 905     int *commnode, *buf_index;
 906     int  nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
 907
 908     commnode  = atc->node_dest;
 909     buf_index = atc->buf_index;
 910
 911     nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 912
 913     local_pos = atc->count[atc->nodeid];
 914     buf_pos   = 0;
 915     for (i = 0; i < nnodes_comm; i++)
 916     {
 917         scount = atc->rcount[i];
 918         rcount = atc->count[commnode[i]];
 919         if (scount > 0 || rcount > 0)
 920         {
 921             /* Communicate the forces */
 922             pme_dd_sendrecv(atc, TRUE, i,
 923                             atc->f[local_pos], scount*sizeof(rvec),
 924                             pme->bufv[buf_pos], rcount*sizeof(rvec));
 925             local_pos += scount;
 926         }
 927         buf_index[commnode[i]] = buf_pos;
 928         buf_pos               += rcount;
 929     }
 930
 931     local_pos = 0;
 932     if (bAddF)
 933     {
 934         for (i = 0; i < n; i++)
 935         {
 936             node = atc->pd[i];
 937             if (node == atc->nodeid)
 938             {
 939                 /* Add from the local force array */
 940                 rvec_inc(f[i], atc->f[local_pos]);
 941                 local_pos++;
 942             }
 943             else
 944             {
 945                 /* Add from the receive buffer */
 946                 rvec_inc(f[i], pme->bufv[buf_index[node]]);
 947                 buf_index[node]++;
 948             }
 949         }
 950     }
 951     else
 952     {
 953         for (i = 0; i < n; i++)
 954         {
 955             node = atc->pd[i];
 956             if (node == atc->nodeid)
 957             {
 958                 /* Copy from the local force array */
 959                 copy_rvec(atc->f[local_pos], f[i]);
 960                 local_pos++;
 961             }
 962             else
 963             {
 964                 /* Copy from the receive buffer */
 965                 copy_rvec(pme->bufv[buf_index[node]], f[i]);
 966                 buf_index[node]++;
 967             }
 968         }
 969     }
 970 }
 971
 972 #ifdef GMX_MPI
 973 static void
 974 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 975 {
 976     pme_overlap_t *overlap;
 977     int            send_index0, send_nindex;
 978     int            recv_index0, recv_nindex;
 979     MPI_Status     stat;
 980     int            i, j, k, ix, iy, iz, icnt;
 981     int            ipulse, send_id, recv_id, datasize;
 982     real          *p;
 983     real          *sendptr, *recvptr;
 984
 985     /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
 986     overlap = &pme->overlap[1];
 987
 988     for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 989     {
 990         /* Since we have already (un)wrapped the overlap in the z-dimension,
 991          * we only have to communicate 0 to nkz (not pmegrid_nz).
 992          */
 993         if (direction == GMX_SUM_QGRID_FORWARD)
 994         {
 995             send_id       = overlap->send_id[ipulse];
 996             recv_id       = overlap->recv_id[ipulse];
 997             send_index0   = overlap->comm_data[ipulse].send_index0;
 998             send_nindex   = overlap->comm_data[ipulse].send_nindex;
 999             recv_index0   = overlap->comm_data[ipulse].recv_index0;
1000             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
1001         }
1002         else
1003         {
1004             send_id       = overlap->recv_id[ipulse];
1005             recv_id       = overlap->send_id[ipulse];
1006             send_index0   = overlap->comm_data[ipulse].recv_index0;
1007             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
1008             recv_index0   = overlap->comm_data[ipulse].send_index0;
1009             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
1010         }
1011
1012         /* Copy data to contiguous send buffer */
1013         if (debug)
1014         {
1015             fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1016                     pme->nodeid, overlap->nodeid, send_id,
1017                     pme->pmegrid_start_iy,
1018                     send_index0-pme->pmegrid_start_iy,
1019                     send_index0-pme->pmegrid_start_iy+send_nindex);
1020         }
1021         icnt = 0;
1022         for (i = 0; i < pme->pmegrid_nx; i++)
1023         {
1024             ix = i;
1025             for (j = 0; j < send_nindex; j++)
1026             {
1027                 iy = j + send_index0 - pme->pmegrid_start_iy;
1028                 for (k = 0; k < pme->nkz; k++)
1029                 {
1030                     iz = k;
1031                     overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
1032                 }
1033             }
1034         }
1035
1036         datasize      = pme->pmegrid_nx * pme->nkz;
1037
1038         MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
1039                      send_id, ipulse,
1040                      overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
1041                      recv_id, ipulse,
1042                      overlap->mpi_comm, &stat);
1043
1044         /* Get data from contiguous recv buffer */
1045         if (debug)
1046         {
1047             fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1048                     pme->nodeid, overlap->nodeid, recv_id,
1049                     pme->pmegrid_start_iy,
1050                     recv_index0-pme->pmegrid_start_iy,
1051                     recv_index0-pme->pmegrid_start_iy+recv_nindex);
1052         }
1053         icnt = 0;
1054         for (i = 0; i < pme->pmegrid_nx; i++)
1055         {
1056             ix = i;
1057             for (j = 0; j < recv_nindex; j++)
1058             {
1059                 iy = j + recv_index0 - pme->pmegrid_start_iy;
1060                 for (k = 0; k < pme->nkz; k++)
1061                 {
1062                     iz = k;
1063                     if (direction == GMX_SUM_QGRID_FORWARD)
1064                     {
1065                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1066                     }
1067                     else
1068                     {
1069                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
1070                     }
1071                 }
1072             }
1073         }
1074     }
1075
1076     /* Major dimension is easier, no copying required,
1077      * but we might have to sum to separate array.
1078      * Since we don't copy, we have to communicate up to pmegrid_nz,
1079      * not nkz as for the minor direction.
1080      */
1081     overlap = &pme->overlap[0];
1082
1083     for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
1084     {
1085         if (direction == GMX_SUM_QGRID_FORWARD)
1086         {
1087             send_id       = overlap->send_id[ipulse];
1088             recv_id       = overlap->recv_id[ipulse];
1089             send_index0   = overlap->comm_data[ipulse].send_index0;
1090             send_nindex   = overlap->comm_data[ipulse].send_nindex;
1091             recv_index0   = overlap->comm_data[ipulse].recv_index0;
1092             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
1093             recvptr       = overlap->recvbuf;
1094         }
1095         else
1096         {
1097             send_id       = overlap->recv_id[ipulse];
1098             recv_id       = overlap->send_id[ipulse];
1099             send_index0   = overlap->comm_data[ipulse].recv_index0;
1100             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
1101             recv_index0   = overlap->comm_data[ipulse].send_index0;
1102             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
1103             recvptr       = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1104         }
1105
1106         sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1107         datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
1108
1109         if (debug)
1110         {
1111             fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1112                     pme->nodeid, overlap->nodeid, send_id,
1113                     pme->pmegrid_start_ix,
1114                     send_index0-pme->pmegrid_start_ix,
1115                     send_index0-pme->pmegrid_start_ix+send_nindex);
1116             fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1117                     pme->nodeid, overlap->nodeid, recv_id,
1118                     pme->pmegrid_start_ix,
1119                     recv_index0-pme->pmegrid_start_ix,
1120                     recv_index0-pme->pmegrid_start_ix+recv_nindex);
1121         }
1122
1123         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
1124                      send_id, ipulse,
1125                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
1126                      recv_id, ipulse,
1127                      overlap->mpi_comm, &stat);
1128
1129         /* ADD data from contiguous recv buffer */
1130         if (direction == GMX_SUM_QGRID_FORWARD)
1131         {
1132             p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1133             for (i = 0; i < recv_nindex*datasize; i++)
1134             {
1135                 p[i] += overlap->recvbuf[i];
1136             }
1137         }
1138     }
1139 }
1140 #endif
1141
1142
1143 static int
1144 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
1145 {
1146     ivec    local_fft_ndata, local_fft_offset, local_fft_size;
1147     ivec    local_pme_size;
1148     int     i, ix, iy, iz;
1149     int     pmeidx, fftidx;
1150
1151     /* Dimensions should be identical for A/B grid, so we just use A here */
1152     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1153                                    local_fft_ndata,
1154                                    local_fft_offset,
1155                                    local_fft_size);
1156
1157     local_pme_size[0] = pme->pmegrid_nx;
1158     local_pme_size[1] = pme->pmegrid_ny;
1159     local_pme_size[2] = pme->pmegrid_nz;
1160
1161     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1162        the offset is identical, and the PME grid always has more data (due to overlap)
1163      */
1164     {
1165 #ifdef DEBUG_PME
1166         FILE *fp, *fp2;
1167         char  fn[STRLEN], format[STRLEN];
1168         real  val;
1169         sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
1170         fp = ffopen(fn, "w");
1171         sprintf(fn, "pmegrid%d.txt", pme->nodeid);
1172         fp2 = ffopen(fn, "w");
1173         sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
1174 #endif
1175
1176         for (ix = 0; ix < local_fft_ndata[XX]; ix++)
1177         {
1178             for (iy = 0; iy < local_fft_ndata[YY]; iy++)
1179             {
1180                 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1181                 {
1182                     pmeidx          = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1183                     fftidx          = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1184                     fftgrid[fftidx] = pmegrid[pmeidx];
1185 #ifdef DEBUG_PME
1186                     val = 100*pmegrid[pmeidx];
1187                     if (pmegrid[pmeidx] != 0)
1188                     {
1189                         fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
1190                                 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
1191                     }
1192                     if (pmegrid[pmeidx] != 0)
1193                     {
1194                         fprintf(fp2, "%-12s  %5d  %5d  %5d  %12.5e\n",
1195                                 "qgrid",
1196                                 pme->pmegrid_start_ix + ix,
1197                                 pme->pmegrid_start_iy + iy,
1198                                 pme->pmegrid_start_iz + iz,
1199                                 pmegrid[pmeidx]);
1200                     }
1201 #endif
1202                 }
1203             }
1204         }
1205 #ifdef DEBUG_PME
1206         ffclose(fp);
1207         ffclose(fp2);
1208 #endif
1209     }
1210     return 0;
1211 }
1212
1213
1214 static gmx_cycles_t omp_cyc_start()
1215 {
1216     return gmx_cycles_read();
1217 }
1218
1219 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1220 {
1221     return gmx_cycles_read() - c;
1222 }
1223
1224
1225 static int
1226 copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
1227                         int nthread, int thread)
1228 {
1229     ivec          local_fft_ndata, local_fft_offset, local_fft_size;
1230     ivec          local_pme_size;
1231     int           ixy0, ixy1, ixy, ix, iy, iz;
1232     int           pmeidx, fftidx;
1233 #ifdef PME_TIME_THREADS
1234     gmx_cycles_t  c1;
1235     static double cs1 = 0;
1236     static int    cnt = 0;
1237 #endif
1238
1239 #ifdef PME_TIME_THREADS
1240     c1 = omp_cyc_start();
1241 #endif
1242     /* Dimensions should be identical for A/B grid, so we just use A here */
1243     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1244                                    local_fft_ndata,
1245                                    local_fft_offset,
1246                                    local_fft_size);
1247
1248     local_pme_size[0] = pme->pmegrid_nx;
1249     local_pme_size[1] = pme->pmegrid_ny;
1250     local_pme_size[2] = pme->pmegrid_nz;
1251
1252     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1253        the offset is identical, and the PME grid always has more data (due to overlap)
1254      */
1255     ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1256     ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1257
1258     for (ixy = ixy0; ixy < ixy1; ixy++)
1259     {
1260         ix = ixy/local_fft_ndata[YY];
1261         iy = ixy - ix*local_fft_ndata[YY];
1262
1263         pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1264         fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1265         for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1266         {
1267             pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1268         }
1269     }
1270
1271 #ifdef PME_TIME_THREADS
1272     c1   = omp_cyc_end(c1);
1273     cs1 += (double)c1;
1274     cnt++;
1275     if (cnt % 20 == 0)
1276     {
1277         printf("copy %.2f\n", cs1*1e-9);
1278     }
1279 #endif
1280
1281     return 0;
1282 }
1283
1284
1285 static void
1286 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1287 {
1288     int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
1289
1290     nx = pme->nkx;
1291     ny = pme->nky;
1292     nz = pme->nkz;
1293
1294     pnx = pme->pmegrid_nx;
1295     pny = pme->pmegrid_ny;
1296     pnz = pme->pmegrid_nz;
1297
1298     overlap = pme->pme_order - 1;
1299
1300     /* Add periodic overlap in z */
1301     for (ix = 0; ix < pme->pmegrid_nx; ix++)
1302     {
1303         for (iy = 0; iy < pme->pmegrid_ny; iy++)
1304         {
1305             for (iz = 0; iz < overlap; iz++)
1306             {
1307                 pmegrid[(ix*pny+iy)*pnz+iz] +=
1308                     pmegrid[(ix*pny+iy)*pnz+nz+iz];
1309             }
1310         }
1311     }
1312
1313     if (pme->nnodes_minor == 1)
1314     {
1315         for (ix = 0; ix < pme->pmegrid_nx; ix++)
1316         {
1317             for (iy = 0; iy < overlap; iy++)
1318             {
1319                 for (iz = 0; iz < nz; iz++)
1320                 {
1321                     pmegrid[(ix*pny+iy)*pnz+iz] +=
1322                         pmegrid[(ix*pny+ny+iy)*pnz+iz];
1323                 }
1324             }
1325         }
1326     }
1327
1328     if (pme->nnodes_major == 1)
1329     {
1330         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1331
1332         for (ix = 0; ix < overlap; ix++)
1333         {
1334             for (iy = 0; iy < ny_x; iy++)
1335             {
1336                 for (iz = 0; iz < nz; iz++)
1337                 {
1338                     pmegrid[(ix*pny+iy)*pnz+iz] +=
1339                         pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1340                 }
1341             }
1342         }
1343     }
1344 }
1345
1346
1347 static void
1348 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1349 {
1350     int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
1351
1352     nx = pme->nkx;
1353     ny = pme->nky;
1354     nz = pme->nkz;
1355
1356     pnx = pme->pmegrid_nx;
1357     pny = pme->pmegrid_ny;
1358     pnz = pme->pmegrid_nz;
1359
1360     overlap = pme->pme_order - 1;
1361
1362     if (pme->nnodes_major == 1)
1363     {
1364         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1365
1366         for (ix = 0; ix < overlap; ix++)
1367         {
1368             int iy, iz;
1369
1370             for (iy = 0; iy < ny_x; iy++)
1371             {
1372                 for (iz = 0; iz < nz; iz++)
1373                 {
1374                     pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1375                         pmegrid[(ix*pny+iy)*pnz+iz];
1376                 }
1377             }
1378         }
1379     }
1380
1381     if (pme->nnodes_minor == 1)
1382     {
1383 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1384         for (ix = 0; ix < pme->pmegrid_nx; ix++)
1385         {
1386             int iy, iz;
1387
1388             for (iy = 0; iy < overlap; iy++)
1389             {
1390                 for (iz = 0; iz < nz; iz++)
1391                 {
1392                     pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1393                         pmegrid[(ix*pny+iy)*pnz+iz];
1394                 }
1395             }
1396         }
1397     }
1398
1399     /* Copy periodic overlap in z */
1400 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1401     for (ix = 0; ix < pme->pmegrid_nx; ix++)
1402     {
1403         int iy, iz;
1404
1405         for (iy = 0; iy < pme->pmegrid_ny; iy++)
1406         {
1407             for (iz = 0; iz < overlap; iz++)
1408             {
1409                 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1410                     pmegrid[(ix*pny+iy)*pnz+iz];
1411             }
1412         }
1413     }
1414 }
1415
1416 static void clear_grid(int nx, int ny, int nz, real *grid,
1417                        ivec fs, int *flag,
1418                        int fx, int fy, int fz,
1419                        int order)
1420 {
1421     int nc, ncz;
1422     int fsx, fsy, fsz, gx, gy, gz, g0x, g0y, x, y, z;
1423     int flind;
1424
1425     nc  = 2 + (order - 2)/FLBS;
1426     ncz = 2 + (order - 2)/FLBSZ;
1427
1428     for (fsx = fx; fsx < fx+nc; fsx++)
1429     {
1430         for (fsy = fy; fsy < fy+nc; fsy++)
1431         {
1432             for (fsz = fz; fsz < fz+ncz; fsz++)
1433             {
1434                 flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
1435                 if (flag[flind] == 0)
1436                 {
1437                     gx  = fsx*FLBS;
1438                     gy  = fsy*FLBS;
1439                     gz  = fsz*FLBSZ;
1440                     g0x = (gx*ny + gy)*nz + gz;
1441                     for (x = 0; x < FLBS; x++)
1442                     {
1443                         g0y = g0x;
1444                         for (y = 0; y < FLBS; y++)
1445                         {
1446                             for (z = 0; z < FLBSZ; z++)
1447                             {
1448                                 grid[g0y+z] = 0;
1449                             }
1450                             g0y += nz;
1451                         }
1452                         g0x += ny*nz;
1453                     }
1454
1455                     flag[flind] = 1;
1456                 }
1457             }
1458         }
1459     }
1460 }
1461
1462 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1463 #define DO_BSPLINE(order)                            \
1464     for (ithx = 0; (ithx < order); ithx++)                    \
1465     {                                                    \
1466         index_x = (i0+ithx)*pny*pnz;                     \
1467         valx    = qn*thx[ithx];                          \
1468                                                      \
1469         for (ithy = 0; (ithy < order); ithy++)                \
1470         {                                                \
1471             valxy    = valx*thy[ithy];                   \
1472             index_xy = index_x+(j0+ithy)*pnz;            \
1473                                                      \
1474             for (ithz = 0; (ithz < order); ithz++)            \
1475             {                                            \
1476                 index_xyz        = index_xy+(k0+ithz);   \
1477                 grid[index_xyz] += valxy*thz[ithz];      \
1478             }                                            \
1479         }                                                \
1480     }
1481
1482
1483 static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
1484                                      pme_atomcomm_t *atc, splinedata_t *spline,
1485                                      pme_spline_work_t *work)
1486 {
1487
1488     /* spread charges from home atoms to local grid */
1489     real          *grid;
1490     pme_overlap_t *ol;
1491     int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
1492     int       *    idxptr;
1493     int            order, norder, index_x, index_xy, index_xyz;
1494     real           valx, valxy, qn;
1495     real          *thx, *thy, *thz;
1496     int            localsize, bndsize;
1497     int            pnx, pny, pnz, ndatatot;
1498     int            offx, offy, offz;
1499
1500     pnx = pmegrid->s[XX];
1501     pny = pmegrid->s[YY];
1502     pnz = pmegrid->s[ZZ];
1503
1504     offx = pmegrid->offset[XX];
1505     offy = pmegrid->offset[YY];
1506     offz = pmegrid->offset[ZZ];
1507
1508     ndatatot = pnx*pny*pnz;
1509     grid     = pmegrid->grid;
1510     for (i = 0; i < ndatatot; i++)
1511     {
1512         grid[i] = 0;
1513     }
1514
1515     order = pmegrid->order;
1516
1517     for (nn = 0; nn < spline->n; nn++)
1518     {
1519         n  = spline->ind[nn];
1520         qn = atc->q[n];
1521
1522         if (qn != 0)
1523         {
1524             idxptr = atc->idx[n];
1525             norder = nn*order;
1526
1527             i0   = idxptr[XX] - offx;
1528             j0   = idxptr[YY] - offy;
1529             k0   = idxptr[ZZ] - offz;
1530
1531             thx = spline->theta[XX] + norder;
1532             thy = spline->theta[YY] + norder;
1533             thz = spline->theta[ZZ] + norder;
1534
1535             switch (order)
1536             {
1537                 case 4:
1538 #ifdef PME_SSE
1539 #ifdef PME_SSE_UNALIGNED
1540 #define PME_SPREAD_SSE_ORDER4
1541 #else
1542 #define PME_SPREAD_SSE_ALIGNED
1543 #define PME_ORDER 4
1544 #endif
1545 #include "pme_sse_single.h"
1546 #else
1547                     DO_BSPLINE(4);
1548 #endif
1549                     break;
1550                 case 5:
1551 #ifdef PME_SSE
1552 #define PME_SPREAD_SSE_ALIGNED
1553 #define PME_ORDER 5
1554 #include "pme_sse_single.h"
1555 #else
1556                     DO_BSPLINE(5);
1557 #endif
1558                     break;
1559                 default:
1560                     DO_BSPLINE(order);
1561                     break;
1562             }
1563         }
1564     }
1565 }
1566
1567 static void set_grid_alignment(int *pmegrid_nz, int pme_order)
1568 {
1569 #ifdef PME_SSE
1570     if (pme_order == 5
1571 #ifndef PME_SSE_UNALIGNED
1572         || pme_order == 4
1573 #endif
1574         )
1575     {
1576         /* Round nz up to a multiple of 4 to ensure alignment */
1577         *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1578     }
1579 #endif
1580 }
1581
1582 static void set_gridsize_alignment(int *gridsize, int pme_order)
1583 {
1584 #ifdef PME_SSE
1585 #ifndef PME_SSE_UNALIGNED
1586     if (pme_order == 4)
1587     {
1588         /* Add extra elements to ensured aligned operations do not go
1589          * beyond the allocated grid size.
1590          * Note that for pme_order=5, the pme grid z-size alignment
1591          * ensures that we will not go beyond the grid size.
1592          */
1593         *gridsize += 4;
1594     }
1595 #endif
1596 #endif
1597 }
1598
1599 static void pmegrid_init(pmegrid_t *grid,
1600                          int cx, int cy, int cz,
1601                          int x0, int y0, int z0,
1602                          int x1, int y1, int z1,
1603                          gmx_bool set_alignment,
1604                          int pme_order,
1605                          real *ptr)
1606 {
1607     int nz, gridsize;
1608
1609     grid->ci[XX]     = cx;
1610     grid->ci[YY]     = cy;
1611     grid->ci[ZZ]     = cz;
1612     grid->offset[XX] = x0;
1613     grid->offset[YY] = y0;
1614     grid->offset[ZZ] = z0;
1615     grid->n[XX]      = x1 - x0 + pme_order - 1;
1616     grid->n[YY]      = y1 - y0 + pme_order - 1;
1617     grid->n[ZZ]      = z1 - z0 + pme_order - 1;
1618     copy_ivec(grid->n, grid->s);
1619
1620     nz = grid->s[ZZ];
1621     set_grid_alignment(&nz, pme_order);
1622     if (set_alignment)
1623     {
1624         grid->s[ZZ] = nz;
1625     }
1626     else if (nz != grid->s[ZZ])
1627     {
1628         gmx_incons("pmegrid_init call with an unaligned z size");
1629     }
1630
1631     grid->order = pme_order;
1632     if (ptr == NULL)
1633     {
1634         gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
1635         set_gridsize_alignment(&gridsize, pme_order);
1636         snew_aligned(grid->grid, gridsize, 16);
1637     }
1638     else
1639     {
1640         grid->grid = ptr;
1641     }
1642 }
1643
1644 static int div_round_up(int enumerator, int denominator)
1645 {
1646     return (enumerator + denominator - 1)/denominator;
1647 }
1648
1649 static void make_subgrid_division(const ivec n, int ovl, int nthread,
1650                                   ivec nsub)
1651 {
1652     int gsize_opt, gsize;
1653     int nsx, nsy, nsz;
1654     char *env;
1655
1656     gsize_opt = -1;
1657     for (nsx = 1; nsx <= nthread; nsx++)
1658     {
1659         if (nthread % nsx == 0)
1660         {
1661             for (nsy = 1; nsy <= nthread; nsy++)
1662             {
1663                 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1664                 {
1665                     nsz = nthread/(nsx*nsy);
1666
1667                     /* Determine the number of grid points per thread */
1668                     gsize =
1669                         (div_round_up(n[XX], nsx) + ovl)*
1670                         (div_round_up(n[YY], nsy) + ovl)*
1671                         (div_round_up(n[ZZ], nsz) + ovl);
1672
1673                     /* Minimize the number of grids points per thread
1674                      * and, secondarily, the number of cuts in minor dimensions.
1675                      */
1676                     if (gsize_opt == -1 ||
1677                         gsize < gsize_opt ||
1678                         (gsize == gsize_opt &&
1679                          (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1680                     {
1681                         nsub[XX]  = nsx;
1682                         nsub[YY]  = nsy;
1683                         nsub[ZZ]  = nsz;
1684                         gsize_opt = gsize;
1685                     }
1686                 }
1687             }
1688         }
1689     }
1690
1691     env = getenv("GMX_PME_THREAD_DIVISION");
1692     if (env != NULL)
1693     {
1694         sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
1695     }
1696
1697     if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1698     {
1699         gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
1700     }
1701 }
1702
1703 static void pmegrids_init(pmegrids_t *grids,
1704                           int nx, int ny, int nz, int nz_base,
1705                           int pme_order,
1706                           gmx_bool bUseThreads,
1707                           int nthread,
1708                           int overlap_x,
1709                           int overlap_y)
1710 {
1711     ivec n, n_base, g0, g1;
1712     int t, x, y, z, d, i, tfac;
1713     int max_comm_lines = -1;
1714
1715     n[XX] = nx - (pme_order - 1);
1716     n[YY] = ny - (pme_order - 1);
1717     n[ZZ] = nz - (pme_order - 1);
1718
1719     copy_ivec(n, n_base);
1720     n_base[ZZ] = nz_base;
1721
1722     pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
1723                  NULL);
1724
1725     grids->nthread = nthread;
1726
1727     make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
1728
1729     if (bUseThreads)
1730     {
1731         ivec nst;
1732         int gridsize;
1733
1734         for (d = 0; d < DIM; d++)
1735         {
1736             nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
1737         }
1738         set_grid_alignment(&nst[ZZ], pme_order);
1739
1740         if (debug)
1741         {
1742             fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
1743                     grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
1744             fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1745                     nx, ny, nz,
1746                     nst[XX], nst[YY], nst[ZZ]);
1747         }
1748
1749         snew(grids->grid_th, grids->nthread);
1750         t        = 0;
1751         gridsize = nst[XX]*nst[YY]*nst[ZZ];
1752         set_gridsize_alignment(&gridsize, pme_order);
1753         snew_aligned(grids->grid_all,
1754                      grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1755                      16);
1756
1757         for (x = 0; x < grids->nc[XX]; x++)
1758         {
1759             for (y = 0; y < grids->nc[YY]; y++)
1760             {
1761                 for (z = 0; z < grids->nc[ZZ]; z++)
1762                 {
1763                     pmegrid_init(&grids->grid_th[t],
1764                                  x, y, z,
1765                                  (n[XX]*(x  ))/grids->nc[XX],
1766                                  (n[YY]*(y  ))/grids->nc[YY],
1767                                  (n[ZZ]*(z  ))/grids->nc[ZZ],
1768                                  (n[XX]*(x+1))/grids->nc[XX],
1769                                  (n[YY]*(y+1))/grids->nc[YY],
1770                                  (n[ZZ]*(z+1))/grids->nc[ZZ],
1771                                  TRUE,
1772                                  pme_order,
1773                                  grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1774                     t++;
1775                 }
1776             }
1777         }
1778     }
1779     else
1780     {
1781         grids->grid_th = NULL;
1782     }
1783
1784     snew(grids->g2t, DIM);
1785     tfac = 1;
1786     for (d = DIM-1; d >= 0; d--)
1787     {
1788         snew(grids->g2t[d], n[d]);
1789         t = 0;
1790         for (i = 0; i < n[d]; i++)
1791         {
1792             /* The second check should match the parameters
1793              * of the pmegrid_init call above.
1794              */
1795             while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1796             {
1797                 t++;
1798             }
1799             grids->g2t[d][i] = t*tfac;
1800         }
1801
1802         tfac *= grids->nc[d];
1803
1804         switch (d)
1805         {
1806             case XX: max_comm_lines = overlap_x;     break;
1807             case YY: max_comm_lines = overlap_y;     break;
1808             case ZZ: max_comm_lines = pme_order - 1; break;
1809         }
1810         grids->nthread_comm[d] = 0;
1811         while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
1812                grids->nthread_comm[d] < grids->nc[d])
1813         {
1814             grids->nthread_comm[d]++;
1815         }
1816         if (debug != NULL)
1817         {
1818             fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
1819                     'x'+d, grids->nthread_comm[d]);
1820         }
1821         /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1822          * work, but this is not a problematic restriction.
1823          */
1824         if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1825         {
1826             gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
1827         }
1828     }
1829 }
1830
1831
1832 static void pmegrids_destroy(pmegrids_t *grids)
1833 {
1834     int t;
1835
1836     if (grids->grid.grid != NULL)
1837     {
1838         sfree(grids->grid.grid);
1839
1840         if (grids->nthread > 0)
1841         {
1842             for (t = 0; t < grids->nthread; t++)
1843             {
1844                 sfree(grids->grid_th[t].grid);
1845             }
1846             sfree(grids->grid_th);
1847         }
1848     }
1849 }
1850
1851
1852 static void realloc_work(pme_work_t *work, int nkx)
1853 {
1854     if (nkx > work->nalloc)
1855     {
1856         work->nalloc = nkx;
1857         srenew(work->mhx, work->nalloc);
1858         srenew(work->mhy, work->nalloc);
1859         srenew(work->mhz, work->nalloc);
1860         srenew(work->m2, work->nalloc);
1861         /* Allocate an aligned pointer for SSE operations, including 3 extra
1862          * elements at the end since SSE operates on 4 elements at a time.
1863          */
1864         sfree_aligned(work->denom);
1865         sfree_aligned(work->tmp1);
1866         sfree_aligned(work->eterm);
1867         snew_aligned(work->denom, work->nalloc+3, 16);
1868         snew_aligned(work->tmp1, work->nalloc+3, 16);
1869         snew_aligned(work->eterm, work->nalloc+3, 16);
1870         srenew(work->m2inv, work->nalloc);
1871     }
1872 }
1873
1874
1875 static void free_work(pme_work_t *work)
1876 {
1877     sfree(work->mhx);
1878     sfree(work->mhy);
1879     sfree(work->mhz);
1880     sfree(work->m2);
1881     sfree_aligned(work->denom);
1882     sfree_aligned(work->tmp1);
1883     sfree_aligned(work->eterm);
1884     sfree(work->m2inv);
1885 }
1886
1887
1888 #ifdef PME_SSE
1889 /* Calculate exponentials through SSE in float precision */
1890 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1891 {
1892     {
1893         const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
1894         __m128 f_sse;
1895         __m128 lu;
1896         __m128 tmp_d1, d_inv, tmp_r, tmp_e;
1897         int kx;
1898         f_sse = _mm_load1_ps(&f);
1899         for (kx = 0; kx < end; kx += 4)
1900         {
1901             tmp_d1   = _mm_load_ps(d_aligned+kx);
1902             lu       = _mm_rcp_ps(tmp_d1);
1903             d_inv    = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
1904             tmp_r    = _mm_load_ps(r_aligned+kx);
1905             tmp_r    = gmx_mm_exp_ps(tmp_r);
1906             tmp_e    = _mm_mul_ps(f_sse, d_inv);
1907             tmp_e    = _mm_mul_ps(tmp_e, tmp_r);
1908             _mm_store_ps(e_aligned+kx, tmp_e);
1909         }
1910     }
1911 }
1912 #else
1913 inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
1914 {
1915     int kx;
1916     for (kx = start; kx < end; kx++)
1917     {
1918         d[kx] = 1.0/d[kx];
1919     }
1920     for (kx = start; kx < end; kx++)
1921     {
1922         r[kx] = exp(r[kx]);
1923     }
1924     for (kx = start; kx < end; kx++)
1925     {
1926         e[kx] = f*r[kx]*d[kx];
1927     }
1928 }
1929 #endif
1930
1931
1932 static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
1933                          real ewaldcoeff, real vol,
1934                          gmx_bool bEnerVir,
1935                          int nthread, int thread)
1936 {
1937     /* do recip sum over local cells in grid */
1938     /* y major, z middle, x minor or continuous */
1939     t_complex *p0;
1940     int     kx, ky, kz, maxkx, maxky, maxkz;
1941     int     nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
1942     real    mx, my, mz;
1943     real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1944     real    ets2, struct2, vfactor, ets2vf;
1945     real    d1, d2, energy = 0;
1946     real    by, bz;
1947     real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
1948     real    rxx, ryx, ryy, rzx, rzy, rzz;
1949     pme_work_t *work;
1950     real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
1951     real    mhxk, mhyk, mhzk, m2k;
1952     real    corner_fac;
1953     ivec    complex_order;
1954     ivec    local_ndata, local_offset, local_size;
1955     real    elfac;
1956
1957     elfac = ONE_4PI_EPS0/pme->epsilon_r;
1958
1959     nx = pme->nkx;
1960     ny = pme->nky;
1961     nz = pme->nkz;
1962
1963     /* Dimensions should be identical for A/B grid, so we just use A here */
1964     gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1965                                       complex_order,
1966                                       local_ndata,
1967                                       local_offset,
1968                                       local_size);
1969
1970     rxx = pme->recipbox[XX][XX];
1971     ryx = pme->recipbox[YY][XX];
1972     ryy = pme->recipbox[YY][YY];
1973     rzx = pme->recipbox[ZZ][XX];
1974     rzy = pme->recipbox[ZZ][YY];
1975     rzz = pme->recipbox[ZZ][ZZ];
1976
1977     maxkx = (nx+1)/2;
1978     maxky = (ny+1)/2;
1979     maxkz = nz/2+1;
1980
1981     work  = &pme->work[thread];
1982     mhx   = work->mhx;
1983     mhy   = work->mhy;
1984     mhz   = work->mhz;
1985     m2    = work->m2;
1986     denom = work->denom;
1987     tmp1  = work->tmp1;
1988     eterm = work->eterm;
1989     m2inv = work->m2inv;
1990
1991     iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
1992     iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1993
1994     for (iyz = iyz0; iyz < iyz1; iyz++)
1995     {
1996         iy = iyz/local_ndata[ZZ];
1997         iz = iyz - iy*local_ndata[ZZ];
1998
1999         ky = iy + local_offset[YY];
2000
2001         if (ky < maxky)
2002         {
2003             my = ky;
2004         }
2005         else
2006         {
2007             my = (ky - ny);
2008         }
2009
2010         by = M_PI*vol*pme->bsp_mod[YY][ky];
2011
2012         kz = iz + local_offset[ZZ];
2013
2014         mz = kz;
2015
2016         bz = pme->bsp_mod[ZZ][kz];
2017
2018         /* 0.5 correction for corner points */
2019         corner_fac = 1;
2020         if (kz == 0 || kz == (nz+1)/2)
2021         {
2022             corner_fac = 0.5;
2023         }
2024
2025         p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2026
2027         /* We should skip the k-space point (0,0,0) */
2028         if (local_offset[XX] > 0 || ky > 0 || kz > 0)
2029         {
2030             kxstart = local_offset[XX];
2031         }
2032         else
2033         {
2034             kxstart = local_offset[XX] + 1;
2035             p0++;
2036         }
2037         kxend = local_offset[XX] + local_ndata[XX];
2038
2039         if (bEnerVir)
2040         {
2041             /* More expensive inner loop, especially because of the storage
2042              * of the mh elements in array's.
2043              * Because x is the minor grid index, all mh elements
2044              * depend on kx for triclinic unit cells.
2045              */
2046
2047             /* Two explicit loops to avoid a conditional inside the loop */
2048             for (kx = kxstart; kx < maxkx; kx++)
2049             {
2050                 mx = kx;
2051
2052                 mhxk      = mx * rxx;
2053                 mhyk      = mx * ryx + my * ryy;
2054                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2055                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2056                 mhx[kx]   = mhxk;
2057                 mhy[kx]   = mhyk;
2058                 mhz[kx]   = mhzk;
2059                 m2[kx]    = m2k;
2060                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2061                 tmp1[kx]  = -factor*m2k;
2062             }
2063
2064             for (kx = maxkx; kx < kxend; kx++)
2065             {
2066                 mx = (kx - nx);
2067
2068                 mhxk      = mx * rxx;
2069                 mhyk      = mx * ryx + my * ryy;
2070                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2071                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2072                 mhx[kx]   = mhxk;
2073                 mhy[kx]   = mhyk;
2074                 mhz[kx]   = mhzk;
2075                 m2[kx]    = m2k;
2076                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2077                 tmp1[kx]  = -factor*m2k;
2078             }
2079
2080             for (kx = kxstart; kx < kxend; kx++)
2081             {
2082                 m2inv[kx] = 1.0/m2[kx];
2083             }
2084
2085             calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
2086
2087             for (kx = kxstart; kx < kxend; kx++, p0++)
2088             {
2089                 d1      = p0->re;
2090                 d2      = p0->im;
2091
2092                 p0->re  = d1*eterm[kx];
2093                 p0->im  = d2*eterm[kx];
2094
2095                 struct2 = 2.0*(d1*d1+d2*d2);
2096
2097                 tmp1[kx] = eterm[kx]*struct2;
2098             }
2099
2100             for (kx = kxstart; kx < kxend; kx++)
2101             {
2102                 ets2     = corner_fac*tmp1[kx];
2103                 vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2104                 energy  += ets2;
2105
2106                 ets2vf   = ets2*vfactor;
2107                 virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
2108                 virxy   += ets2vf*mhx[kx]*mhy[kx];
2109                 virxz   += ets2vf*mhx[kx]*mhz[kx];
2110                 viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
2111                 viryz   += ets2vf*mhy[kx]*mhz[kx];
2112                 virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
2113             }
2114         }
2115         else
2116         {
2117             /* We don't need to calculate the energy and the virial.
2118              * In this case the triclinic overhead is small.
2119              */
2120
2121             /* Two explicit loops to avoid a conditional inside the loop */
2122
2123             for (kx = kxstart; kx < maxkx; kx++)
2124             {
2125                 mx = kx;
2126
2127                 mhxk      = mx * rxx;
2128                 mhyk      = mx * ryx + my * ryy;
2129                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2130                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2131                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2132                 tmp1[kx]  = -factor*m2k;
2133             }
2134
2135             for (kx = maxkx; kx < kxend; kx++)
2136             {
2137                 mx = (kx - nx);
2138
2139                 mhxk      = mx * rxx;
2140                 mhyk      = mx * ryx + my * ryy;
2141                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2142                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2143                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2144                 tmp1[kx]  = -factor*m2k;
2145             }
2146
2147             calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
2148
2149             for (kx = kxstart; kx < kxend; kx++, p0++)
2150             {
2151                 d1      = p0->re;
2152                 d2      = p0->im;
2153
2154                 p0->re  = d1*eterm[kx];
2155                 p0->im  = d2*eterm[kx];
2156             }
2157         }
2158     }
2159
2160     if (bEnerVir)
2161     {
2162         /* Update virial with local values.
2163          * The virial is symmetric by definition.
2164          * this virial seems ok for isotropic scaling, but I'm
2165          * experiencing problems on semiisotropic membranes.
2166          * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2167          */
2168         work->vir[XX][XX] = 0.25*virxx;
2169         work->vir[YY][YY] = 0.25*viryy;
2170         work->vir[ZZ][ZZ] = 0.25*virzz;
2171         work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
2172         work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
2173         work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
2174
2175         /* This energy should be corrected for a charged system */
2176         work->energy = 0.5*energy;
2177     }
2178
2179     /* Return the loop count */
2180     return local_ndata[YY]*local_ndata[XX];
2181 }
2182
2183 static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
2184                              real *mesh_energy, matrix vir)
2185 {
2186     /* This function sums output over threads
2187      * and should therefore only be called after thread synchronization.
2188      */
2189     int thread;
2190
2191     *mesh_energy = pme->work[0].energy;
2192     copy_mat(pme->work[0].vir, vir);
2193
2194     for (thread = 1; thread < nthread; thread++)
2195     {
2196         *mesh_energy += pme->work[thread].energy;
2197         m_add(vir, pme->work[thread].vir, vir);
2198     }
2199 }
2200
2201 #define DO_FSPLINE(order)                      \
2202     for (ithx = 0; (ithx < order); ithx++)              \
2203     {                                              \
2204         index_x = (i0+ithx)*pny*pnz;               \
2205         tx      = thx[ithx];                       \
2206         dx      = dthx[ithx];                      \
2207                                                \
2208         for (ithy = 0; (ithy < order); ithy++)          \
2209         {                                          \
2210             index_xy = index_x+(j0+ithy)*pnz;      \
2211             ty       = thy[ithy];                  \
2212             dy       = dthy[ithy];                 \
2213             fxy1     = fz1 = 0;                    \
2214                                                \
2215             for (ithz = 0; (ithz < order); ithz++)      \
2216             {                                      \
2217                 gval  = grid[index_xy+(k0+ithz)];  \
2218                 fxy1 += thz[ithz]*gval;            \
2219                 fz1  += dthz[ithz]*gval;           \
2220             }                                      \
2221             fx += dx*ty*fxy1;                      \
2222             fy += tx*dy*fxy1;                      \
2223             fz += tx*ty*fz1;                       \
2224         }                                          \
2225     }
2226
2227
2228 static void gather_f_bsplines(gmx_pme_t pme, real *grid,
2229                               gmx_bool bClearF, pme_atomcomm_t *atc,
2230                               splinedata_t *spline,
2231                               real scale)
2232 {
2233     /* sum forces for local particles */
2234     int     nn, n, ithx, ithy, ithz, i0, j0, k0;
2235     int     index_x, index_xy;
2236     int     nx, ny, nz, pnx, pny, pnz;
2237     int *   idxptr;
2238     real    tx, ty, dx, dy, qn;
2239     real    fx, fy, fz, gval;
2240     real    fxy1, fz1;
2241     real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
2242     int     norder;
2243     real    rxx, ryx, ryy, rzx, rzy, rzz;
2244     int     order;
2245
2246     pme_spline_work_t *work;
2247
2248     work = pme->spline_work;
2249
2250     order = pme->pme_order;
2251     thx   = spline->theta[XX];
2252     thy   = spline->theta[YY];
2253     thz   = spline->theta[ZZ];
2254     dthx  = spline->dtheta[XX];
2255     dthy  = spline->dtheta[YY];
2256     dthz  = spline->dtheta[ZZ];
2257     nx    = pme->nkx;
2258     ny    = pme->nky;
2259     nz    = pme->nkz;
2260     pnx   = pme->pmegrid_nx;
2261     pny   = pme->pmegrid_ny;
2262     pnz   = pme->pmegrid_nz;
2263
2264     rxx   = pme->recipbox[XX][XX];
2265     ryx   = pme->recipbox[YY][XX];
2266     ryy   = pme->recipbox[YY][YY];
2267     rzx   = pme->recipbox[ZZ][XX];
2268     rzy   = pme->recipbox[ZZ][YY];
2269     rzz   = pme->recipbox[ZZ][ZZ];
2270
2271     for (nn = 0; nn < spline->n; nn++)
2272     {
2273         n  = spline->ind[nn];
2274         qn = scale*atc->q[n];
2275
2276         if (bClearF)
2277         {
2278             atc->f[n][XX] = 0;
2279             atc->f[n][YY] = 0;
2280             atc->f[n][ZZ] = 0;
2281         }
2282         if (qn != 0)
2283         {
2284             fx     = 0;
2285             fy     = 0;
2286             fz     = 0;
2287             idxptr = atc->idx[n];
2288             norder = nn*order;
2289
2290             i0   = idxptr[XX];
2291             j0   = idxptr[YY];
2292             k0   = idxptr[ZZ];
2293
2294             /* Pointer arithmetic alert, next six statements */
2295             thx  = spline->theta[XX] + norder;
2296             thy  = spline->theta[YY] + norder;
2297             thz  = spline->theta[ZZ] + norder;
2298             dthx = spline->dtheta[XX] + norder;
2299             dthy = spline->dtheta[YY] + norder;
2300             dthz = spline->dtheta[ZZ] + norder;
2301
2302             switch (order)
2303             {
2304                 case 4:
2305 #ifdef PME_SSE
2306 #ifdef PME_SSE_UNALIGNED
2307 #define PME_GATHER_F_SSE_ORDER4
2308 #else
2309 #define PME_GATHER_F_SSE_ALIGNED
2310 #define PME_ORDER 4
2311 #endif
2312 #include "pme_sse_single.h"
2313 #else
2314                     DO_FSPLINE(4);
2315 #endif
2316                     break;
2317                 case 5:
2318 #ifdef PME_SSE
2319 #define PME_GATHER_F_SSE_ALIGNED
2320 #define PME_ORDER 5
2321 #include "pme_sse_single.h"
2322 #else
2323                     DO_FSPLINE(5);
2324 #endif
2325                     break;
2326                 default:
2327                     DO_FSPLINE(order);
2328                     break;
2329             }
2330
2331             atc->f[n][XX] += -qn*( fx*nx*rxx );
2332             atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
2333             atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2334         }
2335     }
2336     /* Since the energy and not forces are interpolated
2337      * the net force might not be exactly zero.
2338      * This can be solved by also interpolating F, but
2339      * that comes at a cost.
2340      * A better hack is to remove the net force every
2341      * step, but that must be done at a higher level
2342      * since this routine doesn't see all atoms if running
2343      * in parallel. Don't know how important it is?  EL 990726
2344      */
2345 }
2346
2347
2348 static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
2349                                    pme_atomcomm_t *atc)
2350 {
2351     splinedata_t *spline;
2352     int     n, ithx, ithy, ithz, i0, j0, k0;
2353     int     index_x, index_xy;
2354     int *   idxptr;
2355     real    energy, pot, tx, ty, qn, gval;
2356     real    *thx, *thy, *thz;
2357     int     norder;
2358     int     order;
2359
2360     spline = &atc->spline[0];
2361
2362     order = pme->pme_order;
2363
2364     energy = 0;
2365     for (n = 0; (n < atc->n); n++)
2366     {
2367         qn      = atc->q[n];
2368
2369         if (qn != 0)
2370         {
2371             idxptr = atc->idx[n];
2372             norder = n*order;
2373
2374             i0   = idxptr[XX];
2375             j0   = idxptr[YY];
2376             k0   = idxptr[ZZ];
2377
2378             /* Pointer arithmetic alert, next three statements */
2379             thx  = spline->theta[XX] + norder;
2380             thy  = spline->theta[YY] + norder;
2381             thz  = spline->theta[ZZ] + norder;
2382
2383             pot = 0;
2384             for (ithx = 0; (ithx < order); ithx++)
2385             {
2386                 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2387                 tx      = thx[ithx];
2388
2389                 for (ithy = 0; (ithy < order); ithy++)
2390                 {
2391                     index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2392                     ty       = thy[ithy];
2393
2394                     for (ithz = 0; (ithz < order); ithz++)
2395                     {
2396                         gval  = grid[index_xy+(k0+ithz)];
2397                         pot  += tx*ty*thz[ithz]*gval;
2398                     }
2399
2400                 }
2401             }
2402
2403             energy += pot*qn;
2404         }
2405     }
2406
2407     return energy;
2408 }
2409
2410 /* Macro to force loop unrolling by fixing order.
2411  * This gives a significant performance gain.
2412  */
2413 #define CALC_SPLINE(order)                     \
2414     {                                              \
2415         int j, k, l;                                 \
2416         real dr, div;                               \
2417         real data[PME_ORDER_MAX];                  \
2418         real ddata[PME_ORDER_MAX];                 \
2419                                                \
2420         for (j = 0; (j < DIM); j++)                     \
2421         {                                          \
2422             dr  = xptr[j];                         \
2423                                                \
2424             /* dr is relative offset from lower cell limit */ \
2425             data[order-1] = 0;                     \
2426             data[1]       = dr;                          \
2427             data[0]       = 1 - dr;                      \
2428                                                \
2429             for (k = 3; (k < order); k++)               \
2430             {                                      \
2431                 div       = 1.0/(k - 1.0);               \
2432                 data[k-1] = div*dr*data[k-2];      \
2433                 for (l = 1; (l < (k-1)); l++)           \
2434                 {                                  \
2435                     data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2436                                        data[k-l-1]);                \
2437                 }                                  \
2438                 data[0] = div*(1-dr)*data[0];      \
2439             }                                      \
2440             /* differentiate */                    \
2441             ddata[0] = -data[0];                   \
2442             for (k = 1; (k < order); k++)               \
2443             {                                      \
2444                 ddata[k] = data[k-1] - data[k];    \
2445             }                                      \
2446                                                \
2447             div           = 1.0/(order - 1);                 \
2448             data[order-1] = div*dr*data[order-2];  \
2449             for (l = 1; (l < (order-1)); l++)           \
2450             {                                      \
2451                 data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
2452                                        (order-l-dr)*data[order-l-1]); \
2453             }                                      \
2454             data[0] = div*(1 - dr)*data[0];        \
2455                                                \
2456             for (k = 0; k < order; k++)                 \
2457             {                                      \
2458                 theta[j][i*order+k]  = data[k];    \
2459                 dtheta[j][i*order+k] = ddata[k];   \
2460             }                                      \
2461         }                                          \
2462     }
2463
2464 void make_bsplines(splinevec theta, splinevec dtheta, int order,
2465                    rvec fractx[], int nr, int ind[], real charge[],
2466                    gmx_bool bFreeEnergy)
2467 {
2468     /* construct splines for local atoms */
2469     int  i, ii;
2470     real *xptr;
2471
2472     for (i = 0; i < nr; i++)
2473     {
2474         /* With free energy we do not use the charge check.
2475          * In most cases this will be more efficient than calling make_bsplines
2476          * twice, since usually more than half the particles have charges.
2477          */
2478         ii = ind[i];
2479         if (bFreeEnergy || charge[ii] != 0.0)
2480         {
2481             xptr = fractx[ii];
2482             switch (order)
2483             {
2484                 case 4:  CALC_SPLINE(4);     break;
2485                 case 5:  CALC_SPLINE(5);     break;
2486                 default: CALC_SPLINE(order); break;
2487             }
2488         }
2489     }
2490 }
2491
2492
2493 void make_dft_mod(real *mod, real *data, int ndata)
2494 {
2495     int i, j;
2496     real sc, ss, arg;
2497
2498     for (i = 0; i < ndata; i++)
2499     {
2500         sc = ss = 0;
2501         for (j = 0; j < ndata; j++)
2502         {
2503             arg = (2.0*M_PI*i*j)/ndata;
2504             sc += data[j]*cos(arg);
2505             ss += data[j]*sin(arg);
2506         }
2507         mod[i] = sc*sc+ss*ss;
2508     }
2509     for (i = 0; i < ndata; i++)
2510     {
2511         if (mod[i] < 1e-7)
2512         {
2513             mod[i] = (mod[i-1]+mod[i+1])*0.5;
2514         }
2515     }
2516 }
2517
2518
2519 static void make_bspline_moduli(splinevec bsp_mod,
2520                                 int nx, int ny, int nz, int order)
2521 {
2522     int nmax = max(nx, max(ny, nz));
2523     real *data, *ddata, *bsp_data;
2524     int i, k, l;
2525     real div;
2526
2527     snew(data, order);
2528     snew(ddata, order);
2529     snew(bsp_data, nmax);
2530
2531     data[order-1] = 0;
2532     data[1]       = 0;
2533     data[0]       = 1;
2534
2535     for (k = 3; k < order; k++)
2536     {
2537         div       = 1.0/(k-1.0);
2538         data[k-1] = 0;
2539         for (l = 1; l < (k-1); l++)
2540         {
2541             data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2542         }
2543         data[0] = div*data[0];
2544     }
2545     /* differentiate */
2546     ddata[0] = -data[0];
2547     for (k = 1; k < order; k++)
2548     {
2549         ddata[k] = data[k-1]-data[k];
2550     }
2551     div           = 1.0/(order-1);
2552     data[order-1] = 0;
2553     for (l = 1; l < (order-1); l++)
2554     {
2555         data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2556     }
2557     data[0] = div*data[0];
2558
2559     for (i = 0; i < nmax; i++)
2560     {
2561         bsp_data[i] = 0;
2562     }
2563     for (i = 1; i <= order; i++)
2564     {
2565         bsp_data[i] = data[i-1];
2566     }
2567
2568     make_dft_mod(bsp_mod[XX], bsp_data, nx);
2569     make_dft_mod(bsp_mod[YY], bsp_data, ny);
2570     make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
2571
2572     sfree(data);
2573     sfree(ddata);
2574     sfree(bsp_data);
2575 }
2576
2577
2578 /* Return the P3M optimal influence function */
2579 static double do_p3m_influence(double z, int order)
2580 {
2581     double z2, z4;
2582
2583     z2 = z*z;
2584     z4 = z2*z2;
2585
2586     /* The formula and most constants can be found in:
2587      * Ballenegger et al., JCTC 8, 936 (2012)
2588      */
2589     switch (order)
2590     {
2591         case 2:
2592             return 1.0 - 2.0*z2/3.0;
2593             break;
2594         case 3:
2595             return 1.0 - z2 + 2.0*z4/15.0;
2596             break;
2597         case 4:
2598             return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2599             break;
2600         case 5:
2601             return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2602             break;
2603         case 6:
2604             return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2605             break;
2606         case 7:
2607             return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2608         case 8:
2609             return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2610             break;
2611     }
2612
2613     return 0.0;
2614 }
2615
2616 /* Calculate the P3M B-spline moduli for one dimension */
2617 static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
2618 {
2619     double zarg, zai, sinzai, infl;
2620     int    maxk, i;
2621
2622     if (order > 8)
2623     {
2624         gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
2625     }
2626
2627     zarg = M_PI/n;
2628
2629     maxk = (n + 1)/2;
2630
2631     for (i = -maxk; i < 0; i++)
2632     {
2633         zai          = zarg*i;
2634         sinzai       = sin(zai);
2635         infl         = do_p3m_influence(sinzai, order);
2636         bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
2637     }
2638     bsp_mod[0] = 1.0;
2639     for (i = 1; i < maxk; i++)
2640     {
2641         zai        = zarg*i;
2642         sinzai     = sin(zai);
2643         infl       = do_p3m_influence(sinzai, order);
2644         bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
2645     }
2646 }
2647
2648 /* Calculate the P3M B-spline moduli */
2649 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2650                                     int nx, int ny, int nz, int order)
2651 {
2652     make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
2653     make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
2654     make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
2655 }
2656
2657
2658 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2659 {
2660     int nslab, n, i;
2661     int fw, bw;
2662
2663     nslab = atc->nslab;
2664
2665     n = 0;
2666     for (i = 1; i <= nslab/2; i++)
2667     {
2668         fw = (atc->nodeid + i) % nslab;
2669         bw = (atc->nodeid - i + nslab) % nslab;
2670         if (n < nslab - 1)
2671         {
2672             atc->node_dest[n] = fw;
2673             atc->node_src[n]  = bw;
2674             n++;
2675         }
2676         if (n < nslab - 1)
2677         {
2678             atc->node_dest[n] = bw;
2679             atc->node_src[n]  = fw;
2680             n++;
2681         }
2682     }
2683 }
2684
2685 int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
2686 {
2687     int thread;
2688
2689     if (NULL != log)
2690     {
2691         fprintf(log, "Destroying PME data structures.\n");
2692     }
2693
2694     sfree((*pmedata)->nnx);
2695     sfree((*pmedata)->nny);
2696     sfree((*pmedata)->nnz);
2697
2698     pmegrids_destroy(&(*pmedata)->pmegridA);
2699
2700     sfree((*pmedata)->fftgridA);
2701     sfree((*pmedata)->cfftgridA);
2702     gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
2703
2704     if ((*pmedata)->pmegridB.grid.grid != NULL)
2705     {
2706         pmegrids_destroy(&(*pmedata)->pmegridB);
2707         sfree((*pmedata)->fftgridB);
2708         sfree((*pmedata)->cfftgridB);
2709         gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
2710     }
2711     for (thread = 0; thread < (*pmedata)->nthread; thread++)
2712     {
2713         free_work(&(*pmedata)->work[thread]);
2714     }
2715     sfree((*pmedata)->work);
2716
2717     sfree(*pmedata);
2718     *pmedata = NULL;
2719
2720     return 0;
2721 }
2722
2723 static int mult_up(int n, int f)
2724 {
2725     return ((n + f - 1)/f)*f;
2726 }
2727
2728
2729 static double pme_load_imbalance(gmx_pme_t pme)
2730 {
2731     int    nma, nmi;
2732     double n1, n2, n3;
2733
2734     nma = pme->nnodes_major;
2735     nmi = pme->nnodes_minor;
2736
2737     n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
2738     n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
2739     n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
2740
2741     /* pme_solve is roughly double the cost of an fft */
2742
2743     return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
2744 }
2745
2746 static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc, t_commrec *cr,
2747                           int dimind, gmx_bool bSpread)
2748 {
2749     int nk, k, s, thread;
2750
2751     atc->dimind    = dimind;
2752     atc->nslab     = 1;
2753     atc->nodeid    = 0;
2754     atc->pd_nalloc = 0;
2755 #ifdef GMX_MPI
2756     if (pme->nnodes > 1)
2757     {
2758         atc->mpi_comm = pme->mpi_comm_d[dimind];
2759         MPI_Comm_size(atc->mpi_comm, &atc->nslab);
2760         MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
2761     }
2762     if (debug)
2763     {
2764         fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
2765     }
2766 #endif
2767
2768     atc->bSpread   = bSpread;
2769     atc->pme_order = pme->pme_order;
2770
2771     if (atc->nslab > 1)
2772     {
2773         /* These three allocations are not required for particle decomp. */
2774         snew(atc->node_dest, atc->nslab);
2775         snew(atc->node_src, atc->nslab);
2776         setup_coordinate_communication(atc);
2777
2778         snew(atc->count_thread, pme->nthread);
2779         for (thread = 0; thread < pme->nthread; thread++)
2780         {
2781             snew(atc->count_thread[thread], atc->nslab);
2782         }
2783         atc->count = atc->count_thread[0];
2784         snew(atc->rcount, atc->nslab);
2785         snew(atc->buf_index, atc->nslab);
2786     }
2787
2788     atc->nthread = pme->nthread;
2789     if (atc->nthread > 1)
2790     {
2791         snew(atc->thread_plist, atc->nthread);
2792     }
2793     snew(atc->spline, atc->nthread);
2794     for (thread = 0; thread < atc->nthread; thread++)
2795     {
2796         if (atc->nthread > 1)
2797         {
2798             snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
2799             atc->thread_plist[thread].n += GMX_CACHE_SEP;
2800         }
2801         snew(atc->spline[thread].thread_one, pme->nthread);
2802         atc->spline[thread].thread_one[thread] = 1;
2803     }
2804 }
2805
2806 static void
2807 init_overlap_comm(pme_overlap_t *  ol,
2808                   int              norder,
2809 #ifdef GMX_MPI
2810                   MPI_Comm         comm,
2811 #endif
2812                   int              nnodes,
2813                   int              nodeid,
2814                   int              ndata,
2815                   int              commplainsize)
2816 {
2817     int lbnd, rbnd, maxlr, b, i;
2818     int exten;
2819     int nn, nk;
2820     pme_grid_comm_t *pgc;
2821     gmx_bool bCont;
2822     int fft_start, fft_end, send_index1, recv_index1;
2823 #ifdef GMX_MPI
2824     MPI_Status stat;
2825
2826     ol->mpi_comm = comm;
2827 #endif
2828
2829     ol->nnodes = nnodes;
2830     ol->nodeid = nodeid;
2831
2832     /* Linear translation of the PME grid won't affect reciprocal space
2833      * calculations, so to optimize we only interpolate "upwards",
2834      * which also means we only have to consider overlap in one direction.
2835      * I.e., particles on this node might also be spread to grid indices
2836      * that belong to higher nodes (modulo nnodes)
2837      */
2838
2839     snew(ol->s2g0, ol->nnodes+1);
2840     snew(ol->s2g1, ol->nnodes);
2841     if (debug)
2842     {
2843         fprintf(debug, "PME slab boundaries:");
2844     }
2845     for (i = 0; i < nnodes; i++)
2846     {
2847         /* s2g0 the local interpolation grid start.
2848          * s2g1 the local interpolation grid end.
2849          * Because grid overlap communication only goes forward,
2850          * the grid the slabs for fft's should be rounded down.
2851          */
2852         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
2853         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
2854
2855         if (debug)
2856         {
2857             fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
2858         }
2859     }
2860     ol->s2g0[nnodes] = ndata;
2861     if (debug)
2862     {
2863         fprintf(debug, "\n");
2864     }
2865
2866     /* Determine with how many nodes we need to communicate the grid overlap */
2867     b = 0;
2868     do
2869     {
2870         b++;
2871         bCont = FALSE;
2872         for (i = 0; i < nnodes; i++)
2873         {
2874             if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
2875                 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
2876             {
2877                 bCont = TRUE;
2878             }
2879         }
2880     }
2881     while (bCont && b < nnodes);
2882     ol->noverlap_nodes = b - 1;
2883
2884     snew(ol->send_id, ol->noverlap_nodes);
2885     snew(ol->recv_id, ol->noverlap_nodes);
2886     for (b = 0; b < ol->noverlap_nodes; b++)
2887     {
2888         ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
2889         ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
2890     }
2891     snew(ol->comm_data, ol->noverlap_nodes);
2892
2893     ol->send_size = 0;
2894     for (b = 0; b < ol->noverlap_nodes; b++)
2895     {
2896         pgc = &ol->comm_data[b];
2897         /* Send */
2898         fft_start        = ol->s2g0[ol->send_id[b]];
2899         fft_end          = ol->s2g0[ol->send_id[b]+1];
2900         if (ol->send_id[b] < nodeid)
2901         {
2902             fft_start += ndata;
2903             fft_end   += ndata;
2904         }
2905         send_index1       = ol->s2g1[nodeid];
2906         send_index1       = min(send_index1, fft_end);
2907         pgc->send_index0  = fft_start;
2908         pgc->send_nindex  = max(0, send_index1 - pgc->send_index0);
2909         ol->send_size    += pgc->send_nindex;
2910
2911         /* We always start receiving to the first index of our slab */
2912         fft_start        = ol->s2g0[ol->nodeid];
2913         fft_end          = ol->s2g0[ol->nodeid+1];
2914         recv_index1      = ol->s2g1[ol->recv_id[b]];
2915         if (ol->recv_id[b] > nodeid)
2916         {
2917             recv_index1 -= ndata;
2918         }
2919         recv_index1      = min(recv_index1, fft_end);
2920         pgc->recv_index0 = fft_start;
2921         pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
2922     }
2923
2924 #ifdef GMX_MPI
2925     /* Communicate the buffer sizes to receive */
2926     for (b = 0; b < ol->noverlap_nodes; b++)
2927     {
2928         MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
2929                      &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
2930                      ol->mpi_comm, &stat);
2931     }
2932 #endif
2933
2934     /* For non-divisible grid we need pme_order iso pme_order-1 */
2935     snew(ol->sendbuf, norder*commplainsize);
2936     snew(ol->recvbuf, norder*commplainsize);
2937 }
2938
2939 static void
2940 make_gridindex5_to_localindex(int n, int local_start, int local_range,
2941                               int **global_to_local,
2942                               real **fraction_shift)
2943 {
2944     int i;
2945     int * gtl;
2946     real * fsh;
2947
2948     snew(gtl, 5*n);
2949     snew(fsh, 5*n);
2950     for (i = 0; (i < 5*n); i++)
2951     {
2952         /* Determine the global to local grid index */
2953         gtl[i] = (i - local_start + n) % n;
2954         /* For coordinates that fall within the local grid the fraction
2955          * is correct, we don't need to shift it.
2956          */
2957         fsh[i] = 0;
2958         if (local_range < n)
2959         {
2960             /* Due to rounding issues i could be 1 beyond the lower or
2961              * upper boundary of the local grid. Correct the index for this.
2962              * If we shift the index, we need to shift the fraction by
2963              * the same amount in the other direction to not affect
2964              * the weights.
2965              * Note that due to this shifting the weights at the end of
2966              * the spline might change, but that will only involve values
2967              * between zero and values close to the precision of a real,
2968              * which is anyhow the accuracy of the whole mesh calculation.
2969              */
2970             /* With local_range=0 we should not change i=local_start */
2971             if (i % n != local_start)
2972             {
2973                 if (gtl[i] == n-1)
2974                 {
2975                     gtl[i] = 0;
2976                     fsh[i] = -1;
2977                 }
2978                 else if (gtl[i] == local_range)
2979                 {
2980                     gtl[i] = local_range - 1;
2981                     fsh[i] = 1;
2982                 }
2983             }
2984         }
2985     }
2986
2987     *global_to_local = gtl;
2988     *fraction_shift  = fsh;
2989 }
2990
2991 static pme_spline_work_t *make_pme_spline_work(int order)
2992 {
2993     pme_spline_work_t *work;
2994
2995 #ifdef PME_SSE
2996     float  tmp[8];
2997     __m128 zero_SSE;
2998     int    of, i;
2999
3000     snew_aligned(work, 1, 16);
3001
3002     zero_SSE = _mm_setzero_ps();
3003
3004     /* Generate bit masks to mask out the unused grid entries,
3005      * as we only operate on order of the 8 grid entries that are
3006      * load into 2 SSE float registers.
3007      */
3008     for (of = 0; of < 8-(order-1); of++)
3009     {
3010         for (i = 0; i < 8; i++)
3011         {
3012             tmp[i] = (i >= of && i < of+order ? 1 : 0);
3013         }
3014         work->mask_SSE0[of] = _mm_loadu_ps(tmp);
3015         work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
3016         work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
3017         work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
3018     }
3019 #else
3020     work = NULL;
3021 #endif
3022
3023     return work;
3024 }
3025
3026 void gmx_pme_check_restrictions(int pme_order,
3027                                 int nkx, int nky, int nkz,
3028                                 int nnodes_major,
3029                                 int nnodes_minor,
3030                                 gmx_bool bUseThreads,
3031                                 gmx_bool bFatal,
3032                                 gmx_bool *bValidSettings)
3033 {
3034     if (pme_order > PME_ORDER_MAX)
3035     {
3036         if (!bFatal)
3037         {
3038             *bValidSettings = FALSE;
3039             return;
3040         }
3041         gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3042                   pme_order, PME_ORDER_MAX);
3043     }
3044
3045     if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
3046         nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
3047         nkz <= pme_order)
3048     {
3049         if (!bFatal)
3050         {
3051             *bValidSettings = FALSE;
3052             return;
3053         }
3054         gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
3055                   pme_order);
3056     }
3057
3058     /* Check for a limitation of the (current) sum_fftgrid_dd code.
3059      * We only allow multiple communication pulses in dim 1, not in dim 0.
3060      */
3061     if (bUseThreads && (nkx < nnodes_major*pme_order &&
3062                         nkx != nnodes_major*(pme_order - 1)))
3063     {
3064         if (!bFatal)
3065         {
3066             *bValidSettings = FALSE;
3067             return;
3068         }
3069         gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
3070                   nkx/(double)nnodes_major, pme_order);
3071     }
3072
3073     if (bValidSettings != NULL)
3074     {
3075         *bValidSettings = TRUE;
3076     }
3077
3078     return;
3079 }
3080
3081 int gmx_pme_init(gmx_pme_t *         pmedata,
3082                  t_commrec *         cr,
3083                  int                 nnodes_major,
3084                  int                 nnodes_minor,
3085                  t_inputrec *        ir,
3086                  int                 homenr,
3087                  gmx_bool            bFreeEnergy,
3088                  gmx_bool            bReproducible,
3089                  int                 nthread)
3090 {
3091     gmx_pme_t pme = NULL;
3092
3093     int  use_threads, sum_use_threads;
3094     ivec ndata;
3095
3096     if (debug)
3097     {
3098         fprintf(debug, "Creating PME data structures.\n");
3099     }
3100     snew(pme, 1);
3101
3102     pme->redist_init         = FALSE;
3103     pme->sum_qgrid_tmp       = NULL;
3104     pme->sum_qgrid_dd_tmp    = NULL;
3105     pme->buf_nalloc          = 0;
3106     pme->redist_buf_nalloc   = 0;
3107
3108     pme->nnodes              = 1;
3109     pme->bPPnode             = TRUE;
3110
3111     pme->nnodes_major        = nnodes_major;
3112     pme->nnodes_minor        = nnodes_minor;
3113
3114 #ifdef GMX_MPI
3115     if (nnodes_major*nnodes_minor > 1)
3116     {
3117         pme->mpi_comm = cr->mpi_comm_mygroup;
3118
3119         MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
3120         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
3121         if (pme->nnodes != nnodes_major*nnodes_minor)
3122         {
3123             gmx_incons("PME node count mismatch");
3124         }
3125     }
3126     else
3127     {
3128         pme->mpi_comm = MPI_COMM_NULL;
3129     }
3130 #endif
3131
3132     if (pme->nnodes == 1)
3133     {
3134 #ifdef GMX_MPI
3135         pme->mpi_comm_d[0] = MPI_COMM_NULL;
3136         pme->mpi_comm_d[1] = MPI_COMM_NULL;
3137 #endif
3138         pme->ndecompdim   = 0;
3139         pme->nodeid_major = 0;
3140         pme->nodeid_minor = 0;
3141 #ifdef GMX_MPI
3142         pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
3143 #endif
3144     }
3145     else
3146     {
3147         if (nnodes_minor == 1)
3148         {
3149 #ifdef GMX_MPI
3150             pme->mpi_comm_d[0] = pme->mpi_comm;
3151             pme->mpi_comm_d[1] = MPI_COMM_NULL;
3152 #endif
3153             pme->ndecompdim   = 1;
3154             pme->nodeid_major = pme->nodeid;
3155             pme->nodeid_minor = 0;
3156
3157         }
3158         else if (nnodes_major == 1)
3159         {
3160 #ifdef GMX_MPI
3161             pme->mpi_comm_d[0] = MPI_COMM_NULL;
3162             pme->mpi_comm_d[1] = pme->mpi_comm;
3163 #endif
3164             pme->ndecompdim   = 1;
3165             pme->nodeid_major = 0;
3166             pme->nodeid_minor = pme->nodeid;
3167         }
3168         else
3169         {
3170             if (pme->nnodes % nnodes_major != 0)
3171             {
3172                 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
3173             }
3174             pme->ndecompdim = 2;
3175
3176 #ifdef GMX_MPI
3177             MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
3178                            pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
3179             MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
3180                            pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
3181
3182             MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
3183             MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
3184             MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
3185             MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
3186 #endif
3187         }
3188         pme->bPPnode = (cr->duty & DUTY_PP);
3189     }
3190
3191     pme->nthread = nthread;
3192
3193     /* Check if any of the PME MPI ranks uses threads */
3194     use_threads = (pme->nthread > 1 ? 1 : 0);
3195 #ifdef GMX_MPI
3196     if (pme->nnodes > 1)
3197     {
3198         MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
3199                       MPI_SUM, pme->mpi_comm);
3200     }
3201     else
3202 #endif
3203     {
3204         sum_use_threads = use_threads;
3205     }
3206     pme->bUseThreads = (sum_use_threads > 0);
3207
3208     if (ir->ePBC == epbcSCREW)
3209     {
3210         gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
3211     }
3212
3213     pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
3214     pme->nkx         = ir->nkx;
3215     pme->nky         = ir->nky;
3216     pme->nkz         = ir->nkz;
3217     pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3218     pme->pme_order   = ir->pme_order;
3219     pme->epsilon_r   = ir->epsilon_r;
3220
3221     /* If we violate restrictions, generate a fatal error here */
3222     gmx_pme_check_restrictions(pme->pme_order,
3223                                pme->nkx, pme->nky, pme->nkz,
3224                                pme->nnodes_major,
3225                                pme->nnodes_minor,
3226                                pme->bUseThreads,
3227                                TRUE,
3228                                NULL);
3229
3230     if (pme->nnodes > 1)
3231     {
3232         double imbal;
3233
3234 #ifdef GMX_MPI
3235         MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3236         MPI_Type_commit(&(pme->rvec_mpi));
3237 #endif
3238
3239         /* Note that the charge spreading and force gathering, which usually
3240          * takes about the same amount of time as FFT+solve_pme,
3241          * is always fully load balanced
3242          * (unless the charge distribution is inhomogeneous).
3243          */
3244
3245         imbal = pme_load_imbalance(pme);
3246         if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3247         {
3248             fprintf(stderr,
3249                     "\n"
3250                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3251                     "      For optimal PME load balancing\n"
3252                     "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3253                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3254                     "\n",
3255                     (int)((imbal-1)*100 + 0.5),
3256                     pme->nkx, pme->nky, pme->nnodes_major,
3257                     pme->nky, pme->nkz, pme->nnodes_minor);
3258         }
3259     }
3260
3261     /* For non-divisible grid we need pme_order iso pme_order-1 */
3262     /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3263      * y is always copied through a buffer: we don't need padding in z,
3264      * but we do need the overlap in x because of the communication order.
3265      */
3266     init_overlap_comm(&pme->overlap[0], pme->pme_order,
3267 #ifdef GMX_MPI
3268                       pme->mpi_comm_d[0],
3269 #endif
3270                       pme->nnodes_major, pme->nodeid_major,
3271                       pme->nkx,
3272                       (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3273
3274     /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3275      * We do this with an offset buffer of equal size, so we need to allocate
3276      * extra for the offset. That's what the (+1)*pme->nkz is for.
3277      */
3278     init_overlap_comm(&pme->overlap[1], pme->pme_order,
3279 #ifdef GMX_MPI
3280                       pme->mpi_comm_d[1],
3281 #endif
3282                       pme->nnodes_minor, pme->nodeid_minor,
3283                       pme->nky,
3284                       (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
3285
3286     /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
3287      * Note that gmx_pme_check_restrictions checked for this already.
3288      */
3289     if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
3290     {
3291         gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
3292     }
3293
3294     snew(pme->bsp_mod[XX], pme->nkx);
3295     snew(pme->bsp_mod[YY], pme->nky);
3296     snew(pme->bsp_mod[ZZ], pme->nkz);
3297
3298     /* The required size of the interpolation grid, including overlap.
3299      * The allocated size (pmegrid_n?) might be slightly larger.
3300      */
3301     pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3302         pme->overlap[0].s2g0[pme->nodeid_major];
3303     pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3304         pme->overlap[1].s2g0[pme->nodeid_minor];
3305     pme->pmegrid_nz_base = pme->nkz;
3306     pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
3307     set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
3308
3309     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3310     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3311     pme->pmegrid_start_iz = 0;
3312
3313     make_gridindex5_to_localindex(pme->nkx,
3314                                   pme->pmegrid_start_ix,
3315                                   pme->pmegrid_nx - (pme->pme_order-1),
3316                                   &pme->nnx, &pme->fshx);
3317     make_gridindex5_to_localindex(pme->nky,
3318                                   pme->pmegrid_start_iy,
3319                                   pme->pmegrid_ny - (pme->pme_order-1),
3320                                   &pme->nny, &pme->fshy);
3321     make_gridindex5_to_localindex(pme->nkz,
3322                                   pme->pmegrid_start_iz,
3323                                   pme->pmegrid_nz_base,
3324                                   &pme->nnz, &pme->fshz);
3325
3326     pmegrids_init(&pme->pmegridA,
3327                   pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3328                   pme->pmegrid_nz_base,
3329                   pme->pme_order,
3330                   pme->bUseThreads,
3331                   pme->nthread,
3332                   pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3333                   pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3334
3335     pme->spline_work = make_pme_spline_work(pme->pme_order);
3336
3337     ndata[0] = pme->nkx;
3338     ndata[1] = pme->nky;
3339     ndata[2] = pme->nkz;
3340
3341     /* This routine will allocate the grid data to fit the FFTs */
3342     gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
3343                             &pme->fftgridA, &pme->cfftgridA,
3344                             pme->mpi_comm_d,
3345                             bReproducible, pme->nthread);
3346
3347     if (bFreeEnergy)
3348     {
3349         pmegrids_init(&pme->pmegridB,
3350                       pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3351                       pme->pmegrid_nz_base,
3352                       pme->pme_order,
3353                       pme->bUseThreads,
3354                       pme->nthread,
3355                       pme->nkx % pme->nnodes_major != 0,
3356                       pme->nky % pme->nnodes_minor != 0);
3357
3358         gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
3359                                 &pme->fftgridB, &pme->cfftgridB,
3360                                 pme->mpi_comm_d,
3361                                 bReproducible, pme->nthread);
3362     }
3363     else
3364     {
3365         pme->pmegridB.grid.grid = NULL;
3366         pme->fftgridB           = NULL;
3367         pme->cfftgridB          = NULL;
3368     }
3369
3370     if (!pme->bP3M)
3371     {
3372         /* Use plain SPME B-spline interpolation */
3373         make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3374     }
3375     else
3376     {
3377         /* Use the P3M grid-optimized influence function */
3378         make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3379     }
3380
3381     /* Use atc[0] for spreading */
3382     init_atomcomm(pme, &pme->atc[0], cr, nnodes_major > 1 ? 0 : 1, TRUE);
3383     if (pme->ndecompdim >= 2)
3384     {
3385         init_atomcomm(pme, &pme->atc[1], cr, 1, FALSE);
3386     }
3387
3388     if (pme->nnodes == 1)
3389     {
3390         pme->atc[0].n = homenr;
3391         pme_realloc_atomcomm_things(&pme->atc[0]);
3392     }
3393
3394     {
3395         int thread;
3396
3397         /* Use fft5d, order after FFT is y major, z, x minor */
3398
3399         snew(pme->work, pme->nthread);
3400         for (thread = 0; thread < pme->nthread; thread++)
3401         {
3402             realloc_work(&pme->work[thread], pme->nkx);
3403         }
3404     }
3405
3406     *pmedata = pme;
3407
3408     return 0;
3409 }
3410
3411 static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
3412 {
3413     int d, t;
3414
3415     for (d = 0; d < DIM; d++)
3416     {
3417         if (new->grid.n[d] > old->grid.n[d])
3418         {
3419             return;
3420         }
3421     }
3422
3423     sfree_aligned(new->grid.grid);
3424     new->grid.grid = old->grid.grid;
3425
3426     if (new->grid_th != NULL && new->nthread == old->nthread)
3427     {
3428         sfree_aligned(new->grid_all);
3429         for (t = 0; t < new->nthread; t++)
3430         {
3431             new->grid_th[t].grid = old->grid_th[t].grid;
3432         }
3433     }
3434 }
3435
3436 int gmx_pme_reinit(gmx_pme_t *         pmedata,
3437                    t_commrec *         cr,
3438                    gmx_pme_t           pme_src,
3439                    const t_inputrec *  ir,
3440                    ivec                grid_size)
3441 {
3442     t_inputrec irc;
3443     int homenr;
3444     int ret;
3445
3446     irc     = *ir;
3447     irc.nkx = grid_size[XX];
3448     irc.nky = grid_size[YY];
3449     irc.nkz = grid_size[ZZ];
3450
3451     if (pme_src->nnodes == 1)
3452     {
3453         homenr = pme_src->atc[0].n;
3454     }
3455     else
3456     {
3457         homenr = -1;
3458     }
3459
3460     ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
3461                        &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
3462
3463     if (ret == 0)
3464     {
3465         /* We can easily reuse the allocated pme grids in pme_src */
3466         reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
3467         /* We would like to reuse the fft grids, but that's harder */
3468     }
3469
3470     return ret;
3471 }
3472
3473
3474 static void copy_local_grid(gmx_pme_t pme,
3475                             pmegrids_t *pmegrids, int thread, real *fftgrid)
3476 {
3477     ivec local_fft_ndata, local_fft_offset, local_fft_size;
3478     int  fft_my, fft_mz;
3479     int  nsx, nsy, nsz;
3480     ivec nf;
3481     int  offx, offy, offz, x, y, z, i0, i0t;
3482     int  d;
3483     pmegrid_t *pmegrid;
3484     real *grid_th;
3485
3486     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3487                                    local_fft_ndata,
3488                                    local_fft_offset,
3489                                    local_fft_size);
3490     fft_my = local_fft_size[YY];
3491     fft_mz = local_fft_size[ZZ];
3492
3493     pmegrid = &pmegrids->grid_th[thread];
3494
3495     nsx = pmegrid->s[XX];
3496     nsy = pmegrid->s[YY];
3497     nsz = pmegrid->s[ZZ];
3498
3499     for (d = 0; d < DIM; d++)
3500     {
3501         nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3502                     local_fft_ndata[d] - pmegrid->offset[d]);
3503     }
3504
3505     offx = pmegrid->offset[XX];
3506     offy = pmegrid->offset[YY];
3507     offz = pmegrid->offset[ZZ];
3508
3509     /* Directly copy the non-overlapping parts of the local grids.
3510      * This also initializes the full grid.
3511      */
3512     grid_th = pmegrid->grid;
3513     for (x = 0; x < nf[XX]; x++)
3514     {
3515         for (y = 0; y < nf[YY]; y++)
3516         {
3517             i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3518             i0t = (x*nsy + y)*nsz;
3519             for (z = 0; z < nf[ZZ]; z++)
3520             {
3521                 fftgrid[i0+z] = grid_th[i0t+z];
3522             }
3523         }
3524     }
3525 }
3526
3527 static void
3528 reduce_threadgrid_overlap(gmx_pme_t pme,
3529                           const pmegrids_t *pmegrids, int thread,
3530                           real *fftgrid, real *commbuf_x, real *commbuf_y)
3531 {
3532     ivec local_fft_ndata, local_fft_offset, local_fft_size;
3533     int  fft_nx, fft_ny, fft_nz;
3534     int  fft_my, fft_mz;
3535     int  buf_my = -1;
3536     int  nsx, nsy, nsz;
3537     ivec ne;
3538     int  offx, offy, offz, x, y, z, i0, i0t;
3539     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
3540     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
3541     gmx_bool bCommX, bCommY;
3542     int  d;
3543     int  thread_f;
3544     const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
3545     const real *grid_th;
3546     real *commbuf = NULL;
3547
3548     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3549                                    local_fft_ndata,
3550                                    local_fft_offset,
3551                                    local_fft_size);
3552     fft_nx = local_fft_ndata[XX];
3553     fft_ny = local_fft_ndata[YY];
3554     fft_nz = local_fft_ndata[ZZ];
3555
3556     fft_my = local_fft_size[YY];
3557     fft_mz = local_fft_size[ZZ];
3558
3559     /* This routine is called when all thread have finished spreading.
3560      * Here each thread sums grid contributions calculated by other threads
3561      * to the thread local grid volume.
3562      * To minimize the number of grid copying operations,
3563      * this routines sums immediately from the pmegrid to the fftgrid.
3564      */
3565
3566     /* Determine which part of the full node grid we should operate on,
3567      * this is our thread local part of the full grid.
3568      */
3569     pmegrid = &pmegrids->grid_th[thread];
3570
3571     for (d = 0; d < DIM; d++)
3572     {
3573         ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3574                     local_fft_ndata[d]);
3575     }
3576
3577     offx = pmegrid->offset[XX];
3578     offy = pmegrid->offset[YY];
3579     offz = pmegrid->offset[ZZ];
3580
3581
3582     bClearBufX  = TRUE;
3583     bClearBufY  = TRUE;
3584     bClearBufXY = TRUE;
3585
3586     /* Now loop over all the thread data blocks that contribute
3587      * to the grid region we (our thread) are operating on.
3588      */
3589     /* Note that ffy_nx/y is equal to the number of grid points
3590      * between the first point of our node grid and the one of the next node.
3591      */
3592     for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
3593     {
3594         fx     = pmegrid->ci[XX] + sx;
3595         ox     = 0;
3596         bCommX = FALSE;
3597         if (fx < 0)
3598         {
3599             fx    += pmegrids->nc[XX];
3600             ox    -= fft_nx;
3601             bCommX = (pme->nnodes_major > 1);
3602         }
3603         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3604         ox       += pmegrid_g->offset[XX];
3605         if (!bCommX)
3606         {
3607             tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
3608         }
3609         else
3610         {
3611             tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
3612         }
3613
3614         for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
3615         {
3616             fy     = pmegrid->ci[YY] + sy;
3617             oy     = 0;
3618             bCommY = FALSE;
3619             if (fy < 0)
3620             {
3621                 fy    += pmegrids->nc[YY];
3622                 oy    -= fft_ny;
3623                 bCommY = (pme->nnodes_minor > 1);
3624             }
3625             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3626             oy       += pmegrid_g->offset[YY];
3627             if (!bCommY)
3628             {
3629                 ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
3630             }
3631             else
3632             {
3633                 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
3634             }
3635
3636             for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
3637             {
3638                 fz = pmegrid->ci[ZZ] + sz;
3639                 oz = 0;
3640                 if (fz < 0)
3641                 {
3642                     fz += pmegrids->nc[ZZ];
3643                     oz -= fft_nz;
3644                 }
3645                 pmegrid_g = &pmegrids->grid_th[fz];
3646                 oz       += pmegrid_g->offset[ZZ];
3647                 tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
3648
3649                 if (sx == 0 && sy == 0 && sz == 0)
3650                 {
3651                     /* We have already added our local contribution
3652                      * before calling this routine, so skip it here.
3653                      */
3654                     continue;
3655                 }
3656
3657                 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3658
3659                 pmegrid_f = &pmegrids->grid_th[thread_f];
3660
3661                 grid_th = pmegrid_f->grid;
3662
3663                 nsx = pmegrid_f->s[XX];
3664                 nsy = pmegrid_f->s[YY];
3665                 nsz = pmegrid_f->s[ZZ];
3666
3667 #ifdef DEBUG_PME_REDUCE
3668                 printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3669                        pme->nodeid, thread, thread_f,
3670                        pme->pmegrid_start_ix,
3671                        pme->pmegrid_start_iy,
3672                        pme->pmegrid_start_iz,
3673                        sx, sy, sz,
3674                        offx-ox, tx1-ox, offx, tx1,
3675                        offy-oy, ty1-oy, offy, ty1,
3676                        offz-oz, tz1-oz, offz, tz1);
3677 #endif
3678
3679                 if (!(bCommX || bCommY))
3680                 {
3681                     /* Copy from the thread local grid to the node grid */
3682                     for (x = offx; x < tx1; x++)
3683                     {
3684                         for (y = offy; y < ty1; y++)
3685                         {
3686                             i0  = (x*fft_my + y)*fft_mz;
3687                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3688                             for (z = offz; z < tz1; z++)
3689                             {
3690                                 fftgrid[i0+z] += grid_th[i0t+z];
3691                             }
3692                         }
3693                     }
3694                 }
3695                 else
3696                 {
3697                     /* The order of this conditional decides
3698                      * where the corner volume gets stored with x+y decomp.
3699                      */
3700                     if (bCommY)
3701                     {
3702                         commbuf = commbuf_y;
3703                         buf_my  = ty1 - offy;
3704                         if (bCommX)
3705                         {
3706                             /* We index commbuf modulo the local grid size */
3707                             commbuf += buf_my*fft_nx*fft_nz;
3708
3709                             bClearBuf   = bClearBufXY;
3710                             bClearBufXY = FALSE;
3711                         }
3712                         else
3713                         {
3714                             bClearBuf  = bClearBufY;
3715                             bClearBufY = FALSE;
3716                         }
3717                     }
3718                     else
3719                     {
3720                         commbuf    = commbuf_x;
3721                         buf_my     = fft_ny;
3722                         bClearBuf  = bClearBufX;
3723                         bClearBufX = FALSE;
3724                     }
3725
3726                     /* Copy to the communication buffer */
3727                     for (x = offx; x < tx1; x++)
3728                     {
3729                         for (y = offy; y < ty1; y++)
3730                         {
3731                             i0  = (x*buf_my + y)*fft_nz;
3732                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3733
3734                             if (bClearBuf)
3735                             {
3736                                 /* First access of commbuf, initialize it */
3737                                 for (z = offz; z < tz1; z++)
3738                                 {
3739                                     commbuf[i0+z]  = grid_th[i0t+z];
3740                                 }
3741                             }
3742                             else
3743                             {
3744                                 for (z = offz; z < tz1; z++)
3745                                 {
3746                                     commbuf[i0+z] += grid_th[i0t+z];
3747                                 }
3748                             }
3749                         }
3750                     }
3751                 }
3752             }
3753         }
3754     }
3755 }
3756
3757
3758 static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
3759 {
3760     ivec local_fft_ndata, local_fft_offset, local_fft_size;
3761     pme_overlap_t *overlap;
3762     int  send_index0, send_nindex;
3763     int  recv_nindex;
3764 #ifdef GMX_MPI
3765     MPI_Status stat;
3766 #endif
3767     int  send_size_y, recv_size_y;
3768     int  ipulse, send_id, recv_id, datasize, gridsize, size_yx;
3769     real *sendptr, *recvptr;
3770     int  x, y, z, indg, indb;
3771
3772     /* Note that this routine is only used for forward communication.
3773      * Since the force gathering, unlike the charge spreading,
3774      * can be trivially parallelized over the particles,
3775      * the backwards process is much simpler and can use the "old"
3776      * communication setup.
3777      */
3778
3779     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3780                                    local_fft_ndata,
3781                                    local_fft_offset,
3782                                    local_fft_size);
3783
3784     if (pme->nnodes_minor > 1)
3785     {
3786         /* Major dimension */
3787         overlap = &pme->overlap[1];
3788
3789         if (pme->nnodes_major > 1)
3790         {
3791             size_yx = pme->overlap[0].comm_data[0].send_nindex;
3792         }
3793         else
3794         {
3795             size_yx = 0;
3796         }
3797         datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
3798
3799         send_size_y = overlap->send_size;
3800
3801         for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
3802         {
3803             send_id       = overlap->send_id[ipulse];
3804             recv_id       = overlap->recv_id[ipulse];
3805             send_index0   =
3806                 overlap->comm_data[ipulse].send_index0 -
3807                 overlap->comm_data[0].send_index0;
3808             send_nindex   = overlap->comm_data[ipulse].send_nindex;
3809             /* We don't use recv_index0, as we always receive starting at 0 */
3810             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
3811             recv_size_y   = overlap->comm_data[ipulse].recv_size;
3812
3813             sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
3814             recvptr = overlap->recvbuf;
3815
3816 #ifdef GMX_MPI
3817             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
3818                          send_id, ipulse,
3819                          recvptr, recv_size_y*datasize, GMX_MPI_REAL,
3820                          recv_id, ipulse,
3821                          overlap->mpi_comm, &stat);
3822 #endif
3823
3824             for (x = 0; x < local_fft_ndata[XX]; x++)
3825             {
3826                 for (y = 0; y < recv_nindex; y++)
3827                 {
3828                     indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3829                     indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
3830                     for (z = 0; z < local_fft_ndata[ZZ]; z++)
3831                     {
3832                         fftgrid[indg+z] += recvptr[indb+z];
3833                     }
3834                 }
3835             }
3836
3837             if (pme->nnodes_major > 1)
3838             {
3839                 /* Copy from the received buffer to the send buffer for dim 0 */
3840                 sendptr = pme->overlap[0].sendbuf;
3841                 for (x = 0; x < size_yx; x++)
3842                 {
3843                     for (y = 0; y < recv_nindex; y++)
3844                     {
3845                         indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3846                         indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
3847                         for (z = 0; z < local_fft_ndata[ZZ]; z++)
3848                         {
3849                             sendptr[indg+z] += recvptr[indb+z];
3850                         }
3851                     }
3852                 }
3853             }
3854         }
3855     }
3856
3857     /* We only support a single pulse here.
3858      * This is not a severe limitation, as this code is only used
3859      * with OpenMP and with OpenMP the (PME) domains can be larger.
3860      */
3861     if (pme->nnodes_major > 1)
3862     {
3863         /* Major dimension */
3864         overlap = &pme->overlap[0];
3865
3866         datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
3867         gridsize = local_fft_size[YY] *local_fft_size[ZZ];
3868
3869         ipulse = 0;
3870
3871         send_id       = overlap->send_id[ipulse];
3872         recv_id       = overlap->recv_id[ipulse];
3873         send_nindex   = overlap->comm_data[ipulse].send_nindex;
3874         /* We don't use recv_index0, as we always receive starting at 0 */
3875         recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
3876
3877         sendptr = overlap->sendbuf;
3878         recvptr = overlap->recvbuf;
3879
3880         if (debug != NULL)
3881         {
3882             fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
3883                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
3884         }
3885
3886 #ifdef GMX_MPI
3887         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
3888                      send_id, ipulse,
3889                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
3890                      recv_id, ipulse,
3891                      overlap->mpi_comm, &stat);
3892 #endif
3893
3894         for (x = 0; x < recv_nindex; x++)
3895         {
3896             for (y = 0; y < local_fft_ndata[YY]; y++)
3897             {
3898                 indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
3899                 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3900                 for (z = 0; z < local_fft_ndata[ZZ]; z++)
3901                 {
3902                     fftgrid[indg+z] += recvptr[indb+z];
3903                 }
3904             }
3905         }
3906     }
3907 }
3908
3909
3910 static void spread_on_grid(gmx_pme_t pme,
3911                            pme_atomcomm_t *atc, pmegrids_t *grids,
3912                            gmx_bool bCalcSplines, gmx_bool bSpread,
3913                            real *fftgrid)
3914 {
3915     int nthread, thread;
3916 #ifdef PME_TIME_THREADS
3917     gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
3918     static double cs1     = 0, cs2 = 0, cs3 = 0;
3919     static double cs1a[6] = {0, 0, 0, 0, 0, 0};
3920     static int cnt        = 0;
3921 #endif
3922
3923     nthread = pme->nthread;
3924     assert(nthread > 0);
3925
3926 #ifdef PME_TIME_THREADS
3927     c1 = omp_cyc_start();
3928 #endif
3929     if (bCalcSplines)
3930     {
3931 #pragma omp parallel for num_threads(nthread) schedule(static)
3932         for (thread = 0; thread < nthread; thread++)
3933         {
3934             int start, end;
3935
3936             start = atc->n* thread   /nthread;
3937             end   = atc->n*(thread+1)/nthread;
3938
3939             /* Compute fftgrid index for all atoms,
3940              * with help of some extra variables.
3941              */
3942             calc_interpolation_idx(pme, atc, start, end, thread);
3943         }
3944     }
3945 #ifdef PME_TIME_THREADS
3946     c1   = omp_cyc_end(c1);
3947     cs1 += (double)c1;
3948 #endif
3949
3950 #ifdef PME_TIME_THREADS
3951     c2 = omp_cyc_start();
3952 #endif
3953 #pragma omp parallel for num_threads(nthread) schedule(static)
3954     for (thread = 0; thread < nthread; thread++)
3955     {
3956         splinedata_t *spline;
3957         pmegrid_t *grid = NULL;
3958
3959         /* make local bsplines  */
3960         if (grids == NULL || !pme->bUseThreads)
3961         {
3962             spline = &atc->spline[0];
3963
3964             spline->n = atc->n;
3965
3966             if (bSpread)
3967             {
3968                 grid = &grids->grid;
3969             }
3970         }
3971         else
3972         {
3973             spline = &atc->spline[thread];
3974
3975             if (grids->nthread == 1)
3976             {
3977                 /* One thread, we operate on all charges */
3978                 spline->n = atc->n;
3979             }
3980             else
3981             {
3982                 /* Get the indices our thread should operate on */
3983                 make_thread_local_ind(atc, thread, spline);
3984             }
3985
3986             grid = &grids->grid_th[thread];
3987         }
3988
3989         if (bCalcSplines)
3990         {
3991             make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
3992                           atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
3993         }
3994
3995         if (bSpread)
3996         {
3997             /* put local atoms on grid. */
3998 #ifdef PME_TIME_SPREAD
3999             ct1a = omp_cyc_start();
4000 #endif
4001             spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
4002
4003             if (pme->bUseThreads)
4004             {
4005                 copy_local_grid(pme, grids, thread, fftgrid);
4006             }
4007 #ifdef PME_TIME_SPREAD
4008             ct1a          = omp_cyc_end(ct1a);
4009             cs1a[thread] += (double)ct1a;
4010 #endif
4011         }
4012     }
4013 #ifdef PME_TIME_THREADS
4014     c2   = omp_cyc_end(c2);
4015     cs2 += (double)c2;
4016 #endif
4017
4018     if (bSpread && pme->bUseThreads)
4019     {
4020 #ifdef PME_TIME_THREADS
4021         c3 = omp_cyc_start();
4022 #endif
4023 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
4024         for (thread = 0; thread < grids->nthread; thread++)
4025         {
4026             reduce_threadgrid_overlap(pme, grids, thread,
4027                                       fftgrid,
4028                                       pme->overlap[0].sendbuf,
4029                                       pme->overlap[1].sendbuf);
4030         }
4031 #ifdef PME_TIME_THREADS
4032         c3   = omp_cyc_end(c3);
4033         cs3 += (double)c3;
4034 #endif
4035
4036         if (pme->nnodes > 1)
4037         {
4038             /* Communicate the overlapping part of the fftgrid.
4039              * For this communication call we need to check pme->bUseThreads
4040              * to have all ranks communicate here, regardless of pme->nthread.
4041              */
4042             sum_fftgrid_dd(pme, fftgrid);
4043         }
4044     }
4045
4046 #ifdef PME_TIME_THREADS
4047     cnt++;
4048     if (cnt % 20 == 0)
4049     {
4050         printf("idx %.2f spread %.2f red %.2f",
4051                cs1*1e-9, cs2*1e-9, cs3*1e-9);
4052 #ifdef PME_TIME_SPREAD
4053         for (thread = 0; thread < nthread; thread++)
4054         {
4055             printf(" %.2f", cs1a[thread]*1e-9);
4056         }
4057 #endif
4058         printf("\n");
4059     }
4060 #endif
4061 }
4062
4063
4064 static void dump_grid(FILE *fp,
4065                       int sx, int sy, int sz, int nx, int ny, int nz,
4066                       int my, int mz, const real *g)
4067 {
4068     int x, y, z;
4069
4070     for (x = 0; x < nx; x++)
4071     {
4072         for (y = 0; y < ny; y++)
4073         {
4074             for (z = 0; z < nz; z++)
4075             {
4076                 fprintf(fp, "%2d %2d %2d %6.3f\n",
4077                         sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
4078             }
4079         }
4080     }
4081 }
4082
4083 static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
4084 {
4085     ivec local_fft_ndata, local_fft_offset, local_fft_size;
4086
4087     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
4088                                    local_fft_ndata,
4089                                    local_fft_offset,
4090                                    local_fft_size);
4091
4092     dump_grid(stderr,
4093               pme->pmegrid_start_ix,
4094               pme->pmegrid_start_iy,
4095               pme->pmegrid_start_iz,
4096               pme->pmegrid_nx-pme->pme_order+1,
4097               pme->pmegrid_ny-pme->pme_order+1,
4098               pme->pmegrid_nz-pme->pme_order+1,
4099               local_fft_size[YY],
4100               local_fft_size[ZZ],
4101               fftgrid);
4102 }
4103
4104
4105 void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
4106 {
4107     pme_atomcomm_t *atc;
4108     pmegrids_t *grid;
4109
4110     if (pme->nnodes > 1)
4111     {
4112         gmx_incons("gmx_pme_calc_energy called in parallel");
4113     }
4114     if (pme->bFEP > 1)
4115     {
4116         gmx_incons("gmx_pme_calc_energy with free energy");
4117     }
4118
4119     atc            = &pme->atc_energy;
4120     atc->nthread   = 1;
4121     if (atc->spline == NULL)
4122     {
4123         snew(atc->spline, atc->nthread);
4124     }
4125     atc->nslab     = 1;
4126     atc->bSpread   = TRUE;
4127     atc->pme_order = pme->pme_order;
4128     atc->n         = n;
4129     pme_realloc_atomcomm_things(atc);
4130     atc->x         = x;
4131     atc->q         = q;
4132
4133     /* We only use the A-charges grid */
4134     grid = &pme->pmegridA;
4135
4136     /* Only calculate the spline coefficients, don't actually spread */
4137     spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
4138
4139     *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
4140 }
4141
4142
4143 static void reset_pmeonly_counters(t_commrec *cr, gmx_wallcycle_t wcycle,
4144                                    t_nrnb *nrnb, t_inputrec *ir,
4145                                    gmx_large_int_t step)
4146 {
4147     /* Reset all the counters related to performance over the run */
4148     wallcycle_stop(wcycle, ewcRUN);
4149     wallcycle_reset_all(wcycle);
4150     init_nrnb(nrnb);
4151     if (ir->nsteps >= 0)
4152     {
4153         /* ir->nsteps is not used here, but we update it for consistency */
4154         ir->nsteps -= step - ir->init_step;
4155     }
4156     ir->init_step = step;
4157     wallcycle_start(wcycle, ewcRUN);
4158 }
4159
4160
4161 static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
4162                                ivec grid_size,
4163                                t_commrec *cr, t_inputrec *ir,
4164                                gmx_pme_t *pme_ret)
4165 {
4166     int ind;
4167     gmx_pme_t pme = NULL;
4168
4169     ind = 0;
4170     while (ind < *npmedata)
4171     {
4172         pme = (*pmedata)[ind];
4173         if (pme->nkx == grid_size[XX] &&
4174             pme->nky == grid_size[YY] &&
4175             pme->nkz == grid_size[ZZ])
4176         {
4177             *pme_ret = pme;
4178
4179             return;
4180         }
4181
4182         ind++;
4183     }
4184
4185     (*npmedata)++;
4186     srenew(*pmedata, *npmedata);
4187
4188     /* Generate a new PME data structure, copying part of the old pointers */
4189     gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
4190
4191     *pme_ret = (*pmedata)[ind];
4192 }
4193
4194
4195 int gmx_pmeonly(gmx_pme_t pme,
4196                 t_commrec *cr,    t_nrnb *nrnb,
4197                 gmx_wallcycle_t wcycle,
4198                 real ewaldcoeff,  gmx_bool bGatherOnly,
4199                 t_inputrec *ir)
4200 {
4201     int npmedata;
4202     gmx_pme_t *pmedata;
4203     gmx_pme_pp_t pme_pp;
4204     int  ret;
4205     int  natoms;
4206     matrix box;
4207     rvec *x_pp      = NULL, *f_pp = NULL;
4208     real *chargeA   = NULL, *chargeB = NULL;
4209     real lambda     = 0;
4210     int  maxshift_x = 0, maxshift_y = 0;
4211     real energy, dvdlambda;
4212     matrix vir;
4213     float cycles;
4214     int  count;
4215     gmx_bool bEnerVir;
4216     gmx_large_int_t step, step_rel;
4217     ivec grid_switch;
4218
4219     /* This data will only use with PME tuning, i.e. switching PME grids */
4220     npmedata = 1;
4221     snew(pmedata, npmedata);
4222     pmedata[0] = pme;
4223
4224     pme_pp = gmx_pme_pp_init(cr);
4225
4226     init_nrnb(nrnb);
4227
4228     count = 0;
4229     do /****** this is a quasi-loop over time steps! */
4230     {
4231         /* The reason for having a loop here is PME grid tuning/switching */
4232         do
4233         {
4234             /* Domain decomposition */
4235             ret = gmx_pme_recv_q_x(pme_pp,
4236                                    &natoms,
4237                                    &chargeA, &chargeB, box, &x_pp, &f_pp,
4238                                    &maxshift_x, &maxshift_y,
4239                                    &pme->bFEP, &lambda,
4240                                    &bEnerVir,
4241                                    &step,
4242                                    grid_switch, &ewaldcoeff);
4243
4244             if (ret == pmerecvqxSWITCHGRID)
4245             {
4246                 /* Switch the PME grid to grid_switch */
4247                 gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
4248             }
4249
4250             if (ret == pmerecvqxRESETCOUNTERS)
4251             {
4252                 /* Reset the cycle and flop counters */
4253                 reset_pmeonly_counters(cr, wcycle, nrnb, ir, step);
4254             }
4255         }
4256         while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
4257
4258         if (ret == pmerecvqxFINISH)
4259         {
4260             /* We should stop: break out of the loop */
4261             break;
4262         }
4263
4264         step_rel = step - ir->init_step;
4265
4266         if (count == 0)
4267         {
4268             wallcycle_start(wcycle, ewcRUN);
4269         }
4270
4271         wallcycle_start(wcycle, ewcPMEMESH);
4272
4273         dvdlambda = 0;
4274         clear_mat(vir);
4275         gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
4276                    cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
4277                    &energy, lambda, &dvdlambda,
4278                    GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
4279
4280         cycles = wallcycle_stop(wcycle, ewcPMEMESH);
4281
4282         gmx_pme_send_force_vir_ener(pme_pp,
4283                                     f_pp, vir, energy, dvdlambda,
4284                                     cycles);
4285
4286         count++;
4287     } /***** end of quasi-loop, we stop with the break above */
4288     while (TRUE);
4289
4290     return 0;
4291 }
4292
4293 int gmx_pme_do(gmx_pme_t pme,
4294                int start,       int homenr,
4295                rvec x[],        rvec f[],
4296                real *chargeA,   real *chargeB,
4297                matrix box, t_commrec *cr,
4298                int  maxshift_x, int maxshift_y,
4299                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
4300                matrix vir,      real ewaldcoeff,
4301                real *energy,    real lambda,
4302                real *dvdlambda, int flags)
4303 {
4304     int     q, d, i, j, ntot, npme;
4305     int     nx, ny, nz;
4306     int     n_d, local_ny;
4307     pme_atomcomm_t *atc = NULL;
4308     pmegrids_t *pmegrid = NULL;
4309     real    *grid       = NULL;
4310     real    *ptr;
4311     rvec    *x_d, *f_d;
4312     real    *charge = NULL, *q_d;
4313     real    energy_AB[2];
4314     matrix  vir_AB[2];
4315     gmx_bool bClearF;
4316     gmx_parallel_3dfft_t pfft_setup;
4317     real *  fftgrid;
4318     t_complex * cfftgrid;
4319     int     thread;
4320     const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4321     const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
4322
4323     assert(pme->nnodes > 0);
4324     assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4325
4326     if (pme->nnodes > 1)
4327     {
4328         atc      = &pme->atc[0];
4329         atc->npd = homenr;
4330         if (atc->npd > atc->pd_nalloc)
4331         {
4332             atc->pd_nalloc = over_alloc_dd(atc->npd);
4333             srenew(atc->pd, atc->pd_nalloc);
4334         }
4335         atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4336     }
4337     else
4338     {
4339         /* This could be necessary for TPI */
4340         pme->atc[0].n = homenr;
4341     }
4342
4343     for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
4344     {
4345         if (q == 0)
4346         {
4347             pmegrid    = &pme->pmegridA;
4348             fftgrid    = pme->fftgridA;
4349             cfftgrid   = pme->cfftgridA;
4350             pfft_setup = pme->pfft_setupA;
4351             charge     = chargeA+start;
4352         }
4353         else
4354         {
4355             pmegrid    = &pme->pmegridB;
4356             fftgrid    = pme->fftgridB;
4357             cfftgrid   = pme->cfftgridB;
4358             pfft_setup = pme->pfft_setupB;
4359             charge     = chargeB+start;
4360         }
4361         grid = pmegrid->grid.grid;
4362         /* Unpack structure */
4363         if (debug)
4364         {
4365             fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
4366                     cr->nnodes, cr->nodeid);
4367             fprintf(debug, "Grid = %p\n", (void*)grid);
4368             if (grid == NULL)
4369             {
4370                 gmx_fatal(FARGS, "No grid!");
4371             }
4372         }
4373         where();
4374
4375         m_inv_ur0(box, pme->recipbox);
4376
4377         if (pme->nnodes == 1)
4378         {
4379             atc = &pme->atc[0];
4380             if (DOMAINDECOMP(cr))
4381             {
4382                 atc->n = homenr;
4383                 pme_realloc_atomcomm_things(atc);
4384             }
4385             atc->x = x;
4386             atc->q = charge;
4387             atc->f = f;
4388         }
4389         else
4390         {
4391             wallcycle_start(wcycle, ewcPME_REDISTXF);
4392             for (d = pme->ndecompdim-1; d >= 0; d--)
4393             {
4394                 if (d == pme->ndecompdim-1)
4395                 {
4396                     n_d = homenr;
4397                     x_d = x + start;
4398                     q_d = charge;
4399                 }
4400                 else
4401                 {
4402                     n_d = pme->atc[d+1].n;
4403                     x_d = atc->x;
4404                     q_d = atc->q;
4405                 }
4406                 atc      = &pme->atc[d];
4407                 atc->npd = n_d;
4408                 if (atc->npd > atc->pd_nalloc)
4409                 {
4410                     atc->pd_nalloc = over_alloc_dd(atc->npd);
4411                     srenew(atc->pd, atc->pd_nalloc);
4412                 }
4413                 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4414                 pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
4415                 where();
4416
4417                 /* Redistribute x (only once) and qA or qB */
4418                 if (DOMAINDECOMP(cr))
4419                 {
4420                     dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
4421                 }
4422                 else
4423                 {
4424                     pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
4425                 }
4426             }
4427             where();
4428
4429             wallcycle_stop(wcycle, ewcPME_REDISTXF);
4430         }
4431
4432         if (debug)
4433         {
4434             fprintf(debug, "Node= %6d, pme local particles=%6d\n",
4435                     cr->nodeid, atc->n);
4436         }
4437
4438         if (flags & GMX_PME_SPREAD_Q)
4439         {
4440             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4441
4442             /* Spread the charges on a grid */
4443             spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
4444
4445             if (q == 0)
4446             {
4447                 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
4448             }
4449             inc_nrnb(nrnb, eNR_SPREADQBSP,
4450                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4451
4452             if (!pme->bUseThreads)
4453             {
4454                 wrap_periodic_pmegrid(pme, grid);
4455
4456                 /* sum contributions to local grid from other nodes */
4457 #ifdef GMX_MPI
4458                 if (pme->nnodes > 1)
4459                 {
4460                     gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
4461                     where();
4462                 }
4463 #endif
4464
4465                 copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
4466             }
4467
4468             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4469
4470             /*
4471                dump_local_fftgrid(pme,fftgrid);
4472                exit(0);
4473              */
4474         }
4475
4476         /* Here we start a large thread parallel region */
4477 #pragma omp parallel num_threads(pme->nthread) private(thread)
4478         {
4479             thread = gmx_omp_get_thread_num();
4480             if (flags & GMX_PME_SOLVE)
4481             {
4482                 int loop_count;
4483
4484                 /* do 3d-fft */
4485                 if (thread == 0)
4486                 {
4487                     wallcycle_start(wcycle, ewcPME_FFT);
4488                 }
4489                 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
4490                                            thread, wcycle);
4491                 if (thread == 0)
4492                 {
4493                     wallcycle_stop(wcycle, ewcPME_FFT);
4494                 }
4495                 where();
4496
4497                 /* solve in k-space for our local cells */
4498                 if (thread == 0)
4499                 {
4500                     wallcycle_start(wcycle, ewcPME_SOLVE);
4501                 }
4502                 loop_count =
4503                     solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
4504                                   box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4505                                   bCalcEnerVir,
4506                                   pme->nthread, thread);
4507                 if (thread == 0)
4508                 {
4509                     wallcycle_stop(wcycle, ewcPME_SOLVE);
4510                     where();
4511                     inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
4512                 }
4513             }
4514
4515             if (bCalcF)
4516             {
4517                 /* do 3d-invfft */
4518                 if (thread == 0)
4519                 {
4520                     where();
4521                     wallcycle_start(wcycle, ewcPME_FFT);
4522                 }
4523                 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
4524                                            thread, wcycle);
4525                 if (thread == 0)
4526                 {
4527                     wallcycle_stop(wcycle, ewcPME_FFT);
4528
4529                     where();
4530
4531                     if (pme->nodeid == 0)
4532                     {
4533                         ntot  = pme->nkx*pme->nky*pme->nkz;
4534                         npme  = ntot*log((real)ntot)/log(2.0);
4535                         inc_nrnb(nrnb, eNR_FFT, 2*npme);
4536                     }
4537
4538                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4539                 }
4540
4541                 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
4542             }
4543         }
4544         /* End of thread parallel section.
4545          * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4546          */
4547
4548         if (bCalcF)
4549         {
4550             /* distribute local grid to all nodes */
4551 #ifdef GMX_MPI
4552             if (pme->nnodes > 1)
4553             {
4554                 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
4555             }
4556 #endif
4557             where();
4558
4559             unwrap_periodic_pmegrid(pme, grid);
4560
4561             /* interpolate forces for our local atoms */
4562
4563             where();
4564
4565             /* If we are running without parallelization,
4566              * atc->f is the actual force array, not a buffer,
4567              * therefore we should not clear it.
4568              */
4569             bClearF = (q == 0 && PAR(cr));
4570 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4571             for (thread = 0; thread < pme->nthread; thread++)
4572             {
4573                 gather_f_bsplines(pme, grid, bClearF, atc,
4574                                   &atc->spline[thread],
4575                                   pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
4576             }
4577
4578             where();
4579
4580             inc_nrnb(nrnb, eNR_GATHERFBSP,
4581                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4582             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4583         }
4584
4585         if (bCalcEnerVir)
4586         {
4587             /* This should only be called on the master thread
4588              * and after the threads have synchronized.
4589              */
4590             get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
4591         }
4592     } /* of q-loop */
4593
4594     if (bCalcF && pme->nnodes > 1)
4595     {
4596         wallcycle_start(wcycle, ewcPME_REDISTXF);
4597         for (d = 0; d < pme->ndecompdim; d++)
4598         {
4599             atc = &pme->atc[d];
4600             if (d == pme->ndecompdim - 1)
4601             {
4602                 n_d = homenr;
4603                 f_d = f + start;
4604             }
4605             else
4606             {
4607                 n_d = pme->atc[d+1].n;
4608                 f_d = pme->atc[d+1].f;
4609             }
4610             if (DOMAINDECOMP(cr))
4611             {
4612                 dd_pmeredist_f(pme, atc, n_d, f_d,
4613                                d == pme->ndecompdim-1 && pme->bPPnode);
4614             }
4615             else
4616             {
4617                 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
4618             }
4619         }
4620
4621         wallcycle_stop(wcycle, ewcPME_REDISTXF);
4622     }
4623     where();
4624
4625     if (bCalcEnerVir)
4626     {
4627         if (!pme->bFEP)
4628         {
4629             *energy = energy_AB[0];
4630             m_add(vir, vir_AB[0], vir);
4631         }
4632         else
4633         {
4634             *energy     = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
4635             *dvdlambda += energy_AB[1] - energy_AB[0];
4636             for (i = 0; i < DIM; i++)
4637             {
4638                 for (j = 0; j < DIM; j++)
4639                 {
4640                     vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
4641                         lambda*vir_AB[1][i][j];
4642                 }
4643             }
4644         }
4645     }
4646     else
4647     {
4648         *energy = 0;
4649     }
4650
4651     if (debug)
4652     {
4653         fprintf(debug, "PME mesh energy: %g\n", *energy);
4654     }
4655
4656     return 0;
4657 }