src/gromacs/mdlib/pme.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  *                        VERSION 3.2.0
  11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  13  * Copyright (c) 2001-2004, The GROMACS development team,
  14  * check out http://www.gromacs.org for more information.
  15
  16  * This program is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU General Public License
  18  * as published by the Free Software Foundation; either version 2
  19  * of the License, or (at your option) any later version.
  20  *
  21  * If you want to redistribute modifications, please consider that
  22  * scientific software is very special. Version control is crucial -
  23  * bugs must be traceable. We will be happy to consider code for
  24  * inclusion in the official distribution, but derived work must not
  25  * be called official GROMACS. Details are found in the README & COPYING
  26  * files - if they are missing, get the official version at www.gromacs.org.
  27  *
  28  * To help us fund GROMACS development, we humbly ask that you cite
  29  * the papers on the package - you can find them in the top README file.
  30  *
  31  * For more info, check our website at http://www.gromacs.org
  32  *
  33  * And Hey:
  34  * GROwing Monsters And Cloning Shrimps
  35  */
  36 /* IMPORTANT FOR DEVELOPERS:
  37  *
  38  * Triclinic pme stuff isn't entirely trivial, and we've experienced
  39  * some bugs during development (many of them due to me). To avoid
  40  * this in the future, please check the following things if you make
  41  * changes in this file:
  42  *
  43  * 1. You should obtain identical (at least to the PME precision)
  44  *    energies, forces, and virial for
  45  *    a rectangular box and a triclinic one where the z (or y) axis is
  46  *    tilted a whole box side. For instance you could use these boxes:
  47  *
  48  *    rectangular       triclinic
  49  *     2  0  0           2  0  0
  50  *     0  2  0           0  2  0
  51  *     0  0  6           2  2  6
  52  *
  53  * 2. You should check the energy conservation in a triclinic box.
  54  *
  55  * It might seem an overkill, but better safe than sorry.
  56  * /Erik 001109
  57  */
  58
  59 #ifdef HAVE_CONFIG_H
  60 #include <config.h>
  61 #endif
  62
  63 #include "gromacs/utility/gmxmpi.h"
  64
  65 #include <stdio.h>
  66 #include <string.h>
  67 #include <math.h>
  68 #include <assert.h>
  69 #include "typedefs.h"
  70 #include "txtdump.h"
  71 #include "vec.h"
  72 #include "gmxcomplex.h"
  73 #include "smalloc.h"
  74 #include "futil.h"
  75 #include "coulomb.h"
  76 #include "gmx_fatal.h"
  77 #include "pme.h"
  78 #include "network.h"
  79 #include "physics.h"
  80 #include "nrnb.h"
  81 #include "copyrite.h"
  82 #include "gmx_wallcycle.h"
  83 #include "gmx_parallel_3dfft.h"
  84 #include "pdbio.h"
  85 #include "gmx_cyclecounter.h"
  86 #include "gmx_omp.h"
  87 #include "macros.h"
  88
  89 /* Single precision, with SSE2 or higher available */
  90 #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
  91
  92 #include "gmx_x86_simd_single.h"
  93
  94 #define PME_SSE
  95 /* Some old AMD processors could have problems with unaligned loads+stores */
  96 #ifndef GMX_FAHCORE
  97 #define PME_SSE_UNALIGNED
  98 #endif
  99 #endif
 100
 101 #define DFT_TOL 1e-7
 102 /* #define PRT_FORCE */
 103 /* conditions for on the fly time-measurement */
 104 /* #define TAKETIME (step > 1 && timesteps < 10) */
 105 #define TAKETIME FALSE
 106
 107 /* #define PME_TIME_THREADS */
 108
 109 #ifdef GMX_DOUBLE
 110 #define mpi_type MPI_DOUBLE
 111 #else
 112 #define mpi_type MPI_FLOAT
 113 #endif
 114
 115 /* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
 116 #define GMX_CACHE_SEP 64
 117
 118 /* We only define a maximum to be able to use local arrays without allocation.
 119  * An order larger than 12 should never be needed, even for test cases.
 120  * If needed it can be changed here.
 121  */
 122 #define PME_ORDER_MAX 12
 123
 124 /* Internal datastructures */
 125 typedef struct {
 126     int send_index0;
 127     int send_nindex;
 128     int recv_index0;
 129     int recv_nindex;
 130     int recv_size;   /* Receive buffer width, used with OpenMP */
 131 } pme_grid_comm_t;
 132
 133 typedef struct {
 134 #ifdef GMX_MPI
 135     MPI_Comm         mpi_comm;
 136 #endif
 137     int              nnodes, nodeid;
 138     int             *s2g0;
 139     int             *s2g1;
 140     int              noverlap_nodes;
 141     int             *send_id, *recv_id;
 142     int              send_size; /* Send buffer width, used with OpenMP */
 143     pme_grid_comm_t *comm_data;
 144     real            *sendbuf;
 145     real            *recvbuf;
 146 } pme_overlap_t;
 147
 148 typedef struct {
 149     int *n;      /* Cumulative counts of the number of particles per thread */
 150     int  nalloc; /* Allocation size of i */
 151     int *i;      /* Particle indices ordered on thread index (n) */
 152 } thread_plist_t;
 153
 154 typedef struct {
 155     int      *thread_one;
 156     int       n;
 157     int      *ind;
 158     splinevec theta;
 159     real     *ptr_theta_z;
 160     splinevec dtheta;
 161     real     *ptr_dtheta_z;
 162 } splinedata_t;
 163
 164 typedef struct {
 165     int      dimind;        /* The index of the dimension, 0=x, 1=y */
 166     int      nslab;
 167     int      nodeid;
 168 #ifdef GMX_MPI
 169     MPI_Comm mpi_comm;
 170 #endif
 171
 172     int     *node_dest;     /* The nodes to send x and q to with DD */
 173     int     *node_src;      /* The nodes to receive x and q from with DD */
 174     int     *buf_index;     /* Index for commnode into the buffers */
 175
 176     int      maxshift;
 177
 178     int      npd;
 179     int      pd_nalloc;
 180     int     *pd;
 181     int     *count;         /* The number of atoms to send to each node */
 182     int    **count_thread;
 183     int     *rcount;        /* The number of atoms to receive */
 184
 185     int      n;
 186     int      nalloc;
 187     rvec    *x;
 188     real    *q;
 189     rvec    *f;
 190     gmx_bool bSpread;       /* These coordinates are used for spreading */
 191     int      pme_order;
 192     ivec    *idx;
 193     rvec    *fractx;            /* Fractional coordinate relative to the
 194                                  * lower cell boundary
 195                                  */
 196     int             nthread;
 197     int            *thread_idx; /* Which thread should spread which charge */
 198     thread_plist_t *thread_plist;
 199     splinedata_t   *spline;
 200 } pme_atomcomm_t;
 201
 202 #define FLBS  3
 203 #define FLBSZ 4
 204
 205 typedef struct {
 206     ivec  ci;     /* The spatial location of this grid         */
 207     ivec  n;      /* The used size of *grid, including order-1 */
 208     ivec  offset; /* The grid offset from the full node grid   */
 209     int   order;  /* PME spreading order                       */
 210     ivec  s;      /* The allocated size of *grid, s >= n       */
 211     real *grid;   /* The grid local thread, size n             */
 212 } pmegrid_t;
 213
 214 typedef struct {
 215     pmegrid_t  grid;         /* The full node grid (non thread-local)            */
 216     int        nthread;      /* The number of threads operating on this grid     */
 217     ivec       nc;           /* The local spatial decomposition over the threads */
 218     pmegrid_t *grid_th;      /* Array of grids for each thread                   */
 219     real      *grid_all;     /* Allocated array for the grids in *grid_th        */
 220     int      **g2t;          /* The grid to thread index                         */
 221     ivec       nthread_comm; /* The number of threads to communicate with        */
 222 } pmegrids_t;
 223
 224
 225 typedef struct {
 226 #ifdef PME_SSE
 227     /* Masks for SSE aligned spreading and gathering */
 228     __m128 mask_SSE0[6], mask_SSE1[6];
 229 #else
 230     int    dummy; /* C89 requires that struct has at least one member */
 231 #endif
 232 } pme_spline_work_t;
 233
 234 typedef struct {
 235     /* work data for solve_pme */
 236     int      nalloc;
 237     real *   mhx;
 238     real *   mhy;
 239     real *   mhz;
 240     real *   m2;
 241     real *   denom;
 242     real *   tmp1_alloc;
 243     real *   tmp1;
 244     real *   eterm;
 245     real *   m2inv;
 246
 247     real     energy;
 248     matrix   vir;
 249 } pme_work_t;
 250
 251 typedef struct gmx_pme {
 252     int           ndecompdim; /* The number of decomposition dimensions */
 253     int           nodeid;     /* Our nodeid in mpi->mpi_comm */
 254     int           nodeid_major;
 255     int           nodeid_minor;
 256     int           nnodes;    /* The number of nodes doing PME */
 257     int           nnodes_major;
 258     int           nnodes_minor;
 259
 260     MPI_Comm      mpi_comm;
 261     MPI_Comm      mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
 262 #ifdef GMX_MPI
 263     MPI_Datatype  rvec_mpi;      /* the pme vector's MPI type */
 264 #endif
 265
 266     int        nthread;       /* The number of threads doing PME */
 267
 268     gmx_bool   bPPnode;       /* Node also does particle-particle forces */
 269     gmx_bool   bFEP;          /* Compute Free energy contribution */
 270     int        nkx, nky, nkz; /* Grid dimensions */
 271     gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
 272     int        pme_order;
 273     real       epsilon_r;
 274
 275     pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
 276     pmegrids_t pmegridB;
 277     /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
 278     int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
 279     /* pmegrid_nz might be larger than strictly necessary to ensure
 280      * memory alignment, pmegrid_nz_base gives the real base size.
 281      */
 282     int     pmegrid_nz_base;
 283     /* The local PME grid starting indices */
 284     int     pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
 285
 286     /* Work data for spreading and gathering */
 287     pme_spline_work_t    *spline_work;
 288
 289     real                 *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
 290     real                 *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
 291     int                   fftgrid_nx, fftgrid_ny, fftgrid_nz;
 292
 293     t_complex            *cfftgridA;  /* Grids for complex FFT data */
 294     t_complex            *cfftgridB;
 295     int                   cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
 296
 297     gmx_parallel_3dfft_t  pfft_setupA;
 298     gmx_parallel_3dfft_t  pfft_setupB;
 299
 300     int                  *nnx, *nny, *nnz;
 301     real                 *fshx, *fshy, *fshz;
 302
 303     pme_atomcomm_t        atc[2]; /* Indexed on decomposition index */
 304     matrix                recipbox;
 305     splinevec             bsp_mod;
 306
 307     pme_overlap_t         overlap[2]; /* Indexed on dimension, 0=x, 1=y */
 308
 309     pme_atomcomm_t        atc_energy; /* Only for gmx_pme_calc_energy */
 310
 311     rvec                 *bufv;       /* Communication buffer */
 312     real                 *bufr;       /* Communication buffer */
 313     int                   buf_nalloc; /* The communication buffer size */
 314
 315     /* thread local work data for solve_pme */
 316     pme_work_t *work;
 317
 318     /* Work data for PME_redist */
 319     gmx_bool redist_init;
 320     int *    scounts;
 321     int *    rcounts;
 322     int *    sdispls;
 323     int *    rdispls;
 324     int *    sidx;
 325     int *    idxa;
 326     real *   redist_buf;
 327     int      redist_buf_nalloc;
 328
 329     /* Work data for sum_qgrid */
 330     real *   sum_qgrid_tmp;
 331     real *   sum_qgrid_dd_tmp;
 332 } t_gmx_pme;
 333
 334
 335 static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
 336                                    int start, int end, int thread)
 337 {
 338     int             i;
 339     int            *idxptr, tix, tiy, tiz;
 340     real           *xptr, *fptr, tx, ty, tz;
 341     real            rxx, ryx, ryy, rzx, rzy, rzz;
 342     int             nx, ny, nz;
 343     int             start_ix, start_iy, start_iz;
 344     int            *g2tx, *g2ty, *g2tz;
 345     gmx_bool        bThreads;
 346     int            *thread_idx = NULL;
 347     thread_plist_t *tpl        = NULL;
 348     int            *tpl_n      = NULL;
 349     int             thread_i;
 350
 351     nx  = pme->nkx;
 352     ny  = pme->nky;
 353     nz  = pme->nkz;
 354
 355     start_ix = pme->pmegrid_start_ix;
 356     start_iy = pme->pmegrid_start_iy;
 357     start_iz = pme->pmegrid_start_iz;
 358
 359     rxx = pme->recipbox[XX][XX];
 360     ryx = pme->recipbox[YY][XX];
 361     ryy = pme->recipbox[YY][YY];
 362     rzx = pme->recipbox[ZZ][XX];
 363     rzy = pme->recipbox[ZZ][YY];
 364     rzz = pme->recipbox[ZZ][ZZ];
 365
 366     g2tx = pme->pmegridA.g2t[XX];
 367     g2ty = pme->pmegridA.g2t[YY];
 368     g2tz = pme->pmegridA.g2t[ZZ];
 369
 370     bThreads = (atc->nthread > 1);
 371     if (bThreads)
 372     {
 373         thread_idx = atc->thread_idx;
 374
 375         tpl   = &atc->thread_plist[thread];
 376         tpl_n = tpl->n;
 377         for (i = 0; i < atc->nthread; i++)
 378         {
 379             tpl_n[i] = 0;
 380         }
 381     }
 382
 383     for (i = start; i < end; i++)
 384     {
 385         xptr   = atc->x[i];
 386         idxptr = atc->idx[i];
 387         fptr   = atc->fractx[i];
 388
 389         /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
 390         tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
 391         ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
 392         tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
 393
 394         tix = (int)(tx);
 395         tiy = (int)(ty);
 396         tiz = (int)(tz);
 397
 398         /* Because decomposition only occurs in x and y,
 399          * we never have a fraction correction in z.
 400          */
 401         fptr[XX] = tx - tix + pme->fshx[tix];
 402         fptr[YY] = ty - tiy + pme->fshy[tiy];
 403         fptr[ZZ] = tz - tiz;
 404
 405         idxptr[XX] = pme->nnx[tix];
 406         idxptr[YY] = pme->nny[tiy];
 407         idxptr[ZZ] = pme->nnz[tiz];
 408
 409 #ifdef DEBUG
 410         range_check(idxptr[XX], 0, pme->pmegrid_nx);
 411         range_check(idxptr[YY], 0, pme->pmegrid_ny);
 412         range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
 413 #endif
 414
 415         if (bThreads)
 416         {
 417             thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
 418             thread_idx[i] = thread_i;
 419             tpl_n[thread_i]++;
 420         }
 421     }
 422
 423     if (bThreads)
 424     {
 425         /* Make a list of particle indices sorted on thread */
 426
 427         /* Get the cumulative count */
 428         for (i = 1; i < atc->nthread; i++)
 429         {
 430             tpl_n[i] += tpl_n[i-1];
 431         }
 432         /* The current implementation distributes particles equally
 433          * over the threads, so we could actually allocate for that
 434          * in pme_realloc_atomcomm_things.
 435          */
 436         if (tpl_n[atc->nthread-1] > tpl->nalloc)
 437         {
 438             tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
 439             srenew(tpl->i, tpl->nalloc);
 440         }
 441         /* Set tpl_n to the cumulative start */
 442         for (i = atc->nthread-1; i >= 1; i--)
 443         {
 444             tpl_n[i] = tpl_n[i-1];
 445         }
 446         tpl_n[0] = 0;
 447
 448         /* Fill our thread local array with indices sorted on thread */
 449         for (i = start; i < end; i++)
 450         {
 451             tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
 452         }
 453         /* Now tpl_n contains the cummulative count again */
 454     }
 455 }
 456
 457 static void make_thread_local_ind(pme_atomcomm_t *atc,
 458                                   int thread, splinedata_t *spline)
 459 {
 460     int             n, t, i, start, end;
 461     thread_plist_t *tpl;
 462
 463     /* Combine the indices made by each thread into one index */
 464
 465     n     = 0;
 466     start = 0;
 467     for (t = 0; t < atc->nthread; t++)
 468     {
 469         tpl = &atc->thread_plist[t];
 470         /* Copy our part (start - end) from the list of thread t */
 471         if (thread > 0)
 472         {
 473             start = tpl->n[thread-1];
 474         }
 475         end = tpl->n[thread];
 476         for (i = start; i < end; i++)
 477         {
 478             spline->ind[n++] = tpl->i[i];
 479         }
 480     }
 481
 482     spline->n = n;
 483 }
 484
 485
 486 static void pme_calc_pidx(int start, int end,
 487                           matrix recipbox, rvec x[],
 488                           pme_atomcomm_t *atc, int *count)
 489 {
 490     int   nslab, i;
 491     int   si;
 492     real *xptr, s;
 493     real  rxx, ryx, rzx, ryy, rzy;
 494     int  *pd;
 495
 496     /* Calculate PME task index (pidx) for each grid index.
 497      * Here we always assign equally sized slabs to each node
 498      * for load balancing reasons (the PME grid spacing is not used).
 499      */
 500
 501     nslab = atc->nslab;
 502     pd    = atc->pd;
 503
 504     /* Reset the count */
 505     for (i = 0; i < nslab; i++)
 506     {
 507         count[i] = 0;
 508     }
 509
 510     if (atc->dimind == 0)
 511     {
 512         rxx = recipbox[XX][XX];
 513         ryx = recipbox[YY][XX];
 514         rzx = recipbox[ZZ][XX];
 515         /* Calculate the node index in x-dimension */
 516         for (i = start; i < end; i++)
 517         {
 518             xptr   = x[i];
 519             /* Fractional coordinates along box vectors */
 520             s     = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
 521             si    = (int)(s + 2*nslab) % nslab;
 522             pd[i] = si;
 523             count[si]++;
 524         }
 525     }
 526     else
 527     {
 528         ryy = recipbox[YY][YY];
 529         rzy = recipbox[ZZ][YY];
 530         /* Calculate the node index in y-dimension */
 531         for (i = start; i < end; i++)
 532         {
 533             xptr   = x[i];
 534             /* Fractional coordinates along box vectors */
 535             s     = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
 536             si    = (int)(s + 2*nslab) % nslab;
 537             pd[i] = si;
 538             count[si]++;
 539         }
 540     }
 541 }
 542
 543 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
 544                                   pme_atomcomm_t *atc)
 545 {
 546     int nthread, thread, slab;
 547
 548     nthread = atc->nthread;
 549
 550 #pragma omp parallel for num_threads(nthread) schedule(static)
 551     for (thread = 0; thread < nthread; thread++)
 552     {
 553         pme_calc_pidx(natoms* thread   /nthread,
 554                       natoms*(thread+1)/nthread,
 555                       recipbox, x, atc, atc->count_thread[thread]);
 556     }
 557     /* Non-parallel reduction, since nslab is small */
 558
 559     for (thread = 1; thread < nthread; thread++)
 560     {
 561         for (slab = 0; slab < atc->nslab; slab++)
 562         {
 563             atc->count_thread[0][slab] += atc->count_thread[thread][slab];
 564         }
 565     }
 566 }
 567
 568 static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
 569 {
 570     const int padding = 4;
 571     int       i;
 572
 573     srenew(th[XX], nalloc);
 574     srenew(th[YY], nalloc);
 575     /* In z we add padding, this is only required for the aligned SSE code */
 576     srenew(*ptr_z, nalloc+2*padding);
 577     th[ZZ] = *ptr_z + padding;
 578
 579     for (i = 0; i < padding; i++)
 580     {
 581         (*ptr_z)[               i] = 0;
 582         (*ptr_z)[padding+nalloc+i] = 0;
 583     }
 584 }
 585
 586 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
 587 {
 588     int i, d;
 589
 590     srenew(spline->ind, atc->nalloc);
 591     /* Initialize the index to identity so it works without threads */
 592     for (i = 0; i < atc->nalloc; i++)
 593     {
 594         spline->ind[i] = i;
 595     }
 596
 597     realloc_splinevec(spline->theta, &spline->ptr_theta_z,
 598                       atc->pme_order*atc->nalloc);
 599     realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
 600                       atc->pme_order*atc->nalloc);
 601 }
 602
 603 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 604 {
 605     int nalloc_old, i, j, nalloc_tpl;
 606
 607     /* We have to avoid a NULL pointer for atc->x to avoid
 608      * possible fatal errors in MPI routines.
 609      */
 610     if (atc->n > atc->nalloc || atc->nalloc == 0)
 611     {
 612         nalloc_old  = atc->nalloc;
 613         atc->nalloc = over_alloc_dd(max(atc->n, 1));
 614
 615         if (atc->nslab > 1)
 616         {
 617             srenew(atc->x, atc->nalloc);
 618             srenew(atc->q, atc->nalloc);
 619             srenew(atc->f, atc->nalloc);
 620             for (i = nalloc_old; i < atc->nalloc; i++)
 621             {
 622                 clear_rvec(atc->f[i]);
 623             }
 624         }
 625         if (atc->bSpread)
 626         {
 627             srenew(atc->fractx, atc->nalloc);
 628             srenew(atc->idx, atc->nalloc);
 629
 630             if (atc->nthread > 1)
 631             {
 632                 srenew(atc->thread_idx, atc->nalloc);
 633             }
 634
 635             for (i = 0; i < atc->nthread; i++)
 636             {
 637                 pme_realloc_splinedata(&atc->spline[i], atc);
 638             }
 639         }
 640     }
 641 }
 642
 643 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
 644                          int n, gmx_bool bXF, rvec *x_f, real *charge,
 645                          pme_atomcomm_t *atc)
 646 /* Redistribute particle data for PME calculation */
 647 /* domain decomposition by x coordinate           */
 648 {
 649     int *idxa;
 650     int  i, ii;
 651
 652     if (FALSE == pme->redist_init)
 653     {
 654         snew(pme->scounts, atc->nslab);
 655         snew(pme->rcounts, atc->nslab);
 656         snew(pme->sdispls, atc->nslab);
 657         snew(pme->rdispls, atc->nslab);
 658         snew(pme->sidx, atc->nslab);
 659         pme->redist_init = TRUE;
 660     }
 661     if (n > pme->redist_buf_nalloc)
 662     {
 663         pme->redist_buf_nalloc = over_alloc_dd(n);
 664         srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
 665     }
 666
 667     pme->idxa = atc->pd;
 668
 669 #ifdef GMX_MPI
 670     if (forw && bXF)
 671     {
 672         /* forward, redistribution from pp to pme */
 673
 674         /* Calculate send counts and exchange them with other nodes */
 675         for (i = 0; (i < atc->nslab); i++)
 676         {
 677             pme->scounts[i] = 0;
 678         }
 679         for (i = 0; (i < n); i++)
 680         {
 681             pme->scounts[pme->idxa[i]]++;
 682         }
 683         MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
 684
 685         /* Calculate send and receive displacements and index into send
 686            buffer */
 687         pme->sdispls[0] = 0;
 688         pme->rdispls[0] = 0;
 689         pme->sidx[0]    = 0;
 690         for (i = 1; i < atc->nslab; i++)
 691         {
 692             pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
 693             pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
 694             pme->sidx[i]    = pme->sdispls[i];
 695         }
 696         /* Total # of particles to be received */
 697         atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
 698
 699         pme_realloc_atomcomm_things(atc);
 700
 701         /* Copy particle coordinates into send buffer and exchange*/
 702         for (i = 0; (i < n); i++)
 703         {
 704             ii = DIM*pme->sidx[pme->idxa[i]];
 705             pme->sidx[pme->idxa[i]]++;
 706             pme->redist_buf[ii+XX] = x_f[i][XX];
 707             pme->redist_buf[ii+YY] = x_f[i][YY];
 708             pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
 709         }
 710         MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
 711                       pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
 712                       pme->rvec_mpi, atc->mpi_comm);
 713     }
 714     if (forw)
 715     {
 716         /* Copy charge into send buffer and exchange*/
 717         for (i = 0; i < atc->nslab; i++)
 718         {
 719             pme->sidx[i] = pme->sdispls[i];
 720         }
 721         for (i = 0; (i < n); i++)
 722         {
 723             ii = pme->sidx[pme->idxa[i]];
 724             pme->sidx[pme->idxa[i]]++;
 725             pme->redist_buf[ii] = charge[i];
 726         }
 727         MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
 728                       atc->q, pme->rcounts, pme->rdispls, mpi_type,
 729                       atc->mpi_comm);
 730     }
 731     else   /* backward, redistribution from pme to pp */
 732     {
 733         MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
 734                       pme->redist_buf, pme->scounts, pme->sdispls,
 735                       pme->rvec_mpi, atc->mpi_comm);
 736
 737         /* Copy data from receive buffer */
 738         for (i = 0; i < atc->nslab; i++)
 739         {
 740             pme->sidx[i] = pme->sdispls[i];
 741         }
 742         for (i = 0; (i < n); i++)
 743         {
 744             ii          = DIM*pme->sidx[pme->idxa[i]];
 745             x_f[i][XX] += pme->redist_buf[ii+XX];
 746             x_f[i][YY] += pme->redist_buf[ii+YY];
 747             x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
 748             pme->sidx[pme->idxa[i]]++;
 749         }
 750     }
 751 #endif
 752 }
 753
 754 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
 755                             gmx_bool bBackward, int shift,
 756                             void *buf_s, int nbyte_s,
 757                             void *buf_r, int nbyte_r)
 758 {
 759 #ifdef GMX_MPI
 760     int        dest, src;
 761     MPI_Status stat;
 762
 763     if (bBackward == FALSE)
 764     {
 765         dest = atc->node_dest[shift];
 766         src  = atc->node_src[shift];
 767     }
 768     else
 769     {
 770         dest = atc->node_src[shift];
 771         src  = atc->node_dest[shift];
 772     }
 773
 774     if (nbyte_s > 0 && nbyte_r > 0)
 775     {
 776         MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
 777                      dest, shift,
 778                      buf_r, nbyte_r, MPI_BYTE,
 779                      src, shift,
 780                      atc->mpi_comm, &stat);
 781     }
 782     else if (nbyte_s > 0)
 783     {
 784         MPI_Send(buf_s, nbyte_s, MPI_BYTE,
 785                  dest, shift,
 786                  atc->mpi_comm);
 787     }
 788     else if (nbyte_r > 0)
 789     {
 790         MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
 791                  src, shift,
 792                  atc->mpi_comm, &stat);
 793     }
 794 #endif
 795 }
 796
 797 static void dd_pmeredist_x_q(gmx_pme_t pme,
 798                              int n, gmx_bool bX, rvec *x, real *charge,
 799                              pme_atomcomm_t *atc)
 800 {
 801     int *commnode, *buf_index;
 802     int  nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
 803
 804     commnode  = atc->node_dest;
 805     buf_index = atc->buf_index;
 806
 807     nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 808
 809     nsend = 0;
 810     for (i = 0; i < nnodes_comm; i++)
 811     {
 812         buf_index[commnode[i]] = nsend;
 813         nsend                 += atc->count[commnode[i]];
 814     }
 815     if (bX)
 816     {
 817         if (atc->count[atc->nodeid] + nsend != n)
 818         {
 819             gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
 820                       "This usually means that your system is not well equilibrated.",
 821                       n - (atc->count[atc->nodeid] + nsend),
 822                       pme->nodeid, 'x'+atc->dimind);
 823         }
 824
 825         if (nsend > pme->buf_nalloc)
 826         {
 827             pme->buf_nalloc = over_alloc_dd(nsend);
 828             srenew(pme->bufv, pme->buf_nalloc);
 829             srenew(pme->bufr, pme->buf_nalloc);
 830         }
 831
 832         atc->n = atc->count[atc->nodeid];
 833         for (i = 0; i < nnodes_comm; i++)
 834         {
 835             scount = atc->count[commnode[i]];
 836             /* Communicate the count */
 837             if (debug)
 838             {
 839                 fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
 840                         atc->dimind, atc->nodeid, commnode[i], scount);
 841             }
 842             pme_dd_sendrecv(atc, FALSE, i,
 843                             &scount, sizeof(int),
 844                             &atc->rcount[i], sizeof(int));
 845             atc->n += atc->rcount[i];
 846         }
 847
 848         pme_realloc_atomcomm_things(atc);
 849     }
 850
 851     local_pos = 0;
 852     for (i = 0; i < n; i++)
 853     {
 854         node = atc->pd[i];
 855         if (node == atc->nodeid)
 856         {
 857             /* Copy direct to the receive buffer */
 858             if (bX)
 859             {
 860                 copy_rvec(x[i], atc->x[local_pos]);
 861             }
 862             atc->q[local_pos] = charge[i];
 863             local_pos++;
 864         }
 865         else
 866         {
 867             /* Copy to the send buffer */
 868             if (bX)
 869             {
 870                 copy_rvec(x[i], pme->bufv[buf_index[node]]);
 871             }
 872             pme->bufr[buf_index[node]] = charge[i];
 873             buf_index[node]++;
 874         }
 875     }
 876
 877     buf_pos = 0;
 878     for (i = 0; i < nnodes_comm; i++)
 879     {
 880         scount = atc->count[commnode[i]];
 881         rcount = atc->rcount[i];
 882         if (scount > 0 || rcount > 0)
 883         {
 884             if (bX)
 885             {
 886                 /* Communicate the coordinates */
 887                 pme_dd_sendrecv(atc, FALSE, i,
 888                                 pme->bufv[buf_pos], scount*sizeof(rvec),
 889                                 atc->x[local_pos], rcount*sizeof(rvec));
 890             }
 891             /* Communicate the charges */
 892             pme_dd_sendrecv(atc, FALSE, i,
 893                             pme->bufr+buf_pos, scount*sizeof(real),
 894                             atc->q+local_pos, rcount*sizeof(real));
 895             buf_pos   += scount;
 896             local_pos += atc->rcount[i];
 897         }
 898     }
 899 }
 900
 901 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 902                            int n, rvec *f,
 903                            gmx_bool bAddF)
 904 {
 905     int *commnode, *buf_index;
 906     int  nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
 907
 908     commnode  = atc->node_dest;
 909     buf_index = atc->buf_index;
 910
 911     nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 912
 913     local_pos = atc->count[atc->nodeid];
 914     buf_pos   = 0;
 915     for (i = 0; i < nnodes_comm; i++)
 916     {
 917         scount = atc->rcount[i];
 918         rcount = atc->count[commnode[i]];
 919         if (scount > 0 || rcount > 0)
 920         {
 921             /* Communicate the forces */
 922             pme_dd_sendrecv(atc, TRUE, i,
 923                             atc->f[local_pos], scount*sizeof(rvec),
 924                             pme->bufv[buf_pos], rcount*sizeof(rvec));
 925             local_pos += scount;
 926         }
 927         buf_index[commnode[i]] = buf_pos;
 928         buf_pos               += rcount;
 929     }
 930
 931     local_pos = 0;
 932     if (bAddF)
 933     {
 934         for (i = 0; i < n; i++)
 935         {
 936             node = atc->pd[i];
 937             if (node == atc->nodeid)
 938             {
 939                 /* Add from the local force array */
 940                 rvec_inc(f[i], atc->f[local_pos]);
 941                 local_pos++;
 942             }
 943             else
 944             {
 945                 /* Add from the receive buffer */
 946                 rvec_inc(f[i], pme->bufv[buf_index[node]]);
 947                 buf_index[node]++;
 948             }
 949         }
 950     }
 951     else
 952     {
 953         for (i = 0; i < n; i++)
 954         {
 955             node = atc->pd[i];
 956             if (node == atc->nodeid)
 957             {
 958                 /* Copy from the local force array */
 959                 copy_rvec(atc->f[local_pos], f[i]);
 960                 local_pos++;
 961             }
 962             else
 963             {
 964                 /* Copy from the receive buffer */
 965                 copy_rvec(pme->bufv[buf_index[node]], f[i]);
 966                 buf_index[node]++;
 967             }
 968         }
 969     }
 970 }
 971
 972 #ifdef GMX_MPI
 973 static void
 974 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 975 {
 976     pme_overlap_t *overlap;
 977     int            send_index0, send_nindex;
 978     int            recv_index0, recv_nindex;
 979     MPI_Status     stat;
 980     int            i, j, k, ix, iy, iz, icnt;
 981     int            ipulse, send_id, recv_id, datasize;
 982     real          *p;
 983     real          *sendptr, *recvptr;
 984
 985     /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
 986     overlap = &pme->overlap[1];
 987
 988     for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 989     {
 990         /* Since we have already (un)wrapped the overlap in the z-dimension,
 991          * we only have to communicate 0 to nkz (not pmegrid_nz).
 992          */
 993         if (direction == GMX_SUM_QGRID_FORWARD)
 994         {
 995             send_id       = overlap->send_id[ipulse];
 996             recv_id       = overlap->recv_id[ipulse];
 997             send_index0   = overlap->comm_data[ipulse].send_index0;
 998             send_nindex   = overlap->comm_data[ipulse].send_nindex;
 999             recv_index0   = overlap->comm_data[ipulse].recv_index0;
1000             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
1001         }
1002         else
1003         {
1004             send_id       = overlap->recv_id[ipulse];
1005             recv_id       = overlap->send_id[ipulse];
1006             send_index0   = overlap->comm_data[ipulse].recv_index0;
1007             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
1008             recv_index0   = overlap->comm_data[ipulse].send_index0;
1009             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
1010         }
1011
1012         /* Copy data to contiguous send buffer */
1013         if (debug)
1014         {
1015             fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1016                     pme->nodeid, overlap->nodeid, send_id,
1017                     pme->pmegrid_start_iy,
1018                     send_index0-pme->pmegrid_start_iy,
1019                     send_index0-pme->pmegrid_start_iy+send_nindex);
1020         }
1021         icnt = 0;
1022         for (i = 0; i < pme->pmegrid_nx; i++)
1023         {
1024             ix = i;
1025             for (j = 0; j < send_nindex; j++)
1026             {
1027                 iy = j + send_index0 - pme->pmegrid_start_iy;
1028                 for (k = 0; k < pme->nkz; k++)
1029                 {
1030                     iz = k;
1031                     overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
1032                 }
1033             }
1034         }
1035
1036         datasize      = pme->pmegrid_nx * pme->nkz;
1037
1038         MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
1039                      send_id, ipulse,
1040                      overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
1041                      recv_id, ipulse,
1042                      overlap->mpi_comm, &stat);
1043
1044         /* Get data from contiguous recv buffer */
1045         if (debug)
1046         {
1047             fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1048                     pme->nodeid, overlap->nodeid, recv_id,
1049                     pme->pmegrid_start_iy,
1050                     recv_index0-pme->pmegrid_start_iy,
1051                     recv_index0-pme->pmegrid_start_iy+recv_nindex);
1052         }
1053         icnt = 0;
1054         for (i = 0; i < pme->pmegrid_nx; i++)
1055         {
1056             ix = i;
1057             for (j = 0; j < recv_nindex; j++)
1058             {
1059                 iy = j + recv_index0 - pme->pmegrid_start_iy;
1060                 for (k = 0; k < pme->nkz; k++)
1061                 {
1062                     iz = k;
1063                     if (direction == GMX_SUM_QGRID_FORWARD)
1064                     {
1065                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1066                     }
1067                     else
1068                     {
1069                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
1070                     }
1071                 }
1072             }
1073         }
1074     }
1075
1076     /* Major dimension is easier, no copying required,
1077      * but we might have to sum to separate array.
1078      * Since we don't copy, we have to communicate up to pmegrid_nz,
1079      * not nkz as for the minor direction.
1080      */
1081     overlap = &pme->overlap[0];
1082
1083     for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
1084     {
1085         if (direction == GMX_SUM_QGRID_FORWARD)
1086         {
1087             send_id       = overlap->send_id[ipulse];
1088             recv_id       = overlap->recv_id[ipulse];
1089             send_index0   = overlap->comm_data[ipulse].send_index0;
1090             send_nindex   = overlap->comm_data[ipulse].send_nindex;
1091             recv_index0   = overlap->comm_data[ipulse].recv_index0;
1092             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
1093             recvptr       = overlap->recvbuf;
1094         }
1095         else
1096         {
1097             send_id       = overlap->recv_id[ipulse];
1098             recv_id       = overlap->send_id[ipulse];
1099             send_index0   = overlap->comm_data[ipulse].recv_index0;
1100             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
1101             recv_index0   = overlap->comm_data[ipulse].send_index0;
1102             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
1103             recvptr       = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1104         }
1105
1106         sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1107         datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
1108
1109         if (debug)
1110         {
1111             fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1112                     pme->nodeid, overlap->nodeid, send_id,
1113                     pme->pmegrid_start_ix,
1114                     send_index0-pme->pmegrid_start_ix,
1115                     send_index0-pme->pmegrid_start_ix+send_nindex);
1116             fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1117                     pme->nodeid, overlap->nodeid, recv_id,
1118                     pme->pmegrid_start_ix,
1119                     recv_index0-pme->pmegrid_start_ix,
1120                     recv_index0-pme->pmegrid_start_ix+recv_nindex);
1121         }
1122
1123         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
1124                      send_id, ipulse,
1125                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
1126                      recv_id, ipulse,
1127                      overlap->mpi_comm, &stat);
1128
1129         /* ADD data from contiguous recv buffer */
1130         if (direction == GMX_SUM_QGRID_FORWARD)
1131         {
1132             p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1133             for (i = 0; i < recv_nindex*datasize; i++)
1134             {
1135                 p[i] += overlap->recvbuf[i];
1136             }
1137         }
1138     }
1139 }
1140 #endif
1141
1142
1143 static int
1144 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
1145 {
1146     ivec    local_fft_ndata, local_fft_offset, local_fft_size;
1147     ivec    local_pme_size;
1148     int     i, ix, iy, iz;
1149     int     pmeidx, fftidx;
1150
1151     /* Dimensions should be identical for A/B grid, so we just use A here */
1152     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1153                                    local_fft_ndata,
1154                                    local_fft_offset,
1155                                    local_fft_size);
1156
1157     local_pme_size[0] = pme->pmegrid_nx;
1158     local_pme_size[1] = pme->pmegrid_ny;
1159     local_pme_size[2] = pme->pmegrid_nz;
1160
1161     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1162        the offset is identical, and the PME grid always has more data (due to overlap)
1163      */
1164     {
1165 #ifdef DEBUG_PME
1166         FILE *fp, *fp2;
1167         char  fn[STRLEN], format[STRLEN];
1168         real  val;
1169         sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
1170         fp = ffopen(fn, "w");
1171         sprintf(fn, "pmegrid%d.txt", pme->nodeid);
1172         fp2 = ffopen(fn, "w");
1173         sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
1174 #endif
1175
1176         for (ix = 0; ix < local_fft_ndata[XX]; ix++)
1177         {
1178             for (iy = 0; iy < local_fft_ndata[YY]; iy++)
1179             {
1180                 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1181                 {
1182                     pmeidx          = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1183                     fftidx          = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1184                     fftgrid[fftidx] = pmegrid[pmeidx];
1185 #ifdef DEBUG_PME
1186                     val = 100*pmegrid[pmeidx];
1187                     if (pmegrid[pmeidx] != 0)
1188                     {
1189                         fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
1190                                 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
1191                     }
1192                     if (pmegrid[pmeidx] != 0)
1193                     {
1194                         fprintf(fp2, "%-12s  %5d  %5d  %5d  %12.5e\n",
1195                                 "qgrid",
1196                                 pme->pmegrid_start_ix + ix,
1197                                 pme->pmegrid_start_iy + iy,
1198                                 pme->pmegrid_start_iz + iz,
1199                                 pmegrid[pmeidx]);
1200                     }
1201 #endif
1202                 }
1203             }
1204         }
1205 #ifdef DEBUG_PME
1206         ffclose(fp);
1207         ffclose(fp2);
1208 #endif
1209     }
1210     return 0;
1211 }
1212
1213
1214 static gmx_cycles_t omp_cyc_start()
1215 {
1216     return gmx_cycles_read();
1217 }
1218
1219 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1220 {
1221     return gmx_cycles_read() - c;
1222 }
1223
1224
1225 static int
1226 copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
1227                         int nthread, int thread)
1228 {
1229     ivec          local_fft_ndata, local_fft_offset, local_fft_size;
1230     ivec          local_pme_size;
1231     int           ixy0, ixy1, ixy, ix, iy, iz;
1232     int           pmeidx, fftidx;
1233 #ifdef PME_TIME_THREADS
1234     gmx_cycles_t  c1;
1235     static double cs1 = 0;
1236     static int    cnt = 0;
1237 #endif
1238
1239 #ifdef PME_TIME_THREADS
1240     c1 = omp_cyc_start();
1241 #endif
1242     /* Dimensions should be identical for A/B grid, so we just use A here */
1243     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1244                                    local_fft_ndata,
1245                                    local_fft_offset,
1246                                    local_fft_size);
1247
1248     local_pme_size[0] = pme->pmegrid_nx;
1249     local_pme_size[1] = pme->pmegrid_ny;
1250     local_pme_size[2] = pme->pmegrid_nz;
1251
1252     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1253        the offset is identical, and the PME grid always has more data (due to overlap)
1254      */
1255     ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1256     ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1257
1258     for (ixy = ixy0; ixy < ixy1; ixy++)
1259     {
1260         ix = ixy/local_fft_ndata[YY];
1261         iy = ixy - ix*local_fft_ndata[YY];
1262
1263         pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1264         fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1265         for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1266         {
1267             pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1268         }
1269     }
1270
1271 #ifdef PME_TIME_THREADS
1272     c1   = omp_cyc_end(c1);
1273     cs1 += (double)c1;
1274     cnt++;
1275     if (cnt % 20 == 0)
1276     {
1277         printf("copy %.2f\n", cs1*1e-9);
1278     }
1279 #endif
1280
1281     return 0;
1282 }
1283
1284
1285 static void
1286 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1287 {
1288     int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
1289
1290     nx = pme->nkx;
1291     ny = pme->nky;
1292     nz = pme->nkz;
1293
1294     pnx = pme->pmegrid_nx;
1295     pny = pme->pmegrid_ny;
1296     pnz = pme->pmegrid_nz;
1297
1298     overlap = pme->pme_order - 1;
1299
1300     /* Add periodic overlap in z */
1301     for (ix = 0; ix < pme->pmegrid_nx; ix++)
1302     {
1303         for (iy = 0; iy < pme->pmegrid_ny; iy++)
1304         {
1305             for (iz = 0; iz < overlap; iz++)
1306             {
1307                 pmegrid[(ix*pny+iy)*pnz+iz] +=
1308                     pmegrid[(ix*pny+iy)*pnz+nz+iz];
1309             }
1310         }
1311     }
1312
1313     if (pme->nnodes_minor == 1)
1314     {
1315         for (ix = 0; ix < pme->pmegrid_nx; ix++)
1316         {
1317             for (iy = 0; iy < overlap; iy++)
1318             {
1319                 for (iz = 0; iz < nz; iz++)
1320                 {
1321                     pmegrid[(ix*pny+iy)*pnz+iz] +=
1322                         pmegrid[(ix*pny+ny+iy)*pnz+iz];
1323                 }
1324             }
1325         }
1326     }
1327
1328     if (pme->nnodes_major == 1)
1329     {
1330         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1331
1332         for (ix = 0; ix < overlap; ix++)
1333         {
1334             for (iy = 0; iy < ny_x; iy++)
1335             {
1336                 for (iz = 0; iz < nz; iz++)
1337                 {
1338                     pmegrid[(ix*pny+iy)*pnz+iz] +=
1339                         pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1340                 }
1341             }
1342         }
1343     }
1344 }
1345
1346
1347 static void
1348 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1349 {
1350     int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
1351
1352     nx = pme->nkx;
1353     ny = pme->nky;
1354     nz = pme->nkz;
1355
1356     pnx = pme->pmegrid_nx;
1357     pny = pme->pmegrid_ny;
1358     pnz = pme->pmegrid_nz;
1359
1360     overlap = pme->pme_order - 1;
1361
1362     if (pme->nnodes_major == 1)
1363     {
1364         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1365
1366         for (ix = 0; ix < overlap; ix++)
1367         {
1368             int iy, iz;
1369
1370             for (iy = 0; iy < ny_x; iy++)
1371             {
1372                 for (iz = 0; iz < nz; iz++)
1373                 {
1374                     pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1375                         pmegrid[(ix*pny+iy)*pnz+iz];
1376                 }
1377             }
1378         }
1379     }
1380
1381     if (pme->nnodes_minor == 1)
1382     {
1383 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1384         for (ix = 0; ix < pme->pmegrid_nx; ix++)
1385         {
1386             int iy, iz;
1387
1388             for (iy = 0; iy < overlap; iy++)
1389             {
1390                 for (iz = 0; iz < nz; iz++)
1391                 {
1392                     pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1393                         pmegrid[(ix*pny+iy)*pnz+iz];
1394                 }
1395             }
1396         }
1397     }
1398
1399     /* Copy periodic overlap in z */
1400 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1401     for (ix = 0; ix < pme->pmegrid_nx; ix++)
1402     {
1403         int iy, iz;
1404
1405         for (iy = 0; iy < pme->pmegrid_ny; iy++)
1406         {
1407             for (iz = 0; iz < overlap; iz++)
1408             {
1409                 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1410                     pmegrid[(ix*pny+iy)*pnz+iz];
1411             }
1412         }
1413     }
1414 }
1415
1416 static void clear_grid(int nx, int ny, int nz, real *grid,
1417                        ivec fs, int *flag,
1418                        int fx, int fy, int fz,
1419                        int order)
1420 {
1421     int nc, ncz;
1422     int fsx, fsy, fsz, gx, gy, gz, g0x, g0y, x, y, z;
1423     int flind;
1424
1425     nc  = 2 + (order - 2)/FLBS;
1426     ncz = 2 + (order - 2)/FLBSZ;
1427
1428     for (fsx = fx; fsx < fx+nc; fsx++)
1429     {
1430         for (fsy = fy; fsy < fy+nc; fsy++)
1431         {
1432             for (fsz = fz; fsz < fz+ncz; fsz++)
1433             {
1434                 flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
1435                 if (flag[flind] == 0)
1436                 {
1437                     gx  = fsx*FLBS;
1438                     gy  = fsy*FLBS;
1439                     gz  = fsz*FLBSZ;
1440                     g0x = (gx*ny + gy)*nz + gz;
1441                     for (x = 0; x < FLBS; x++)
1442                     {
1443                         g0y = g0x;
1444                         for (y = 0; y < FLBS; y++)
1445                         {
1446                             for (z = 0; z < FLBSZ; z++)
1447                             {
1448                                 grid[g0y+z] = 0;
1449                             }
1450                             g0y += nz;
1451                         }
1452                         g0x += ny*nz;
1453                     }
1454
1455                     flag[flind] = 1;
1456                 }
1457             }
1458         }
1459     }
1460 }
1461
1462 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1463 #define DO_BSPLINE(order)                            \
1464     for (ithx = 0; (ithx < order); ithx++)                    \
1465     {                                                    \
1466         index_x = (i0+ithx)*pny*pnz;                     \
1467         valx    = qn*thx[ithx];                          \
1468                                                      \
1469         for (ithy = 0; (ithy < order); ithy++)                \
1470         {                                                \
1471             valxy    = valx*thy[ithy];                   \
1472             index_xy = index_x+(j0+ithy)*pnz;            \
1473                                                      \
1474             for (ithz = 0; (ithz < order); ithz++)            \
1475             {                                            \
1476                 index_xyz        = index_xy+(k0+ithz);   \
1477                 grid[index_xyz] += valxy*thz[ithz];      \
1478             }                                            \
1479         }                                                \
1480     }
1481
1482
1483 static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
1484                                      pme_atomcomm_t *atc, splinedata_t *spline,
1485                                      pme_spline_work_t *work)
1486 {
1487
1488     /* spread charges from home atoms to local grid */
1489     real          *grid;
1490     pme_overlap_t *ol;
1491     int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
1492     int       *    idxptr;
1493     int            order, norder, index_x, index_xy, index_xyz;
1494     real           valx, valxy, qn;
1495     real          *thx, *thy, *thz;
1496     int            localsize, bndsize;
1497     int            pnx, pny, pnz, ndatatot;
1498     int            offx, offy, offz;
1499
1500     pnx = pmegrid->s[XX];
1501     pny = pmegrid->s[YY];
1502     pnz = pmegrid->s[ZZ];
1503
1504     offx = pmegrid->offset[XX];
1505     offy = pmegrid->offset[YY];
1506     offz = pmegrid->offset[ZZ];
1507
1508     ndatatot = pnx*pny*pnz;
1509     grid     = pmegrid->grid;
1510     for (i = 0; i < ndatatot; i++)
1511     {
1512         grid[i] = 0;
1513     }
1514
1515     order = pmegrid->order;
1516
1517     for (nn = 0; nn < spline->n; nn++)
1518     {
1519         n  = spline->ind[nn];
1520         qn = atc->q[n];
1521
1522         if (qn != 0)
1523         {
1524             idxptr = atc->idx[n];
1525             norder = nn*order;
1526
1527             i0   = idxptr[XX] - offx;
1528             j0   = idxptr[YY] - offy;
1529             k0   = idxptr[ZZ] - offz;
1530
1531             thx = spline->theta[XX] + norder;
1532             thy = spline->theta[YY] + norder;
1533             thz = spline->theta[ZZ] + norder;
1534
1535             switch (order)
1536             {
1537                 case 4:
1538 #ifdef PME_SSE
1539 #ifdef PME_SSE_UNALIGNED
1540 #define PME_SPREAD_SSE_ORDER4
1541 #else
1542 #define PME_SPREAD_SSE_ALIGNED
1543 #define PME_ORDER 4
1544 #endif
1545 #include "pme_sse_single.h"
1546 #else
1547                     DO_BSPLINE(4);
1548 #endif
1549                     break;
1550                 case 5:
1551 #ifdef PME_SSE
1552 #define PME_SPREAD_SSE_ALIGNED
1553 #define PME_ORDER 5
1554 #include "pme_sse_single.h"
1555 #else
1556                     DO_BSPLINE(5);
1557 #endif
1558                     break;
1559                 default:
1560                     DO_BSPLINE(order);
1561                     break;
1562             }
1563         }
1564     }
1565 }
1566
1567 static void set_grid_alignment(int *pmegrid_nz, int pme_order)
1568 {
1569 #ifdef PME_SSE
1570     if (pme_order == 5
1571 #ifndef PME_SSE_UNALIGNED
1572         || pme_order == 4
1573 #endif
1574         )
1575     {
1576         /* Round nz up to a multiple of 4 to ensure alignment */
1577         *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1578     }
1579 #endif
1580 }
1581
1582 static void set_gridsize_alignment(int *gridsize, int pme_order)
1583 {
1584 #ifdef PME_SSE
1585 #ifndef PME_SSE_UNALIGNED
1586     if (pme_order == 4)
1587     {
1588         /* Add extra elements to ensured aligned operations do not go
1589          * beyond the allocated grid size.
1590          * Note that for pme_order=5, the pme grid z-size alignment
1591          * ensures that we will not go beyond the grid size.
1592          */
1593         *gridsize += 4;
1594     }
1595 #endif
1596 #endif
1597 }
1598
1599 static void pmegrid_init(pmegrid_t *grid,
1600                          int cx, int cy, int cz,
1601                          int x0, int y0, int z0,
1602                          int x1, int y1, int z1,
1603                          gmx_bool set_alignment,
1604                          int pme_order,
1605                          real *ptr)
1606 {
1607     int nz, gridsize;
1608
1609     grid->ci[XX]     = cx;
1610     grid->ci[YY]     = cy;
1611     grid->ci[ZZ]     = cz;
1612     grid->offset[XX] = x0;
1613     grid->offset[YY] = y0;
1614     grid->offset[ZZ] = z0;
1615     grid->n[XX]      = x1 - x0 + pme_order - 1;
1616     grid->n[YY]      = y1 - y0 + pme_order - 1;
1617     grid->n[ZZ]      = z1 - z0 + pme_order - 1;
1618     copy_ivec(grid->n, grid->s);
1619
1620     nz = grid->s[ZZ];
1621     set_grid_alignment(&nz, pme_order);
1622     if (set_alignment)
1623     {
1624         grid->s[ZZ] = nz;
1625     }
1626     else if (nz != grid->s[ZZ])
1627     {
1628         gmx_incons("pmegrid_init call with an unaligned z size");
1629     }
1630
1631     grid->order = pme_order;
1632     if (ptr == NULL)
1633     {
1634         gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
1635         set_gridsize_alignment(&gridsize, pme_order);
1636         snew_aligned(grid->grid, gridsize, 16);
1637     }
1638     else
1639     {
1640         grid->grid = ptr;
1641     }
1642 }
1643
1644 static int div_round_up(int enumerator, int denominator)
1645 {
1646     return (enumerator + denominator - 1)/denominator;
1647 }
1648
1649 static void make_subgrid_division(const ivec n, int ovl, int nthread,
1650                                   ivec nsub)
1651 {
1652     int gsize_opt, gsize;
1653     int nsx, nsy, nsz;
1654     char *env;
1655
1656     gsize_opt = -1;
1657     for (nsx = 1; nsx <= nthread; nsx++)
1658     {
1659         if (nthread % nsx == 0)
1660         {
1661             for (nsy = 1; nsy <= nthread; nsy++)
1662             {
1663                 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1664                 {
1665                     nsz = nthread/(nsx*nsy);
1666
1667                     /* Determine the number of grid points per thread */
1668                     gsize =
1669                         (div_round_up(n[XX], nsx) + ovl)*
1670                         (div_round_up(n[YY], nsy) + ovl)*
1671                         (div_round_up(n[ZZ], nsz) + ovl);
1672
1673                     /* Minimize the number of grids points per thread
1674                      * and, secondarily, the number of cuts in minor dimensions.
1675                      */
1676                     if (gsize_opt == -1 ||
1677                         gsize < gsize_opt ||
1678                         (gsize == gsize_opt &&
1679                          (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1680                     {
1681                         nsub[XX]  = nsx;
1682                         nsub[YY]  = nsy;
1683                         nsub[ZZ]  = nsz;
1684                         gsize_opt = gsize;
1685                     }
1686                 }
1687             }
1688         }
1689     }
1690
1691     env = getenv("GMX_PME_THREAD_DIVISION");
1692     if (env != NULL)
1693     {
1694         sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
1695     }
1696
1697     if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1698     {
1699         gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
1700     }
1701 }
1702
1703 static void pmegrids_init(pmegrids_t *grids,
1704                           int nx, int ny, int nz, int nz_base,
1705                           int pme_order,
1706                           int nthread,
1707                           int overlap_x,
1708                           int overlap_y)
1709 {
1710     ivec n, n_base, g0, g1;
1711     int t, x, y, z, d, i, tfac;
1712     int max_comm_lines = -1;
1713
1714     n[XX] = nx - (pme_order - 1);
1715     n[YY] = ny - (pme_order - 1);
1716     n[ZZ] = nz - (pme_order - 1);
1717
1718     copy_ivec(n, n_base);
1719     n_base[ZZ] = nz_base;
1720
1721     pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
1722                  NULL);
1723
1724     grids->nthread = nthread;
1725
1726     make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
1727
1728     if (grids->nthread > 1)
1729     {
1730         ivec nst;
1731         int gridsize;
1732
1733         for (d = 0; d < DIM; d++)
1734         {
1735             nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
1736         }
1737         set_grid_alignment(&nst[ZZ], pme_order);
1738
1739         if (debug)
1740         {
1741             fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
1742                     grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
1743             fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1744                     nx, ny, nz,
1745                     nst[XX], nst[YY], nst[ZZ]);
1746         }
1747
1748         snew(grids->grid_th, grids->nthread);
1749         t        = 0;
1750         gridsize = nst[XX]*nst[YY]*nst[ZZ];
1751         set_gridsize_alignment(&gridsize, pme_order);
1752         snew_aligned(grids->grid_all,
1753                      grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1754                      16);
1755
1756         for (x = 0; x < grids->nc[XX]; x++)
1757         {
1758             for (y = 0; y < grids->nc[YY]; y++)
1759             {
1760                 for (z = 0; z < grids->nc[ZZ]; z++)
1761                 {
1762                     pmegrid_init(&grids->grid_th[t],
1763                                  x, y, z,
1764                                  (n[XX]*(x  ))/grids->nc[XX],
1765                                  (n[YY]*(y  ))/grids->nc[YY],
1766                                  (n[ZZ]*(z  ))/grids->nc[ZZ],
1767                                  (n[XX]*(x+1))/grids->nc[XX],
1768                                  (n[YY]*(y+1))/grids->nc[YY],
1769                                  (n[ZZ]*(z+1))/grids->nc[ZZ],
1770                                  TRUE,
1771                                  pme_order,
1772                                  grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1773                     t++;
1774                 }
1775             }
1776         }
1777     }
1778
1779     snew(grids->g2t, DIM);
1780     tfac = 1;
1781     for (d = DIM-1; d >= 0; d--)
1782     {
1783         snew(grids->g2t[d], n[d]);
1784         t = 0;
1785         for (i = 0; i < n[d]; i++)
1786         {
1787             /* The second check should match the parameters
1788              * of the pmegrid_init call above.
1789              */
1790             while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1791             {
1792                 t++;
1793             }
1794             grids->g2t[d][i] = t*tfac;
1795         }
1796
1797         tfac *= grids->nc[d];
1798
1799         switch (d)
1800         {
1801             case XX: max_comm_lines = overlap_x;     break;
1802             case YY: max_comm_lines = overlap_y;     break;
1803             case ZZ: max_comm_lines = pme_order - 1; break;
1804         }
1805         grids->nthread_comm[d] = 0;
1806         while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
1807                grids->nthread_comm[d] < grids->nc[d])
1808         {
1809             grids->nthread_comm[d]++;
1810         }
1811         if (debug != NULL)
1812         {
1813             fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
1814                     'x'+d, grids->nthread_comm[d]);
1815         }
1816         /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1817          * work, but this is not a problematic restriction.
1818          */
1819         if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1820         {
1821             gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
1822         }
1823     }
1824 }
1825
1826
1827 static void pmegrids_destroy(pmegrids_t *grids)
1828 {
1829     int t;
1830
1831     if (grids->grid.grid != NULL)
1832     {
1833         sfree(grids->grid.grid);
1834
1835         if (grids->nthread > 0)
1836         {
1837             for (t = 0; t < grids->nthread; t++)
1838             {
1839                 sfree(grids->grid_th[t].grid);
1840             }
1841             sfree(grids->grid_th);
1842         }
1843     }
1844 }
1845
1846
1847 static void realloc_work(pme_work_t *work, int nkx)
1848 {
1849     if (nkx > work->nalloc)
1850     {
1851         work->nalloc = nkx;
1852         srenew(work->mhx, work->nalloc);
1853         srenew(work->mhy, work->nalloc);
1854         srenew(work->mhz, work->nalloc);
1855         srenew(work->m2, work->nalloc);
1856         /* Allocate an aligned pointer for SSE operations, including 3 extra
1857          * elements at the end since SSE operates on 4 elements at a time.
1858          */
1859         sfree_aligned(work->denom);
1860         sfree_aligned(work->tmp1);
1861         sfree_aligned(work->eterm);
1862         snew_aligned(work->denom, work->nalloc+3, 16);
1863         snew_aligned(work->tmp1, work->nalloc+3, 16);
1864         snew_aligned(work->eterm, work->nalloc+3, 16);
1865         srenew(work->m2inv, work->nalloc);
1866     }
1867 }
1868
1869
1870 static void free_work(pme_work_t *work)
1871 {
1872     sfree(work->mhx);
1873     sfree(work->mhy);
1874     sfree(work->mhz);
1875     sfree(work->m2);
1876     sfree_aligned(work->denom);
1877     sfree_aligned(work->tmp1);
1878     sfree_aligned(work->eterm);
1879     sfree(work->m2inv);
1880 }
1881
1882
1883 #ifdef PME_SSE
1884 /* Calculate exponentials through SSE in float precision */
1885 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1886 {
1887     {
1888         const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
1889         __m128 f_sse;
1890         __m128 lu;
1891         __m128 tmp_d1, d_inv, tmp_r, tmp_e;
1892         int kx;
1893         f_sse = _mm_load1_ps(&f);
1894         for (kx = 0; kx < end; kx += 4)
1895         {
1896             tmp_d1   = _mm_load_ps(d_aligned+kx);
1897             lu       = _mm_rcp_ps(tmp_d1);
1898             d_inv    = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
1899             tmp_r    = _mm_load_ps(r_aligned+kx);
1900             tmp_r    = gmx_mm_exp_ps(tmp_r);
1901             tmp_e    = _mm_mul_ps(f_sse, d_inv);
1902             tmp_e    = _mm_mul_ps(tmp_e, tmp_r);
1903             _mm_store_ps(e_aligned+kx, tmp_e);
1904         }
1905     }
1906 }
1907 #else
1908 inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
1909 {
1910     int kx;
1911     for (kx = start; kx < end; kx++)
1912     {
1913         d[kx] = 1.0/d[kx];
1914     }
1915     for (kx = start; kx < end; kx++)
1916     {
1917         r[kx] = exp(r[kx]);
1918     }
1919     for (kx = start; kx < end; kx++)
1920     {
1921         e[kx] = f*r[kx]*d[kx];
1922     }
1923 }
1924 #endif
1925
1926
1927 static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
1928                          real ewaldcoeff, real vol,
1929                          gmx_bool bEnerVir,
1930                          int nthread, int thread)
1931 {
1932     /* do recip sum over local cells in grid */
1933     /* y major, z middle, x minor or continuous */
1934     t_complex *p0;
1935     int     kx, ky, kz, maxkx, maxky, maxkz;
1936     int     nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
1937     real    mx, my, mz;
1938     real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1939     real    ets2, struct2, vfactor, ets2vf;
1940     real    d1, d2, energy = 0;
1941     real    by, bz;
1942     real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
1943     real    rxx, ryx, ryy, rzx, rzy, rzz;
1944     pme_work_t *work;
1945     real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
1946     real    mhxk, mhyk, mhzk, m2k;
1947     real    corner_fac;
1948     ivec    complex_order;
1949     ivec    local_ndata, local_offset, local_size;
1950     real    elfac;
1951
1952     elfac = ONE_4PI_EPS0/pme->epsilon_r;
1953
1954     nx = pme->nkx;
1955     ny = pme->nky;
1956     nz = pme->nkz;
1957
1958     /* Dimensions should be identical for A/B grid, so we just use A here */
1959     gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1960                                       complex_order,
1961                                       local_ndata,
1962                                       local_offset,
1963                                       local_size);
1964
1965     rxx = pme->recipbox[XX][XX];
1966     ryx = pme->recipbox[YY][XX];
1967     ryy = pme->recipbox[YY][YY];
1968     rzx = pme->recipbox[ZZ][XX];
1969     rzy = pme->recipbox[ZZ][YY];
1970     rzz = pme->recipbox[ZZ][ZZ];
1971
1972     maxkx = (nx+1)/2;
1973     maxky = (ny+1)/2;
1974     maxkz = nz/2+1;
1975
1976     work  = &pme->work[thread];
1977     mhx   = work->mhx;
1978     mhy   = work->mhy;
1979     mhz   = work->mhz;
1980     m2    = work->m2;
1981     denom = work->denom;
1982     tmp1  = work->tmp1;
1983     eterm = work->eterm;
1984     m2inv = work->m2inv;
1985
1986     iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
1987     iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1988
1989     for (iyz = iyz0; iyz < iyz1; iyz++)
1990     {
1991         iy = iyz/local_ndata[ZZ];
1992         iz = iyz - iy*local_ndata[ZZ];
1993
1994         ky = iy + local_offset[YY];
1995
1996         if (ky < maxky)
1997         {
1998             my = ky;
1999         }
2000         else
2001         {
2002             my = (ky - ny);
2003         }
2004
2005         by = M_PI*vol*pme->bsp_mod[YY][ky];
2006
2007         kz = iz + local_offset[ZZ];
2008
2009         mz = kz;
2010
2011         bz = pme->bsp_mod[ZZ][kz];
2012
2013         /* 0.5 correction for corner points */
2014         corner_fac = 1;
2015         if (kz == 0 || kz == (nz+1)/2)
2016         {
2017             corner_fac = 0.5;
2018         }
2019
2020         p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2021
2022         /* We should skip the k-space point (0,0,0) */
2023         if (local_offset[XX] > 0 || ky > 0 || kz > 0)
2024         {
2025             kxstart = local_offset[XX];
2026         }
2027         else
2028         {
2029             kxstart = local_offset[XX] + 1;
2030             p0++;
2031         }
2032         kxend = local_offset[XX] + local_ndata[XX];
2033
2034         if (bEnerVir)
2035         {
2036             /* More expensive inner loop, especially because of the storage
2037              * of the mh elements in array's.
2038              * Because x is the minor grid index, all mh elements
2039              * depend on kx for triclinic unit cells.
2040              */
2041
2042             /* Two explicit loops to avoid a conditional inside the loop */
2043             for (kx = kxstart; kx < maxkx; kx++)
2044             {
2045                 mx = kx;
2046
2047                 mhxk      = mx * rxx;
2048                 mhyk      = mx * ryx + my * ryy;
2049                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2050                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2051                 mhx[kx]   = mhxk;
2052                 mhy[kx]   = mhyk;
2053                 mhz[kx]   = mhzk;
2054                 m2[kx]    = m2k;
2055                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2056                 tmp1[kx]  = -factor*m2k;
2057             }
2058
2059             for (kx = maxkx; kx < kxend; kx++)
2060             {
2061                 mx = (kx - nx);
2062
2063                 mhxk      = mx * rxx;
2064                 mhyk      = mx * ryx + my * ryy;
2065                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2066                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2067                 mhx[kx]   = mhxk;
2068                 mhy[kx]   = mhyk;
2069                 mhz[kx]   = mhzk;
2070                 m2[kx]    = m2k;
2071                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2072                 tmp1[kx]  = -factor*m2k;
2073             }
2074
2075             for (kx = kxstart; kx < kxend; kx++)
2076             {
2077                 m2inv[kx] = 1.0/m2[kx];
2078             }
2079
2080             calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
2081
2082             for (kx = kxstart; kx < kxend; kx++, p0++)
2083             {
2084                 d1      = p0->re;
2085                 d2      = p0->im;
2086
2087                 p0->re  = d1*eterm[kx];
2088                 p0->im  = d2*eterm[kx];
2089
2090                 struct2 = 2.0*(d1*d1+d2*d2);
2091
2092                 tmp1[kx] = eterm[kx]*struct2;
2093             }
2094
2095             for (kx = kxstart; kx < kxend; kx++)
2096             {
2097                 ets2     = corner_fac*tmp1[kx];
2098                 vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2099                 energy  += ets2;
2100
2101                 ets2vf   = ets2*vfactor;
2102                 virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
2103                 virxy   += ets2vf*mhx[kx]*mhy[kx];
2104                 virxz   += ets2vf*mhx[kx]*mhz[kx];
2105                 viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
2106                 viryz   += ets2vf*mhy[kx]*mhz[kx];
2107                 virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
2108             }
2109         }
2110         else
2111         {
2112             /* We don't need to calculate the energy and the virial.
2113              * In this case the triclinic overhead is small.
2114              */
2115
2116             /* Two explicit loops to avoid a conditional inside the loop */
2117
2118             for (kx = kxstart; kx < maxkx; kx++)
2119             {
2120                 mx = kx;
2121
2122                 mhxk      = mx * rxx;
2123                 mhyk      = mx * ryx + my * ryy;
2124                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2125                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2126                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2127                 tmp1[kx]  = -factor*m2k;
2128             }
2129
2130             for (kx = maxkx; kx < kxend; kx++)
2131             {
2132                 mx = (kx - nx);
2133
2134                 mhxk      = mx * rxx;
2135                 mhyk      = mx * ryx + my * ryy;
2136                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2137                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2138                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2139                 tmp1[kx]  = -factor*m2k;
2140             }
2141
2142             calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
2143
2144             for (kx = kxstart; kx < kxend; kx++, p0++)
2145             {
2146                 d1      = p0->re;
2147                 d2      = p0->im;
2148
2149                 p0->re  = d1*eterm[kx];
2150                 p0->im  = d2*eterm[kx];
2151             }
2152         }
2153     }
2154
2155     if (bEnerVir)
2156     {
2157         /* Update virial with local values.
2158          * The virial is symmetric by definition.
2159          * this virial seems ok for isotropic scaling, but I'm
2160          * experiencing problems on semiisotropic membranes.
2161          * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2162          */
2163         work->vir[XX][XX] = 0.25*virxx;
2164         work->vir[YY][YY] = 0.25*viryy;
2165         work->vir[ZZ][ZZ] = 0.25*virzz;
2166         work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
2167         work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
2168         work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
2169
2170         /* This energy should be corrected for a charged system */
2171         work->energy = 0.5*energy;
2172     }
2173
2174     /* Return the loop count */
2175     return local_ndata[YY]*local_ndata[XX];
2176 }
2177
2178 static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
2179                              real *mesh_energy, matrix vir)
2180 {
2181     /* This function sums output over threads
2182      * and should therefore only be called after thread synchronization.
2183      */
2184     int thread;
2185
2186     *mesh_energy = pme->work[0].energy;
2187     copy_mat(pme->work[0].vir, vir);
2188
2189     for (thread = 1; thread < nthread; thread++)
2190     {
2191         *mesh_energy += pme->work[thread].energy;
2192         m_add(vir, pme->work[thread].vir, vir);
2193     }
2194 }
2195
2196 #define DO_FSPLINE(order)                      \
2197     for (ithx = 0; (ithx < order); ithx++)              \
2198     {                                              \
2199         index_x = (i0+ithx)*pny*pnz;               \
2200         tx      = thx[ithx];                       \
2201         dx      = dthx[ithx];                      \
2202                                                \
2203         for (ithy = 0; (ithy < order); ithy++)          \
2204         {                                          \
2205             index_xy = index_x+(j0+ithy)*pnz;      \
2206             ty       = thy[ithy];                  \
2207             dy       = dthy[ithy];                 \
2208             fxy1     = fz1 = 0;                    \
2209                                                \
2210             for (ithz = 0; (ithz < order); ithz++)      \
2211             {                                      \
2212                 gval  = grid[index_xy+(k0+ithz)];  \
2213                 fxy1 += thz[ithz]*gval;            \
2214                 fz1  += dthz[ithz]*gval;           \
2215             }                                      \
2216             fx += dx*ty*fxy1;                      \
2217             fy += tx*dy*fxy1;                      \
2218             fz += tx*ty*fz1;                       \
2219         }                                          \
2220     }
2221
2222
2223 static void gather_f_bsplines(gmx_pme_t pme, real *grid,
2224                               gmx_bool bClearF, pme_atomcomm_t *atc,
2225                               splinedata_t *spline,
2226                               real scale)
2227 {
2228     /* sum forces for local particles */
2229     int     nn, n, ithx, ithy, ithz, i0, j0, k0;
2230     int     index_x, index_xy;
2231     int     nx, ny, nz, pnx, pny, pnz;
2232     int *   idxptr;
2233     real    tx, ty, dx, dy, qn;
2234     real    fx, fy, fz, gval;
2235     real    fxy1, fz1;
2236     real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
2237     int     norder;
2238     real    rxx, ryx, ryy, rzx, rzy, rzz;
2239     int     order;
2240
2241     pme_spline_work_t *work;
2242
2243     work = pme->spline_work;
2244
2245     order = pme->pme_order;
2246     thx   = spline->theta[XX];
2247     thy   = spline->theta[YY];
2248     thz   = spline->theta[ZZ];
2249     dthx  = spline->dtheta[XX];
2250     dthy  = spline->dtheta[YY];
2251     dthz  = spline->dtheta[ZZ];
2252     nx    = pme->nkx;
2253     ny    = pme->nky;
2254     nz    = pme->nkz;
2255     pnx   = pme->pmegrid_nx;
2256     pny   = pme->pmegrid_ny;
2257     pnz   = pme->pmegrid_nz;
2258
2259     rxx   = pme->recipbox[XX][XX];
2260     ryx   = pme->recipbox[YY][XX];
2261     ryy   = pme->recipbox[YY][YY];
2262     rzx   = pme->recipbox[ZZ][XX];
2263     rzy   = pme->recipbox[ZZ][YY];
2264     rzz   = pme->recipbox[ZZ][ZZ];
2265
2266     for (nn = 0; nn < spline->n; nn++)
2267     {
2268         n  = spline->ind[nn];
2269         qn = scale*atc->q[n];
2270
2271         if (bClearF)
2272         {
2273             atc->f[n][XX] = 0;
2274             atc->f[n][YY] = 0;
2275             atc->f[n][ZZ] = 0;
2276         }
2277         if (qn != 0)
2278         {
2279             fx     = 0;
2280             fy     = 0;
2281             fz     = 0;
2282             idxptr = atc->idx[n];
2283             norder = nn*order;
2284
2285             i0   = idxptr[XX];
2286             j0   = idxptr[YY];
2287             k0   = idxptr[ZZ];
2288
2289             /* Pointer arithmetic alert, next six statements */
2290             thx  = spline->theta[XX] + norder;
2291             thy  = spline->theta[YY] + norder;
2292             thz  = spline->theta[ZZ] + norder;
2293             dthx = spline->dtheta[XX] + norder;
2294             dthy = spline->dtheta[YY] + norder;
2295             dthz = spline->dtheta[ZZ] + norder;
2296
2297             switch (order)
2298             {
2299                 case 4:
2300 #ifdef PME_SSE
2301 #ifdef PME_SSE_UNALIGNED
2302 #define PME_GATHER_F_SSE_ORDER4
2303 #else
2304 #define PME_GATHER_F_SSE_ALIGNED
2305 #define PME_ORDER 4
2306 #endif
2307 #include "pme_sse_single.h"
2308 #else
2309                     DO_FSPLINE(4);
2310 #endif
2311                     break;
2312                 case 5:
2313 #ifdef PME_SSE
2314 #define PME_GATHER_F_SSE_ALIGNED
2315 #define PME_ORDER 5
2316 #include "pme_sse_single.h"
2317 #else
2318                     DO_FSPLINE(5);
2319 #endif
2320                     break;
2321                 default:
2322                     DO_FSPLINE(order);
2323                     break;
2324             }
2325
2326             atc->f[n][XX] += -qn*( fx*nx*rxx );
2327             atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
2328             atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2329         }
2330     }
2331     /* Since the energy and not forces are interpolated
2332      * the net force might not be exactly zero.
2333      * This can be solved by also interpolating F, but
2334      * that comes at a cost.
2335      * A better hack is to remove the net force every
2336      * step, but that must be done at a higher level
2337      * since this routine doesn't see all atoms if running
2338      * in parallel. Don't know how important it is?  EL 990726
2339      */
2340 }
2341
2342
2343 static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
2344                                    pme_atomcomm_t *atc)
2345 {
2346     splinedata_t *spline;
2347     int     n, ithx, ithy, ithz, i0, j0, k0;
2348     int     index_x, index_xy;
2349     int *   idxptr;
2350     real    energy, pot, tx, ty, qn, gval;
2351     real    *thx, *thy, *thz;
2352     int     norder;
2353     int     order;
2354
2355     spline = &atc->spline[0];
2356
2357     order = pme->pme_order;
2358
2359     energy = 0;
2360     for (n = 0; (n < atc->n); n++)
2361     {
2362         qn      = atc->q[n];
2363
2364         if (qn != 0)
2365         {
2366             idxptr = atc->idx[n];
2367             norder = n*order;
2368
2369             i0   = idxptr[XX];
2370             j0   = idxptr[YY];
2371             k0   = idxptr[ZZ];
2372
2373             /* Pointer arithmetic alert, next three statements */
2374             thx  = spline->theta[XX] + norder;
2375             thy  = spline->theta[YY] + norder;
2376             thz  = spline->theta[ZZ] + norder;
2377
2378             pot = 0;
2379             for (ithx = 0; (ithx < order); ithx++)
2380             {
2381                 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2382                 tx      = thx[ithx];
2383
2384                 for (ithy = 0; (ithy < order); ithy++)
2385                 {
2386                     index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2387                     ty       = thy[ithy];
2388
2389                     for (ithz = 0; (ithz < order); ithz++)
2390                     {
2391                         gval  = grid[index_xy+(k0+ithz)];
2392                         pot  += tx*ty*thz[ithz]*gval;
2393                     }
2394
2395                 }
2396             }
2397
2398             energy += pot*qn;
2399         }
2400     }
2401
2402     return energy;
2403 }
2404
2405 /* Macro to force loop unrolling by fixing order.
2406  * This gives a significant performance gain.
2407  */
2408 #define CALC_SPLINE(order)                     \
2409     {                                              \
2410         int j, k, l;                                 \
2411         real dr, div;                               \
2412         real data[PME_ORDER_MAX];                  \
2413         real ddata[PME_ORDER_MAX];                 \
2414                                                \
2415         for (j = 0; (j < DIM); j++)                     \
2416         {                                          \
2417             dr  = xptr[j];                         \
2418                                                \
2419             /* dr is relative offset from lower cell limit */ \
2420             data[order-1] = 0;                     \
2421             data[1]       = dr;                          \
2422             data[0]       = 1 - dr;                      \
2423                                                \
2424             for (k = 3; (k < order); k++)               \
2425             {                                      \
2426                 div       = 1.0/(k - 1.0);               \
2427                 data[k-1] = div*dr*data[k-2];      \
2428                 for (l = 1; (l < (k-1)); l++)           \
2429                 {                                  \
2430                     data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2431                                        data[k-l-1]);                \
2432                 }                                  \
2433                 data[0] = div*(1-dr)*data[0];      \
2434             }                                      \
2435             /* differentiate */                    \
2436             ddata[0] = -data[0];                   \
2437             for (k = 1; (k < order); k++)               \
2438             {                                      \
2439                 ddata[k] = data[k-1] - data[k];    \
2440             }                                      \
2441                                                \
2442             div           = 1.0/(order - 1);                 \
2443             data[order-1] = div*dr*data[order-2];  \
2444             for (l = 1; (l < (order-1)); l++)           \
2445             {                                      \
2446                 data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
2447                                        (order-l-dr)*data[order-l-1]); \
2448             }                                      \
2449             data[0] = div*(1 - dr)*data[0];        \
2450                                                \
2451             for (k = 0; k < order; k++)                 \
2452             {                                      \
2453                 theta[j][i*order+k]  = data[k];    \
2454                 dtheta[j][i*order+k] = ddata[k];   \
2455             }                                      \
2456         }                                          \
2457     }
2458
2459 void make_bsplines(splinevec theta, splinevec dtheta, int order,
2460                    rvec fractx[], int nr, int ind[], real charge[],
2461                    gmx_bool bFreeEnergy)
2462 {
2463     /* construct splines for local atoms */
2464     int  i, ii;
2465     real *xptr;
2466
2467     for (i = 0; i < nr; i++)
2468     {
2469         /* With free energy we do not use the charge check.
2470          * In most cases this will be more efficient than calling make_bsplines
2471          * twice, since usually more than half the particles have charges.
2472          */
2473         ii = ind[i];
2474         if (bFreeEnergy || charge[ii] != 0.0)
2475         {
2476             xptr = fractx[ii];
2477             switch (order)
2478             {
2479                 case 4:  CALC_SPLINE(4);     break;
2480                 case 5:  CALC_SPLINE(5);     break;
2481                 default: CALC_SPLINE(order); break;
2482             }
2483         }
2484     }
2485 }
2486
2487
2488 void make_dft_mod(real *mod, real *data, int ndata)
2489 {
2490     int i, j;
2491     real sc, ss, arg;
2492
2493     for (i = 0; i < ndata; i++)
2494     {
2495         sc = ss = 0;
2496         for (j = 0; j < ndata; j++)
2497         {
2498             arg = (2.0*M_PI*i*j)/ndata;
2499             sc += data[j]*cos(arg);
2500             ss += data[j]*sin(arg);
2501         }
2502         mod[i] = sc*sc+ss*ss;
2503     }
2504     for (i = 0; i < ndata; i++)
2505     {
2506         if (mod[i] < 1e-7)
2507         {
2508             mod[i] = (mod[i-1]+mod[i+1])*0.5;
2509         }
2510     }
2511 }
2512
2513
2514 static void make_bspline_moduli(splinevec bsp_mod,
2515                                 int nx, int ny, int nz, int order)
2516 {
2517     int nmax = max(nx, max(ny, nz));
2518     real *data, *ddata, *bsp_data;
2519     int i, k, l;
2520     real div;
2521
2522     snew(data, order);
2523     snew(ddata, order);
2524     snew(bsp_data, nmax);
2525
2526     data[order-1] = 0;
2527     data[1]       = 0;
2528     data[0]       = 1;
2529
2530     for (k = 3; k < order; k++)
2531     {
2532         div       = 1.0/(k-1.0);
2533         data[k-1] = 0;
2534         for (l = 1; l < (k-1); l++)
2535         {
2536             data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2537         }
2538         data[0] = div*data[0];
2539     }
2540     /* differentiate */
2541     ddata[0] = -data[0];
2542     for (k = 1; k < order; k++)
2543     {
2544         ddata[k] = data[k-1]-data[k];
2545     }
2546     div           = 1.0/(order-1);
2547     data[order-1] = 0;
2548     for (l = 1; l < (order-1); l++)
2549     {
2550         data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2551     }
2552     data[0] = div*data[0];
2553
2554     for (i = 0; i < nmax; i++)
2555     {
2556         bsp_data[i] = 0;
2557     }
2558     for (i = 1; i <= order; i++)
2559     {
2560         bsp_data[i] = data[i-1];
2561     }
2562
2563     make_dft_mod(bsp_mod[XX], bsp_data, nx);
2564     make_dft_mod(bsp_mod[YY], bsp_data, ny);
2565     make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
2566
2567     sfree(data);
2568     sfree(ddata);
2569     sfree(bsp_data);
2570 }
2571
2572
2573 /* Return the P3M optimal influence function */
2574 static double do_p3m_influence(double z, int order)
2575 {
2576     double z2, z4;
2577
2578     z2 = z*z;
2579     z4 = z2*z2;
2580
2581     /* The formula and most constants can be found in:
2582      * Ballenegger et al., JCTC 8, 936 (2012)
2583      */
2584     switch (order)
2585     {
2586         case 2:
2587             return 1.0 - 2.0*z2/3.0;
2588             break;
2589         case 3:
2590             return 1.0 - z2 + 2.0*z4/15.0;
2591             break;
2592         case 4:
2593             return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2594             break;
2595         case 5:
2596             return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2597             break;
2598         case 6:
2599             return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2600             break;
2601         case 7:
2602             return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2603         case 8:
2604             return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2605             break;
2606     }
2607
2608     return 0.0;
2609 }
2610
2611 /* Calculate the P3M B-spline moduli for one dimension */
2612 static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
2613 {
2614     double zarg, zai, sinzai, infl;
2615     int    maxk, i;
2616
2617     if (order > 8)
2618     {
2619         gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
2620     }
2621
2622     zarg = M_PI/n;
2623
2624     maxk = (n + 1)/2;
2625
2626     for (i = -maxk; i < 0; i++)
2627     {
2628         zai          = zarg*i;
2629         sinzai       = sin(zai);
2630         infl         = do_p3m_influence(sinzai, order);
2631         bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
2632     }
2633     bsp_mod[0] = 1.0;
2634     for (i = 1; i < maxk; i++)
2635     {
2636         zai        = zarg*i;
2637         sinzai     = sin(zai);
2638         infl       = do_p3m_influence(sinzai, order);
2639         bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
2640     }
2641 }
2642
2643 /* Calculate the P3M B-spline moduli */
2644 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2645                                     int nx, int ny, int nz, int order)
2646 {
2647     make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
2648     make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
2649     make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
2650 }
2651
2652
2653 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2654 {
2655     int nslab, n, i;
2656     int fw, bw;
2657
2658     nslab = atc->nslab;
2659
2660     n = 0;
2661     for (i = 1; i <= nslab/2; i++)
2662     {
2663         fw = (atc->nodeid + i) % nslab;
2664         bw = (atc->nodeid - i + nslab) % nslab;
2665         if (n < nslab - 1)
2666         {
2667             atc->node_dest[n] = fw;
2668             atc->node_src[n]  = bw;
2669             n++;
2670         }
2671         if (n < nslab - 1)
2672         {
2673             atc->node_dest[n] = bw;
2674             atc->node_src[n]  = fw;
2675             n++;
2676         }
2677     }
2678 }
2679
2680 int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
2681 {
2682     int thread;
2683
2684     if (NULL != log)
2685     {
2686         fprintf(log, "Destroying PME data structures.\n");
2687     }
2688
2689     sfree((*pmedata)->nnx);
2690     sfree((*pmedata)->nny);
2691     sfree((*pmedata)->nnz);
2692
2693     pmegrids_destroy(&(*pmedata)->pmegridA);
2694
2695     sfree((*pmedata)->fftgridA);
2696     sfree((*pmedata)->cfftgridA);
2697     gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
2698
2699     if ((*pmedata)->pmegridB.grid.grid != NULL)
2700     {
2701         pmegrids_destroy(&(*pmedata)->pmegridB);
2702         sfree((*pmedata)->fftgridB);
2703         sfree((*pmedata)->cfftgridB);
2704         gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
2705     }
2706     for (thread = 0; thread < (*pmedata)->nthread; thread++)
2707     {
2708         free_work(&(*pmedata)->work[thread]);
2709     }
2710     sfree((*pmedata)->work);
2711
2712     sfree(*pmedata);
2713     *pmedata = NULL;
2714
2715     return 0;
2716 }
2717
2718 static int mult_up(int n, int f)
2719 {
2720     return ((n + f - 1)/f)*f;
2721 }
2722
2723
2724 static double pme_load_imbalance(gmx_pme_t pme)
2725 {
2726     int    nma, nmi;
2727     double n1, n2, n3;
2728
2729     nma = pme->nnodes_major;
2730     nmi = pme->nnodes_minor;
2731
2732     n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
2733     n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
2734     n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
2735
2736     /* pme_solve is roughly double the cost of an fft */
2737
2738     return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
2739 }
2740
2741 static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc, t_commrec *cr,
2742                           int dimind, gmx_bool bSpread)
2743 {
2744     int nk, k, s, thread;
2745
2746     atc->dimind    = dimind;
2747     atc->nslab     = 1;
2748     atc->nodeid    = 0;
2749     atc->pd_nalloc = 0;
2750 #ifdef GMX_MPI
2751     if (pme->nnodes > 1)
2752     {
2753         atc->mpi_comm = pme->mpi_comm_d[dimind];
2754         MPI_Comm_size(atc->mpi_comm, &atc->nslab);
2755         MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
2756     }
2757     if (debug)
2758     {
2759         fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
2760     }
2761 #endif
2762
2763     atc->bSpread   = bSpread;
2764     atc->pme_order = pme->pme_order;
2765
2766     if (atc->nslab > 1)
2767     {
2768         /* These three allocations are not required for particle decomp. */
2769         snew(atc->node_dest, atc->nslab);
2770         snew(atc->node_src, atc->nslab);
2771         setup_coordinate_communication(atc);
2772
2773         snew(atc->count_thread, pme->nthread);
2774         for (thread = 0; thread < pme->nthread; thread++)
2775         {
2776             snew(atc->count_thread[thread], atc->nslab);
2777         }
2778         atc->count = atc->count_thread[0];
2779         snew(atc->rcount, atc->nslab);
2780         snew(atc->buf_index, atc->nslab);
2781     }
2782
2783     atc->nthread = pme->nthread;
2784     if (atc->nthread > 1)
2785     {
2786         snew(atc->thread_plist, atc->nthread);
2787     }
2788     snew(atc->spline, atc->nthread);
2789     for (thread = 0; thread < atc->nthread; thread++)
2790     {
2791         if (atc->nthread > 1)
2792         {
2793             snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
2794             atc->thread_plist[thread].n += GMX_CACHE_SEP;
2795         }
2796         snew(atc->spline[thread].thread_one, pme->nthread);
2797         atc->spline[thread].thread_one[thread] = 1;
2798     }
2799 }
2800
2801 static void
2802 init_overlap_comm(pme_overlap_t *  ol,
2803                   int              norder,
2804 #ifdef GMX_MPI
2805                   MPI_Comm         comm,
2806 #endif
2807                   int              nnodes,
2808                   int              nodeid,
2809                   int              ndata,
2810                   int              commplainsize)
2811 {
2812     int lbnd, rbnd, maxlr, b, i;
2813     int exten;
2814     int nn, nk;
2815     pme_grid_comm_t *pgc;
2816     gmx_bool bCont;
2817     int fft_start, fft_end, send_index1, recv_index1;
2818 #ifdef GMX_MPI
2819     MPI_Status stat;
2820
2821     ol->mpi_comm = comm;
2822 #endif
2823
2824     ol->nnodes = nnodes;
2825     ol->nodeid = nodeid;
2826
2827     /* Linear translation of the PME grid won't affect reciprocal space
2828      * calculations, so to optimize we only interpolate "upwards",
2829      * which also means we only have to consider overlap in one direction.
2830      * I.e., particles on this node might also be spread to grid indices
2831      * that belong to higher nodes (modulo nnodes)
2832      */
2833
2834     snew(ol->s2g0, ol->nnodes+1);
2835     snew(ol->s2g1, ol->nnodes);
2836     if (debug)
2837     {
2838         fprintf(debug, "PME slab boundaries:");
2839     }
2840     for (i = 0; i < nnodes; i++)
2841     {
2842         /* s2g0 the local interpolation grid start.
2843          * s2g1 the local interpolation grid end.
2844          * Because grid overlap communication only goes forward,
2845          * the grid the slabs for fft's should be rounded down.
2846          */
2847         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
2848         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
2849
2850         if (debug)
2851         {
2852             fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
2853         }
2854     }
2855     ol->s2g0[nnodes] = ndata;
2856     if (debug)
2857     {
2858         fprintf(debug, "\n");
2859     }
2860
2861     /* Determine with how many nodes we need to communicate the grid overlap */
2862     b = 0;
2863     do
2864     {
2865         b++;
2866         bCont = FALSE;
2867         for (i = 0; i < nnodes; i++)
2868         {
2869             if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
2870                 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
2871             {
2872                 bCont = TRUE;
2873             }
2874         }
2875     }
2876     while (bCont && b < nnodes);
2877     ol->noverlap_nodes = b - 1;
2878
2879     snew(ol->send_id, ol->noverlap_nodes);
2880     snew(ol->recv_id, ol->noverlap_nodes);
2881     for (b = 0; b < ol->noverlap_nodes; b++)
2882     {
2883         ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
2884         ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
2885     }
2886     snew(ol->comm_data, ol->noverlap_nodes);
2887
2888     ol->send_size = 0;
2889     for (b = 0; b < ol->noverlap_nodes; b++)
2890     {
2891         pgc = &ol->comm_data[b];
2892         /* Send */
2893         fft_start        = ol->s2g0[ol->send_id[b]];
2894         fft_end          = ol->s2g0[ol->send_id[b]+1];
2895         if (ol->send_id[b] < nodeid)
2896         {
2897             fft_start += ndata;
2898             fft_end   += ndata;
2899         }
2900         send_index1       = ol->s2g1[nodeid];
2901         send_index1       = min(send_index1, fft_end);
2902         pgc->send_index0  = fft_start;
2903         pgc->send_nindex  = max(0, send_index1 - pgc->send_index0);
2904         ol->send_size    += pgc->send_nindex;
2905
2906         /* We always start receiving to the first index of our slab */
2907         fft_start        = ol->s2g0[ol->nodeid];
2908         fft_end          = ol->s2g0[ol->nodeid+1];
2909         recv_index1      = ol->s2g1[ol->recv_id[b]];
2910         if (ol->recv_id[b] > nodeid)
2911         {
2912             recv_index1 -= ndata;
2913         }
2914         recv_index1      = min(recv_index1, fft_end);
2915         pgc->recv_index0 = fft_start;
2916         pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
2917     }
2918
2919 #ifdef GMX_MPI
2920     /* Communicate the buffer sizes to receive */
2921     for (b = 0; b < ol->noverlap_nodes; b++)
2922     {
2923         MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
2924                      &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
2925                      ol->mpi_comm, &stat);
2926     }
2927 #endif
2928
2929     /* For non-divisible grid we need pme_order iso pme_order-1 */
2930     snew(ol->sendbuf, norder*commplainsize);
2931     snew(ol->recvbuf, norder*commplainsize);
2932 }
2933
2934 static void
2935 make_gridindex5_to_localindex(int n, int local_start, int local_range,
2936                               int **global_to_local,
2937                               real **fraction_shift)
2938 {
2939     int i;
2940     int * gtl;
2941     real * fsh;
2942
2943     snew(gtl, 5*n);
2944     snew(fsh, 5*n);
2945     for (i = 0; (i < 5*n); i++)
2946     {
2947         /* Determine the global to local grid index */
2948         gtl[i] = (i - local_start + n) % n;
2949         /* For coordinates that fall within the local grid the fraction
2950          * is correct, we don't need to shift it.
2951          */
2952         fsh[i] = 0;
2953         if (local_range < n)
2954         {
2955             /* Due to rounding issues i could be 1 beyond the lower or
2956              * upper boundary of the local grid. Correct the index for this.
2957              * If we shift the index, we need to shift the fraction by
2958              * the same amount in the other direction to not affect
2959              * the weights.
2960              * Note that due to this shifting the weights at the end of
2961              * the spline might change, but that will only involve values
2962              * between zero and values close to the precision of a real,
2963              * which is anyhow the accuracy of the whole mesh calculation.
2964              */
2965             /* With local_range=0 we should not change i=local_start */
2966             if (i % n != local_start)
2967             {
2968                 if (gtl[i] == n-1)
2969                 {
2970                     gtl[i] = 0;
2971                     fsh[i] = -1;
2972                 }
2973                 else if (gtl[i] == local_range)
2974                 {
2975                     gtl[i] = local_range - 1;
2976                     fsh[i] = 1;
2977                 }
2978             }
2979         }
2980     }
2981
2982     *global_to_local = gtl;
2983     *fraction_shift  = fsh;
2984 }
2985
2986 static pme_spline_work_t *make_pme_spline_work(int order)
2987 {
2988     pme_spline_work_t *work;
2989
2990 #ifdef PME_SSE
2991     float  tmp[8];
2992     __m128 zero_SSE;
2993     int    of, i;
2994
2995     snew_aligned(work, 1, 16);
2996
2997     zero_SSE = _mm_setzero_ps();
2998
2999     /* Generate bit masks to mask out the unused grid entries,
3000      * as we only operate on order of the 8 grid entries that are
3001      * load into 2 SSE float registers.
3002      */
3003     for (of = 0; of < 8-(order-1); of++)
3004     {
3005         for (i = 0; i < 8; i++)
3006         {
3007             tmp[i] = (i >= of && i < of+order ? 1 : 0);
3008         }
3009         work->mask_SSE0[of] = _mm_loadu_ps(tmp);
3010         work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
3011         work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
3012         work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
3013     }
3014 #else
3015     work = NULL;
3016 #endif
3017
3018     return work;
3019 }
3020
3021 static void
3022 gmx_pme_check_grid_restrictions(FILE *fplog, char dim, int nnodes, int *nk)
3023 {
3024     int nk_new;
3025
3026     if (*nk % nnodes != 0)
3027     {
3028         nk_new = nnodes*(*nk/nnodes + 1);
3029
3030         if (2*nk_new >= 3*(*nk))
3031         {
3032             gmx_fatal(FARGS, "The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
3033                       dim, *nk, dim, nnodes, dim);
3034         }
3035
3036         if (fplog != NULL)
3037         {
3038             fprintf(fplog, "\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
3039                     dim, *nk, dim, nnodes, dim, nk_new, dim);
3040         }
3041
3042         *nk = nk_new;
3043     }
3044 }
3045
3046 int gmx_pme_init(gmx_pme_t *         pmedata,
3047                  t_commrec *         cr,
3048                  int                 nnodes_major,
3049                  int                 nnodes_minor,
3050                  t_inputrec *        ir,
3051                  int                 homenr,
3052                  gmx_bool            bFreeEnergy,
3053                  gmx_bool            bReproducible,
3054                  int                 nthread)
3055 {
3056     gmx_pme_t pme = NULL;
3057
3058     pme_atomcomm_t *atc;
3059     ivec ndata;
3060
3061     if (debug)
3062     {
3063         fprintf(debug, "Creating PME data structures.\n");
3064     }
3065     snew(pme, 1);
3066
3067     pme->redist_init         = FALSE;
3068     pme->sum_qgrid_tmp       = NULL;
3069     pme->sum_qgrid_dd_tmp    = NULL;
3070     pme->buf_nalloc          = 0;
3071     pme->redist_buf_nalloc   = 0;
3072
3073     pme->nnodes              = 1;
3074     pme->bPPnode             = TRUE;
3075
3076     pme->nnodes_major        = nnodes_major;
3077     pme->nnodes_minor        = nnodes_minor;
3078
3079 #ifdef GMX_MPI
3080     if (nnodes_major*nnodes_minor > 1)
3081     {
3082         pme->mpi_comm = cr->mpi_comm_mygroup;
3083
3084         MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
3085         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
3086         if (pme->nnodes != nnodes_major*nnodes_minor)
3087         {
3088             gmx_incons("PME node count mismatch");
3089         }
3090     }
3091     else
3092     {
3093         pme->mpi_comm = MPI_COMM_NULL;
3094     }
3095 #endif
3096
3097     if (pme->nnodes == 1)
3098     {
3099 #ifdef GMX_MPI
3100         pme->mpi_comm_d[0] = MPI_COMM_NULL;
3101         pme->mpi_comm_d[1] = MPI_COMM_NULL;
3102 #endif
3103         pme->ndecompdim   = 0;
3104         pme->nodeid_major = 0;
3105         pme->nodeid_minor = 0;
3106 #ifdef GMX_MPI
3107         pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
3108 #endif
3109     }
3110     else
3111     {
3112         if (nnodes_minor == 1)
3113         {
3114 #ifdef GMX_MPI
3115             pme->mpi_comm_d[0] = pme->mpi_comm;
3116             pme->mpi_comm_d[1] = MPI_COMM_NULL;
3117 #endif
3118             pme->ndecompdim   = 1;
3119             pme->nodeid_major = pme->nodeid;
3120             pme->nodeid_minor = 0;
3121
3122         }
3123         else if (nnodes_major == 1)
3124         {
3125 #ifdef GMX_MPI
3126             pme->mpi_comm_d[0] = MPI_COMM_NULL;
3127             pme->mpi_comm_d[1] = pme->mpi_comm;
3128 #endif
3129             pme->ndecompdim   = 1;
3130             pme->nodeid_major = 0;
3131             pme->nodeid_minor = pme->nodeid;
3132         }
3133         else
3134         {
3135             if (pme->nnodes % nnodes_major != 0)
3136             {
3137                 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
3138             }
3139             pme->ndecompdim = 2;
3140
3141 #ifdef GMX_MPI
3142             MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
3143                            pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
3144             MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
3145                            pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
3146
3147             MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
3148             MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
3149             MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
3150             MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
3151 #endif
3152         }
3153         pme->bPPnode = (cr->duty & DUTY_PP);
3154     }
3155
3156     pme->nthread = nthread;
3157
3158     if (ir->ePBC == epbcSCREW)
3159     {
3160         gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
3161     }
3162
3163     pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
3164     pme->nkx         = ir->nkx;
3165     pme->nky         = ir->nky;
3166     pme->nkz         = ir->nkz;
3167     pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3168     pme->pme_order   = ir->pme_order;
3169     pme->epsilon_r   = ir->epsilon_r;
3170
3171     if (pme->pme_order > PME_ORDER_MAX)
3172     {
3173         gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3174                   pme->pme_order, PME_ORDER_MAX);
3175     }
3176
3177     /* Currently pme.c supports only the fft5d FFT code.
3178      * Therefore the grid always needs to be divisible by nnodes.
3179      * When the old 1D code is also supported again, change this check.
3180      *
3181      * This check should be done before calling gmx_pme_init
3182      * and fplog should be passed iso stderr.
3183      *
3184        if (pme->ndecompdim >= 2)
3185      */
3186     if (pme->ndecompdim >= 1)
3187     {
3188         /*
3189            gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3190                                         'x',nnodes_major,&pme->nkx);
3191            gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3192                                         'y',nnodes_minor,&pme->nky);
3193          */
3194     }
3195
3196     if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
3197         pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
3198         pme->nkz <= pme->pme_order)
3199     {
3200         gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order", pme->pme_order);
3201     }
3202
3203     if (pme->nnodes > 1)
3204     {
3205         double imbal;
3206
3207 #ifdef GMX_MPI
3208         MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3209         MPI_Type_commit(&(pme->rvec_mpi));
3210 #endif
3211
3212         /* Note that the charge spreading and force gathering, which usually
3213          * takes about the same amount of time as FFT+solve_pme,
3214          * is always fully load balanced
3215          * (unless the charge distribution is inhomogeneous).
3216          */
3217
3218         imbal = pme_load_imbalance(pme);
3219         if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3220         {
3221             fprintf(stderr,
3222                     "\n"
3223                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3224                     "      For optimal PME load balancing\n"
3225                     "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3226                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3227                     "\n",
3228                     (int)((imbal-1)*100 + 0.5),
3229                     pme->nkx, pme->nky, pme->nnodes_major,
3230                     pme->nky, pme->nkz, pme->nnodes_minor);
3231         }
3232     }
3233
3234     /* For non-divisible grid we need pme_order iso pme_order-1 */
3235     /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3236      * y is always copied through a buffer: we don't need padding in z,
3237      * but we do need the overlap in x because of the communication order.
3238      */
3239     init_overlap_comm(&pme->overlap[0], pme->pme_order,
3240 #ifdef GMX_MPI
3241                       pme->mpi_comm_d[0],
3242 #endif
3243                       pme->nnodes_major, pme->nodeid_major,
3244                       pme->nkx,
3245                       (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3246
3247     /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3248      * We do this with an offset buffer of equal size, so we need to allocate
3249      * extra for the offset. That's what the (+1)*pme->nkz is for.
3250      */
3251     init_overlap_comm(&pme->overlap[1], pme->pme_order,
3252 #ifdef GMX_MPI
3253                       pme->mpi_comm_d[1],
3254 #endif
3255                       pme->nnodes_minor, pme->nodeid_minor,
3256                       pme->nky,
3257                       (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
3258
3259     /* Check for a limitation of the (current) sum_fftgrid_dd code.
3260      * We only allow multiple communication pulses in dim 1, not in dim 0.
3261      */
3262     if (pme->nthread > 1 && (pme->overlap[0].noverlap_nodes > 1 ||
3263                              pme->nkx < pme->nnodes_major*pme->pme_order))
3264     {
3265         gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
3266                   pme->nkx/(double)pme->nnodes_major, pme->pme_order);
3267     }
3268
3269     snew(pme->bsp_mod[XX], pme->nkx);
3270     snew(pme->bsp_mod[YY], pme->nky);
3271     snew(pme->bsp_mod[ZZ], pme->nkz);
3272
3273     /* The required size of the interpolation grid, including overlap.
3274      * The allocated size (pmegrid_n?) might be slightly larger.
3275      */
3276     pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3277         pme->overlap[0].s2g0[pme->nodeid_major];
3278     pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3279         pme->overlap[1].s2g0[pme->nodeid_minor];
3280     pme->pmegrid_nz_base = pme->nkz;
3281     pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
3282     set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
3283
3284     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3285     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3286     pme->pmegrid_start_iz = 0;
3287
3288     make_gridindex5_to_localindex(pme->nkx,
3289                                   pme->pmegrid_start_ix,
3290                                   pme->pmegrid_nx - (pme->pme_order-1),
3291                                   &pme->nnx, &pme->fshx);
3292     make_gridindex5_to_localindex(pme->nky,
3293                                   pme->pmegrid_start_iy,
3294                                   pme->pmegrid_ny - (pme->pme_order-1),
3295                                   &pme->nny, &pme->fshy);
3296     make_gridindex5_to_localindex(pme->nkz,
3297                                   pme->pmegrid_start_iz,
3298                                   pme->pmegrid_nz_base,
3299                                   &pme->nnz, &pme->fshz);
3300
3301     pmegrids_init(&pme->pmegridA,
3302                   pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3303                   pme->pmegrid_nz_base,
3304                   pme->pme_order,
3305                   pme->nthread,
3306                   pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3307                   pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3308
3309     pme->spline_work = make_pme_spline_work(pme->pme_order);
3310
3311     ndata[0] = pme->nkx;
3312     ndata[1] = pme->nky;
3313     ndata[2] = pme->nkz;
3314
3315     /* This routine will allocate the grid data to fit the FFTs */
3316     gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
3317                             &pme->fftgridA, &pme->cfftgridA,
3318                             pme->mpi_comm_d,
3319                             pme->overlap[0].s2g0, pme->overlap[1].s2g0,
3320                             bReproducible, pme->nthread);
3321
3322     if (bFreeEnergy)
3323     {
3324         pmegrids_init(&pme->pmegridB,
3325                       pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3326                       pme->pmegrid_nz_base,
3327                       pme->pme_order,
3328                       pme->nthread,
3329                       pme->nkx % pme->nnodes_major != 0,
3330                       pme->nky % pme->nnodes_minor != 0);
3331
3332         gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
3333                                 &pme->fftgridB, &pme->cfftgridB,
3334                                 pme->mpi_comm_d,
3335                                 pme->overlap[0].s2g0, pme->overlap[1].s2g0,
3336                                 bReproducible, pme->nthread);
3337     }
3338     else
3339     {
3340         pme->pmegridB.grid.grid = NULL;
3341         pme->fftgridB           = NULL;
3342         pme->cfftgridB          = NULL;
3343     }
3344
3345     if (!pme->bP3M)
3346     {
3347         /* Use plain SPME B-spline interpolation */
3348         make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3349     }
3350     else
3351     {
3352         /* Use the P3M grid-optimized influence function */
3353         make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3354     }
3355
3356     /* Use atc[0] for spreading */
3357     init_atomcomm(pme, &pme->atc[0], cr, nnodes_major > 1 ? 0 : 1, TRUE);
3358     if (pme->ndecompdim >= 2)
3359     {
3360         init_atomcomm(pme, &pme->atc[1], cr, 1, FALSE);
3361     }
3362
3363     if (pme->nnodes == 1)
3364     {
3365         pme->atc[0].n = homenr;
3366         pme_realloc_atomcomm_things(&pme->atc[0]);
3367     }
3368
3369     {
3370         int thread;
3371
3372         /* Use fft5d, order after FFT is y major, z, x minor */
3373
3374         snew(pme->work, pme->nthread);
3375         for (thread = 0; thread < pme->nthread; thread++)
3376         {
3377             realloc_work(&pme->work[thread], pme->nkx);
3378         }
3379     }
3380
3381     *pmedata = pme;
3382
3383     return 0;
3384 }
3385
3386 static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
3387 {
3388     int d, t;
3389
3390     for (d = 0; d < DIM; d++)
3391     {
3392         if (new->grid.n[d] > old->grid.n[d])
3393         {
3394             return;
3395         }
3396     }
3397
3398     sfree_aligned(new->grid.grid);
3399     new->grid.grid = old->grid.grid;
3400
3401     if (new->nthread > 1 && new->nthread == old->nthread)
3402     {
3403         sfree_aligned(new->grid_all);
3404         for (t = 0; t < new->nthread; t++)
3405         {
3406             new->grid_th[t].grid = old->grid_th[t].grid;
3407         }
3408     }
3409 }
3410
3411 int gmx_pme_reinit(gmx_pme_t *         pmedata,
3412                    t_commrec *         cr,
3413                    gmx_pme_t           pme_src,
3414                    const t_inputrec *  ir,
3415                    ivec                grid_size)
3416 {
3417     t_inputrec irc;
3418     int homenr;
3419     int ret;
3420
3421     irc     = *ir;
3422     irc.nkx = grid_size[XX];
3423     irc.nky = grid_size[YY];
3424     irc.nkz = grid_size[ZZ];
3425
3426     if (pme_src->nnodes == 1)
3427     {
3428         homenr = pme_src->atc[0].n;
3429     }
3430     else
3431     {
3432         homenr = -1;
3433     }
3434
3435     ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
3436                        &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
3437
3438     if (ret == 0)
3439     {
3440         /* We can easily reuse the allocated pme grids in pme_src */
3441         reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
3442         /* We would like to reuse the fft grids, but that's harder */
3443     }
3444
3445     return ret;
3446 }
3447
3448
3449 static void copy_local_grid(gmx_pme_t pme,
3450                             pmegrids_t *pmegrids, int thread, real *fftgrid)
3451 {
3452     ivec local_fft_ndata, local_fft_offset, local_fft_size;
3453     int  fft_my, fft_mz;
3454     int  nsx, nsy, nsz;
3455     ivec nf;
3456     int  offx, offy, offz, x, y, z, i0, i0t;
3457     int  d;
3458     pmegrid_t *pmegrid;
3459     real *grid_th;
3460
3461     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3462                                    local_fft_ndata,
3463                                    local_fft_offset,
3464                                    local_fft_size);
3465     fft_my = local_fft_size[YY];
3466     fft_mz = local_fft_size[ZZ];
3467
3468     pmegrid = &pmegrids->grid_th[thread];
3469
3470     nsx = pmegrid->s[XX];
3471     nsy = pmegrid->s[YY];
3472     nsz = pmegrid->s[ZZ];
3473
3474     for (d = 0; d < DIM; d++)
3475     {
3476         nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3477                     local_fft_ndata[d] - pmegrid->offset[d]);
3478     }
3479
3480     offx = pmegrid->offset[XX];
3481     offy = pmegrid->offset[YY];
3482     offz = pmegrid->offset[ZZ];
3483
3484     /* Directly copy the non-overlapping parts of the local grids.
3485      * This also initializes the full grid.
3486      */
3487     grid_th = pmegrid->grid;
3488     for (x = 0; x < nf[XX]; x++)
3489     {
3490         for (y = 0; y < nf[YY]; y++)
3491         {
3492             i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3493             i0t = (x*nsy + y)*nsz;
3494             for (z = 0; z < nf[ZZ]; z++)
3495             {
3496                 fftgrid[i0+z] = grid_th[i0t+z];
3497             }
3498         }
3499     }
3500 }
3501
3502 static void
3503 reduce_threadgrid_overlap(gmx_pme_t pme,
3504                           const pmegrids_t *pmegrids, int thread,
3505                           real *fftgrid, real *commbuf_x, real *commbuf_y)
3506 {
3507     ivec local_fft_ndata, local_fft_offset, local_fft_size;
3508     int  fft_nx, fft_ny, fft_nz;
3509     int  fft_my, fft_mz;
3510     int  buf_my = -1;
3511     int  nsx, nsy, nsz;
3512     ivec ne;
3513     int  offx, offy, offz, x, y, z, i0, i0t;
3514     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
3515     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
3516     gmx_bool bCommX, bCommY;
3517     int  d;
3518     int  thread_f;
3519     const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
3520     const real *grid_th;
3521     real *commbuf = NULL;
3522
3523     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3524                                    local_fft_ndata,
3525                                    local_fft_offset,
3526                                    local_fft_size);
3527     fft_nx = local_fft_ndata[XX];
3528     fft_ny = local_fft_ndata[YY];
3529     fft_nz = local_fft_ndata[ZZ];
3530
3531     fft_my = local_fft_size[YY];
3532     fft_mz = local_fft_size[ZZ];
3533
3534     /* This routine is called when all thread have finished spreading.
3535      * Here each thread sums grid contributions calculated by other threads
3536      * to the thread local grid volume.
3537      * To minimize the number of grid copying operations,
3538      * this routines sums immediately from the pmegrid to the fftgrid.
3539      */
3540
3541     /* Determine which part of the full node grid we should operate on,
3542      * this is our thread local part of the full grid.
3543      */
3544     pmegrid = &pmegrids->grid_th[thread];
3545
3546     for (d = 0; d < DIM; d++)
3547     {
3548         ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3549                     local_fft_ndata[d]);
3550     }
3551
3552     offx = pmegrid->offset[XX];
3553     offy = pmegrid->offset[YY];
3554     offz = pmegrid->offset[ZZ];
3555
3556
3557     bClearBufX  = TRUE;
3558     bClearBufY  = TRUE;
3559     bClearBufXY = TRUE;
3560
3561     /* Now loop over all the thread data blocks that contribute
3562      * to the grid region we (our thread) are operating on.
3563      */
3564     /* Note that ffy_nx/y is equal to the number of grid points
3565      * between the first point of our node grid and the one of the next node.
3566      */
3567     for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
3568     {
3569         fx     = pmegrid->ci[XX] + sx;
3570         ox     = 0;
3571         bCommX = FALSE;
3572         if (fx < 0)
3573         {
3574             fx    += pmegrids->nc[XX];
3575             ox    -= fft_nx;
3576             bCommX = (pme->nnodes_major > 1);
3577         }
3578         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3579         ox       += pmegrid_g->offset[XX];
3580         if (!bCommX)
3581         {
3582             tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
3583         }
3584         else
3585         {
3586             tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
3587         }
3588
3589         for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
3590         {
3591             fy     = pmegrid->ci[YY] + sy;
3592             oy     = 0;
3593             bCommY = FALSE;
3594             if (fy < 0)
3595             {
3596                 fy    += pmegrids->nc[YY];
3597                 oy    -= fft_ny;
3598                 bCommY = (pme->nnodes_minor > 1);
3599             }
3600             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3601             oy       += pmegrid_g->offset[YY];
3602             if (!bCommY)
3603             {
3604                 ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
3605             }
3606             else
3607             {
3608                 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
3609             }
3610
3611             for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
3612             {
3613                 fz = pmegrid->ci[ZZ] + sz;
3614                 oz = 0;
3615                 if (fz < 0)
3616                 {
3617                     fz += pmegrids->nc[ZZ];
3618                     oz -= fft_nz;
3619                 }
3620                 pmegrid_g = &pmegrids->grid_th[fz];
3621                 oz       += pmegrid_g->offset[ZZ];
3622                 tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
3623
3624                 if (sx == 0 && sy == 0 && sz == 0)
3625                 {
3626                     /* We have already added our local contribution
3627                      * before calling this routine, so skip it here.
3628                      */
3629                     continue;
3630                 }
3631
3632                 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3633
3634                 pmegrid_f = &pmegrids->grid_th[thread_f];
3635
3636                 grid_th = pmegrid_f->grid;
3637
3638                 nsx = pmegrid_f->s[XX];
3639                 nsy = pmegrid_f->s[YY];
3640                 nsz = pmegrid_f->s[ZZ];
3641
3642 #ifdef DEBUG_PME_REDUCE
3643                 printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3644                        pme->nodeid, thread, thread_f,
3645                        pme->pmegrid_start_ix,
3646                        pme->pmegrid_start_iy,
3647                        pme->pmegrid_start_iz,
3648                        sx, sy, sz,
3649                        offx-ox, tx1-ox, offx, tx1,
3650                        offy-oy, ty1-oy, offy, ty1,
3651                        offz-oz, tz1-oz, offz, tz1);
3652 #endif
3653
3654                 if (!(bCommX || bCommY))
3655                 {
3656                     /* Copy from the thread local grid to the node grid */
3657                     for (x = offx; x < tx1; x++)
3658                     {
3659                         for (y = offy; y < ty1; y++)
3660                         {
3661                             i0  = (x*fft_my + y)*fft_mz;
3662                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3663                             for (z = offz; z < tz1; z++)
3664                             {
3665                                 fftgrid[i0+z] += grid_th[i0t+z];
3666                             }
3667                         }
3668                     }
3669                 }
3670                 else
3671                 {
3672                     /* The order of this conditional decides
3673                      * where the corner volume gets stored with x+y decomp.
3674                      */
3675                     if (bCommY)
3676                     {
3677                         commbuf = commbuf_y;
3678                         buf_my  = ty1 - offy;
3679                         if (bCommX)
3680                         {
3681                             /* We index commbuf modulo the local grid size */
3682                             commbuf += buf_my*fft_nx*fft_nz;
3683
3684                             bClearBuf   = bClearBufXY;
3685                             bClearBufXY = FALSE;
3686                         }
3687                         else
3688                         {
3689                             bClearBuf  = bClearBufY;
3690                             bClearBufY = FALSE;
3691                         }
3692                     }
3693                     else
3694                     {
3695                         commbuf    = commbuf_x;
3696                         buf_my     = fft_ny;
3697                         bClearBuf  = bClearBufX;
3698                         bClearBufX = FALSE;
3699                     }
3700
3701                     /* Copy to the communication buffer */
3702                     for (x = offx; x < tx1; x++)
3703                     {
3704                         for (y = offy; y < ty1; y++)
3705                         {
3706                             i0  = (x*buf_my + y)*fft_nz;
3707                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3708
3709                             if (bClearBuf)
3710                             {
3711                                 /* First access of commbuf, initialize it */
3712                                 for (z = offz; z < tz1; z++)
3713                                 {
3714                                     commbuf[i0+z]  = grid_th[i0t+z];
3715                                 }
3716                             }
3717                             else
3718                             {
3719                                 for (z = offz; z < tz1; z++)
3720                                 {
3721                                     commbuf[i0+z] += grid_th[i0t+z];
3722                                 }
3723                             }
3724                         }
3725                     }
3726                 }
3727             }
3728         }
3729     }
3730 }
3731
3732
3733 static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
3734 {
3735     ivec local_fft_ndata, local_fft_offset, local_fft_size;
3736     pme_overlap_t *overlap;
3737     int  send_index0, send_nindex;
3738     int  recv_nindex;
3739 #ifdef GMX_MPI
3740     MPI_Status stat;
3741 #endif
3742     int  send_size_y, recv_size_y;
3743     int  ipulse, send_id, recv_id, datasize, gridsize, size_yx;
3744     real *sendptr, *recvptr;
3745     int  x, y, z, indg, indb;
3746
3747     /* Note that this routine is only used for forward communication.
3748      * Since the force gathering, unlike the charge spreading,
3749      * can be trivially parallelized over the particles,
3750      * the backwards process is much simpler and can use the "old"
3751      * communication setup.
3752      */
3753
3754     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3755                                    local_fft_ndata,
3756                                    local_fft_offset,
3757                                    local_fft_size);
3758
3759     if (pme->nnodes_minor > 1)
3760     {
3761         /* Major dimension */
3762         overlap = &pme->overlap[1];
3763
3764         if (pme->nnodes_major > 1)
3765         {
3766             size_yx = pme->overlap[0].comm_data[0].send_nindex;
3767         }
3768         else
3769         {
3770             size_yx = 0;
3771         }
3772         datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
3773
3774         send_size_y = overlap->send_size;
3775
3776         for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
3777         {
3778             send_id       = overlap->send_id[ipulse];
3779             recv_id       = overlap->recv_id[ipulse];
3780             send_index0   =
3781                 overlap->comm_data[ipulse].send_index0 -
3782                 overlap->comm_data[0].send_index0;
3783             send_nindex   = overlap->comm_data[ipulse].send_nindex;
3784             /* We don't use recv_index0, as we always receive starting at 0 */
3785             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
3786             recv_size_y   = overlap->comm_data[ipulse].recv_size;
3787
3788             sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
3789             recvptr = overlap->recvbuf;
3790
3791 #ifdef GMX_MPI
3792             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
3793                          send_id, ipulse,
3794                          recvptr, recv_size_y*datasize, GMX_MPI_REAL,
3795                          recv_id, ipulse,
3796                          overlap->mpi_comm, &stat);
3797 #endif
3798
3799             for (x = 0; x < local_fft_ndata[XX]; x++)
3800             {
3801                 for (y = 0; y < recv_nindex; y++)
3802                 {
3803                     indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3804                     indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
3805                     for (z = 0; z < local_fft_ndata[ZZ]; z++)
3806                     {
3807                         fftgrid[indg+z] += recvptr[indb+z];
3808                     }
3809                 }
3810             }
3811
3812             if (pme->nnodes_major > 1)
3813             {
3814                 /* Copy from the received buffer to the send buffer for dim 0 */
3815                 sendptr = pme->overlap[0].sendbuf;
3816                 for (x = 0; x < size_yx; x++)
3817                 {
3818                     for (y = 0; y < recv_nindex; y++)
3819                     {
3820                         indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3821                         indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
3822                         for (z = 0; z < local_fft_ndata[ZZ]; z++)
3823                         {
3824                             sendptr[indg+z] += recvptr[indb+z];
3825                         }
3826                     }
3827                 }
3828             }
3829         }
3830     }
3831
3832     /* We only support a single pulse here.
3833      * This is not a severe limitation, as this code is only used
3834      * with OpenMP and with OpenMP the (PME) domains can be larger.
3835      */
3836     if (pme->nnodes_major > 1)
3837     {
3838         /* Major dimension */
3839         overlap = &pme->overlap[0];
3840
3841         datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
3842         gridsize = local_fft_size[YY] *local_fft_size[ZZ];
3843
3844         ipulse = 0;
3845
3846         send_id       = overlap->send_id[ipulse];
3847         recv_id       = overlap->recv_id[ipulse];
3848         send_nindex   = overlap->comm_data[ipulse].send_nindex;
3849         /* We don't use recv_index0, as we always receive starting at 0 */
3850         recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
3851
3852         sendptr = overlap->sendbuf;
3853         recvptr = overlap->recvbuf;
3854
3855         if (debug != NULL)
3856         {
3857             fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
3858                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
3859         }
3860
3861 #ifdef GMX_MPI
3862         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
3863                      send_id, ipulse,
3864                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
3865                      recv_id, ipulse,
3866                      overlap->mpi_comm, &stat);
3867 #endif
3868
3869         for (x = 0; x < recv_nindex; x++)
3870         {
3871             for (y = 0; y < local_fft_ndata[YY]; y++)
3872             {
3873                 indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
3874                 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3875                 for (z = 0; z < local_fft_ndata[ZZ]; z++)
3876                 {
3877                     fftgrid[indg+z] += recvptr[indb+z];
3878                 }
3879             }
3880         }
3881     }
3882 }
3883
3884
3885 static void spread_on_grid(gmx_pme_t pme,
3886                            pme_atomcomm_t *atc, pmegrids_t *grids,
3887                            gmx_bool bCalcSplines, gmx_bool bSpread,
3888                            real *fftgrid)
3889 {
3890     int nthread, thread;
3891 #ifdef PME_TIME_THREADS
3892     gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
3893     static double cs1     = 0, cs2 = 0, cs3 = 0;
3894     static double cs1a[6] = {0, 0, 0, 0, 0, 0};
3895     static int cnt        = 0;
3896 #endif
3897
3898     nthread = pme->nthread;
3899     assert(nthread > 0);
3900
3901 #ifdef PME_TIME_THREADS
3902     c1 = omp_cyc_start();
3903 #endif
3904     if (bCalcSplines)
3905     {
3906 #pragma omp parallel for num_threads(nthread) schedule(static)
3907         for (thread = 0; thread < nthread; thread++)
3908         {
3909             int start, end;
3910
3911             start = atc->n* thread   /nthread;
3912             end   = atc->n*(thread+1)/nthread;
3913
3914             /* Compute fftgrid index for all atoms,
3915              * with help of some extra variables.
3916              */
3917             calc_interpolation_idx(pme, atc, start, end, thread);
3918         }
3919     }
3920 #ifdef PME_TIME_THREADS
3921     c1   = omp_cyc_end(c1);
3922     cs1 += (double)c1;
3923 #endif
3924
3925 #ifdef PME_TIME_THREADS
3926     c2 = omp_cyc_start();
3927 #endif
3928 #pragma omp parallel for num_threads(nthread) schedule(static)
3929     for (thread = 0; thread < nthread; thread++)
3930     {
3931         splinedata_t *spline;
3932         pmegrid_t *grid;
3933
3934         /* make local bsplines  */
3935         if (grids == NULL || grids->nthread == 1)
3936         {
3937             spline = &atc->spline[0];
3938
3939             spline->n = atc->n;
3940
3941             grid = &grids->grid;
3942         }
3943         else
3944         {
3945             spline = &atc->spline[thread];
3946
3947             make_thread_local_ind(atc, thread, spline);
3948
3949             grid = &grids->grid_th[thread];
3950         }
3951
3952         if (bCalcSplines)
3953         {
3954             make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
3955                           atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
3956         }
3957
3958         if (bSpread)
3959         {
3960             /* put local atoms on grid. */
3961 #ifdef PME_TIME_SPREAD
3962             ct1a = omp_cyc_start();
3963 #endif
3964             spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
3965
3966             if (grids->nthread > 1)
3967             {
3968                 copy_local_grid(pme, grids, thread, fftgrid);
3969             }
3970 #ifdef PME_TIME_SPREAD
3971             ct1a          = omp_cyc_end(ct1a);
3972             cs1a[thread] += (double)ct1a;
3973 #endif
3974         }
3975     }
3976 #ifdef PME_TIME_THREADS
3977     c2   = omp_cyc_end(c2);
3978     cs2 += (double)c2;
3979 #endif
3980
3981     if (bSpread && grids->nthread > 1)
3982     {
3983 #ifdef PME_TIME_THREADS
3984         c3 = omp_cyc_start();
3985 #endif
3986 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
3987         for (thread = 0; thread < grids->nthread; thread++)
3988         {
3989             reduce_threadgrid_overlap(pme, grids, thread,
3990                                       fftgrid,
3991                                       pme->overlap[0].sendbuf,
3992                                       pme->overlap[1].sendbuf);
3993         }
3994 #ifdef PME_TIME_THREADS
3995         c3   = omp_cyc_end(c3);
3996         cs3 += (double)c3;
3997 #endif
3998
3999         if (pme->nnodes > 1)
4000         {
4001             /* Communicate the overlapping part of the fftgrid */
4002             sum_fftgrid_dd(pme, fftgrid);
4003         }
4004     }
4005
4006 #ifdef PME_TIME_THREADS
4007     cnt++;
4008     if (cnt % 20 == 0)
4009     {
4010         printf("idx %.2f spread %.2f red %.2f",
4011                cs1*1e-9, cs2*1e-9, cs3*1e-9);
4012 #ifdef PME_TIME_SPREAD
4013         for (thread = 0; thread < nthread; thread++)
4014         {
4015             printf(" %.2f", cs1a[thread]*1e-9);
4016         }
4017 #endif
4018         printf("\n");
4019     }
4020 #endif
4021 }
4022
4023
4024 static void dump_grid(FILE *fp,
4025                       int sx, int sy, int sz, int nx, int ny, int nz,
4026                       int my, int mz, const real *g)
4027 {
4028     int x, y, z;
4029
4030     for (x = 0; x < nx; x++)
4031     {
4032         for (y = 0; y < ny; y++)
4033         {
4034             for (z = 0; z < nz; z++)
4035             {
4036                 fprintf(fp, "%2d %2d %2d %6.3f\n",
4037                         sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
4038             }
4039         }
4040     }
4041 }
4042
4043 static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
4044 {
4045     ivec local_fft_ndata, local_fft_offset, local_fft_size;
4046
4047     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
4048                                    local_fft_ndata,
4049                                    local_fft_offset,
4050                                    local_fft_size);
4051
4052     dump_grid(stderr,
4053               pme->pmegrid_start_ix,
4054               pme->pmegrid_start_iy,
4055               pme->pmegrid_start_iz,
4056               pme->pmegrid_nx-pme->pme_order+1,
4057               pme->pmegrid_ny-pme->pme_order+1,
4058               pme->pmegrid_nz-pme->pme_order+1,
4059               local_fft_size[YY],
4060               local_fft_size[ZZ],
4061               fftgrid);
4062 }
4063
4064
4065 void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
4066 {
4067     pme_atomcomm_t *atc;
4068     pmegrids_t *grid;
4069
4070     if (pme->nnodes > 1)
4071     {
4072         gmx_incons("gmx_pme_calc_energy called in parallel");
4073     }
4074     if (pme->bFEP > 1)
4075     {
4076         gmx_incons("gmx_pme_calc_energy with free energy");
4077     }
4078
4079     atc            = &pme->atc_energy;
4080     atc->nthread   = 1;
4081     if (atc->spline == NULL)
4082     {
4083         snew(atc->spline, atc->nthread);
4084     }
4085     atc->nslab     = 1;
4086     atc->bSpread   = TRUE;
4087     atc->pme_order = pme->pme_order;
4088     atc->n         = n;
4089     pme_realloc_atomcomm_things(atc);
4090     atc->x         = x;
4091     atc->q         = q;
4092
4093     /* We only use the A-charges grid */
4094     grid = &pme->pmegridA;
4095
4096     spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
4097
4098     *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
4099 }
4100
4101
4102 static void reset_pmeonly_counters(t_commrec *cr, gmx_wallcycle_t wcycle,
4103                                    t_nrnb *nrnb, t_inputrec *ir,
4104                                    gmx_large_int_t step)
4105 {
4106     /* Reset all the counters related to performance over the run */
4107     wallcycle_stop(wcycle, ewcRUN);
4108     wallcycle_reset_all(wcycle);
4109     init_nrnb(nrnb);
4110     if (ir->nsteps >= 0)
4111     {
4112         /* ir->nsteps is not used here, but we update it for consistency */
4113         ir->nsteps -= step - ir->init_step;
4114     }
4115     ir->init_step = step;
4116     wallcycle_start(wcycle, ewcRUN);
4117 }
4118
4119
4120 static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
4121                                ivec grid_size,
4122                                t_commrec *cr, t_inputrec *ir,
4123                                gmx_pme_t *pme_ret)
4124 {
4125     int ind;
4126     gmx_pme_t pme = NULL;
4127
4128     ind = 0;
4129     while (ind < *npmedata)
4130     {
4131         pme = (*pmedata)[ind];
4132         if (pme->nkx == grid_size[XX] &&
4133             pme->nky == grid_size[YY] &&
4134             pme->nkz == grid_size[ZZ])
4135         {
4136             *pme_ret = pme;
4137
4138             return;
4139         }
4140
4141         ind++;
4142     }
4143
4144     (*npmedata)++;
4145     srenew(*pmedata, *npmedata);
4146
4147     /* Generate a new PME data structure, copying part of the old pointers */
4148     gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
4149
4150     *pme_ret = (*pmedata)[ind];
4151 }
4152
4153
4154 int gmx_pmeonly(gmx_pme_t pme,
4155                 t_commrec *cr,    t_nrnb *nrnb,
4156                 gmx_wallcycle_t wcycle,
4157                 real ewaldcoeff,  gmx_bool bGatherOnly,
4158                 t_inputrec *ir)
4159 {
4160     int npmedata;
4161     gmx_pme_t *pmedata;
4162     gmx_pme_pp_t pme_pp;
4163     int  ret;
4164     int  natoms;
4165     matrix box;
4166     rvec *x_pp      = NULL, *f_pp = NULL;
4167     real *chargeA   = NULL, *chargeB = NULL;
4168     real lambda     = 0;
4169     int  maxshift_x = 0, maxshift_y = 0;
4170     real energy, dvdlambda;
4171     matrix vir;
4172     float cycles;
4173     int  count;
4174     gmx_bool bEnerVir;
4175     gmx_large_int_t step, step_rel;
4176     ivec grid_switch;
4177
4178     /* This data will only use with PME tuning, i.e. switching PME grids */
4179     npmedata = 1;
4180     snew(pmedata, npmedata);
4181     pmedata[0] = pme;
4182
4183     pme_pp = gmx_pme_pp_init(cr);
4184
4185     init_nrnb(nrnb);
4186
4187     count = 0;
4188     do /****** this is a quasi-loop over time steps! */
4189     {
4190         /* The reason for having a loop here is PME grid tuning/switching */
4191         do
4192         {
4193             /* Domain decomposition */
4194             ret = gmx_pme_recv_q_x(pme_pp,
4195                                    &natoms,
4196                                    &chargeA, &chargeB, box, &x_pp, &f_pp,
4197                                    &maxshift_x, &maxshift_y,
4198                                    &pme->bFEP, &lambda,
4199                                    &bEnerVir,
4200                                    &step,
4201                                    grid_switch, &ewaldcoeff);
4202
4203             if (ret == pmerecvqxSWITCHGRID)
4204             {
4205                 /* Switch the PME grid to grid_switch */
4206                 gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
4207             }
4208
4209             if (ret == pmerecvqxRESETCOUNTERS)
4210             {
4211                 /* Reset the cycle and flop counters */
4212                 reset_pmeonly_counters(cr, wcycle, nrnb, ir, step);
4213             }
4214         }
4215         while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
4216
4217         if (ret == pmerecvqxFINISH)
4218         {
4219             /* We should stop: break out of the loop */
4220             break;
4221         }
4222
4223         step_rel = step - ir->init_step;
4224
4225         if (count == 0)
4226         {
4227             wallcycle_start(wcycle, ewcRUN);
4228         }
4229
4230         wallcycle_start(wcycle, ewcPMEMESH);
4231
4232         dvdlambda = 0;
4233         clear_mat(vir);
4234         gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
4235                    cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
4236                    &energy, lambda, &dvdlambda,
4237                    GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
4238
4239         cycles = wallcycle_stop(wcycle, ewcPMEMESH);
4240
4241         gmx_pme_send_force_vir_ener(pme_pp,
4242                                     f_pp, vir, energy, dvdlambda,
4243                                     cycles);
4244
4245         count++;
4246     } /***** end of quasi-loop, we stop with the break above */
4247     while (TRUE);
4248
4249     return 0;
4250 }
4251
4252 int gmx_pme_do(gmx_pme_t pme,
4253                int start,       int homenr,
4254                rvec x[],        rvec f[],
4255                real *chargeA,   real *chargeB,
4256                matrix box, t_commrec *cr,
4257                int  maxshift_x, int maxshift_y,
4258                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
4259                matrix vir,      real ewaldcoeff,
4260                real *energy,    real lambda,
4261                real *dvdlambda, int flags)
4262 {
4263     int     q, d, i, j, ntot, npme;
4264     int     nx, ny, nz;
4265     int     n_d, local_ny;
4266     pme_atomcomm_t *atc = NULL;
4267     pmegrids_t *pmegrid = NULL;
4268     real    *grid       = NULL;
4269     real    *ptr;
4270     rvec    *x_d, *f_d;
4271     real    *charge = NULL, *q_d;
4272     real    energy_AB[2];
4273     matrix  vir_AB[2];
4274     gmx_bool bClearF;
4275     gmx_parallel_3dfft_t pfft_setup;
4276     real *  fftgrid;
4277     t_complex * cfftgrid;
4278     int     thread;
4279     const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4280     const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
4281
4282     assert(pme->nnodes > 0);
4283     assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4284
4285     if (pme->nnodes > 1)
4286     {
4287         atc      = &pme->atc[0];
4288         atc->npd = homenr;
4289         if (atc->npd > atc->pd_nalloc)
4290         {
4291             atc->pd_nalloc = over_alloc_dd(atc->npd);
4292             srenew(atc->pd, atc->pd_nalloc);
4293         }
4294         atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4295     }
4296     else
4297     {
4298         /* This could be necessary for TPI */
4299         pme->atc[0].n = homenr;
4300     }
4301
4302     for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
4303     {
4304         if (q == 0)
4305         {
4306             pmegrid    = &pme->pmegridA;
4307             fftgrid    = pme->fftgridA;
4308             cfftgrid   = pme->cfftgridA;
4309             pfft_setup = pme->pfft_setupA;
4310             charge     = chargeA+start;
4311         }
4312         else
4313         {
4314             pmegrid    = &pme->pmegridB;
4315             fftgrid    = pme->fftgridB;
4316             cfftgrid   = pme->cfftgridB;
4317             pfft_setup = pme->pfft_setupB;
4318             charge     = chargeB+start;
4319         }
4320         grid = pmegrid->grid.grid;
4321         /* Unpack structure */
4322         if (debug)
4323         {
4324             fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
4325                     cr->nnodes, cr->nodeid);
4326             fprintf(debug, "Grid = %p\n", (void*)grid);
4327             if (grid == NULL)
4328             {
4329                 gmx_fatal(FARGS, "No grid!");
4330             }
4331         }
4332         where();
4333
4334         m_inv_ur0(box, pme->recipbox);
4335
4336         if (pme->nnodes == 1)
4337         {
4338             atc = &pme->atc[0];
4339             if (DOMAINDECOMP(cr))
4340             {
4341                 atc->n = homenr;
4342                 pme_realloc_atomcomm_things(atc);
4343             }
4344             atc->x = x;
4345             atc->q = charge;
4346             atc->f = f;
4347         }
4348         else
4349         {
4350             wallcycle_start(wcycle, ewcPME_REDISTXF);
4351             for (d = pme->ndecompdim-1; d >= 0; d--)
4352             {
4353                 if (d == pme->ndecompdim-1)
4354                 {
4355                     n_d = homenr;
4356                     x_d = x + start;
4357                     q_d = charge;
4358                 }
4359                 else
4360                 {
4361                     n_d = pme->atc[d+1].n;
4362                     x_d = atc->x;
4363                     q_d = atc->q;
4364                 }
4365                 atc      = &pme->atc[d];
4366                 atc->npd = n_d;
4367                 if (atc->npd > atc->pd_nalloc)
4368                 {
4369                     atc->pd_nalloc = over_alloc_dd(atc->npd);
4370                     srenew(atc->pd, atc->pd_nalloc);
4371                 }
4372                 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4373                 pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
4374                 where();
4375
4376                 /* Redistribute x (only once) and qA or qB */
4377                 if (DOMAINDECOMP(cr))
4378                 {
4379                     dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
4380                 }
4381                 else
4382                 {
4383                     pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
4384                 }
4385             }
4386             where();
4387
4388             wallcycle_stop(wcycle, ewcPME_REDISTXF);
4389         }
4390
4391         if (debug)
4392         {
4393             fprintf(debug, "Node= %6d, pme local particles=%6d\n",
4394                     cr->nodeid, atc->n);
4395         }
4396
4397         if (flags & GMX_PME_SPREAD_Q)
4398         {
4399             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4400
4401             /* Spread the charges on a grid */
4402             spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
4403
4404             if (q == 0)
4405             {
4406                 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
4407             }
4408             inc_nrnb(nrnb, eNR_SPREADQBSP,
4409                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4410
4411             if (pme->nthread == 1)
4412             {
4413                 wrap_periodic_pmegrid(pme, grid);
4414
4415                 /* sum contributions to local grid from other nodes */
4416 #ifdef GMX_MPI
4417                 if (pme->nnodes > 1)
4418                 {
4419                     gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
4420                     where();
4421                 }
4422 #endif
4423
4424                 copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
4425             }
4426
4427             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4428
4429             /*
4430                dump_local_fftgrid(pme,fftgrid);
4431                exit(0);
4432              */
4433         }
4434
4435         /* Here we start a large thread parallel region */
4436 #pragma omp parallel num_threads(pme->nthread) private(thread)
4437         {
4438             thread = gmx_omp_get_thread_num();
4439             if (flags & GMX_PME_SOLVE)
4440             {
4441                 int loop_count;
4442
4443                 /* do 3d-fft */
4444                 if (thread == 0)
4445                 {
4446                     wallcycle_start(wcycle, ewcPME_FFT);
4447                 }
4448                 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
4449                                            fftgrid, cfftgrid, thread, wcycle);
4450                 if (thread == 0)
4451                 {
4452                     wallcycle_stop(wcycle, ewcPME_FFT);
4453                 }
4454                 where();
4455
4456                 /* solve in k-space for our local cells */
4457                 if (thread == 0)
4458                 {
4459                     wallcycle_start(wcycle, ewcPME_SOLVE);
4460                 }
4461                 loop_count =
4462                     solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
4463                                   box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4464                                   bCalcEnerVir,
4465                                   pme->nthread, thread);
4466                 if (thread == 0)
4467                 {
4468                     wallcycle_stop(wcycle, ewcPME_SOLVE);
4469                     where();
4470                     inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
4471                 }
4472             }
4473
4474             if (bCalcF)
4475             {
4476                 /* do 3d-invfft */
4477                 if (thread == 0)
4478                 {
4479                     where();
4480                     wallcycle_start(wcycle, ewcPME_FFT);
4481                 }
4482                 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
4483                                            cfftgrid, fftgrid, thread, wcycle);
4484                 if (thread == 0)
4485                 {
4486                     wallcycle_stop(wcycle, ewcPME_FFT);
4487
4488                     where();
4489
4490                     if (pme->nodeid == 0)
4491                     {
4492                         ntot  = pme->nkx*pme->nky*pme->nkz;
4493                         npme  = ntot*log((real)ntot)/log(2.0);
4494                         inc_nrnb(nrnb, eNR_FFT, 2*npme);
4495                     }
4496
4497                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4498                 }
4499
4500                 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
4501             }
4502         }
4503         /* End of thread parallel section.
4504          * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4505          */
4506
4507         if (bCalcF)
4508         {
4509             /* distribute local grid to all nodes */
4510 #ifdef GMX_MPI
4511             if (pme->nnodes > 1)
4512             {
4513                 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
4514             }
4515 #endif
4516             where();
4517
4518             unwrap_periodic_pmegrid(pme, grid);
4519
4520             /* interpolate forces for our local atoms */
4521
4522             where();
4523
4524             /* If we are running without parallelization,
4525              * atc->f is the actual force array, not a buffer,
4526              * therefore we should not clear it.
4527              */
4528             bClearF = (q == 0 && PAR(cr));
4529 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4530             for (thread = 0; thread < pme->nthread; thread++)
4531             {
4532                 gather_f_bsplines(pme, grid, bClearF, atc,
4533                                   &atc->spline[thread],
4534                                   pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
4535             }
4536
4537             where();
4538
4539             inc_nrnb(nrnb, eNR_GATHERFBSP,
4540                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4541             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4542         }
4543
4544         if (bCalcEnerVir)
4545         {
4546             /* This should only be called on the master thread
4547              * and after the threads have synchronized.
4548              */
4549             get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
4550         }
4551     } /* of q-loop */
4552
4553     if (bCalcF && pme->nnodes > 1)
4554     {
4555         wallcycle_start(wcycle, ewcPME_REDISTXF);
4556         for (d = 0; d < pme->ndecompdim; d++)
4557         {
4558             atc = &pme->atc[d];
4559             if (d == pme->ndecompdim - 1)
4560             {
4561                 n_d = homenr;
4562                 f_d = f + start;
4563             }
4564             else
4565             {
4566                 n_d = pme->atc[d+1].n;
4567                 f_d = pme->atc[d+1].f;
4568             }
4569             if (DOMAINDECOMP(cr))
4570             {
4571                 dd_pmeredist_f(pme, atc, n_d, f_d,
4572                                d == pme->ndecompdim-1 && pme->bPPnode);
4573             }
4574             else
4575             {
4576                 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
4577             }
4578         }
4579
4580         wallcycle_stop(wcycle, ewcPME_REDISTXF);
4581     }
4582     where();
4583
4584     if (bCalcEnerVir)
4585     {
4586         if (!pme->bFEP)
4587         {
4588             *energy = energy_AB[0];
4589             m_add(vir, vir_AB[0], vir);
4590         }
4591         else
4592         {
4593             *energy     = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
4594             *dvdlambda += energy_AB[1] - energy_AB[0];
4595             for (i = 0; i < DIM; i++)
4596             {
4597                 for (j = 0; j < DIM; j++)
4598                 {
4599                     vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
4600                         lambda*vir_AB[1][i][j];
4601                 }
4602             }
4603         }
4604     }
4605     else
4606     {
4607         *energy = 0;
4608     }
4609
4610     if (debug)
4611     {
4612         fprintf(debug, "PME mesh energy: %g\n", *energy);
4613     }
4614
4615     return 0;
4616 }