src/gromacs/mdlib/pme.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /* IMPORTANT FOR DEVELOPERS:
  38  *
  39  * Triclinic pme stuff isn't entirely trivial, and we've experienced
  40  * some bugs during development (many of them due to me). To avoid
  41  * this in the future, please check the following things if you make
  42  * changes in this file:
  43  *
  44  * 1. You should obtain identical (at least to the PME precision)
  45  *    energies, forces, and virial for
  46  *    a rectangular box and a triclinic one where the z (or y) axis is
  47  *    tilted a whole box side. For instance you could use these boxes:
  48  *
  49  *    rectangular       triclinic
  50  *     2  0  0           2  0  0
  51  *     0  2  0           0  2  0
  52  *     0  0  6           2  2  6
  53  *
  54  * 2. You should check the energy conservation in a triclinic box.
  55  *
  56  * It might seem an overkill, but better safe than sorry.
  57  * /Erik 001109
  58  */
  59
  60 #ifdef HAVE_CONFIG_H
  61 #include <config.h>
  62 #endif
  63
  64 #include <assert.h>
  65 #include <math.h>
  66 #include <stdio.h>
  67 #include <stdlib.h>
  68 #include <string.h>
  69
  70 #include "typedefs.h"
  71 #include "txtdump.h"
  72 #include "gromacs/math/vec.h"
  73 #include "gromacs/utility/smalloc.h"
  74 #include "coulomb.h"
  75 #include "gromacs/utility/fatalerror.h"
  76 #include "pme.h"
  77 #include "network.h"
  78 #include "physics.h"
  79 #include "nrnb.h"
  80 #include "macros.h"
  81
  82 #include "gromacs/legacyheaders/types/commrec.h"
  83 #include "gromacs/fft/parallel_3dfft.h"
  84 #include "gromacs/utility/futil.h"
  85 #include "gromacs/fileio/pdbio.h"
  86 #include "gromacs/math/gmxcomplex.h"
  87 #include "gromacs/timing/cyclecounter.h"
  88 #include "gromacs/timing/wallcycle.h"
  89 #include "gromacs/utility/gmxmpi.h"
  90 #include "gromacs/utility/gmxomp.h"
  91
  92 /* Include the SIMD macro file and then check for support */
  93 #include "gromacs/simd/simd.h"
  94 #include "gromacs/simd/simd_math.h"
  95 #ifdef GMX_SIMD_HAVE_REAL
  96 /* Turn on arbitrary width SIMD intrinsics for PME solve */
  97 #    define PME_SIMD_SOLVE
  98 #endif
  99
 100 #define PME_GRID_QA    0 /* Gridindex for A-state for Q */
 101 #define PME_GRID_C6A   2 /* Gridindex for A-state for LJ */
 102 #define DO_Q           2 /* Electrostatic grids have index q<2 */
 103 #define DO_Q_AND_LJ    4 /* non-LB LJ grids have index 2 <= q < 4 */
 104 #define DO_Q_AND_LJ_LB 9 /* With LB rules we need a total of 2+7 grids */
 105
 106 /* Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */
 107 const real lb_scale_factor[] = {
 108     1.0/64, 6.0/64, 15.0/64, 20.0/64,
 109     15.0/64, 6.0/64, 1.0/64
 110 };
 111
 112 /* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
 113 const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
 114
 115 /* Check if we have 4-wide SIMD macro support */
 116 #if (defined GMX_SIMD4_HAVE_REAL)
 117 /* Do PME spread and gather with 4-wide SIMD.
 118  * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
 119  */
 120 #    define PME_SIMD4_SPREAD_GATHER
 121
 122 #    if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
 123 /* With PME-order=4 on x86, unaligned load+store is slightly faster
 124  * than doubling all SIMD operations when using aligned load+store.
 125  */
 126 #        define PME_SIMD4_UNALIGNED
 127 #    endif
 128 #endif
 129
 130 #define DFT_TOL 1e-7
 131 /* #define PRT_FORCE */
 132 /* conditions for on the fly time-measurement */
 133 /* #define TAKETIME (step > 1 && timesteps < 10) */
 134 #define TAKETIME FALSE
 135
 136 /* #define PME_TIME_THREADS */
 137
 138 #ifdef GMX_DOUBLE
 139 #define mpi_type MPI_DOUBLE
 140 #else
 141 #define mpi_type MPI_FLOAT
 142 #endif
 143
 144 #ifdef PME_SIMD4_SPREAD_GATHER
 145 #    define SIMD4_ALIGNMENT  (GMX_SIMD4_WIDTH*sizeof(real))
 146 #else
 147 /* We can use any alignment, apart from 0, so we use 4 reals */
 148 #    define SIMD4_ALIGNMENT  (4*sizeof(real))
 149 #endif
 150
 151 /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
 152  * to preserve alignment.
 153  */
 154 #define GMX_CACHE_SEP 64
 155
 156 /* We only define a maximum to be able to use local arrays without allocation.
 157  * An order larger than 12 should never be needed, even for test cases.
 158  * If needed it can be changed here.
 159  */
 160 #define PME_ORDER_MAX 12
 161
 162 /* Internal datastructures */
 163 typedef struct {
 164     int send_index0;
 165     int send_nindex;
 166     int recv_index0;
 167     int recv_nindex;
 168     int recv_size;   /* Receive buffer width, used with OpenMP */
 169 } pme_grid_comm_t;
 170
 171 typedef struct {
 172 #ifdef GMX_MPI
 173     MPI_Comm         mpi_comm;
 174 #endif
 175     int              nnodes, nodeid;
 176     int             *s2g0;
 177     int             *s2g1;
 178     int              noverlap_nodes;
 179     int             *send_id, *recv_id;
 180     int              send_size; /* Send buffer width, used with OpenMP */
 181     pme_grid_comm_t *comm_data;
 182     real            *sendbuf;
 183     real            *recvbuf;
 184 } pme_overlap_t;
 185
 186 typedef struct {
 187     int *n;      /* Cumulative counts of the number of particles per thread */
 188     int  nalloc; /* Allocation size of i */
 189     int *i;      /* Particle indices ordered on thread index (n) */
 190 } thread_plist_t;
 191
 192 typedef struct {
 193     int      *thread_one;
 194     int       n;
 195     int      *ind;
 196     splinevec theta;
 197     real     *ptr_theta_z;
 198     splinevec dtheta;
 199     real     *ptr_dtheta_z;
 200 } splinedata_t;
 201
 202 typedef struct {
 203     int      dimind;        /* The index of the dimension, 0=x, 1=y */
 204     int      nslab;
 205     int      nodeid;
 206 #ifdef GMX_MPI
 207     MPI_Comm mpi_comm;
 208 #endif
 209
 210     int     *node_dest;     /* The nodes to send x and q to with DD */
 211     int     *node_src;      /* The nodes to receive x and q from with DD */
 212     int     *buf_index;     /* Index for commnode into the buffers */
 213
 214     int      maxshift;
 215
 216     int      npd;
 217     int      pd_nalloc;
 218     int     *pd;
 219     int     *count;         /* The number of atoms to send to each node */
 220     int    **count_thread;
 221     int     *rcount;        /* The number of atoms to receive */
 222
 223     int      n;
 224     int      nalloc;
 225     rvec    *x;
 226     real    *coefficient;
 227     rvec    *f;
 228     gmx_bool bSpread;       /* These coordinates are used for spreading */
 229     int      pme_order;
 230     ivec    *idx;
 231     rvec    *fractx;            /* Fractional coordinate relative to
 232                                  * the lower cell boundary
 233                                  */
 234     int             nthread;
 235     int            *thread_idx; /* Which thread should spread which coefficient */
 236     thread_plist_t *thread_plist;
 237     splinedata_t   *spline;
 238 } pme_atomcomm_t;
 239
 240 #define FLBS  3
 241 #define FLBSZ 4
 242
 243 typedef struct {
 244     ivec  ci;     /* The spatial location of this grid         */
 245     ivec  n;      /* The used size of *grid, including order-1 */
 246     ivec  offset; /* The grid offset from the full node grid   */
 247     int   order;  /* PME spreading order                       */
 248     ivec  s;      /* The allocated size of *grid, s >= n       */
 249     real *grid;   /* The grid local thread, size n             */
 250 } pmegrid_t;
 251
 252 typedef struct {
 253     pmegrid_t  grid;         /* The full node grid (non thread-local)            */
 254     int        nthread;      /* The number of threads operating on this grid     */
 255     ivec       nc;           /* The local spatial decomposition over the threads */
 256     pmegrid_t *grid_th;      /* Array of grids for each thread                   */
 257     real      *grid_all;     /* Allocated array for the grids in *grid_th        */
 258     int      **g2t;          /* The grid to thread index                         */
 259     ivec       nthread_comm; /* The number of threads to communicate with        */
 260 } pmegrids_t;
 261
 262 typedef struct {
 263 #ifdef PME_SIMD4_SPREAD_GATHER
 264     /* Masks for 4-wide SIMD aligned spreading and gathering */
 265     gmx_simd4_bool_t mask_S0[6], mask_S1[6];
 266 #else
 267     int              dummy; /* C89 requires that struct has at least one member */
 268 #endif
 269 } pme_spline_work_t;
 270
 271 typedef struct {
 272     /* work data for solve_pme */
 273     int      nalloc;
 274     real *   mhx;
 275     real *   mhy;
 276     real *   mhz;
 277     real *   m2;
 278     real *   denom;
 279     real *   tmp1_alloc;
 280     real *   tmp1;
 281     real *   tmp2;
 282     real *   eterm;
 283     real *   m2inv;
 284
 285     real     energy_q;
 286     matrix   vir_q;
 287     real     energy_lj;
 288     matrix   vir_lj;
 289 } pme_work_t;
 290
 291 typedef struct gmx_pme {
 292     int           ndecompdim; /* The number of decomposition dimensions */
 293     int           nodeid;     /* Our nodeid in mpi->mpi_comm */
 294     int           nodeid_major;
 295     int           nodeid_minor;
 296     int           nnodes;    /* The number of nodes doing PME */
 297     int           nnodes_major;
 298     int           nnodes_minor;
 299
 300     MPI_Comm      mpi_comm;
 301     MPI_Comm      mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
 302 #ifdef GMX_MPI
 303     MPI_Datatype  rvec_mpi;      /* the pme vector's MPI type */
 304 #endif
 305
 306     gmx_bool   bUseThreads;   /* Does any of the PME ranks have nthread>1 ?  */
 307     int        nthread;       /* The number of threads doing PME on our rank */
 308
 309     gmx_bool   bPPnode;       /* Node also does particle-particle forces */
 310     gmx_bool   bFEP;          /* Compute Free energy contribution */
 311     gmx_bool   bFEP_q;
 312     gmx_bool   bFEP_lj;
 313     int        nkx, nky, nkz; /* Grid dimensions */
 314     gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
 315     int        pme_order;
 316     real       epsilon_r;
 317
 318     int        ljpme_combination_rule;  /* Type of combination rule in LJ-PME */
 319
 320     int        ngrids;                  /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
 321
 322     pmegrids_t pmegrid[DO_Q_AND_LJ_LB]; /* Grids on which we do spreading/interpolation,
 323                                          * includes overlap Grid indices are ordered as
 324                                          * follows:
 325                                          * 0: Coloumb PME, state A
 326                                          * 1: Coloumb PME, state B
 327                                          * 2-8: LJ-PME
 328                                          * This can probably be done in a better way
 329                                          * but this simple hack works for now
 330                                          */
 331     /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
 332     int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
 333     /* pmegrid_nz might be larger than strictly necessary to ensure
 334      * memory alignment, pmegrid_nz_base gives the real base size.
 335      */
 336     int     pmegrid_nz_base;
 337     /* The local PME grid starting indices */
 338     int     pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
 339
 340     /* Work data for spreading and gathering */
 341     pme_spline_work_t     *spline_work;
 342
 343     real                 **fftgrid; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
 344     /* inside the interpolation grid, but separate for 2D PME decomp. */
 345     int                    fftgrid_nx, fftgrid_ny, fftgrid_nz;
 346
 347     t_complex            **cfftgrid;  /* Grids for complex FFT data */
 348
 349     int                    cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
 350
 351     gmx_parallel_3dfft_t  *pfft_setup;
 352
 353     int                   *nnx, *nny, *nnz;
 354     real                  *fshx, *fshy, *fshz;
 355
 356     pme_atomcomm_t         atc[2]; /* Indexed on decomposition index */
 357     matrix                 recipbox;
 358     splinevec              bsp_mod;
 359     /* Buffers to store data for local atoms for L-B combination rule
 360      * calculations in LJ-PME. lb_buf1 stores either the coefficients
 361      * for spreading/gathering (in serial), or the C6 coefficient for
 362      * local atoms (in parallel).  lb_buf2 is only used in parallel,
 363      * and stores the sigma values for local atoms. */
 364     real                 *lb_buf1, *lb_buf2;
 365     int                   lb_buf_nalloc; /* Allocation size for the above buffers. */
 366
 367     pme_overlap_t         overlap[2];    /* Indexed on dimension, 0=x, 1=y */
 368
 369     pme_atomcomm_t        atc_energy;    /* Only for gmx_pme_calc_energy */
 370
 371     rvec                 *bufv;          /* Communication buffer */
 372     real                 *bufr;          /* Communication buffer */
 373     int                   buf_nalloc;    /* The communication buffer size */
 374
 375     /* thread local work data for solve_pme */
 376     pme_work_t *work;
 377
 378     /* Work data for sum_qgrid */
 379     real *   sum_qgrid_tmp;
 380     real *   sum_qgrid_dd_tmp;
 381 } t_gmx_pme;
 382
 383 static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
 384                                    int start, int grid_index, int end, int thread)
 385 {
 386     int             i;
 387     int            *idxptr, tix, tiy, tiz;
 388     real           *xptr, *fptr, tx, ty, tz;
 389     real            rxx, ryx, ryy, rzx, rzy, rzz;
 390     int             nx, ny, nz;
 391     int             start_ix, start_iy, start_iz;
 392     int            *g2tx, *g2ty, *g2tz;
 393     gmx_bool        bThreads;
 394     int            *thread_idx = NULL;
 395     thread_plist_t *tpl        = NULL;
 396     int            *tpl_n      = NULL;
 397     int             thread_i;
 398
 399     nx  = pme->nkx;
 400     ny  = pme->nky;
 401     nz  = pme->nkz;
 402
 403     start_ix = pme->pmegrid_start_ix;
 404     start_iy = pme->pmegrid_start_iy;
 405     start_iz = pme->pmegrid_start_iz;
 406
 407     rxx = pme->recipbox[XX][XX];
 408     ryx = pme->recipbox[YY][XX];
 409     ryy = pme->recipbox[YY][YY];
 410     rzx = pme->recipbox[ZZ][XX];
 411     rzy = pme->recipbox[ZZ][YY];
 412     rzz = pme->recipbox[ZZ][ZZ];
 413
 414     g2tx = pme->pmegrid[grid_index].g2t[XX];
 415     g2ty = pme->pmegrid[grid_index].g2t[YY];
 416     g2tz = pme->pmegrid[grid_index].g2t[ZZ];
 417
 418     bThreads = (atc->nthread > 1);
 419     if (bThreads)
 420     {
 421         thread_idx = atc->thread_idx;
 422
 423         tpl   = &atc->thread_plist[thread];
 424         tpl_n = tpl->n;
 425         for (i = 0; i < atc->nthread; i++)
 426         {
 427             tpl_n[i] = 0;
 428         }
 429     }
 430
 431     for (i = start; i < end; i++)
 432     {
 433         xptr   = atc->x[i];
 434         idxptr = atc->idx[i];
 435         fptr   = atc->fractx[i];
 436
 437         /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
 438         tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
 439         ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
 440         tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
 441
 442         tix = (int)(tx);
 443         tiy = (int)(ty);
 444         tiz = (int)(tz);
 445
 446         /* Because decomposition only occurs in x and y,
 447          * we never have a fraction correction in z.
 448          */
 449         fptr[XX] = tx - tix + pme->fshx[tix];
 450         fptr[YY] = ty - tiy + pme->fshy[tiy];
 451         fptr[ZZ] = tz - tiz;
 452
 453         idxptr[XX] = pme->nnx[tix];
 454         idxptr[YY] = pme->nny[tiy];
 455         idxptr[ZZ] = pme->nnz[tiz];
 456
 457 #ifdef DEBUG
 458         range_check(idxptr[XX], 0, pme->pmegrid_nx);
 459         range_check(idxptr[YY], 0, pme->pmegrid_ny);
 460         range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
 461 #endif
 462
 463         if (bThreads)
 464         {
 465             thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
 466             thread_idx[i] = thread_i;
 467             tpl_n[thread_i]++;
 468         }
 469     }
 470
 471     if (bThreads)
 472     {
 473         /* Make a list of particle indices sorted on thread */
 474
 475         /* Get the cumulative count */
 476         for (i = 1; i < atc->nthread; i++)
 477         {
 478             tpl_n[i] += tpl_n[i-1];
 479         }
 480         /* The current implementation distributes particles equally
 481          * over the threads, so we could actually allocate for that
 482          * in pme_realloc_atomcomm_things.
 483          */
 484         if (tpl_n[atc->nthread-1] > tpl->nalloc)
 485         {
 486             tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
 487             srenew(tpl->i, tpl->nalloc);
 488         }
 489         /* Set tpl_n to the cumulative start */
 490         for (i = atc->nthread-1; i >= 1; i--)
 491         {
 492             tpl_n[i] = tpl_n[i-1];
 493         }
 494         tpl_n[0] = 0;
 495
 496         /* Fill our thread local array with indices sorted on thread */
 497         for (i = start; i < end; i++)
 498         {
 499             tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
 500         }
 501         /* Now tpl_n contains the cummulative count again */
 502     }
 503 }
 504
 505 static void make_thread_local_ind(pme_atomcomm_t *atc,
 506                                   int thread, splinedata_t *spline)
 507 {
 508     int             n, t, i, start, end;
 509     thread_plist_t *tpl;
 510
 511     /* Combine the indices made by each thread into one index */
 512
 513     n     = 0;
 514     start = 0;
 515     for (t = 0; t < atc->nthread; t++)
 516     {
 517         tpl = &atc->thread_plist[t];
 518         /* Copy our part (start - end) from the list of thread t */
 519         if (thread > 0)
 520         {
 521             start = tpl->n[thread-1];
 522         }
 523         end = tpl->n[thread];
 524         for (i = start; i < end; i++)
 525         {
 526             spline->ind[n++] = tpl->i[i];
 527         }
 528     }
 529
 530     spline->n = n;
 531 }
 532
 533
 534 static void pme_calc_pidx(int start, int end,
 535                           matrix recipbox, rvec x[],
 536                           pme_atomcomm_t *atc, int *count)
 537 {
 538     int   nslab, i;
 539     int   si;
 540     real *xptr, s;
 541     real  rxx, ryx, rzx, ryy, rzy;
 542     int  *pd;
 543
 544     /* Calculate PME task index (pidx) for each grid index.
 545      * Here we always assign equally sized slabs to each node
 546      * for load balancing reasons (the PME grid spacing is not used).
 547      */
 548
 549     nslab = atc->nslab;
 550     pd    = atc->pd;
 551
 552     /* Reset the count */
 553     for (i = 0; i < nslab; i++)
 554     {
 555         count[i] = 0;
 556     }
 557
 558     if (atc->dimind == 0)
 559     {
 560         rxx = recipbox[XX][XX];
 561         ryx = recipbox[YY][XX];
 562         rzx = recipbox[ZZ][XX];
 563         /* Calculate the node index in x-dimension */
 564         for (i = start; i < end; i++)
 565         {
 566             xptr   = x[i];
 567             /* Fractional coordinates along box vectors */
 568             s     = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
 569             si    = (int)(s + 2*nslab) % nslab;
 570             pd[i] = si;
 571             count[si]++;
 572         }
 573     }
 574     else
 575     {
 576         ryy = recipbox[YY][YY];
 577         rzy = recipbox[ZZ][YY];
 578         /* Calculate the node index in y-dimension */
 579         for (i = start; i < end; i++)
 580         {
 581             xptr   = x[i];
 582             /* Fractional coordinates along box vectors */
 583             s     = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
 584             si    = (int)(s + 2*nslab) % nslab;
 585             pd[i] = si;
 586             count[si]++;
 587         }
 588     }
 589 }
 590
 591 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
 592                                   pme_atomcomm_t *atc)
 593 {
 594     int nthread, thread, slab;
 595
 596     nthread = atc->nthread;
 597
 598 #pragma omp parallel for num_threads(nthread) schedule(static)
 599     for (thread = 0; thread < nthread; thread++)
 600     {
 601         pme_calc_pidx(natoms* thread   /nthread,
 602                       natoms*(thread+1)/nthread,
 603                       recipbox, x, atc, atc->count_thread[thread]);
 604     }
 605     /* Non-parallel reduction, since nslab is small */
 606
 607     for (thread = 1; thread < nthread; thread++)
 608     {
 609         for (slab = 0; slab < atc->nslab; slab++)
 610         {
 611             atc->count_thread[0][slab] += atc->count_thread[thread][slab];
 612         }
 613     }
 614 }
 615
 616 static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
 617 {
 618     const int padding = 4;
 619     int       i;
 620
 621     srenew(th[XX], nalloc);
 622     srenew(th[YY], nalloc);
 623     /* In z we add padding, this is only required for the aligned SIMD code */
 624     sfree_aligned(*ptr_z);
 625     snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT);
 626     th[ZZ] = *ptr_z + padding;
 627
 628     for (i = 0; i < padding; i++)
 629     {
 630         (*ptr_z)[               i] = 0;
 631         (*ptr_z)[padding+nalloc+i] = 0;
 632     }
 633 }
 634
 635 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
 636 {
 637     int i, d;
 638
 639     srenew(spline->ind, atc->nalloc);
 640     /* Initialize the index to identity so it works without threads */
 641     for (i = 0; i < atc->nalloc; i++)
 642     {
 643         spline->ind[i] = i;
 644     }
 645
 646     realloc_splinevec(spline->theta, &spline->ptr_theta_z,
 647                       atc->pme_order*atc->nalloc);
 648     realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
 649                       atc->pme_order*atc->nalloc);
 650 }
 651
 652 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 653 {
 654     int nalloc_old, i, j, nalloc_tpl;
 655
 656     /* We have to avoid a NULL pointer for atc->x to avoid
 657      * possible fatal errors in MPI routines.
 658      */
 659     if (atc->n > atc->nalloc || atc->nalloc == 0)
 660     {
 661         nalloc_old  = atc->nalloc;
 662         atc->nalloc = over_alloc_dd(max(atc->n, 1));
 663
 664         if (atc->nslab > 1)
 665         {
 666             srenew(atc->x, atc->nalloc);
 667             srenew(atc->coefficient, atc->nalloc);
 668             srenew(atc->f, atc->nalloc);
 669             for (i = nalloc_old; i < atc->nalloc; i++)
 670             {
 671                 clear_rvec(atc->f[i]);
 672             }
 673         }
 674         if (atc->bSpread)
 675         {
 676             srenew(atc->fractx, atc->nalloc);
 677             srenew(atc->idx, atc->nalloc);
 678
 679             if (atc->nthread > 1)
 680             {
 681                 srenew(atc->thread_idx, atc->nalloc);
 682             }
 683
 684             for (i = 0; i < atc->nthread; i++)
 685             {
 686                 pme_realloc_splinedata(&atc->spline[i], atc);
 687             }
 688         }
 689     }
 690 }
 691
 692 static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused *atc,
 693                             gmx_bool gmx_unused bBackward, int gmx_unused shift,
 694                             void gmx_unused *buf_s, int gmx_unused nbyte_s,
 695                             void gmx_unused *buf_r, int gmx_unused nbyte_r)
 696 {
 697 #ifdef GMX_MPI
 698     int        dest, src;
 699     MPI_Status stat;
 700
 701     if (bBackward == FALSE)
 702     {
 703         dest = atc->node_dest[shift];
 704         src  = atc->node_src[shift];
 705     }
 706     else
 707     {
 708         dest = atc->node_src[shift];
 709         src  = atc->node_dest[shift];
 710     }
 711
 712     if (nbyte_s > 0 && nbyte_r > 0)
 713     {
 714         MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
 715                      dest, shift,
 716                      buf_r, nbyte_r, MPI_BYTE,
 717                      src, shift,
 718                      atc->mpi_comm, &stat);
 719     }
 720     else if (nbyte_s > 0)
 721     {
 722         MPI_Send(buf_s, nbyte_s, MPI_BYTE,
 723                  dest, shift,
 724                  atc->mpi_comm);
 725     }
 726     else if (nbyte_r > 0)
 727     {
 728         MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
 729                  src, shift,
 730                  atc->mpi_comm, &stat);
 731     }
 732 #endif
 733 }
 734
 735 static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
 736                                     int n, gmx_bool bX, rvec *x, real *data,
 737                                     pme_atomcomm_t *atc)
 738 {
 739     int *commnode, *buf_index;
 740     int  nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
 741
 742     commnode  = atc->node_dest;
 743     buf_index = atc->buf_index;
 744
 745     nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 746
 747     nsend = 0;
 748     for (i = 0; i < nnodes_comm; i++)
 749     {
 750         buf_index[commnode[i]] = nsend;
 751         nsend                 += atc->count[commnode[i]];
 752     }
 753     if (bX)
 754     {
 755         if (atc->count[atc->nodeid] + nsend != n)
 756         {
 757             gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
 758                       "This usually means that your system is not well equilibrated.",
 759                       n - (atc->count[atc->nodeid] + nsend),
 760                       pme->nodeid, 'x'+atc->dimind);
 761         }
 762
 763         if (nsend > pme->buf_nalloc)
 764         {
 765             pme->buf_nalloc = over_alloc_dd(nsend);
 766             srenew(pme->bufv, pme->buf_nalloc);
 767             srenew(pme->bufr, pme->buf_nalloc);
 768         }
 769
 770         atc->n = atc->count[atc->nodeid];
 771         for (i = 0; i < nnodes_comm; i++)
 772         {
 773             scount = atc->count[commnode[i]];
 774             /* Communicate the count */
 775             if (debug)
 776             {
 777                 fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
 778                         atc->dimind, atc->nodeid, commnode[i], scount);
 779             }
 780             pme_dd_sendrecv(atc, FALSE, i,
 781                             &scount, sizeof(int),
 782                             &atc->rcount[i], sizeof(int));
 783             atc->n += atc->rcount[i];
 784         }
 785
 786         pme_realloc_atomcomm_things(atc);
 787     }
 788
 789     local_pos = 0;
 790     for (i = 0; i < n; i++)
 791     {
 792         node = atc->pd[i];
 793         if (node == atc->nodeid)
 794         {
 795             /* Copy direct to the receive buffer */
 796             if (bX)
 797             {
 798                 copy_rvec(x[i], atc->x[local_pos]);
 799             }
 800             atc->coefficient[local_pos] = data[i];
 801             local_pos++;
 802         }
 803         else
 804         {
 805             /* Copy to the send buffer */
 806             if (bX)
 807             {
 808                 copy_rvec(x[i], pme->bufv[buf_index[node]]);
 809             }
 810             pme->bufr[buf_index[node]] = data[i];
 811             buf_index[node]++;
 812         }
 813     }
 814
 815     buf_pos = 0;
 816     for (i = 0; i < nnodes_comm; i++)
 817     {
 818         scount = atc->count[commnode[i]];
 819         rcount = atc->rcount[i];
 820         if (scount > 0 || rcount > 0)
 821         {
 822             if (bX)
 823             {
 824                 /* Communicate the coordinates */
 825                 pme_dd_sendrecv(atc, FALSE, i,
 826                                 pme->bufv[buf_pos], scount*sizeof(rvec),
 827                                 atc->x[local_pos], rcount*sizeof(rvec));
 828             }
 829             /* Communicate the coefficients */
 830             pme_dd_sendrecv(atc, FALSE, i,
 831                             pme->bufr+buf_pos, scount*sizeof(real),
 832                             atc->coefficient+local_pos, rcount*sizeof(real));
 833             buf_pos   += scount;
 834             local_pos += atc->rcount[i];
 835         }
 836     }
 837 }
 838
 839 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 840                            int n, rvec *f,
 841                            gmx_bool bAddF)
 842 {
 843     int *commnode, *buf_index;
 844     int  nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
 845
 846     commnode  = atc->node_dest;
 847     buf_index = atc->buf_index;
 848
 849     nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 850
 851     local_pos = atc->count[atc->nodeid];
 852     buf_pos   = 0;
 853     for (i = 0; i < nnodes_comm; i++)
 854     {
 855         scount = atc->rcount[i];
 856         rcount = atc->count[commnode[i]];
 857         if (scount > 0 || rcount > 0)
 858         {
 859             /* Communicate the forces */
 860             pme_dd_sendrecv(atc, TRUE, i,
 861                             atc->f[local_pos], scount*sizeof(rvec),
 862                             pme->bufv[buf_pos], rcount*sizeof(rvec));
 863             local_pos += scount;
 864         }
 865         buf_index[commnode[i]] = buf_pos;
 866         buf_pos               += rcount;
 867     }
 868
 869     local_pos = 0;
 870     if (bAddF)
 871     {
 872         for (i = 0; i < n; i++)
 873         {
 874             node = atc->pd[i];
 875             if (node == atc->nodeid)
 876             {
 877                 /* Add from the local force array */
 878                 rvec_inc(f[i], atc->f[local_pos]);
 879                 local_pos++;
 880             }
 881             else
 882             {
 883                 /* Add from the receive buffer */
 884                 rvec_inc(f[i], pme->bufv[buf_index[node]]);
 885                 buf_index[node]++;
 886             }
 887         }
 888     }
 889     else
 890     {
 891         for (i = 0; i < n; i++)
 892         {
 893             node = atc->pd[i];
 894             if (node == atc->nodeid)
 895             {
 896                 /* Copy from the local force array */
 897                 copy_rvec(atc->f[local_pos], f[i]);
 898                 local_pos++;
 899             }
 900             else
 901             {
 902                 /* Copy from the receive buffer */
 903                 copy_rvec(pme->bufv[buf_index[node]], f[i]);
 904                 buf_index[node]++;
 905             }
 906         }
 907     }
 908 }
 909
 910 #ifdef GMX_MPI
 911 static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 912 {
 913     pme_overlap_t *overlap;
 914     int            send_index0, send_nindex;
 915     int            recv_index0, recv_nindex;
 916     MPI_Status     stat;
 917     int            i, j, k, ix, iy, iz, icnt;
 918     int            ipulse, send_id, recv_id, datasize;
 919     real          *p;
 920     real          *sendptr, *recvptr;
 921
 922     /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
 923     overlap = &pme->overlap[1];
 924
 925     for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 926     {
 927         /* Since we have already (un)wrapped the overlap in the z-dimension,
 928          * we only have to communicate 0 to nkz (not pmegrid_nz).
 929          */
 930         if (direction == GMX_SUM_GRID_FORWARD)
 931         {
 932             send_id       = overlap->send_id[ipulse];
 933             recv_id       = overlap->recv_id[ipulse];
 934             send_index0   = overlap->comm_data[ipulse].send_index0;
 935             send_nindex   = overlap->comm_data[ipulse].send_nindex;
 936             recv_index0   = overlap->comm_data[ipulse].recv_index0;
 937             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 938         }
 939         else
 940         {
 941             send_id       = overlap->recv_id[ipulse];
 942             recv_id       = overlap->send_id[ipulse];
 943             send_index0   = overlap->comm_data[ipulse].recv_index0;
 944             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 945             recv_index0   = overlap->comm_data[ipulse].send_index0;
 946             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 947         }
 948
 949         /* Copy data to contiguous send buffer */
 950         if (debug)
 951         {
 952             fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 953                     pme->nodeid, overlap->nodeid, send_id,
 954                     pme->pmegrid_start_iy,
 955                     send_index0-pme->pmegrid_start_iy,
 956                     send_index0-pme->pmegrid_start_iy+send_nindex);
 957         }
 958         icnt = 0;
 959         for (i = 0; i < pme->pmegrid_nx; i++)
 960         {
 961             ix = i;
 962             for (j = 0; j < send_nindex; j++)
 963             {
 964                 iy = j + send_index0 - pme->pmegrid_start_iy;
 965                 for (k = 0; k < pme->nkz; k++)
 966                 {
 967                     iz = k;
 968                     overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
 969                 }
 970             }
 971         }
 972
 973         datasize      = pme->pmegrid_nx * pme->nkz;
 974
 975         MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
 976                      send_id, ipulse,
 977                      overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
 978                      recv_id, ipulse,
 979                      overlap->mpi_comm, &stat);
 980
 981         /* Get data from contiguous recv buffer */
 982         if (debug)
 983         {
 984             fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 985                     pme->nodeid, overlap->nodeid, recv_id,
 986                     pme->pmegrid_start_iy,
 987                     recv_index0-pme->pmegrid_start_iy,
 988                     recv_index0-pme->pmegrid_start_iy+recv_nindex);
 989         }
 990         icnt = 0;
 991         for (i = 0; i < pme->pmegrid_nx; i++)
 992         {
 993             ix = i;
 994             for (j = 0; j < recv_nindex; j++)
 995             {
 996                 iy = j + recv_index0 - pme->pmegrid_start_iy;
 997                 for (k = 0; k < pme->nkz; k++)
 998                 {
 999                     iz = k;
1000                     if (direction == GMX_SUM_GRID_FORWARD)
1001                     {
1002                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1003                     }
1004                     else
1005                     {
1006                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
1007                     }
1008                 }
1009             }
1010         }
1011     }
1012
1013     /* Major dimension is easier, no copying required,
1014      * but we might have to sum to separate array.
1015      * Since we don't copy, we have to communicate up to pmegrid_nz,
1016      * not nkz as for the minor direction.
1017      */
1018     overlap = &pme->overlap[0];
1019
1020     for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
1021     {
1022         if (direction == GMX_SUM_GRID_FORWARD)
1023         {
1024             send_id       = overlap->send_id[ipulse];
1025             recv_id       = overlap->recv_id[ipulse];
1026             send_index0   = overlap->comm_data[ipulse].send_index0;
1027             send_nindex   = overlap->comm_data[ipulse].send_nindex;
1028             recv_index0   = overlap->comm_data[ipulse].recv_index0;
1029             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
1030             recvptr       = overlap->recvbuf;
1031         }
1032         else
1033         {
1034             send_id       = overlap->recv_id[ipulse];
1035             recv_id       = overlap->send_id[ipulse];
1036             send_index0   = overlap->comm_data[ipulse].recv_index0;
1037             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
1038             recv_index0   = overlap->comm_data[ipulse].send_index0;
1039             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
1040             recvptr       = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1041         }
1042
1043         sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1044         datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
1045
1046         if (debug)
1047         {
1048             fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1049                     pme->nodeid, overlap->nodeid, send_id,
1050                     pme->pmegrid_start_ix,
1051                     send_index0-pme->pmegrid_start_ix,
1052                     send_index0-pme->pmegrid_start_ix+send_nindex);
1053             fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1054                     pme->nodeid, overlap->nodeid, recv_id,
1055                     pme->pmegrid_start_ix,
1056                     recv_index0-pme->pmegrid_start_ix,
1057                     recv_index0-pme->pmegrid_start_ix+recv_nindex);
1058         }
1059
1060         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
1061                      send_id, ipulse,
1062                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
1063                      recv_id, ipulse,
1064                      overlap->mpi_comm, &stat);
1065
1066         /* ADD data from contiguous recv buffer */
1067         if (direction == GMX_SUM_GRID_FORWARD)
1068         {
1069             p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1070             for (i = 0; i < recv_nindex*datasize; i++)
1071             {
1072                 p[i] += overlap->recvbuf[i];
1073             }
1074         }
1075     }
1076 }
1077 #endif
1078
1079
1080 static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid, int grid_index)
1081 {
1082     ivec    local_fft_ndata, local_fft_offset, local_fft_size;
1083     ivec    local_pme_size;
1084     int     i, ix, iy, iz;
1085     int     pmeidx, fftidx;
1086
1087     /* Dimensions should be identical for A/B grid, so we just use A here */
1088     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
1089                                    local_fft_ndata,
1090                                    local_fft_offset,
1091                                    local_fft_size);
1092
1093     local_pme_size[0] = pme->pmegrid_nx;
1094     local_pme_size[1] = pme->pmegrid_ny;
1095     local_pme_size[2] = pme->pmegrid_nz;
1096
1097     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1098        the offset is identical, and the PME grid always has more data (due to overlap)
1099      */
1100     {
1101 #ifdef DEBUG_PME
1102         FILE *fp, *fp2;
1103         char  fn[STRLEN], format[STRLEN];
1104         real  val;
1105         sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
1106         fp = gmx_ffopen(fn, "w");
1107         sprintf(fn, "pmegrid%d.txt", pme->nodeid);
1108         fp2 = gmx_ffopen(fn, "w");
1109         sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
1110 #endif
1111
1112         for (ix = 0; ix < local_fft_ndata[XX]; ix++)
1113         {
1114             for (iy = 0; iy < local_fft_ndata[YY]; iy++)
1115             {
1116                 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1117                 {
1118                     pmeidx          = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1119                     fftidx          = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1120                     fftgrid[fftidx] = pmegrid[pmeidx];
1121 #ifdef DEBUG_PME
1122                     val = 100*pmegrid[pmeidx];
1123                     if (pmegrid[pmeidx] != 0)
1124                     {
1125                         fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
1126                                 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
1127                     }
1128                     if (pmegrid[pmeidx] != 0)
1129                     {
1130                         fprintf(fp2, "%-12s  %5d  %5d  %5d  %12.5e\n",
1131                                 "qgrid",
1132                                 pme->pmegrid_start_ix + ix,
1133                                 pme->pmegrid_start_iy + iy,
1134                                 pme->pmegrid_start_iz + iz,
1135                                 pmegrid[pmeidx]);
1136                     }
1137 #endif
1138                 }
1139             }
1140         }
1141 #ifdef DEBUG_PME
1142         gmx_ffclose(fp);
1143         gmx_ffclose(fp2);
1144 #endif
1145     }
1146     return 0;
1147 }
1148
1149
1150 static gmx_cycles_t omp_cyc_start()
1151 {
1152     return gmx_cycles_read();
1153 }
1154
1155 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1156 {
1157     return gmx_cycles_read() - c;
1158 }
1159
1160
1161 static int copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid, int grid_index,
1162                                    int nthread, int thread)
1163 {
1164     ivec          local_fft_ndata, local_fft_offset, local_fft_size;
1165     ivec          local_pme_size;
1166     int           ixy0, ixy1, ixy, ix, iy, iz;
1167     int           pmeidx, fftidx;
1168 #ifdef PME_TIME_THREADS
1169     gmx_cycles_t  c1;
1170     static double cs1 = 0;
1171     static int    cnt = 0;
1172 #endif
1173
1174 #ifdef PME_TIME_THREADS
1175     c1 = omp_cyc_start();
1176 #endif
1177     /* Dimensions should be identical for A/B grid, so we just use A here */
1178     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
1179                                    local_fft_ndata,
1180                                    local_fft_offset,
1181                                    local_fft_size);
1182
1183     local_pme_size[0] = pme->pmegrid_nx;
1184     local_pme_size[1] = pme->pmegrid_ny;
1185     local_pme_size[2] = pme->pmegrid_nz;
1186
1187     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1188        the offset is identical, and the PME grid always has more data (due to overlap)
1189      */
1190     ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1191     ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1192
1193     for (ixy = ixy0; ixy < ixy1; ixy++)
1194     {
1195         ix = ixy/local_fft_ndata[YY];
1196         iy = ixy - ix*local_fft_ndata[YY];
1197
1198         pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1199         fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1200         for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1201         {
1202             pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1203         }
1204     }
1205
1206 #ifdef PME_TIME_THREADS
1207     c1   = omp_cyc_end(c1);
1208     cs1 += (double)c1;
1209     cnt++;
1210     if (cnt % 20 == 0)
1211     {
1212         printf("copy %.2f\n", cs1*1e-9);
1213     }
1214 #endif
1215
1216     return 0;
1217 }
1218
1219
1220 static void wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1221 {
1222     int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
1223
1224     nx = pme->nkx;
1225     ny = pme->nky;
1226     nz = pme->nkz;
1227
1228     pnx = pme->pmegrid_nx;
1229     pny = pme->pmegrid_ny;
1230     pnz = pme->pmegrid_nz;
1231
1232     overlap = pme->pme_order - 1;
1233
1234     /* Add periodic overlap in z */
1235     for (ix = 0; ix < pme->pmegrid_nx; ix++)
1236     {
1237         for (iy = 0; iy < pme->pmegrid_ny; iy++)
1238         {
1239             for (iz = 0; iz < overlap; iz++)
1240             {
1241                 pmegrid[(ix*pny+iy)*pnz+iz] +=
1242                     pmegrid[(ix*pny+iy)*pnz+nz+iz];
1243             }
1244         }
1245     }
1246
1247     if (pme->nnodes_minor == 1)
1248     {
1249         for (ix = 0; ix < pme->pmegrid_nx; ix++)
1250         {
1251             for (iy = 0; iy < overlap; iy++)
1252             {
1253                 for (iz = 0; iz < nz; iz++)
1254                 {
1255                     pmegrid[(ix*pny+iy)*pnz+iz] +=
1256                         pmegrid[(ix*pny+ny+iy)*pnz+iz];
1257                 }
1258             }
1259         }
1260     }
1261
1262     if (pme->nnodes_major == 1)
1263     {
1264         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1265
1266         for (ix = 0; ix < overlap; ix++)
1267         {
1268             for (iy = 0; iy < ny_x; iy++)
1269             {
1270                 for (iz = 0; iz < nz; iz++)
1271                 {
1272                     pmegrid[(ix*pny+iy)*pnz+iz] +=
1273                         pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1274                 }
1275             }
1276         }
1277     }
1278 }
1279
1280
1281 static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1282 {
1283     int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
1284
1285     nx = pme->nkx;
1286     ny = pme->nky;
1287     nz = pme->nkz;
1288
1289     pnx = pme->pmegrid_nx;
1290     pny = pme->pmegrid_ny;
1291     pnz = pme->pmegrid_nz;
1292
1293     overlap = pme->pme_order - 1;
1294
1295     if (pme->nnodes_major == 1)
1296     {
1297         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1298
1299         for (ix = 0; ix < overlap; ix++)
1300         {
1301             int iy, iz;
1302
1303             for (iy = 0; iy < ny_x; iy++)
1304             {
1305                 for (iz = 0; iz < nz; iz++)
1306                 {
1307                     pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1308                         pmegrid[(ix*pny+iy)*pnz+iz];
1309                 }
1310             }
1311         }
1312     }
1313
1314     if (pme->nnodes_minor == 1)
1315     {
1316 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1317         for (ix = 0; ix < pme->pmegrid_nx; ix++)
1318         {
1319             int iy, iz;
1320
1321             for (iy = 0; iy < overlap; iy++)
1322             {
1323                 for (iz = 0; iz < nz; iz++)
1324                 {
1325                     pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1326                         pmegrid[(ix*pny+iy)*pnz+iz];
1327                 }
1328             }
1329         }
1330     }
1331
1332     /* Copy periodic overlap in z */
1333 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1334     for (ix = 0; ix < pme->pmegrid_nx; ix++)
1335     {
1336         int iy, iz;
1337
1338         for (iy = 0; iy < pme->pmegrid_ny; iy++)
1339         {
1340             for (iz = 0; iz < overlap; iz++)
1341             {
1342                 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1343                     pmegrid[(ix*pny+iy)*pnz+iz];
1344             }
1345         }
1346     }
1347 }
1348
1349
1350 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1351 #define DO_BSPLINE(order)                            \
1352     for (ithx = 0; (ithx < order); ithx++)                    \
1353     {                                                    \
1354         index_x = (i0+ithx)*pny*pnz;                     \
1355         valx    = coefficient*thx[ithx];                          \
1356                                                      \
1357         for (ithy = 0; (ithy < order); ithy++)                \
1358         {                                                \
1359             valxy    = valx*thy[ithy];                   \
1360             index_xy = index_x+(j0+ithy)*pnz;            \
1361                                                      \
1362             for (ithz = 0; (ithz < order); ithz++)            \
1363             {                                            \
1364                 index_xyz        = index_xy+(k0+ithz);   \
1365                 grid[index_xyz] += valxy*thz[ithz];      \
1366             }                                            \
1367         }                                                \
1368     }
1369
1370
1371 static void spread_coefficients_bsplines_thread(pmegrid_t                    *pmegrid,
1372                                                 pme_atomcomm_t               *atc,
1373                                                 splinedata_t                 *spline,
1374                                                 pme_spline_work_t gmx_unused *work)
1375 {
1376
1377     /* spread coefficients from home atoms to local grid */
1378     real          *grid;
1379     pme_overlap_t *ol;
1380     int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
1381     int       *    idxptr;
1382     int            order, norder, index_x, index_xy, index_xyz;
1383     real           valx, valxy, coefficient;
1384     real          *thx, *thy, *thz;
1385     int            localsize, bndsize;
1386     int            pnx, pny, pnz, ndatatot;
1387     int            offx, offy, offz;
1388
1389 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
1390     real           thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
1391
1392     thz_aligned = gmx_simd4_align_r(thz_buffer);
1393 #endif
1394
1395     pnx = pmegrid->s[XX];
1396     pny = pmegrid->s[YY];
1397     pnz = pmegrid->s[ZZ];
1398
1399     offx = pmegrid->offset[XX];
1400     offy = pmegrid->offset[YY];
1401     offz = pmegrid->offset[ZZ];
1402
1403     ndatatot = pnx*pny*pnz;
1404     grid     = pmegrid->grid;
1405     for (i = 0; i < ndatatot; i++)
1406     {
1407         grid[i] = 0;
1408     }
1409
1410     order = pmegrid->order;
1411
1412     for (nn = 0; nn < spline->n; nn++)
1413     {
1414         n           = spline->ind[nn];
1415         coefficient = atc->coefficient[n];
1416
1417         if (coefficient != 0)
1418         {
1419             idxptr = atc->idx[n];
1420             norder = nn*order;
1421
1422             i0   = idxptr[XX] - offx;
1423             j0   = idxptr[YY] - offy;
1424             k0   = idxptr[ZZ] - offz;
1425
1426             thx = spline->theta[XX] + norder;
1427             thy = spline->theta[YY] + norder;
1428             thz = spline->theta[ZZ] + norder;
1429
1430             switch (order)
1431             {
1432                 case 4:
1433 #ifdef PME_SIMD4_SPREAD_GATHER
1434 #ifdef PME_SIMD4_UNALIGNED
1435 #define PME_SPREAD_SIMD4_ORDER4
1436 #else
1437 #define PME_SPREAD_SIMD4_ALIGNED
1438 #define PME_ORDER 4
1439 #endif
1440 #include "pme_simd4.h"
1441 #else
1442                     DO_BSPLINE(4);
1443 #endif
1444                     break;
1445                 case 5:
1446 #ifdef PME_SIMD4_SPREAD_GATHER
1447 #define PME_SPREAD_SIMD4_ALIGNED
1448 #define PME_ORDER 5
1449 #include "pme_simd4.h"
1450 #else
1451                     DO_BSPLINE(5);
1452 #endif
1453                     break;
1454                 default:
1455                     DO_BSPLINE(order);
1456                     break;
1457             }
1458         }
1459     }
1460 }
1461
1462 static void set_grid_alignment(int gmx_unused *pmegrid_nz, int gmx_unused pme_order)
1463 {
1464 #ifdef PME_SIMD4_SPREAD_GATHER
1465     if (pme_order == 5
1466 #ifndef PME_SIMD4_UNALIGNED
1467         || pme_order == 4
1468 #endif
1469         )
1470     {
1471         /* Round nz up to a multiple of 4 to ensure alignment */
1472         *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1473     }
1474 #endif
1475 }
1476
1477 static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
1478 {
1479 #ifdef PME_SIMD4_SPREAD_GATHER
1480 #ifndef PME_SIMD4_UNALIGNED
1481     if (pme_order == 4)
1482     {
1483         /* Add extra elements to ensured aligned operations do not go
1484          * beyond the allocated grid size.
1485          * Note that for pme_order=5, the pme grid z-size alignment
1486          * ensures that we will not go beyond the grid size.
1487          */
1488         *gridsize += 4;
1489     }
1490 #endif
1491 #endif
1492 }
1493
1494 static void pmegrid_init(pmegrid_t *grid,
1495                          int cx, int cy, int cz,
1496                          int x0, int y0, int z0,
1497                          int x1, int y1, int z1,
1498                          gmx_bool set_alignment,
1499                          int pme_order,
1500                          real *ptr)
1501 {
1502     int nz, gridsize;
1503
1504     grid->ci[XX]     = cx;
1505     grid->ci[YY]     = cy;
1506     grid->ci[ZZ]     = cz;
1507     grid->offset[XX] = x0;
1508     grid->offset[YY] = y0;
1509     grid->offset[ZZ] = z0;
1510     grid->n[XX]      = x1 - x0 + pme_order - 1;
1511     grid->n[YY]      = y1 - y0 + pme_order - 1;
1512     grid->n[ZZ]      = z1 - z0 + pme_order - 1;
1513     copy_ivec(grid->n, grid->s);
1514
1515     nz = grid->s[ZZ];
1516     set_grid_alignment(&nz, pme_order);
1517     if (set_alignment)
1518     {
1519         grid->s[ZZ] = nz;
1520     }
1521     else if (nz != grid->s[ZZ])
1522     {
1523         gmx_incons("pmegrid_init call with an unaligned z size");
1524     }
1525
1526     grid->order = pme_order;
1527     if (ptr == NULL)
1528     {
1529         gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
1530         set_gridsize_alignment(&gridsize, pme_order);
1531         snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
1532     }
1533     else
1534     {
1535         grid->grid = ptr;
1536     }
1537 }
1538
1539 static int div_round_up(int enumerator, int denominator)
1540 {
1541     return (enumerator + denominator - 1)/denominator;
1542 }
1543
1544 static void make_subgrid_division(const ivec n, int ovl, int nthread,
1545                                   ivec nsub)
1546 {
1547     int gsize_opt, gsize;
1548     int nsx, nsy, nsz;
1549     char *env;
1550
1551     gsize_opt = -1;
1552     for (nsx = 1; nsx <= nthread; nsx++)
1553     {
1554         if (nthread % nsx == 0)
1555         {
1556             for (nsy = 1; nsy <= nthread; nsy++)
1557             {
1558                 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1559                 {
1560                     nsz = nthread/(nsx*nsy);
1561
1562                     /* Determine the number of grid points per thread */
1563                     gsize =
1564                         (div_round_up(n[XX], nsx) + ovl)*
1565                         (div_round_up(n[YY], nsy) + ovl)*
1566                         (div_round_up(n[ZZ], nsz) + ovl);
1567
1568                     /* Minimize the number of grids points per thread
1569                      * and, secondarily, the number of cuts in minor dimensions.
1570                      */
1571                     if (gsize_opt == -1 ||
1572                         gsize < gsize_opt ||
1573                         (gsize == gsize_opt &&
1574                          (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1575                     {
1576                         nsub[XX]  = nsx;
1577                         nsub[YY]  = nsy;
1578                         nsub[ZZ]  = nsz;
1579                         gsize_opt = gsize;
1580                     }
1581                 }
1582             }
1583         }
1584     }
1585
1586     env = getenv("GMX_PME_THREAD_DIVISION");
1587     if (env != NULL)
1588     {
1589         sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
1590     }
1591
1592     if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1593     {
1594         gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
1595     }
1596 }
1597
1598 static void pmegrids_init(pmegrids_t *grids,
1599                           int nx, int ny, int nz, int nz_base,
1600                           int pme_order,
1601                           gmx_bool bUseThreads,
1602                           int nthread,
1603                           int overlap_x,
1604                           int overlap_y)
1605 {
1606     ivec n, n_base, g0, g1;
1607     int t, x, y, z, d, i, tfac;
1608     int max_comm_lines = -1;
1609
1610     n[XX] = nx - (pme_order - 1);
1611     n[YY] = ny - (pme_order - 1);
1612     n[ZZ] = nz - (pme_order - 1);
1613
1614     copy_ivec(n, n_base);
1615     n_base[ZZ] = nz_base;
1616
1617     pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
1618                  NULL);
1619
1620     grids->nthread = nthread;
1621
1622     make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
1623
1624     if (bUseThreads)
1625     {
1626         ivec nst;
1627         int gridsize;
1628
1629         for (d = 0; d < DIM; d++)
1630         {
1631             nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
1632         }
1633         set_grid_alignment(&nst[ZZ], pme_order);
1634
1635         if (debug)
1636         {
1637             fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
1638                     grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
1639             fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1640                     nx, ny, nz,
1641                     nst[XX], nst[YY], nst[ZZ]);
1642         }
1643
1644         snew(grids->grid_th, grids->nthread);
1645         t        = 0;
1646         gridsize = nst[XX]*nst[YY]*nst[ZZ];
1647         set_gridsize_alignment(&gridsize, pme_order);
1648         snew_aligned(grids->grid_all,
1649                      grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1650                      SIMD4_ALIGNMENT);
1651
1652         for (x = 0; x < grids->nc[XX]; x++)
1653         {
1654             for (y = 0; y < grids->nc[YY]; y++)
1655             {
1656                 for (z = 0; z < grids->nc[ZZ]; z++)
1657                 {
1658                     pmegrid_init(&grids->grid_th[t],
1659                                  x, y, z,
1660                                  (n[XX]*(x  ))/grids->nc[XX],
1661                                  (n[YY]*(y  ))/grids->nc[YY],
1662                                  (n[ZZ]*(z  ))/grids->nc[ZZ],
1663                                  (n[XX]*(x+1))/grids->nc[XX],
1664                                  (n[YY]*(y+1))/grids->nc[YY],
1665                                  (n[ZZ]*(z+1))/grids->nc[ZZ],
1666                                  TRUE,
1667                                  pme_order,
1668                                  grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1669                     t++;
1670                 }
1671             }
1672         }
1673     }
1674     else
1675     {
1676         grids->grid_th = NULL;
1677     }
1678
1679     snew(grids->g2t, DIM);
1680     tfac = 1;
1681     for (d = DIM-1; d >= 0; d--)
1682     {
1683         snew(grids->g2t[d], n[d]);
1684         t = 0;
1685         for (i = 0; i < n[d]; i++)
1686         {
1687             /* The second check should match the parameters
1688              * of the pmegrid_init call above.
1689              */
1690             while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1691             {
1692                 t++;
1693             }
1694             grids->g2t[d][i] = t*tfac;
1695         }
1696
1697         tfac *= grids->nc[d];
1698
1699         switch (d)
1700         {
1701             case XX: max_comm_lines = overlap_x;     break;
1702             case YY: max_comm_lines = overlap_y;     break;
1703             case ZZ: max_comm_lines = pme_order - 1; break;
1704         }
1705         grids->nthread_comm[d] = 0;
1706         while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
1707                grids->nthread_comm[d] < grids->nc[d])
1708         {
1709             grids->nthread_comm[d]++;
1710         }
1711         if (debug != NULL)
1712         {
1713             fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
1714                     'x'+d, grids->nthread_comm[d]);
1715         }
1716         /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1717          * work, but this is not a problematic restriction.
1718          */
1719         if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1720         {
1721             gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
1722         }
1723     }
1724 }
1725
1726
1727 static void pmegrids_destroy(pmegrids_t *grids)
1728 {
1729     int t;
1730
1731     if (grids->grid.grid != NULL)
1732     {
1733         sfree(grids->grid.grid);
1734
1735         if (grids->nthread > 0)
1736         {
1737             for (t = 0; t < grids->nthread; t++)
1738             {
1739                 sfree(grids->grid_th[t].grid);
1740             }
1741             sfree(grids->grid_th);
1742         }
1743     }
1744 }
1745
1746
1747 static void realloc_work(pme_work_t *work, int nkx)
1748 {
1749     int simd_width;
1750
1751     if (nkx > work->nalloc)
1752     {
1753         work->nalloc = nkx;
1754         srenew(work->mhx, work->nalloc);
1755         srenew(work->mhy, work->nalloc);
1756         srenew(work->mhz, work->nalloc);
1757         srenew(work->m2, work->nalloc);
1758         /* Allocate an aligned pointer for SIMD operations, including extra
1759          * elements at the end for padding.
1760          */
1761 #ifdef PME_SIMD_SOLVE
1762         simd_width = GMX_SIMD_REAL_WIDTH;
1763 #else
1764         /* We can use any alignment, apart from 0, so we use 4 */
1765         simd_width = 4;
1766 #endif
1767         sfree_aligned(work->denom);
1768         sfree_aligned(work->tmp1);
1769         sfree_aligned(work->tmp2);
1770         sfree_aligned(work->eterm);
1771         snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real));
1772         snew_aligned(work->tmp1,  work->nalloc+simd_width, simd_width*sizeof(real));
1773         snew_aligned(work->tmp2,  work->nalloc+simd_width, simd_width*sizeof(real));
1774         snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
1775         srenew(work->m2inv, work->nalloc);
1776     }
1777 }
1778
1779
1780 static void free_work(pme_work_t *work)
1781 {
1782     sfree(work->mhx);
1783     sfree(work->mhy);
1784     sfree(work->mhz);
1785     sfree(work->m2);
1786     sfree_aligned(work->denom);
1787     sfree_aligned(work->tmp1);
1788     sfree_aligned(work->tmp2);
1789     sfree_aligned(work->eterm);
1790     sfree(work->m2inv);
1791 }
1792
1793
1794 #if defined PME_SIMD_SOLVE
1795 /* Calculate exponentials through SIMD */
1796 gmx_inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1797 {
1798     {
1799         const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
1800         gmx_simd_real_t f_simd;
1801         gmx_simd_real_t lu;
1802         gmx_simd_real_t tmp_d1, d_inv, tmp_r, tmp_e;
1803         int kx;
1804         f_simd = gmx_simd_set1_r(f);
1805         /* We only need to calculate from start. But since start is 0 or 1
1806          * and we want to use aligned loads/stores, we always start from 0.
1807          */
1808         for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
1809         {
1810             tmp_d1   = gmx_simd_load_r(d_aligned+kx);
1811             d_inv    = gmx_simd_inv_r(tmp_d1);
1812             tmp_r    = gmx_simd_load_r(r_aligned+kx);
1813             tmp_r    = gmx_simd_exp_r(tmp_r);
1814             tmp_e    = gmx_simd_mul_r(f_simd, d_inv);
1815             tmp_e    = gmx_simd_mul_r(tmp_e, tmp_r);
1816             gmx_simd_store_r(e_aligned+kx, tmp_e);
1817         }
1818     }
1819 }
1820 #else
1821 gmx_inline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
1822 {
1823     int kx;
1824     for (kx = start; kx < end; kx++)
1825     {
1826         d[kx] = 1.0/d[kx];
1827     }
1828     for (kx = start; kx < end; kx++)
1829     {
1830         r[kx] = exp(r[kx]);
1831     }
1832     for (kx = start; kx < end; kx++)
1833     {
1834         e[kx] = f*r[kx]*d[kx];
1835     }
1836 }
1837 #endif
1838
1839 #if defined PME_SIMD_SOLVE
1840 /* Calculate exponentials through SIMD */
1841 gmx_inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
1842 {
1843     gmx_simd_real_t tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
1844     const gmx_simd_real_t sqr_PI = gmx_simd_sqrt_r(gmx_simd_set1_r(M_PI));
1845     int kx;
1846     for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
1847     {
1848         /* We only need to calculate from start. But since start is 0 or 1
1849          * and we want to use aligned loads/stores, we always start from 0.
1850          */
1851         tmp_d = gmx_simd_load_r(d_aligned+kx);
1852         d_inv = gmx_simd_inv_r(tmp_d);
1853         gmx_simd_store_r(d_aligned+kx, d_inv);
1854         tmp_r = gmx_simd_load_r(r_aligned+kx);
1855         tmp_r = gmx_simd_exp_r(tmp_r);
1856         gmx_simd_store_r(r_aligned+kx, tmp_r);
1857         tmp_mk  = gmx_simd_load_r(factor_aligned+kx);
1858         tmp_fac = gmx_simd_mul_r(sqr_PI, gmx_simd_mul_r(tmp_mk, gmx_simd_erfc_r(tmp_mk)));
1859         gmx_simd_store_r(factor_aligned+kx, tmp_fac);
1860     }
1861 }
1862 #else
1863 gmx_inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
1864 {
1865     int kx;
1866     real mk;
1867     for (kx = start; kx < end; kx++)
1868     {
1869         d[kx] = 1.0/d[kx];
1870     }
1871
1872     for (kx = start; kx < end; kx++)
1873     {
1874         r[kx] = exp(r[kx]);
1875     }
1876
1877     for (kx = start; kx < end; kx++)
1878     {
1879         mk       = tmp2[kx];
1880         tmp2[kx] = sqrt(M_PI)*mk*gmx_erfc(mk);
1881     }
1882 }
1883 #endif
1884
1885 static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
1886                          real ewaldcoeff, real vol,
1887                          gmx_bool bEnerVir,
1888                          int nthread, int thread)
1889 {
1890     /* do recip sum over local cells in grid */
1891     /* y major, z middle, x minor or continuous */
1892     t_complex *p0;
1893     int     kx, ky, kz, maxkx, maxky, maxkz;
1894     int     nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
1895     real    mx, my, mz;
1896     real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1897     real    ets2, struct2, vfactor, ets2vf;
1898     real    d1, d2, energy = 0;
1899     real    by, bz;
1900     real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
1901     real    rxx, ryx, ryy, rzx, rzy, rzz;
1902     pme_work_t *work;
1903     real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
1904     real    mhxk, mhyk, mhzk, m2k;
1905     real    corner_fac;
1906     ivec    complex_order;
1907     ivec    local_ndata, local_offset, local_size;
1908     real    elfac;
1909
1910     elfac = ONE_4PI_EPS0/pme->epsilon_r;
1911
1912     nx = pme->nkx;
1913     ny = pme->nky;
1914     nz = pme->nkz;
1915
1916     /* Dimensions should be identical for A/B grid, so we just use A here */
1917     gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA],
1918                                       complex_order,
1919                                       local_ndata,
1920                                       local_offset,
1921                                       local_size);
1922
1923     rxx = pme->recipbox[XX][XX];
1924     ryx = pme->recipbox[YY][XX];
1925     ryy = pme->recipbox[YY][YY];
1926     rzx = pme->recipbox[ZZ][XX];
1927     rzy = pme->recipbox[ZZ][YY];
1928     rzz = pme->recipbox[ZZ][ZZ];
1929
1930     maxkx = (nx+1)/2;
1931     maxky = (ny+1)/2;
1932     maxkz = nz/2+1;
1933
1934     work  = &pme->work[thread];
1935     mhx   = work->mhx;
1936     mhy   = work->mhy;
1937     mhz   = work->mhz;
1938     m2    = work->m2;
1939     denom = work->denom;
1940     tmp1  = work->tmp1;
1941     eterm = work->eterm;
1942     m2inv = work->m2inv;
1943
1944     iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
1945     iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1946
1947     for (iyz = iyz0; iyz < iyz1; iyz++)
1948     {
1949         iy = iyz/local_ndata[ZZ];
1950         iz = iyz - iy*local_ndata[ZZ];
1951
1952         ky = iy + local_offset[YY];
1953
1954         if (ky < maxky)
1955         {
1956             my = ky;
1957         }
1958         else
1959         {
1960             my = (ky - ny);
1961         }
1962
1963         by = M_PI*vol*pme->bsp_mod[YY][ky];
1964
1965         kz = iz + local_offset[ZZ];
1966
1967         mz = kz;
1968
1969         bz = pme->bsp_mod[ZZ][kz];
1970
1971         /* 0.5 correction for corner points */
1972         corner_fac = 1;
1973         if (kz == 0 || kz == (nz+1)/2)
1974         {
1975             corner_fac = 0.5;
1976         }
1977
1978         p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1979
1980         /* We should skip the k-space point (0,0,0) */
1981         /* Note that since here x is the minor index, local_offset[XX]=0 */
1982         if (local_offset[XX] > 0 || ky > 0 || kz > 0)
1983         {
1984             kxstart = local_offset[XX];
1985         }
1986         else
1987         {
1988             kxstart = local_offset[XX] + 1;
1989             p0++;
1990         }
1991         kxend = local_offset[XX] + local_ndata[XX];
1992
1993         if (bEnerVir)
1994         {
1995             /* More expensive inner loop, especially because of the storage
1996              * of the mh elements in array's.
1997              * Because x is the minor grid index, all mh elements
1998              * depend on kx for triclinic unit cells.
1999              */
2000
2001             /* Two explicit loops to avoid a conditional inside the loop */
2002             for (kx = kxstart; kx < maxkx; kx++)
2003             {
2004                 mx = kx;
2005
2006                 mhxk      = mx * rxx;
2007                 mhyk      = mx * ryx + my * ryy;
2008                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2009                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2010                 mhx[kx]   = mhxk;
2011                 mhy[kx]   = mhyk;
2012                 mhz[kx]   = mhzk;
2013                 m2[kx]    = m2k;
2014                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2015                 tmp1[kx]  = -factor*m2k;
2016             }
2017
2018             for (kx = maxkx; kx < kxend; kx++)
2019             {
2020                 mx = (kx - nx);
2021
2022                 mhxk      = mx * rxx;
2023                 mhyk      = mx * ryx + my * ryy;
2024                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2025                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2026                 mhx[kx]   = mhxk;
2027                 mhy[kx]   = mhyk;
2028                 mhz[kx]   = mhzk;
2029                 m2[kx]    = m2k;
2030                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2031                 tmp1[kx]  = -factor*m2k;
2032             }
2033
2034             for (kx = kxstart; kx < kxend; kx++)
2035             {
2036                 m2inv[kx] = 1.0/m2[kx];
2037             }
2038
2039             calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
2040
2041             for (kx = kxstart; kx < kxend; kx++, p0++)
2042             {
2043                 d1      = p0->re;
2044                 d2      = p0->im;
2045
2046                 p0->re  = d1*eterm[kx];
2047                 p0->im  = d2*eterm[kx];
2048
2049                 struct2 = 2.0*(d1*d1+d2*d2);
2050
2051                 tmp1[kx] = eterm[kx]*struct2;
2052             }
2053
2054             for (kx = kxstart; kx < kxend; kx++)
2055             {
2056                 ets2     = corner_fac*tmp1[kx];
2057                 vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2058                 energy  += ets2;
2059
2060                 ets2vf   = ets2*vfactor;
2061                 virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
2062                 virxy   += ets2vf*mhx[kx]*mhy[kx];
2063                 virxz   += ets2vf*mhx[kx]*mhz[kx];
2064                 viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
2065                 viryz   += ets2vf*mhy[kx]*mhz[kx];
2066                 virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
2067             }
2068         }
2069         else
2070         {
2071             /* We don't need to calculate the energy and the virial.
2072              * In this case the triclinic overhead is small.
2073              */
2074
2075             /* Two explicit loops to avoid a conditional inside the loop */
2076
2077             for (kx = kxstart; kx < maxkx; kx++)
2078             {
2079                 mx = kx;
2080
2081                 mhxk      = mx * rxx;
2082                 mhyk      = mx * ryx + my * ryy;
2083                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2084                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2085                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2086                 tmp1[kx]  = -factor*m2k;
2087             }
2088
2089             for (kx = maxkx; kx < kxend; kx++)
2090             {
2091                 mx = (kx - nx);
2092
2093                 mhxk      = mx * rxx;
2094                 mhyk      = mx * ryx + my * ryy;
2095                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2096                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2097                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2098                 tmp1[kx]  = -factor*m2k;
2099             }
2100
2101             calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
2102
2103             for (kx = kxstart; kx < kxend; kx++, p0++)
2104             {
2105                 d1      = p0->re;
2106                 d2      = p0->im;
2107
2108                 p0->re  = d1*eterm[kx];
2109                 p0->im  = d2*eterm[kx];
2110             }
2111         }
2112     }
2113
2114     if (bEnerVir)
2115     {
2116         /* Update virial with local values.
2117          * The virial is symmetric by definition.
2118          * this virial seems ok for isotropic scaling, but I'm
2119          * experiencing problems on semiisotropic membranes.
2120          * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2121          */
2122         work->vir_q[XX][XX] = 0.25*virxx;
2123         work->vir_q[YY][YY] = 0.25*viryy;
2124         work->vir_q[ZZ][ZZ] = 0.25*virzz;
2125         work->vir_q[XX][YY] = work->vir_q[YY][XX] = 0.25*virxy;
2126         work->vir_q[XX][ZZ] = work->vir_q[ZZ][XX] = 0.25*virxz;
2127         work->vir_q[YY][ZZ] = work->vir_q[ZZ][YY] = 0.25*viryz;
2128
2129         /* This energy should be corrected for a charged system */
2130         work->energy_q = 0.5*energy;
2131     }
2132
2133     /* Return the loop count */
2134     return local_ndata[YY]*local_ndata[XX];
2135 }
2136
2137 static int solve_pme_lj_yzx(gmx_pme_t pme, t_complex **grid, gmx_bool bLB,
2138                             real ewaldcoeff, real vol,
2139                             gmx_bool bEnerVir, int nthread, int thread)
2140 {
2141     /* do recip sum over local cells in grid */
2142     /* y major, z middle, x minor or continuous */
2143     int     ig, gcount;
2144     int     kx, ky, kz, maxkx, maxky, maxkz;
2145     int     nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
2146     real    mx, my, mz;
2147     real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
2148     real    ets2, ets2vf;
2149     real    eterm, vterm, d1, d2, energy = 0;
2150     real    by, bz;
2151     real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
2152     real    rxx, ryx, ryy, rzx, rzy, rzz;
2153     real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2;
2154     real    mhxk, mhyk, mhzk, m2k;
2155     real    mk;
2156     pme_work_t *work;
2157     real    corner_fac;
2158     ivec    complex_order;
2159     ivec    local_ndata, local_offset, local_size;
2160     nx = pme->nkx;
2161     ny = pme->nky;
2162     nz = pme->nkz;
2163
2164     /* Dimensions should be identical for A/B grid, so we just use A here */
2165     gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A],
2166                                       complex_order,
2167                                       local_ndata,
2168                                       local_offset,
2169                                       local_size);
2170     rxx = pme->recipbox[XX][XX];
2171     ryx = pme->recipbox[YY][XX];
2172     ryy = pme->recipbox[YY][YY];
2173     rzx = pme->recipbox[ZZ][XX];
2174     rzy = pme->recipbox[ZZ][YY];
2175     rzz = pme->recipbox[ZZ][ZZ];
2176
2177     maxkx = (nx+1)/2;
2178     maxky = (ny+1)/2;
2179     maxkz = nz/2+1;
2180
2181     work  = &pme->work[thread];
2182     mhx   = work->mhx;
2183     mhy   = work->mhy;
2184     mhz   = work->mhz;
2185     m2    = work->m2;
2186     denom = work->denom;
2187     tmp1  = work->tmp1;
2188     tmp2  = work->tmp2;
2189
2190     iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
2191     iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
2192
2193     for (iyz = iyz0; iyz < iyz1; iyz++)
2194     {
2195         iy = iyz/local_ndata[ZZ];
2196         iz = iyz - iy*local_ndata[ZZ];
2197
2198         ky = iy + local_offset[YY];
2199
2200         if (ky < maxky)
2201         {
2202             my = ky;
2203         }
2204         else
2205         {
2206             my = (ky - ny);
2207         }
2208
2209         by = 3.0*vol*pme->bsp_mod[YY][ky]
2210             / (M_PI*sqrt(M_PI)*ewaldcoeff*ewaldcoeff*ewaldcoeff);
2211
2212         kz = iz + local_offset[ZZ];
2213
2214         mz = kz;
2215
2216         bz = pme->bsp_mod[ZZ][kz];
2217
2218         /* 0.5 correction for corner points */
2219         corner_fac = 1;
2220         if (kz == 0 || kz == (nz+1)/2)
2221         {
2222             corner_fac = 0.5;
2223         }
2224
2225         kxstart = local_offset[XX];
2226         kxend   = local_offset[XX] + local_ndata[XX];
2227         if (bEnerVir)
2228         {
2229             /* More expensive inner loop, especially because of the
2230              * storage of the mh elements in array's.  Because x is the
2231              * minor grid index, all mh elements depend on kx for
2232              * triclinic unit cells.
2233              */
2234
2235             /* Two explicit loops to avoid a conditional inside the loop */
2236             for (kx = kxstart; kx < maxkx; kx++)
2237             {
2238                 mx = kx;
2239
2240                 mhxk      = mx * rxx;
2241                 mhyk      = mx * ryx + my * ryy;
2242                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2243                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2244                 mhx[kx]   = mhxk;
2245                 mhy[kx]   = mhyk;
2246                 mhz[kx]   = mhzk;
2247                 m2[kx]    = m2k;
2248                 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
2249                 tmp1[kx]  = -factor*m2k;
2250                 tmp2[kx]  = sqrt(factor*m2k);
2251             }
2252
2253             for (kx = maxkx; kx < kxend; kx++)
2254             {
2255                 mx = (kx - nx);
2256
2257                 mhxk      = mx * rxx;
2258                 mhyk      = mx * ryx + my * ryy;
2259                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2260                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2261                 mhx[kx]   = mhxk;
2262                 mhy[kx]   = mhyk;
2263                 mhz[kx]   = mhzk;
2264                 m2[kx]    = m2k;
2265                 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
2266                 tmp1[kx]  = -factor*m2k;
2267                 tmp2[kx]  = sqrt(factor*m2k);
2268             }
2269
2270             calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
2271
2272             for (kx = kxstart; kx < kxend; kx++)
2273             {
2274                 m2k   = factor*m2[kx];
2275                 eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
2276                           + 2.0*m2k*tmp2[kx]);
2277                 vterm    = 3.0*(-tmp1[kx] + tmp2[kx]);
2278                 tmp1[kx] = eterm*denom[kx];
2279                 tmp2[kx] = vterm*denom[kx];
2280             }
2281
2282             if (!bLB)
2283             {
2284                 t_complex *p0;
2285                 real       struct2;
2286
2287                 p0 = grid[0] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2288                 for (kx = kxstart; kx < kxend; kx++, p0++)
2289                 {
2290                     d1      = p0->re;
2291                     d2      = p0->im;
2292
2293                     eterm   = tmp1[kx];
2294                     vterm   = tmp2[kx];
2295                     p0->re  = d1*eterm;
2296                     p0->im  = d2*eterm;
2297
2298                     struct2 = 2.0*(d1*d1+d2*d2);
2299
2300                     tmp1[kx] = eterm*struct2;
2301                     tmp2[kx] = vterm*struct2;
2302                 }
2303             }
2304             else
2305             {
2306                 real *struct2 = denom;
2307                 real  str2;
2308
2309                 for (kx = kxstart; kx < kxend; kx++)
2310                 {
2311                     struct2[kx] = 0.0;
2312                 }
2313                 /* Due to symmetry we only need to calculate 4 of the 7 terms */
2314                 for (ig = 0; ig <= 3; ++ig)
2315                 {
2316                     t_complex *p0, *p1;
2317                     real       scale;
2318
2319                     p0    = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2320                     p1    = grid[6-ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2321                     scale = 2.0*lb_scale_factor_symm[ig];
2322                     for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
2323                     {
2324                         struct2[kx] += scale*(p0->re*p1->re + p0->im*p1->im);
2325                     }
2326
2327                 }
2328                 for (ig = 0; ig <= 6; ++ig)
2329                 {
2330                     t_complex *p0;
2331
2332                     p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2333                     for (kx = kxstart; kx < kxend; kx++, p0++)
2334                     {
2335                         d1     = p0->re;
2336                         d2     = p0->im;
2337
2338                         eterm  = tmp1[kx];
2339                         p0->re = d1*eterm;
2340                         p0->im = d2*eterm;
2341                     }
2342                 }
2343                 for (kx = kxstart; kx < kxend; kx++)
2344                 {
2345                     eterm    = tmp1[kx];
2346                     vterm    = tmp2[kx];
2347                     str2     = struct2[kx];
2348                     tmp1[kx] = eterm*str2;
2349                     tmp2[kx] = vterm*str2;
2350                 }
2351             }
2352
2353             for (kx = kxstart; kx < kxend; kx++)
2354             {
2355                 ets2     = corner_fac*tmp1[kx];
2356                 vterm    = 2.0*factor*tmp2[kx];
2357                 energy  += ets2;
2358                 ets2vf   = corner_fac*vterm;
2359                 virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
2360                 virxy   += ets2vf*mhx[kx]*mhy[kx];
2361                 virxz   += ets2vf*mhx[kx]*mhz[kx];
2362                 viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
2363                 viryz   += ets2vf*mhy[kx]*mhz[kx];
2364                 virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
2365             }
2366         }
2367         else
2368         {
2369             /* We don't need to calculate the energy and the virial.
2370              *  In this case the triclinic overhead is small.
2371              */
2372
2373             /* Two explicit loops to avoid a conditional inside the loop */
2374
2375             for (kx = kxstart; kx < maxkx; kx++)
2376             {
2377                 mx = kx;
2378
2379                 mhxk      = mx * rxx;
2380                 mhyk      = mx * ryx + my * ryy;
2381                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2382                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2383                 m2[kx]    = m2k;
2384                 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
2385                 tmp1[kx]  = -factor*m2k;
2386                 tmp2[kx]  = sqrt(factor*m2k);
2387             }
2388
2389             for (kx = maxkx; kx < kxend; kx++)
2390             {
2391                 mx = (kx - nx);
2392
2393                 mhxk      = mx * rxx;
2394                 mhyk      = mx * ryx + my * ryy;
2395                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2396                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2397                 m2[kx]    = m2k;
2398                 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
2399                 tmp1[kx]  = -factor*m2k;
2400                 tmp2[kx]  = sqrt(factor*m2k);
2401             }
2402
2403             calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
2404
2405             for (kx = kxstart; kx < kxend; kx++)
2406             {
2407                 m2k    = factor*m2[kx];
2408                 eterm  = -((1.0 - 2.0*m2k)*tmp1[kx]
2409                            + 2.0*m2k*tmp2[kx]);
2410                 tmp1[kx] = eterm*denom[kx];
2411             }
2412             gcount = (bLB ? 7 : 1);
2413             for (ig = 0; ig < gcount; ++ig)
2414             {
2415                 t_complex *p0;
2416
2417                 p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2418                 for (kx = kxstart; kx < kxend; kx++, p0++)
2419                 {
2420                     d1      = p0->re;
2421                     d2      = p0->im;
2422
2423                     eterm   = tmp1[kx];
2424
2425                     p0->re  = d1*eterm;
2426                     p0->im  = d2*eterm;
2427                 }
2428             }
2429         }
2430     }
2431     if (bEnerVir)
2432     {
2433         work->vir_lj[XX][XX] = 0.25*virxx;
2434         work->vir_lj[YY][YY] = 0.25*viryy;
2435         work->vir_lj[ZZ][ZZ] = 0.25*virzz;
2436         work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25*virxy;
2437         work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25*virxz;
2438         work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25*viryz;
2439
2440         /* This energy should be corrected for a charged system */
2441         work->energy_lj = 0.5*energy;
2442     }
2443     /* Return the loop count */
2444     return local_ndata[YY]*local_ndata[XX];
2445 }
2446
2447 static void get_pme_ener_vir_q(const gmx_pme_t pme, int nthread,
2448                                real *mesh_energy, matrix vir)
2449 {
2450     /* This function sums output over threads and should therefore
2451      * only be called after thread synchronization.
2452      */
2453     int thread;
2454
2455     *mesh_energy = pme->work[0].energy_q;
2456     copy_mat(pme->work[0].vir_q, vir);
2457
2458     for (thread = 1; thread < nthread; thread++)
2459     {
2460         *mesh_energy += pme->work[thread].energy_q;
2461         m_add(vir, pme->work[thread].vir_q, vir);
2462     }
2463 }
2464
2465 static void get_pme_ener_vir_lj(const gmx_pme_t pme, int nthread,
2466                                 real *mesh_energy, matrix vir)
2467 {
2468     /* This function sums output over threads and should therefore
2469      * only be called after thread synchronization.
2470      */
2471     int thread;
2472
2473     *mesh_energy = pme->work[0].energy_lj;
2474     copy_mat(pme->work[0].vir_lj, vir);
2475
2476     for (thread = 1; thread < nthread; thread++)
2477     {
2478         *mesh_energy += pme->work[thread].energy_lj;
2479         m_add(vir, pme->work[thread].vir_lj, vir);
2480     }
2481 }
2482
2483
2484 #define DO_FSPLINE(order)                      \
2485     for (ithx = 0; (ithx < order); ithx++)              \
2486     {                                              \
2487         index_x = (i0+ithx)*pny*pnz;               \
2488         tx      = thx[ithx];                       \
2489         dx      = dthx[ithx];                      \
2490                                                \
2491         for (ithy = 0; (ithy < order); ithy++)          \
2492         {                                          \
2493             index_xy = index_x+(j0+ithy)*pnz;      \
2494             ty       = thy[ithy];                  \
2495             dy       = dthy[ithy];                 \
2496             fxy1     = fz1 = 0;                    \
2497                                                \
2498             for (ithz = 0; (ithz < order); ithz++)      \
2499             {                                      \
2500                 gval  = grid[index_xy+(k0+ithz)];  \
2501                 fxy1 += thz[ithz]*gval;            \
2502                 fz1  += dthz[ithz]*gval;           \
2503             }                                      \
2504             fx += dx*ty*fxy1;                      \
2505             fy += tx*dy*fxy1;                      \
2506             fz += tx*ty*fz1;                       \
2507         }                                          \
2508     }
2509
2510
2511 static void gather_f_bsplines(gmx_pme_t pme, real *grid,
2512                               gmx_bool bClearF, pme_atomcomm_t *atc,
2513                               splinedata_t *spline,
2514                               real scale)
2515 {
2516     /* sum forces for local particles */
2517     int     nn, n, ithx, ithy, ithz, i0, j0, k0;
2518     int     index_x, index_xy;
2519     int     nx, ny, nz, pnx, pny, pnz;
2520     int *   idxptr;
2521     real    tx, ty, dx, dy, coefficient;
2522     real    fx, fy, fz, gval;
2523     real    fxy1, fz1;
2524     real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
2525     int     norder;
2526     real    rxx, ryx, ryy, rzx, rzy, rzz;
2527     int     order;
2528
2529     pme_spline_work_t *work;
2530
2531 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
2532     real           thz_buffer[GMX_SIMD4_WIDTH*3],  *thz_aligned;
2533     real           dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;
2534
2535     thz_aligned  = gmx_simd4_align_r(thz_buffer);
2536     dthz_aligned = gmx_simd4_align_r(dthz_buffer);
2537 #endif
2538
2539     work = pme->spline_work;
2540
2541     order = pme->pme_order;
2542     thx   = spline->theta[XX];
2543     thy   = spline->theta[YY];
2544     thz   = spline->theta[ZZ];
2545     dthx  = spline->dtheta[XX];
2546     dthy  = spline->dtheta[YY];
2547     dthz  = spline->dtheta[ZZ];
2548     nx    = pme->nkx;
2549     ny    = pme->nky;
2550     nz    = pme->nkz;
2551     pnx   = pme->pmegrid_nx;
2552     pny   = pme->pmegrid_ny;
2553     pnz   = pme->pmegrid_nz;
2554
2555     rxx   = pme->recipbox[XX][XX];
2556     ryx   = pme->recipbox[YY][XX];
2557     ryy   = pme->recipbox[YY][YY];
2558     rzx   = pme->recipbox[ZZ][XX];
2559     rzy   = pme->recipbox[ZZ][YY];
2560     rzz   = pme->recipbox[ZZ][ZZ];
2561
2562     for (nn = 0; nn < spline->n; nn++)
2563     {
2564         n           = spline->ind[nn];
2565         coefficient = scale*atc->coefficient[n];
2566
2567         if (bClearF)
2568         {
2569             atc->f[n][XX] = 0;
2570             atc->f[n][YY] = 0;
2571             atc->f[n][ZZ] = 0;
2572         }
2573         if (coefficient != 0)
2574         {
2575             fx     = 0;
2576             fy     = 0;
2577             fz     = 0;
2578             idxptr = atc->idx[n];
2579             norder = nn*order;
2580
2581             i0   = idxptr[XX];
2582             j0   = idxptr[YY];
2583             k0   = idxptr[ZZ];
2584
2585             /* Pointer arithmetic alert, next six statements */
2586             thx  = spline->theta[XX] + norder;
2587             thy  = spline->theta[YY] + norder;
2588             thz  = spline->theta[ZZ] + norder;
2589             dthx = spline->dtheta[XX] + norder;
2590             dthy = spline->dtheta[YY] + norder;
2591             dthz = spline->dtheta[ZZ] + norder;
2592
2593             switch (order)
2594             {
2595                 case 4:
2596 #ifdef PME_SIMD4_SPREAD_GATHER
2597 #ifdef PME_SIMD4_UNALIGNED
2598 #define PME_GATHER_F_SIMD4_ORDER4
2599 #else
2600 #define PME_GATHER_F_SIMD4_ALIGNED
2601 #define PME_ORDER 4
2602 #endif
2603 #include "pme_simd4.h"
2604 #else
2605                     DO_FSPLINE(4);
2606 #endif
2607                     break;
2608                 case 5:
2609 #ifdef PME_SIMD4_SPREAD_GATHER
2610 #define PME_GATHER_F_SIMD4_ALIGNED
2611 #define PME_ORDER 5
2612 #include "pme_simd4.h"
2613 #else
2614                     DO_FSPLINE(5);
2615 #endif
2616                     break;
2617                 default:
2618                     DO_FSPLINE(order);
2619                     break;
2620             }
2621
2622             atc->f[n][XX] += -coefficient*( fx*nx*rxx );
2623             atc->f[n][YY] += -coefficient*( fx*nx*ryx + fy*ny*ryy );
2624             atc->f[n][ZZ] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2625         }
2626     }
2627     /* Since the energy and not forces are interpolated
2628      * the net force might not be exactly zero.
2629      * This can be solved by also interpolating F, but
2630      * that comes at a cost.
2631      * A better hack is to remove the net force every
2632      * step, but that must be done at a higher level
2633      * since this routine doesn't see all atoms if running
2634      * in parallel. Don't know how important it is?  EL 990726
2635      */
2636 }
2637
2638
2639 static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
2640                                    pme_atomcomm_t *atc)
2641 {
2642     splinedata_t *spline;
2643     int     n, ithx, ithy, ithz, i0, j0, k0;
2644     int     index_x, index_xy;
2645     int *   idxptr;
2646     real    energy, pot, tx, ty, coefficient, gval;
2647     real    *thx, *thy, *thz;
2648     int     norder;
2649     int     order;
2650
2651     spline = &atc->spline[0];
2652
2653     order = pme->pme_order;
2654
2655     energy = 0;
2656     for (n = 0; (n < atc->n); n++)
2657     {
2658         coefficient      = atc->coefficient[n];
2659
2660         if (coefficient != 0)
2661         {
2662             idxptr = atc->idx[n];
2663             norder = n*order;
2664
2665             i0   = idxptr[XX];
2666             j0   = idxptr[YY];
2667             k0   = idxptr[ZZ];
2668
2669             /* Pointer arithmetic alert, next three statements */
2670             thx  = spline->theta[XX] + norder;
2671             thy  = spline->theta[YY] + norder;
2672             thz  = spline->theta[ZZ] + norder;
2673
2674             pot = 0;
2675             for (ithx = 0; (ithx < order); ithx++)
2676             {
2677                 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2678                 tx      = thx[ithx];
2679
2680                 for (ithy = 0; (ithy < order); ithy++)
2681                 {
2682                     index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2683                     ty       = thy[ithy];
2684
2685                     for (ithz = 0; (ithz < order); ithz++)
2686                     {
2687                         gval  = grid[index_xy+(k0+ithz)];
2688                         pot  += tx*ty*thz[ithz]*gval;
2689                     }
2690
2691                 }
2692             }
2693
2694             energy += pot*coefficient;
2695         }
2696     }
2697
2698     return energy;
2699 }
2700
2701 /* Macro to force loop unrolling by fixing order.
2702  * This gives a significant performance gain.
2703  */
2704 #define CALC_SPLINE(order)                     \
2705     {                                              \
2706         int j, k, l;                                 \
2707         real dr, div;                               \
2708         real data[PME_ORDER_MAX];                  \
2709         real ddata[PME_ORDER_MAX];                 \
2710                                                \
2711         for (j = 0; (j < DIM); j++)                     \
2712         {                                          \
2713             dr  = xptr[j];                         \
2714                                                \
2715             /* dr is relative offset from lower cell limit */ \
2716             data[order-1] = 0;                     \
2717             data[1]       = dr;                          \
2718             data[0]       = 1 - dr;                      \
2719                                                \
2720             for (k = 3; (k < order); k++)               \
2721             {                                      \
2722                 div       = 1.0/(k - 1.0);               \
2723                 data[k-1] = div*dr*data[k-2];      \
2724                 for (l = 1; (l < (k-1)); l++)           \
2725                 {                                  \
2726                     data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2727                                        data[k-l-1]);                \
2728                 }                                  \
2729                 data[0] = div*(1-dr)*data[0];      \
2730             }                                      \
2731             /* differentiate */                    \
2732             ddata[0] = -data[0];                   \
2733             for (k = 1; (k < order); k++)               \
2734             {                                      \
2735                 ddata[k] = data[k-1] - data[k];    \
2736             }                                      \
2737                                                \
2738             div           = 1.0/(order - 1);                 \
2739             data[order-1] = div*dr*data[order-2];  \
2740             for (l = 1; (l < (order-1)); l++)           \
2741             {                                      \
2742                 data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
2743                                        (order-l-dr)*data[order-l-1]); \
2744             }                                      \
2745             data[0] = div*(1 - dr)*data[0];        \
2746                                                \
2747             for (k = 0; k < order; k++)                 \
2748             {                                      \
2749                 theta[j][i*order+k]  = data[k];    \
2750                 dtheta[j][i*order+k] = ddata[k];   \
2751             }                                      \
2752         }                                          \
2753     }
2754
2755 void make_bsplines(splinevec theta, splinevec dtheta, int order,
2756                    rvec fractx[], int nr, int ind[], real coefficient[],
2757                    gmx_bool bDoSplines)
2758 {
2759     /* construct splines for local atoms */
2760     int  i, ii;
2761     real *xptr;
2762
2763     for (i = 0; i < nr; i++)
2764     {
2765         /* With free energy we do not use the coefficient check.
2766          * In most cases this will be more efficient than calling make_bsplines
2767          * twice, since usually more than half the particles have non-zero coefficients.
2768          */
2769         ii = ind[i];
2770         if (bDoSplines || coefficient[ii] != 0.0)
2771         {
2772             xptr = fractx[ii];
2773             switch (order)
2774             {
2775                 case 4:  CALC_SPLINE(4);     break;
2776                 case 5:  CALC_SPLINE(5);     break;
2777                 default: CALC_SPLINE(order); break;
2778             }
2779         }
2780     }
2781 }
2782
2783
2784 void make_dft_mod(real *mod, real *data, int ndata)
2785 {
2786     int i, j;
2787     real sc, ss, arg;
2788
2789     for (i = 0; i < ndata; i++)
2790     {
2791         sc = ss = 0;
2792         for (j = 0; j < ndata; j++)
2793         {
2794             arg = (2.0*M_PI*i*j)/ndata;
2795             sc += data[j]*cos(arg);
2796             ss += data[j]*sin(arg);
2797         }
2798         mod[i] = sc*sc+ss*ss;
2799     }
2800     for (i = 0; i < ndata; i++)
2801     {
2802         if (mod[i] < 1e-7)
2803         {
2804             mod[i] = (mod[i-1]+mod[i+1])*0.5;
2805         }
2806     }
2807 }
2808
2809
2810 static void make_bspline_moduli(splinevec bsp_mod,
2811                                 int nx, int ny, int nz, int order)
2812 {
2813     int nmax = max(nx, max(ny, nz));
2814     real *data, *ddata, *bsp_data;
2815     int i, k, l;
2816     real div;
2817
2818     snew(data, order);
2819     snew(ddata, order);
2820     snew(bsp_data, nmax);
2821
2822     data[order-1] = 0;
2823     data[1]       = 0;
2824     data[0]       = 1;
2825
2826     for (k = 3; k < order; k++)
2827     {
2828         div       = 1.0/(k-1.0);
2829         data[k-1] = 0;
2830         for (l = 1; l < (k-1); l++)
2831         {
2832             data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2833         }
2834         data[0] = div*data[0];
2835     }
2836     /* differentiate */
2837     ddata[0] = -data[0];
2838     for (k = 1; k < order; k++)
2839     {
2840         ddata[k] = data[k-1]-data[k];
2841     }
2842     div           = 1.0/(order-1);
2843     data[order-1] = 0;
2844     for (l = 1; l < (order-1); l++)
2845     {
2846         data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2847     }
2848     data[0] = div*data[0];
2849
2850     for (i = 0; i < nmax; i++)
2851     {
2852         bsp_data[i] = 0;
2853     }
2854     for (i = 1; i <= order; i++)
2855     {
2856         bsp_data[i] = data[i-1];
2857     }
2858
2859     make_dft_mod(bsp_mod[XX], bsp_data, nx);
2860     make_dft_mod(bsp_mod[YY], bsp_data, ny);
2861     make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
2862
2863     sfree(data);
2864     sfree(ddata);
2865     sfree(bsp_data);
2866 }
2867
2868
2869 /* Return the P3M optimal influence function */
2870 static double do_p3m_influence(double z, int order)
2871 {
2872     double z2, z4;
2873
2874     z2 = z*z;
2875     z4 = z2*z2;
2876
2877     /* The formula and most constants can be found in:
2878      * Ballenegger et al., JCTC 8, 936 (2012)
2879      */
2880     switch (order)
2881     {
2882         case 2:
2883             return 1.0 - 2.0*z2/3.0;
2884             break;
2885         case 3:
2886             return 1.0 - z2 + 2.0*z4/15.0;
2887             break;
2888         case 4:
2889             return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2890             break;
2891         case 5:
2892             return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2893             break;
2894         case 6:
2895             return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2896             break;
2897         case 7:
2898             return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2899         case 8:
2900             return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2901             break;
2902     }
2903
2904     return 0.0;
2905 }
2906
2907 /* Calculate the P3M B-spline moduli for one dimension */
2908 static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
2909 {
2910     double zarg, zai, sinzai, infl;
2911     int    maxk, i;
2912
2913     if (order > 8)
2914     {
2915         gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
2916     }
2917
2918     zarg = M_PI/n;
2919
2920     maxk = (n + 1)/2;
2921
2922     for (i = -maxk; i < 0; i++)
2923     {
2924         zai          = zarg*i;
2925         sinzai       = sin(zai);
2926         infl         = do_p3m_influence(sinzai, order);
2927         bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
2928     }
2929     bsp_mod[0] = 1.0;
2930     for (i = 1; i < maxk; i++)
2931     {
2932         zai        = zarg*i;
2933         sinzai     = sin(zai);
2934         infl       = do_p3m_influence(sinzai, order);
2935         bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
2936     }
2937 }
2938
2939 /* Calculate the P3M B-spline moduli */
2940 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2941                                     int nx, int ny, int nz, int order)
2942 {
2943     make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
2944     make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
2945     make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
2946 }
2947
2948
2949 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2950 {
2951     int nslab, n, i;
2952     int fw, bw;
2953
2954     nslab = atc->nslab;
2955
2956     n = 0;
2957     for (i = 1; i <= nslab/2; i++)
2958     {
2959         fw = (atc->nodeid + i) % nslab;
2960         bw = (atc->nodeid - i + nslab) % nslab;
2961         if (n < nslab - 1)
2962         {
2963             atc->node_dest[n] = fw;
2964             atc->node_src[n]  = bw;
2965             n++;
2966         }
2967         if (n < nslab - 1)
2968         {
2969             atc->node_dest[n] = bw;
2970             atc->node_src[n]  = fw;
2971             n++;
2972         }
2973     }
2974 }
2975
2976 int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
2977 {
2978     int thread, i;
2979
2980     if (NULL != log)
2981     {
2982         fprintf(log, "Destroying PME data structures.\n");
2983     }
2984
2985     sfree((*pmedata)->nnx);
2986     sfree((*pmedata)->nny);
2987     sfree((*pmedata)->nnz);
2988
2989     for (i = 0; i < (*pmedata)->ngrids; ++i)
2990     {
2991         pmegrids_destroy(&(*pmedata)->pmegrid[i]);
2992         sfree((*pmedata)->fftgrid[i]);
2993         sfree((*pmedata)->cfftgrid[i]);
2994         gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
2995     }
2996
2997     sfree((*pmedata)->lb_buf1);
2998     sfree((*pmedata)->lb_buf2);
2999
3000     for (thread = 0; thread < (*pmedata)->nthread; thread++)
3001     {
3002         free_work(&(*pmedata)->work[thread]);
3003     }
3004     sfree((*pmedata)->work);
3005
3006     sfree(*pmedata);
3007     *pmedata = NULL;
3008
3009     return 0;
3010 }
3011
3012 static int mult_up(int n, int f)
3013 {
3014     return ((n + f - 1)/f)*f;
3015 }
3016
3017
3018 static double pme_load_imbalance(gmx_pme_t pme)
3019 {
3020     int    nma, nmi;
3021     double n1, n2, n3;
3022
3023     nma = pme->nnodes_major;
3024     nmi = pme->nnodes_minor;
3025
3026     n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
3027     n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
3028     n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
3029
3030     /* pme_solve is roughly double the cost of an fft */
3031
3032     return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
3033 }
3034
3035 static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
3036                           int dimind, gmx_bool bSpread)
3037 {
3038     int nk, k, s, thread;
3039
3040     atc->dimind    = dimind;
3041     atc->nslab     = 1;
3042     atc->nodeid    = 0;
3043     atc->pd_nalloc = 0;
3044 #ifdef GMX_MPI
3045     if (pme->nnodes > 1)
3046     {
3047         atc->mpi_comm = pme->mpi_comm_d[dimind];
3048         MPI_Comm_size(atc->mpi_comm, &atc->nslab);
3049         MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
3050     }
3051     if (debug)
3052     {
3053         fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
3054     }
3055 #endif
3056
3057     atc->bSpread   = bSpread;
3058     atc->pme_order = pme->pme_order;
3059
3060     if (atc->nslab > 1)
3061     {
3062         snew(atc->node_dest, atc->nslab);
3063         snew(atc->node_src, atc->nslab);
3064         setup_coordinate_communication(atc);
3065
3066         snew(atc->count_thread, pme->nthread);
3067         for (thread = 0; thread < pme->nthread; thread++)
3068         {
3069             snew(atc->count_thread[thread], atc->nslab);
3070         }
3071         atc->count = atc->count_thread[0];
3072         snew(atc->rcount, atc->nslab);
3073         snew(atc->buf_index, atc->nslab);
3074     }
3075
3076     atc->nthread = pme->nthread;
3077     if (atc->nthread > 1)
3078     {
3079         snew(atc->thread_plist, atc->nthread);
3080     }
3081     snew(atc->spline, atc->nthread);
3082     for (thread = 0; thread < atc->nthread; thread++)
3083     {
3084         if (atc->nthread > 1)
3085         {
3086             snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
3087             atc->thread_plist[thread].n += GMX_CACHE_SEP;
3088         }
3089         snew(atc->spline[thread].thread_one, pme->nthread);
3090         atc->spline[thread].thread_one[thread] = 1;
3091     }
3092 }
3093
3094 static void
3095 init_overlap_comm(pme_overlap_t *  ol,
3096                   int              norder,
3097 #ifdef GMX_MPI
3098                   MPI_Comm         comm,
3099 #endif
3100                   int              nnodes,
3101                   int              nodeid,
3102                   int              ndata,
3103                   int              commplainsize)
3104 {
3105     int lbnd, rbnd, maxlr, b, i;
3106     int exten;
3107     int nn, nk;
3108     pme_grid_comm_t *pgc;
3109     gmx_bool bCont;
3110     int fft_start, fft_end, send_index1, recv_index1;
3111 #ifdef GMX_MPI
3112     MPI_Status stat;
3113
3114     ol->mpi_comm = comm;
3115 #endif
3116
3117     ol->nnodes = nnodes;
3118     ol->nodeid = nodeid;
3119
3120     /* Linear translation of the PME grid won't affect reciprocal space
3121      * calculations, so to optimize we only interpolate "upwards",
3122      * which also means we only have to consider overlap in one direction.
3123      * I.e., particles on this node might also be spread to grid indices
3124      * that belong to higher nodes (modulo nnodes)
3125      */
3126
3127     snew(ol->s2g0, ol->nnodes+1);
3128     snew(ol->s2g1, ol->nnodes);
3129     if (debug)
3130     {
3131         fprintf(debug, "PME slab boundaries:");
3132     }
3133     for (i = 0; i < nnodes; i++)
3134     {
3135         /* s2g0 the local interpolation grid start.
3136          * s2g1 the local interpolation grid end.
3137          * Because grid overlap communication only goes forward,
3138          * the grid the slabs for fft's should be rounded down.
3139          */
3140         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
3141         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
3142
3143         if (debug)
3144         {
3145             fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
3146         }
3147     }
3148     ol->s2g0[nnodes] = ndata;
3149     if (debug)
3150     {
3151         fprintf(debug, "\n");
3152     }
3153
3154     /* Determine with how many nodes we need to communicate the grid overlap */
3155     b = 0;
3156     do
3157     {
3158         b++;
3159         bCont = FALSE;
3160         for (i = 0; i < nnodes; i++)
3161         {
3162             if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
3163                 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
3164             {
3165                 bCont = TRUE;
3166             }
3167         }
3168     }
3169     while (bCont && b < nnodes);
3170     ol->noverlap_nodes = b - 1;
3171
3172     snew(ol->send_id, ol->noverlap_nodes);
3173     snew(ol->recv_id, ol->noverlap_nodes);
3174     for (b = 0; b < ol->noverlap_nodes; b++)
3175     {
3176         ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
3177         ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
3178     }
3179     snew(ol->comm_data, ol->noverlap_nodes);
3180
3181     ol->send_size = 0;
3182     for (b = 0; b < ol->noverlap_nodes; b++)
3183     {
3184         pgc = &ol->comm_data[b];
3185         /* Send */
3186         fft_start        = ol->s2g0[ol->send_id[b]];
3187         fft_end          = ol->s2g0[ol->send_id[b]+1];
3188         if (ol->send_id[b] < nodeid)
3189         {
3190             fft_start += ndata;
3191             fft_end   += ndata;
3192         }
3193         send_index1       = ol->s2g1[nodeid];
3194         send_index1       = min(send_index1, fft_end);
3195         pgc->send_index0  = fft_start;
3196         pgc->send_nindex  = max(0, send_index1 - pgc->send_index0);
3197         ol->send_size    += pgc->send_nindex;
3198
3199         /* We always start receiving to the first index of our slab */
3200         fft_start        = ol->s2g0[ol->nodeid];
3201         fft_end          = ol->s2g0[ol->nodeid+1];
3202         recv_index1      = ol->s2g1[ol->recv_id[b]];
3203         if (ol->recv_id[b] > nodeid)
3204         {
3205             recv_index1 -= ndata;
3206         }
3207         recv_index1      = min(recv_index1, fft_end);
3208         pgc->recv_index0 = fft_start;
3209         pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
3210     }
3211
3212 #ifdef GMX_MPI
3213     /* Communicate the buffer sizes to receive */
3214     for (b = 0; b < ol->noverlap_nodes; b++)
3215     {
3216         MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
3217                      &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
3218                      ol->mpi_comm, &stat);
3219     }
3220 #endif
3221
3222     /* For non-divisible grid we need pme_order iso pme_order-1 */
3223     snew(ol->sendbuf, norder*commplainsize);
3224     snew(ol->recvbuf, norder*commplainsize);
3225 }
3226
3227 static void
3228 make_gridindex5_to_localindex(int n, int local_start, int local_range,
3229                               int **global_to_local,
3230                               real **fraction_shift)
3231 {
3232     int i;
3233     int * gtl;
3234     real * fsh;
3235
3236     snew(gtl, 5*n);
3237     snew(fsh, 5*n);
3238     for (i = 0; (i < 5*n); i++)
3239     {
3240         /* Determine the global to local grid index */
3241         gtl[i] = (i - local_start + n) % n;
3242         /* For coordinates that fall within the local grid the fraction
3243          * is correct, we don't need to shift it.
3244          */
3245         fsh[i] = 0;
3246         if (local_range < n)
3247         {
3248             /* Due to rounding issues i could be 1 beyond the lower or
3249              * upper boundary of the local grid. Correct the index for this.
3250              * If we shift the index, we need to shift the fraction by
3251              * the same amount in the other direction to not affect
3252              * the weights.
3253              * Note that due to this shifting the weights at the end of
3254              * the spline might change, but that will only involve values
3255              * between zero and values close to the precision of a real,
3256              * which is anyhow the accuracy of the whole mesh calculation.
3257              */
3258             /* With local_range=0 we should not change i=local_start */
3259             if (i % n != local_start)
3260             {
3261                 if (gtl[i] == n-1)
3262                 {
3263                     gtl[i] = 0;
3264                     fsh[i] = -1;
3265                 }
3266                 else if (gtl[i] == local_range)
3267                 {
3268                     gtl[i] = local_range - 1;
3269                     fsh[i] = 1;
3270                 }
3271             }
3272         }
3273     }
3274
3275     *global_to_local = gtl;
3276     *fraction_shift  = fsh;
3277 }
3278
3279 static pme_spline_work_t *make_pme_spline_work(int gmx_unused order)
3280 {
3281     pme_spline_work_t *work;
3282
3283 #ifdef PME_SIMD4_SPREAD_GATHER
3284     real             tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
3285     gmx_simd4_real_t zero_S;
3286     gmx_simd4_real_t real_mask_S0, real_mask_S1;
3287     int              of, i;
3288
3289     snew_aligned(work, 1, SIMD4_ALIGNMENT);
3290
3291     tmp_aligned = gmx_simd4_align_r(tmp);
3292
3293     zero_S = gmx_simd4_setzero_r();
3294
3295     /* Generate bit masks to mask out the unused grid entries,
3296      * as we only operate on order of the 8 grid entries that are
3297      * load into 2 SIMD registers.
3298      */
3299     for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
3300     {
3301         for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
3302         {
3303             tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
3304         }
3305         real_mask_S0      = gmx_simd4_load_r(tmp_aligned);
3306         real_mask_S1      = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
3307         work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
3308         work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
3309     }
3310 #else
3311     work = NULL;
3312 #endif
3313
3314     return work;
3315 }
3316
3317 void gmx_pme_check_restrictions(int pme_order,
3318                                 int nkx, int nky, int nkz,
3319                                 int nnodes_major,
3320                                 int nnodes_minor,
3321                                 gmx_bool bUseThreads,
3322                                 gmx_bool bFatal,
3323                                 gmx_bool *bValidSettings)
3324 {
3325     if (pme_order > PME_ORDER_MAX)
3326     {
3327         if (!bFatal)
3328         {
3329             *bValidSettings = FALSE;
3330             return;
3331         }
3332         gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3333                   pme_order, PME_ORDER_MAX);
3334     }
3335
3336     if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
3337         nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
3338         nkz <= pme_order)
3339     {
3340         if (!bFatal)
3341         {
3342             *bValidSettings = FALSE;
3343             return;
3344         }
3345         gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
3346                   pme_order);
3347     }
3348
3349     /* Check for a limitation of the (current) sum_fftgrid_dd code.
3350      * We only allow multiple communication pulses in dim 1, not in dim 0.
3351      */
3352     if (bUseThreads && (nkx < nnodes_major*pme_order &&
3353                         nkx != nnodes_major*(pme_order - 1)))
3354     {
3355         if (!bFatal)
3356         {
3357             *bValidSettings = FALSE;
3358             return;
3359         }
3360         gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
3361                   nkx/(double)nnodes_major, pme_order);
3362     }
3363
3364     if (bValidSettings != NULL)
3365     {
3366         *bValidSettings = TRUE;
3367     }
3368
3369     return;
3370 }
3371
3372 int gmx_pme_init(gmx_pme_t *         pmedata,
3373                  t_commrec *         cr,
3374                  int                 nnodes_major,
3375                  int                 nnodes_minor,
3376                  t_inputrec *        ir,
3377                  int                 homenr,
3378                  gmx_bool            bFreeEnergy_q,
3379                  gmx_bool            bFreeEnergy_lj,
3380                  gmx_bool            bReproducible,
3381                  int                 nthread)
3382 {
3383     gmx_pme_t pme = NULL;
3384
3385     int  use_threads, sum_use_threads, i;
3386     ivec ndata;
3387
3388     if (debug)
3389     {
3390         fprintf(debug, "Creating PME data structures.\n");
3391     }
3392     snew(pme, 1);
3393
3394     pme->sum_qgrid_tmp       = NULL;
3395     pme->sum_qgrid_dd_tmp    = NULL;
3396     pme->buf_nalloc          = 0;
3397
3398     pme->nnodes              = 1;
3399     pme->bPPnode             = TRUE;
3400
3401     pme->nnodes_major        = nnodes_major;
3402     pme->nnodes_minor        = nnodes_minor;
3403
3404 #ifdef GMX_MPI
3405     if (nnodes_major*nnodes_minor > 1)
3406     {
3407         pme->mpi_comm = cr->mpi_comm_mygroup;
3408
3409         MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
3410         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
3411         if (pme->nnodes != nnodes_major*nnodes_minor)
3412         {
3413             gmx_incons("PME node count mismatch");
3414         }
3415     }
3416     else
3417     {
3418         pme->mpi_comm = MPI_COMM_NULL;
3419     }
3420 #endif
3421
3422     if (pme->nnodes == 1)
3423     {
3424 #ifdef GMX_MPI
3425         pme->mpi_comm_d[0] = MPI_COMM_NULL;
3426         pme->mpi_comm_d[1] = MPI_COMM_NULL;
3427 #endif
3428         pme->ndecompdim   = 0;
3429         pme->nodeid_major = 0;
3430         pme->nodeid_minor = 0;
3431 #ifdef GMX_MPI
3432         pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
3433 #endif
3434     }
3435     else
3436     {
3437         if (nnodes_minor == 1)
3438         {
3439 #ifdef GMX_MPI
3440             pme->mpi_comm_d[0] = pme->mpi_comm;
3441             pme->mpi_comm_d[1] = MPI_COMM_NULL;
3442 #endif
3443             pme->ndecompdim   = 1;
3444             pme->nodeid_major = pme->nodeid;
3445             pme->nodeid_minor = 0;
3446
3447         }
3448         else if (nnodes_major == 1)
3449         {
3450 #ifdef GMX_MPI
3451             pme->mpi_comm_d[0] = MPI_COMM_NULL;
3452             pme->mpi_comm_d[1] = pme->mpi_comm;
3453 #endif
3454             pme->ndecompdim   = 1;
3455             pme->nodeid_major = 0;
3456             pme->nodeid_minor = pme->nodeid;
3457         }
3458         else
3459         {
3460             if (pme->nnodes % nnodes_major != 0)
3461             {
3462                 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
3463             }
3464             pme->ndecompdim = 2;
3465
3466 #ifdef GMX_MPI
3467             MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
3468                            pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
3469             MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
3470                            pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
3471
3472             MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
3473             MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
3474             MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
3475             MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
3476 #endif
3477         }
3478         pme->bPPnode = (cr->duty & DUTY_PP);
3479     }
3480
3481     pme->nthread = nthread;
3482
3483     /* Check if any of the PME MPI ranks uses threads */
3484     use_threads = (pme->nthread > 1 ? 1 : 0);
3485 #ifdef GMX_MPI
3486     if (pme->nnodes > 1)
3487     {
3488         MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
3489                       MPI_SUM, pme->mpi_comm);
3490     }
3491     else
3492 #endif
3493     {
3494         sum_use_threads = use_threads;
3495     }
3496     pme->bUseThreads = (sum_use_threads > 0);
3497
3498     if (ir->ePBC == epbcSCREW)
3499     {
3500         gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
3501     }
3502
3503     pme->bFEP_q      = ((ir->efep != efepNO) && bFreeEnergy_q);
3504     pme->bFEP_lj     = ((ir->efep != efepNO) && bFreeEnergy_lj);
3505     pme->bFEP        = (pme->bFEP_q || pme->bFEP_lj);
3506     pme->nkx         = ir->nkx;
3507     pme->nky         = ir->nky;
3508     pme->nkz         = ir->nkz;
3509     pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3510     pme->pme_order   = ir->pme_order;
3511
3512     /* Always constant electrostatics coefficients */
3513     pme->epsilon_r   = ir->epsilon_r;
3514
3515     /* Always constant LJ coefficients */
3516     pme->ljpme_combination_rule = ir->ljpme_combination_rule;
3517
3518     /* If we violate restrictions, generate a fatal error here */
3519     gmx_pme_check_restrictions(pme->pme_order,
3520                                pme->nkx, pme->nky, pme->nkz,
3521                                pme->nnodes_major,
3522                                pme->nnodes_minor,
3523                                pme->bUseThreads,
3524                                TRUE,
3525                                NULL);
3526
3527     if (pme->nnodes > 1)
3528     {
3529         double imbal;
3530
3531 #ifdef GMX_MPI
3532         MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3533         MPI_Type_commit(&(pme->rvec_mpi));
3534 #endif
3535
3536         /* Note that the coefficient spreading and force gathering, which usually
3537          * takes about the same amount of time as FFT+solve_pme,
3538          * is always fully load balanced
3539          * (unless the coefficient distribution is inhomogeneous).
3540          */
3541
3542         imbal = pme_load_imbalance(pme);
3543         if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3544         {
3545             fprintf(stderr,
3546                     "\n"
3547                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3548                     "      For optimal PME load balancing\n"
3549                     "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3550                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3551                     "\n",
3552                     (int)((imbal-1)*100 + 0.5),
3553                     pme->nkx, pme->nky, pme->nnodes_major,
3554                     pme->nky, pme->nkz, pme->nnodes_minor);
3555         }
3556     }
3557
3558     /* For non-divisible grid we need pme_order iso pme_order-1 */
3559     /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3560      * y is always copied through a buffer: we don't need padding in z,
3561      * but we do need the overlap in x because of the communication order.
3562      */
3563     init_overlap_comm(&pme->overlap[0], pme->pme_order,
3564 #ifdef GMX_MPI
3565                       pme->mpi_comm_d[0],
3566 #endif
3567                       pme->nnodes_major, pme->nodeid_major,
3568                       pme->nkx,
3569                       (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3570
3571     /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3572      * We do this with an offset buffer of equal size, so we need to allocate
3573      * extra for the offset. That's what the (+1)*pme->nkz is for.
3574      */
3575     init_overlap_comm(&pme->overlap[1], pme->pme_order,
3576 #ifdef GMX_MPI
3577                       pme->mpi_comm_d[1],
3578 #endif
3579                       pme->nnodes_minor, pme->nodeid_minor,
3580                       pme->nky,
3581                       (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
3582
3583     /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
3584      * Note that gmx_pme_check_restrictions checked for this already.
3585      */
3586     if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
3587     {
3588         gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
3589     }
3590
3591     snew(pme->bsp_mod[XX], pme->nkx);
3592     snew(pme->bsp_mod[YY], pme->nky);
3593     snew(pme->bsp_mod[ZZ], pme->nkz);
3594
3595     /* The required size of the interpolation grid, including overlap.
3596      * The allocated size (pmegrid_n?) might be slightly larger.
3597      */
3598     pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3599         pme->overlap[0].s2g0[pme->nodeid_major];
3600     pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3601         pme->overlap[1].s2g0[pme->nodeid_minor];
3602     pme->pmegrid_nz_base = pme->nkz;
3603     pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
3604     set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
3605
3606     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3607     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3608     pme->pmegrid_start_iz = 0;
3609
3610     make_gridindex5_to_localindex(pme->nkx,
3611                                   pme->pmegrid_start_ix,
3612                                   pme->pmegrid_nx - (pme->pme_order-1),
3613                                   &pme->nnx, &pme->fshx);
3614     make_gridindex5_to_localindex(pme->nky,
3615                                   pme->pmegrid_start_iy,
3616                                   pme->pmegrid_ny - (pme->pme_order-1),
3617                                   &pme->nny, &pme->fshy);
3618     make_gridindex5_to_localindex(pme->nkz,
3619                                   pme->pmegrid_start_iz,
3620                                   pme->pmegrid_nz_base,
3621                                   &pme->nnz, &pme->fshz);
3622
3623     pme->spline_work = make_pme_spline_work(pme->pme_order);
3624
3625     ndata[0]    = pme->nkx;
3626     ndata[1]    = pme->nky;
3627     ndata[2]    = pme->nkz;
3628     /* It doesn't matter if we allocate too many grids here,
3629      * we only allocate and use the ones we need.
3630      */
3631     if (EVDW_PME(ir->vdwtype))
3632     {
3633         pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
3634     }
3635     else
3636     {
3637         pme->ngrids = DO_Q;
3638     }
3639     snew(pme->fftgrid, pme->ngrids);
3640     snew(pme->cfftgrid, pme->ngrids);
3641     snew(pme->pfft_setup, pme->ngrids);
3642
3643     for (i = 0; i < pme->ngrids; ++i)
3644     {
3645         if ((i <  DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
3646                                                        bFreeEnergy_q)) ||
3647             (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
3648                                                     bFreeEnergy_lj ||
3649                                                     ir->ljpme_combination_rule == eljpmeLB)))
3650         {
3651             pmegrids_init(&pme->pmegrid[i],
3652                           pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3653                           pme->pmegrid_nz_base,
3654                           pme->pme_order,
3655                           pme->bUseThreads,
3656                           pme->nthread,
3657                           pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3658                           pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3659             /* This routine will allocate the grid data to fit the FFTs */
3660             gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
3661                                     &pme->fftgrid[i], &pme->cfftgrid[i],
3662                                     pme->mpi_comm_d,
3663                                     bReproducible, pme->nthread);
3664
3665         }
3666     }
3667
3668     if (!pme->bP3M)
3669     {
3670         /* Use plain SPME B-spline interpolation */
3671         make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3672     }
3673     else
3674     {
3675         /* Use the P3M grid-optimized influence function */
3676         make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3677     }
3678
3679     /* Use atc[0] for spreading */
3680     init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
3681     if (pme->ndecompdim >= 2)
3682     {
3683         init_atomcomm(pme, &pme->atc[1], 1, FALSE);
3684     }
3685
3686     if (pme->nnodes == 1)
3687     {
3688         pme->atc[0].n = homenr;
3689         pme_realloc_atomcomm_things(&pme->atc[0]);
3690     }
3691
3692     pme->lb_buf1       = NULL;
3693     pme->lb_buf2       = NULL;
3694     pme->lb_buf_nalloc = 0;
3695
3696     {
3697         int thread;
3698
3699         /* Use fft5d, order after FFT is y major, z, x minor */
3700
3701         snew(pme->work, pme->nthread);
3702         for (thread = 0; thread < pme->nthread; thread++)
3703         {
3704             realloc_work(&pme->work[thread], pme->nkx);
3705         }
3706     }
3707
3708     *pmedata = pme;
3709
3710     return 0;
3711 }
3712
3713 static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
3714 {
3715     int d, t;
3716
3717     for (d = 0; d < DIM; d++)
3718     {
3719         if (new->grid.n[d] > old->grid.n[d])
3720         {
3721             return;
3722         }
3723     }
3724
3725     sfree_aligned(new->grid.grid);
3726     new->grid.grid = old->grid.grid;
3727
3728     if (new->grid_th != NULL && new->nthread == old->nthread)
3729     {
3730         sfree_aligned(new->grid_all);
3731         for (t = 0; t < new->nthread; t++)
3732         {
3733             new->grid_th[t].grid = old->grid_th[t].grid;
3734         }
3735     }
3736 }
3737
3738 int gmx_pme_reinit(gmx_pme_t *         pmedata,
3739                    t_commrec *         cr,
3740                    gmx_pme_t           pme_src,
3741                    const t_inputrec *  ir,
3742                    ivec                grid_size)
3743 {
3744     t_inputrec irc;
3745     int homenr;
3746     int ret;
3747
3748     irc     = *ir;
3749     irc.nkx = grid_size[XX];
3750     irc.nky = grid_size[YY];
3751     irc.nkz = grid_size[ZZ];
3752
3753     if (pme_src->nnodes == 1)
3754     {
3755         homenr = pme_src->atc[0].n;
3756     }
3757     else
3758     {
3759         homenr = -1;
3760     }
3761
3762     ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
3763                        &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, pme_src->nthread);
3764
3765     if (ret == 0)
3766     {
3767         /* We can easily reuse the allocated pme grids in pme_src */
3768         reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
3769         /* We would like to reuse the fft grids, but that's harder */
3770     }
3771
3772     return ret;
3773 }
3774
3775
3776 static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids,
3777                             int grid_index, int thread, real *fftgrid)
3778 {
3779     ivec local_fft_ndata, local_fft_offset, local_fft_size;
3780     int  fft_my, fft_mz;
3781     int  nsx, nsy, nsz;
3782     ivec nf;
3783     int  offx, offy, offz, x, y, z, i0, i0t;
3784     int  d;
3785     pmegrid_t *pmegrid;
3786     real *grid_th;
3787
3788     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
3789                                    local_fft_ndata,
3790                                    local_fft_offset,
3791                                    local_fft_size);
3792     fft_my = local_fft_size[YY];
3793     fft_mz = local_fft_size[ZZ];
3794
3795     pmegrid = &pmegrids->grid_th[thread];
3796
3797     nsx = pmegrid->s[XX];
3798     nsy = pmegrid->s[YY];
3799     nsz = pmegrid->s[ZZ];
3800
3801     for (d = 0; d < DIM; d++)
3802     {
3803         nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3804                     local_fft_ndata[d] - pmegrid->offset[d]);
3805     }
3806
3807     offx = pmegrid->offset[XX];
3808     offy = pmegrid->offset[YY];
3809     offz = pmegrid->offset[ZZ];
3810
3811     /* Directly copy the non-overlapping parts of the local grids.
3812      * This also initializes the full grid.
3813      */
3814     grid_th = pmegrid->grid;
3815     for (x = 0; x < nf[XX]; x++)
3816     {
3817         for (y = 0; y < nf[YY]; y++)
3818         {
3819             i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3820             i0t = (x*nsy + y)*nsz;
3821             for (z = 0; z < nf[ZZ]; z++)
3822             {
3823                 fftgrid[i0+z] = grid_th[i0t+z];
3824             }
3825         }
3826     }
3827 }
3828
3829 static void
3830 reduce_threadgrid_overlap(gmx_pme_t pme,
3831                           const pmegrids_t *pmegrids, int thread,
3832                           real *fftgrid, real *commbuf_x, real *commbuf_y,
3833                           int grid_index)
3834 {
3835     ivec local_fft_ndata, local_fft_offset, local_fft_size;
3836     int  fft_nx, fft_ny, fft_nz;
3837     int  fft_my, fft_mz;
3838     int  buf_my = -1;
3839     int  nsx, nsy, nsz;
3840     ivec ne;
3841     int  offx, offy, offz, x, y, z, i0, i0t;
3842     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
3843     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
3844     gmx_bool bCommX, bCommY;
3845     int  d;
3846     int  thread_f;
3847     const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
3848     const real *grid_th;
3849     real *commbuf = NULL;
3850
3851     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
3852                                    local_fft_ndata,
3853                                    local_fft_offset,
3854                                    local_fft_size);
3855     fft_nx = local_fft_ndata[XX];
3856     fft_ny = local_fft_ndata[YY];
3857     fft_nz = local_fft_ndata[ZZ];
3858
3859     fft_my = local_fft_size[YY];
3860     fft_mz = local_fft_size[ZZ];
3861
3862     /* This routine is called when all thread have finished spreading.
3863      * Here each thread sums grid contributions calculated by other threads
3864      * to the thread local grid volume.
3865      * To minimize the number of grid copying operations,
3866      * this routines sums immediately from the pmegrid to the fftgrid.
3867      */
3868
3869     /* Determine which part of the full node grid we should operate on,
3870      * this is our thread local part of the full grid.
3871      */
3872     pmegrid = &pmegrids->grid_th[thread];
3873
3874     for (d = 0; d < DIM; d++)
3875     {
3876         ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3877                     local_fft_ndata[d]);
3878     }
3879
3880     offx = pmegrid->offset[XX];
3881     offy = pmegrid->offset[YY];
3882     offz = pmegrid->offset[ZZ];
3883
3884
3885     bClearBufX  = TRUE;
3886     bClearBufY  = TRUE;
3887     bClearBufXY = TRUE;
3888
3889     /* Now loop over all the thread data blocks that contribute
3890      * to the grid region we (our thread) are operating on.
3891      */
3892     /* Note that ffy_nx/y is equal to the number of grid points
3893      * between the first point of our node grid and the one of the next node.
3894      */
3895     for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
3896     {
3897         fx     = pmegrid->ci[XX] + sx;
3898         ox     = 0;
3899         bCommX = FALSE;
3900         if (fx < 0)
3901         {
3902             fx    += pmegrids->nc[XX];
3903             ox    -= fft_nx;
3904             bCommX = (pme->nnodes_major > 1);
3905         }
3906         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3907         ox       += pmegrid_g->offset[XX];
3908         if (!bCommX)
3909         {
3910             tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
3911         }
3912         else
3913         {
3914             tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
3915         }
3916
3917         for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
3918         {
3919             fy     = pmegrid->ci[YY] + sy;
3920             oy     = 0;
3921             bCommY = FALSE;
3922             if (fy < 0)
3923             {
3924                 fy    += pmegrids->nc[YY];
3925                 oy    -= fft_ny;
3926                 bCommY = (pme->nnodes_minor > 1);
3927             }
3928             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3929             oy       += pmegrid_g->offset[YY];
3930             if (!bCommY)
3931             {
3932                 ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
3933             }
3934             else
3935             {
3936                 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
3937             }
3938
3939             for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
3940             {
3941                 fz = pmegrid->ci[ZZ] + sz;
3942                 oz = 0;
3943                 if (fz < 0)
3944                 {
3945                     fz += pmegrids->nc[ZZ];
3946                     oz -= fft_nz;
3947                 }
3948                 pmegrid_g = &pmegrids->grid_th[fz];
3949                 oz       += pmegrid_g->offset[ZZ];
3950                 tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
3951
3952                 if (sx == 0 && sy == 0 && sz == 0)
3953                 {
3954                     /* We have already added our local contribution
3955                      * before calling this routine, so skip it here.
3956                      */
3957                     continue;
3958                 }
3959
3960                 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3961
3962                 pmegrid_f = &pmegrids->grid_th[thread_f];
3963
3964                 grid_th = pmegrid_f->grid;
3965
3966                 nsx = pmegrid_f->s[XX];
3967                 nsy = pmegrid_f->s[YY];
3968                 nsz = pmegrid_f->s[ZZ];
3969
3970 #ifdef DEBUG_PME_REDUCE
3971                 printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3972                        pme->nodeid, thread, thread_f,
3973                        pme->pmegrid_start_ix,
3974                        pme->pmegrid_start_iy,
3975                        pme->pmegrid_start_iz,
3976                        sx, sy, sz,
3977                        offx-ox, tx1-ox, offx, tx1,
3978                        offy-oy, ty1-oy, offy, ty1,
3979                        offz-oz, tz1-oz, offz, tz1);
3980 #endif
3981
3982                 if (!(bCommX || bCommY))
3983                 {
3984                     /* Copy from the thread local grid to the node grid */
3985                     for (x = offx; x < tx1; x++)
3986                     {
3987                         for (y = offy; y < ty1; y++)
3988                         {
3989                             i0  = (x*fft_my + y)*fft_mz;
3990                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3991                             for (z = offz; z < tz1; z++)
3992                             {
3993                                 fftgrid[i0+z] += grid_th[i0t+z];
3994                             }
3995                         }
3996                     }
3997                 }
3998                 else
3999                 {
4000                     /* The order of this conditional decides
4001                      * where the corner volume gets stored with x+y decomp.
4002                      */
4003                     if (bCommY)
4004                     {
4005                         commbuf = commbuf_y;
4006                         buf_my  = ty1 - offy;
4007                         if (bCommX)
4008                         {
4009                             /* We index commbuf modulo the local grid size */
4010                             commbuf += buf_my*fft_nx*fft_nz;
4011
4012                             bClearBuf   = bClearBufXY;
4013                             bClearBufXY = FALSE;
4014                         }
4015                         else
4016                         {
4017                             bClearBuf  = bClearBufY;
4018                             bClearBufY = FALSE;
4019                         }
4020                     }
4021                     else
4022                     {
4023                         commbuf    = commbuf_x;
4024                         buf_my     = fft_ny;
4025                         bClearBuf  = bClearBufX;
4026                         bClearBufX = FALSE;
4027                     }
4028
4029                     /* Copy to the communication buffer */
4030                     for (x = offx; x < tx1; x++)
4031                     {
4032                         for (y = offy; y < ty1; y++)
4033                         {
4034                             i0  = (x*buf_my + y)*fft_nz;
4035                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
4036
4037                             if (bClearBuf)
4038                             {
4039                                 /* First access of commbuf, initialize it */
4040                                 for (z = offz; z < tz1; z++)
4041                                 {
4042                                     commbuf[i0+z]  = grid_th[i0t+z];
4043                                 }
4044                             }
4045                             else
4046                             {
4047                                 for (z = offz; z < tz1; z++)
4048                                 {
4049                                     commbuf[i0+z] += grid_th[i0t+z];
4050                                 }
4051                             }
4052                         }
4053                     }
4054                 }
4055             }
4056         }
4057     }
4058 }
4059
4060
4061 static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
4062 {
4063     ivec local_fft_ndata, local_fft_offset, local_fft_size;
4064     pme_overlap_t *overlap;
4065     int  send_index0, send_nindex;
4066     int  recv_nindex;
4067 #ifdef GMX_MPI
4068     MPI_Status stat;
4069 #endif
4070     int  send_size_y, recv_size_y;
4071     int  ipulse, send_id, recv_id, datasize, gridsize, size_yx;
4072     real *sendptr, *recvptr;
4073     int  x, y, z, indg, indb;
4074
4075     /* Note that this routine is only used for forward communication.
4076      * Since the force gathering, unlike the coefficient spreading,
4077      * can be trivially parallelized over the particles,
4078      * the backwards process is much simpler and can use the "old"
4079      * communication setup.
4080      */
4081
4082     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
4083                                    local_fft_ndata,
4084                                    local_fft_offset,
4085                                    local_fft_size);
4086
4087     if (pme->nnodes_minor > 1)
4088     {
4089         /* Major dimension */
4090         overlap = &pme->overlap[1];
4091
4092         if (pme->nnodes_major > 1)
4093         {
4094             size_yx = pme->overlap[0].comm_data[0].send_nindex;
4095         }
4096         else
4097         {
4098             size_yx = 0;
4099         }
4100         datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
4101
4102         send_size_y = overlap->send_size;
4103
4104         for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
4105         {
4106             send_id       = overlap->send_id[ipulse];
4107             recv_id       = overlap->recv_id[ipulse];
4108             send_index0   =
4109                 overlap->comm_data[ipulse].send_index0 -
4110                 overlap->comm_data[0].send_index0;
4111             send_nindex   = overlap->comm_data[ipulse].send_nindex;
4112             /* We don't use recv_index0, as we always receive starting at 0 */
4113             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
4114             recv_size_y   = overlap->comm_data[ipulse].recv_size;
4115
4116             sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
4117             recvptr = overlap->recvbuf;
4118
4119 #ifdef GMX_MPI
4120             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
4121                          send_id, ipulse,
4122                          recvptr, recv_size_y*datasize, GMX_MPI_REAL,
4123                          recv_id, ipulse,
4124                          overlap->mpi_comm, &stat);
4125 #endif
4126
4127             for (x = 0; x < local_fft_ndata[XX]; x++)
4128             {
4129                 for (y = 0; y < recv_nindex; y++)
4130                 {
4131                     indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
4132                     indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
4133                     for (z = 0; z < local_fft_ndata[ZZ]; z++)
4134                     {
4135                         fftgrid[indg+z] += recvptr[indb+z];
4136                     }
4137                 }
4138             }
4139
4140             if (pme->nnodes_major > 1)
4141             {
4142                 /* Copy from the received buffer to the send buffer for dim 0 */
4143                 sendptr = pme->overlap[0].sendbuf;
4144                 for (x = 0; x < size_yx; x++)
4145                 {
4146                     for (y = 0; y < recv_nindex; y++)
4147                     {
4148                         indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
4149                         indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
4150                         for (z = 0; z < local_fft_ndata[ZZ]; z++)
4151                         {
4152                             sendptr[indg+z] += recvptr[indb+z];
4153                         }
4154                     }
4155                 }
4156             }
4157         }
4158     }
4159
4160     /* We only support a single pulse here.
4161      * This is not a severe limitation, as this code is only used
4162      * with OpenMP and with OpenMP the (PME) domains can be larger.
4163      */
4164     if (pme->nnodes_major > 1)
4165     {
4166         /* Major dimension */
4167         overlap = &pme->overlap[0];
4168
4169         datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
4170         gridsize = local_fft_size[YY] *local_fft_size[ZZ];
4171
4172         ipulse = 0;
4173
4174         send_id       = overlap->send_id[ipulse];
4175         recv_id       = overlap->recv_id[ipulse];
4176         send_nindex   = overlap->comm_data[ipulse].send_nindex;
4177         /* We don't use recv_index0, as we always receive starting at 0 */
4178         recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
4179
4180         sendptr = overlap->sendbuf;
4181         recvptr = overlap->recvbuf;
4182
4183         if (debug != NULL)
4184         {
4185             fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
4186                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
4187         }
4188
4189 #ifdef GMX_MPI
4190         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
4191                      send_id, ipulse,
4192                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
4193                      recv_id, ipulse,
4194                      overlap->mpi_comm, &stat);
4195 #endif
4196
4197         for (x = 0; x < recv_nindex; x++)
4198         {
4199             for (y = 0; y < local_fft_ndata[YY]; y++)
4200             {
4201                 indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
4202                 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
4203                 for (z = 0; z < local_fft_ndata[ZZ]; z++)
4204                 {
4205                     fftgrid[indg+z] += recvptr[indb+z];
4206                 }
4207             }
4208         }
4209     }
4210 }
4211
4212
4213 static void spread_on_grid(gmx_pme_t pme,
4214                            pme_atomcomm_t *atc, pmegrids_t *grids,
4215                            gmx_bool bCalcSplines, gmx_bool bSpread,
4216                            real *fftgrid, gmx_bool bDoSplines, int grid_index)
4217 {
4218     int nthread, thread;
4219 #ifdef PME_TIME_THREADS
4220     gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
4221     static double cs1     = 0, cs2 = 0, cs3 = 0;
4222     static double cs1a[6] = {0, 0, 0, 0, 0, 0};
4223     static int cnt        = 0;
4224 #endif
4225
4226     nthread = pme->nthread;
4227     assert(nthread > 0);
4228
4229 #ifdef PME_TIME_THREADS
4230     c1 = omp_cyc_start();
4231 #endif
4232     if (bCalcSplines)
4233     {
4234 #pragma omp parallel for num_threads(nthread) schedule(static)
4235         for (thread = 0; thread < nthread; thread++)
4236         {
4237             int start, end;
4238
4239             start = atc->n* thread   /nthread;
4240             end   = atc->n*(thread+1)/nthread;
4241
4242             /* Compute fftgrid index for all atoms,
4243              * with help of some extra variables.
4244              */
4245             calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
4246         }
4247     }
4248 #ifdef PME_TIME_THREADS
4249     c1   = omp_cyc_end(c1);
4250     cs1 += (double)c1;
4251 #endif
4252
4253 #ifdef PME_TIME_THREADS
4254     c2 = omp_cyc_start();
4255 #endif
4256 #pragma omp parallel for num_threads(nthread) schedule(static)
4257     for (thread = 0; thread < nthread; thread++)
4258     {
4259         splinedata_t *spline;
4260         pmegrid_t *grid = NULL;
4261
4262         /* make local bsplines  */
4263         if (grids == NULL || !pme->bUseThreads)
4264         {
4265             spline = &atc->spline[0];
4266
4267             spline->n = atc->n;
4268
4269             if (bSpread)
4270             {
4271                 grid = &grids->grid;
4272             }
4273         }
4274         else
4275         {
4276             spline = &atc->spline[thread];
4277
4278             if (grids->nthread == 1)
4279             {
4280                 /* One thread, we operate on all coefficients */
4281                 spline->n = atc->n;
4282             }
4283             else
4284             {
4285                 /* Get the indices our thread should operate on */
4286                 make_thread_local_ind(atc, thread, spline);
4287             }
4288
4289             grid = &grids->grid_th[thread];
4290         }
4291
4292         if (bCalcSplines)
4293         {
4294             make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
4295                           atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
4296         }
4297
4298         if (bSpread)
4299         {
4300             /* put local atoms on grid. */
4301 #ifdef PME_TIME_SPREAD
4302             ct1a = omp_cyc_start();
4303 #endif
4304             spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
4305
4306             if (pme->bUseThreads)
4307             {
4308                 copy_local_grid(pme, grids, grid_index, thread, fftgrid);
4309             }
4310 #ifdef PME_TIME_SPREAD
4311             ct1a          = omp_cyc_end(ct1a);
4312             cs1a[thread] += (double)ct1a;
4313 #endif
4314         }
4315     }
4316 #ifdef PME_TIME_THREADS
4317     c2   = omp_cyc_end(c2);
4318     cs2 += (double)c2;
4319 #endif
4320
4321     if (bSpread && pme->bUseThreads)
4322     {
4323 #ifdef PME_TIME_THREADS
4324         c3 = omp_cyc_start();
4325 #endif
4326 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
4327         for (thread = 0; thread < grids->nthread; thread++)
4328         {
4329             reduce_threadgrid_overlap(pme, grids, thread,
4330                                       fftgrid,
4331                                       pme->overlap[0].sendbuf,
4332                                       pme->overlap[1].sendbuf,
4333                                       grid_index);
4334         }
4335 #ifdef PME_TIME_THREADS
4336         c3   = omp_cyc_end(c3);
4337         cs3 += (double)c3;
4338 #endif
4339
4340         if (pme->nnodes > 1)
4341         {
4342             /* Communicate the overlapping part of the fftgrid.
4343              * For this communication call we need to check pme->bUseThreads
4344              * to have all ranks communicate here, regardless of pme->nthread.
4345              */
4346             sum_fftgrid_dd(pme, fftgrid, grid_index);
4347         }
4348     }
4349
4350 #ifdef PME_TIME_THREADS
4351     cnt++;
4352     if (cnt % 20 == 0)
4353     {
4354         printf("idx %.2f spread %.2f red %.2f",
4355                cs1*1e-9, cs2*1e-9, cs3*1e-9);
4356 #ifdef PME_TIME_SPREAD
4357         for (thread = 0; thread < nthread; thread++)
4358         {
4359             printf(" %.2f", cs1a[thread]*1e-9);
4360         }
4361 #endif
4362         printf("\n");
4363     }
4364 #endif
4365 }
4366
4367
4368 static void dump_grid(FILE *fp,
4369                       int sx, int sy, int sz, int nx, int ny, int nz,
4370                       int my, int mz, const real *g)
4371 {
4372     int x, y, z;
4373
4374     for (x = 0; x < nx; x++)
4375     {
4376         for (y = 0; y < ny; y++)
4377         {
4378             for (z = 0; z < nz; z++)
4379             {
4380                 fprintf(fp, "%2d %2d %2d %6.3f\n",
4381                         sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
4382             }
4383         }
4384     }
4385 }
4386
4387 static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
4388 {
4389     ivec local_fft_ndata, local_fft_offset, local_fft_size;
4390
4391     gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
4392                                    local_fft_ndata,
4393                                    local_fft_offset,
4394                                    local_fft_size);
4395
4396     dump_grid(stderr,
4397               pme->pmegrid_start_ix,
4398               pme->pmegrid_start_iy,
4399               pme->pmegrid_start_iz,
4400               pme->pmegrid_nx-pme->pme_order+1,
4401               pme->pmegrid_ny-pme->pme_order+1,
4402               pme->pmegrid_nz-pme->pme_order+1,
4403               local_fft_size[YY],
4404               local_fft_size[ZZ],
4405               fftgrid);
4406 }
4407
4408
4409 void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
4410 {
4411     pme_atomcomm_t *atc;
4412     pmegrids_t *grid;
4413
4414     if (pme->nnodes > 1)
4415     {
4416         gmx_incons("gmx_pme_calc_energy called in parallel");
4417     }
4418     if (pme->bFEP_q > 1)
4419     {
4420         gmx_incons("gmx_pme_calc_energy with free energy");
4421     }
4422
4423     atc            = &pme->atc_energy;
4424     atc->nthread   = 1;
4425     if (atc->spline == NULL)
4426     {
4427         snew(atc->spline, atc->nthread);
4428     }
4429     atc->nslab     = 1;
4430     atc->bSpread   = TRUE;
4431     atc->pme_order = pme->pme_order;
4432     atc->n         = n;
4433     pme_realloc_atomcomm_things(atc);
4434     atc->x           = x;
4435     atc->coefficient = q;
4436
4437     /* We only use the A-charges grid */
4438     grid = &pme->pmegrid[PME_GRID_QA];
4439
4440     /* Only calculate the spline coefficients, don't actually spread */
4441     spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
4442
4443     *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
4444 }
4445
4446
4447 static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
4448                                    gmx_walltime_accounting_t walltime_accounting,
4449                                    t_nrnb *nrnb, t_inputrec *ir,
4450                                    gmx_int64_t step)
4451 {
4452     /* Reset all the counters related to performance over the run */
4453     wallcycle_stop(wcycle, ewcRUN);
4454     wallcycle_reset_all(wcycle);
4455     init_nrnb(nrnb);
4456     if (ir->nsteps >= 0)
4457     {
4458         /* ir->nsteps is not used here, but we update it for consistency */
4459         ir->nsteps -= step - ir->init_step;
4460     }
4461     ir->init_step = step;
4462     wallcycle_start(wcycle, ewcRUN);
4463     walltime_accounting_start(walltime_accounting);
4464 }
4465
4466
4467 static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
4468                                ivec grid_size,
4469                                t_commrec *cr, t_inputrec *ir,
4470                                gmx_pme_t *pme_ret)
4471 {
4472     int ind;
4473     gmx_pme_t pme = NULL;
4474
4475     ind = 0;
4476     while (ind < *npmedata)
4477     {
4478         pme = (*pmedata)[ind];
4479         if (pme->nkx == grid_size[XX] &&
4480             pme->nky == grid_size[YY] &&
4481             pme->nkz == grid_size[ZZ])
4482         {
4483             *pme_ret = pme;
4484
4485             return;
4486         }
4487
4488         ind++;
4489     }
4490
4491     (*npmedata)++;
4492     srenew(*pmedata, *npmedata);
4493
4494     /* Generate a new PME data structure, copying part of the old pointers */
4495     gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
4496
4497     *pme_ret = (*pmedata)[ind];
4498 }
4499
4500 int gmx_pmeonly(gmx_pme_t pme,
4501                 t_commrec *cr,    t_nrnb *mynrnb,
4502                 gmx_wallcycle_t wcycle,
4503                 gmx_walltime_accounting_t walltime_accounting,
4504                 real ewaldcoeff_q, real ewaldcoeff_lj,
4505                 t_inputrec *ir)
4506 {
4507     int npmedata;
4508     gmx_pme_t *pmedata;
4509     gmx_pme_pp_t pme_pp;
4510     int  ret;
4511     int  natoms;
4512     matrix box;
4513     rvec *x_pp      = NULL, *f_pp = NULL;
4514     real *chargeA   = NULL, *chargeB = NULL;
4515     real *c6A       = NULL, *c6B = NULL;
4516     real *sigmaA    = NULL, *sigmaB = NULL;
4517     real lambda_q   = 0;
4518     real lambda_lj  = 0;
4519     int  maxshift_x = 0, maxshift_y = 0;
4520     real energy_q, energy_lj, dvdlambda_q, dvdlambda_lj;
4521     matrix vir_q, vir_lj;
4522     float cycles;
4523     int  count;
4524     gmx_bool bEnerVir;
4525     int pme_flags;
4526     gmx_int64_t step, step_rel;
4527     ivec grid_switch;
4528
4529     /* This data will only use with PME tuning, i.e. switching PME grids */
4530     npmedata = 1;
4531     snew(pmedata, npmedata);
4532     pmedata[0] = pme;
4533
4534     pme_pp = gmx_pme_pp_init(cr);
4535
4536     init_nrnb(mynrnb);
4537
4538     count = 0;
4539     do /****** this is a quasi-loop over time steps! */
4540     {
4541         /* The reason for having a loop here is PME grid tuning/switching */
4542         do
4543         {
4544             /* Domain decomposition */
4545             ret = gmx_pme_recv_coeffs_coords(pme_pp,
4546                                              &natoms,
4547                                              &chargeA, &chargeB,
4548                                              &c6A, &c6B,
4549                                              &sigmaA, &sigmaB,
4550                                              box, &x_pp, &f_pp,
4551                                              &maxshift_x, &maxshift_y,
4552                                              &pme->bFEP_q, &pme->bFEP_lj,
4553                                              &lambda_q, &lambda_lj,
4554                                              &bEnerVir,
4555                                              &pme_flags,
4556                                              &step,
4557                                              grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj);
4558
4559             if (ret == pmerecvqxSWITCHGRID)
4560             {
4561                 /* Switch the PME grid to grid_switch */
4562                 gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
4563             }
4564
4565             if (ret == pmerecvqxRESETCOUNTERS)
4566             {
4567                 /* Reset the cycle and flop counters */
4568                 reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, ir, step);
4569             }
4570         }
4571         while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
4572
4573         if (ret == pmerecvqxFINISH)
4574         {
4575             /* We should stop: break out of the loop */
4576             break;
4577         }
4578
4579         step_rel = step - ir->init_step;
4580
4581         if (count == 0)
4582         {
4583             wallcycle_start(wcycle, ewcRUN);
4584             walltime_accounting_start(walltime_accounting);
4585         }
4586
4587         wallcycle_start(wcycle, ewcPMEMESH);
4588
4589         dvdlambda_q  = 0;
4590         dvdlambda_lj = 0;
4591         clear_mat(vir_q);
4592         clear_mat(vir_lj);
4593
4594         gmx_pme_do(pme, 0, natoms, x_pp, f_pp,
4595                    chargeA, chargeB, c6A, c6B, sigmaA, sigmaB, box,
4596                    cr, maxshift_x, maxshift_y, mynrnb, wcycle,
4597                    vir_q, ewaldcoeff_q, vir_lj, ewaldcoeff_lj,
4598                    &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
4599                    pme_flags | GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
4600
4601         cycles = wallcycle_stop(wcycle, ewcPMEMESH);
4602
4603         gmx_pme_send_force_vir_ener(pme_pp,
4604                                     f_pp, vir_q, energy_q, vir_lj, energy_lj,
4605                                     dvdlambda_q, dvdlambda_lj, cycles);
4606
4607         count++;
4608     } /***** end of quasi-loop, we stop with the break above */
4609     while (TRUE);
4610
4611     walltime_accounting_end(walltime_accounting);
4612
4613     return 0;
4614 }
4615
4616 static void
4617 calc_initial_lb_coeffs(gmx_pme_t pme, real *local_c6, real *local_sigma)
4618 {
4619     int  i;
4620
4621     for (i = 0; i < pme->atc[0].n; ++i)
4622     {
4623         real sigma4;
4624
4625         sigma4                     = local_sigma[i];
4626         sigma4                     = sigma4*sigma4;
4627         sigma4                     = sigma4*sigma4;
4628         pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
4629     }
4630 }
4631
4632 static void
4633 calc_next_lb_coeffs(gmx_pme_t pme, real *local_sigma)
4634 {
4635     int  i;
4636
4637     for (i = 0; i < pme->atc[0].n; ++i)
4638     {
4639         pme->atc[0].coefficient[i] *= local_sigma[i];
4640     }
4641 }
4642
4643 static void
4644 do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
4645                      gmx_bool bFirst, rvec x[], real *data)
4646 {
4647     int      d;
4648     pme_atomcomm_t *atc;
4649     atc = &pme->atc[0];
4650
4651     for (d = pme->ndecompdim - 1; d >= 0; d--)
4652     {
4653         int             n_d;
4654         rvec           *x_d;
4655         real           *param_d;
4656
4657         if (d == pme->ndecompdim - 1)
4658         {
4659             n_d     = homenr;
4660             x_d     = x + start;
4661             param_d = data;
4662         }
4663         else
4664         {
4665             n_d     = pme->atc[d + 1].n;
4666             x_d     = atc->x;
4667             param_d = atc->coefficient;
4668         }
4669         atc      = &pme->atc[d];
4670         atc->npd = n_d;
4671         if (atc->npd > atc->pd_nalloc)
4672         {
4673             atc->pd_nalloc = over_alloc_dd(atc->npd);
4674             srenew(atc->pd, atc->pd_nalloc);
4675         }
4676         pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
4677         where();
4678         /* Redistribute x (only once) and qA/c6A or qB/c6B */
4679         if (DOMAINDECOMP(cr))
4680         {
4681             dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc);
4682         }
4683     }
4684 }
4685
4686 int gmx_pme_do(gmx_pme_t pme,
4687                int start,       int homenr,
4688                rvec x[],        rvec f[],
4689                real *chargeA,   real *chargeB,
4690                real *c6A,       real *c6B,
4691                real *sigmaA,    real *sigmaB,
4692                matrix box, t_commrec *cr,
4693                int  maxshift_x, int maxshift_y,
4694                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
4695                matrix vir_q,      real ewaldcoeff_q,
4696                matrix vir_lj,   real ewaldcoeff_lj,
4697                real *energy_q,  real *energy_lj,
4698                real lambda_q, real lambda_lj,
4699                real *dvdlambda_q, real *dvdlambda_lj,
4700                int flags)
4701 {
4702     int     d, i, j, k, ntot, npme, grid_index, max_grid_index;
4703     int     nx, ny, nz;
4704     int     n_d, local_ny;
4705     pme_atomcomm_t *atc = NULL;
4706     pmegrids_t *pmegrid = NULL;
4707     real    *grid       = NULL;
4708     real    *ptr;
4709     rvec    *x_d, *f_d;
4710     real    *coefficient = NULL;
4711     real    energy_AB[4];
4712     matrix  vir_AB[4];
4713     real    scale, lambda;
4714     gmx_bool bClearF;
4715     gmx_parallel_3dfft_t pfft_setup;
4716     real *  fftgrid;
4717     t_complex * cfftgrid;
4718     int     thread;
4719     gmx_bool bFirst, bDoSplines;
4720     int fep_state;
4721     int fep_states_lj           = pme->bFEP_lj ? 2 : 1;
4722     const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4723     const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
4724
4725     assert(pme->nnodes > 0);
4726     assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4727
4728     if (pme->nnodes > 1)
4729     {
4730         atc      = &pme->atc[0];
4731         atc->npd = homenr;
4732         if (atc->npd > atc->pd_nalloc)
4733         {
4734             atc->pd_nalloc = over_alloc_dd(atc->npd);
4735             srenew(atc->pd, atc->pd_nalloc);
4736         }
4737         for (d = pme->ndecompdim-1; d >= 0; d--)
4738         {
4739             atc           = &pme->atc[d];
4740             atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4741         }
4742     }
4743     else
4744     {
4745         atc = &pme->atc[0];
4746         /* This could be necessary for TPI */
4747         pme->atc[0].n = homenr;
4748         if (DOMAINDECOMP(cr))
4749         {
4750             pme_realloc_atomcomm_things(atc);
4751         }
4752         atc->x = x;
4753         atc->f = f;
4754     }
4755
4756     m_inv_ur0(box, pme->recipbox);
4757     bFirst = TRUE;
4758
4759     /* For simplicity, we construct the splines for all particles if
4760      * more than one PME calculations is needed. Some optimization
4761      * could be done by keeping track of which atoms have splines
4762      * constructed, and construct new splines on each pass for atoms
4763      * that don't yet have them.
4764      */
4765
4766     bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
4767
4768     /* We need a maximum of four separate PME calculations:
4769      * grid_index=0: Coulomb PME with charges from state A
4770      * grid_index=1: Coulomb PME with charges from state B
4771      * grid_index=2: LJ PME with C6 from state A
4772      * grid_index=3: LJ PME with C6 from state B
4773      * For Lorentz-Berthelot combination rules, a separate loop is used to
4774      * calculate all the terms
4775      */
4776
4777     /* If we are doing LJ-PME with LB, we only do Q here */
4778     max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
4779
4780     for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
4781     {
4782         /* Check if we should do calculations at this grid_index
4783          * If grid_index is odd we should be doing FEP
4784          * If grid_index < 2 we should be doing electrostatic PME
4785          * If grid_index >= 2 we should be doing LJ-PME
4786          */
4787         if ((grid_index <  DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
4788                                     (grid_index == 1 && !pme->bFEP_q))) ||
4789             (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
4790                                     (grid_index == 3 && !pme->bFEP_lj))))
4791         {
4792             continue;
4793         }
4794         /* Unpack structure */
4795         pmegrid    = &pme->pmegrid[grid_index];
4796         fftgrid    = pme->fftgrid[grid_index];
4797         cfftgrid   = pme->cfftgrid[grid_index];
4798         pfft_setup = pme->pfft_setup[grid_index];
4799         switch (grid_index)
4800         {
4801             case 0: coefficient = chargeA + start; break;
4802             case 1: coefficient = chargeB + start; break;
4803             case 2: coefficient = c6A + start; break;
4804             case 3: coefficient = c6B + start; break;
4805         }
4806
4807         grid = pmegrid->grid.grid;
4808
4809         if (debug)
4810         {
4811             fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
4812                     cr->nnodes, cr->nodeid);
4813             fprintf(debug, "Grid = %p\n", (void*)grid);
4814             if (grid == NULL)
4815             {
4816                 gmx_fatal(FARGS, "No grid!");
4817             }
4818         }
4819         where();
4820
4821         if (pme->nnodes == 1)
4822         {
4823             atc->coefficient = coefficient;
4824         }
4825         else
4826         {
4827             wallcycle_start(wcycle, ewcPME_REDISTXF);
4828             do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
4829             where();
4830
4831             wallcycle_stop(wcycle, ewcPME_REDISTXF);
4832         }
4833
4834         if (debug)
4835         {
4836             fprintf(debug, "Node= %6d, pme local particles=%6d\n",
4837                     cr->nodeid, atc->n);
4838         }
4839
4840         if (flags & GMX_PME_SPREAD)
4841         {
4842             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4843
4844             /* Spread the coefficients on a grid */
4845             spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
4846
4847             if (bFirst)
4848             {
4849                 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
4850             }
4851             inc_nrnb(nrnb, eNR_SPREADBSP,
4852                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4853
4854             if (!pme->bUseThreads)
4855             {
4856                 wrap_periodic_pmegrid(pme, grid);
4857
4858                 /* sum contributions to local grid from other nodes */
4859 #ifdef GMX_MPI
4860                 if (pme->nnodes > 1)
4861                 {
4862                     gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
4863                     where();
4864                 }
4865 #endif
4866
4867                 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
4868             }
4869
4870             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4871
4872             /*
4873                dump_local_fftgrid(pme,fftgrid);
4874                exit(0);
4875              */
4876         }
4877
4878         /* Here we start a large thread parallel region */
4879 #pragma omp parallel num_threads(pme->nthread) private(thread)
4880         {
4881             thread = gmx_omp_get_thread_num();
4882             if (flags & GMX_PME_SOLVE)
4883             {
4884                 int loop_count;
4885
4886                 /* do 3d-fft */
4887                 if (thread == 0)
4888                 {
4889                     wallcycle_start(wcycle, ewcPME_FFT);
4890                 }
4891                 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
4892                                            thread, wcycle);
4893                 if (thread == 0)
4894                 {
4895                     wallcycle_stop(wcycle, ewcPME_FFT);
4896                 }
4897                 where();
4898
4899                 /* solve in k-space for our local cells */
4900                 if (thread == 0)
4901                 {
4902                     wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
4903                 }
4904                 if (grid_index < DO_Q)
4905                 {
4906                     loop_count =
4907                         solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
4908                                       box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4909                                       bCalcEnerVir,
4910                                       pme->nthread, thread);
4911                 }
4912                 else
4913                 {
4914                     loop_count =
4915                         solve_pme_lj_yzx(pme, &cfftgrid, FALSE, ewaldcoeff_lj,
4916                                          box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4917                                          bCalcEnerVir,
4918                                          pme->nthread, thread);
4919                 }
4920
4921                 if (thread == 0)
4922                 {
4923                     wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
4924                     where();
4925                     inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
4926                 }
4927             }
4928
4929             if (bCalcF)
4930             {
4931                 /* do 3d-invfft */
4932                 if (thread == 0)
4933                 {
4934                     where();
4935                     wallcycle_start(wcycle, ewcPME_FFT);
4936                 }
4937                 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
4938                                            thread, wcycle);
4939                 if (thread == 0)
4940                 {
4941                     wallcycle_stop(wcycle, ewcPME_FFT);
4942
4943                     where();
4944
4945                     if (pme->nodeid == 0)
4946                     {
4947                         ntot  = pme->nkx*pme->nky*pme->nkz;
4948                         npme  = ntot*log((real)ntot)/log(2.0);
4949                         inc_nrnb(nrnb, eNR_FFT, 2*npme);
4950                     }
4951
4952                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4953                 }
4954
4955                 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
4956             }
4957         }
4958         /* End of thread parallel section.
4959          * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4960          */
4961
4962         if (bCalcF)
4963         {
4964             /* distribute local grid to all nodes */
4965 #ifdef GMX_MPI
4966             if (pme->nnodes > 1)
4967             {
4968                 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
4969             }
4970 #endif
4971             where();
4972
4973             unwrap_periodic_pmegrid(pme, grid);
4974
4975             /* interpolate forces for our local atoms */
4976
4977             where();
4978
4979             /* If we are running without parallelization,
4980              * atc->f is the actual force array, not a buffer,
4981              * therefore we should not clear it.
4982              */
4983             lambda  = grid_index < DO_Q ? lambda_q : lambda_lj;
4984             bClearF = (bFirst && PAR(cr));
4985 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4986             for (thread = 0; thread < pme->nthread; thread++)
4987             {
4988                 gather_f_bsplines(pme, grid, bClearF, atc,
4989                                   &atc->spline[thread],
4990                                   pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
4991             }
4992
4993             where();
4994
4995             inc_nrnb(nrnb, eNR_GATHERFBSP,
4996                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4997             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4998         }
4999
5000         if (bCalcEnerVir)
5001         {
5002             /* This should only be called on the master thread
5003              * and after the threads have synchronized.
5004              */
5005             if (grid_index < 2)
5006             {
5007                 get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
5008             }
5009             else
5010             {
5011                 get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
5012             }
5013         }
5014         bFirst = FALSE;
5015     } /* of grid_index-loop */
5016
5017     /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
5018      * seven terms. */
5019
5020     if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
5021     {
5022         /* Loop over A- and B-state if we are doing FEP */
5023         for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
5024         {
5025             real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
5026             if (pme->nnodes == 1)
5027             {
5028                 if (pme->lb_buf1 == NULL)
5029                 {
5030                     pme->lb_buf_nalloc = pme->atc[0].n;
5031                     snew(pme->lb_buf1, pme->lb_buf_nalloc);
5032                 }
5033                 pme->atc[0].coefficient = pme->lb_buf1;
5034                 switch (fep_state)
5035                 {
5036                     case 0:
5037                         local_c6      = c6A;
5038                         local_sigma   = sigmaA;
5039                         break;
5040                     case 1:
5041                         local_c6      = c6B;
5042                         local_sigma   = sigmaB;
5043                         break;
5044                     default:
5045                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
5046                 }
5047             }
5048             else
5049             {
5050                 atc = &pme->atc[0];
5051                 switch (fep_state)
5052                 {
5053                     case 0:
5054                         RedistC6      = c6A;
5055                         RedistSigma   = sigmaA;
5056                         break;
5057                     case 1:
5058                         RedistC6      = c6B;
5059                         RedistSigma   = sigmaB;
5060                         break;
5061                     default:
5062                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
5063                 }
5064                 wallcycle_start(wcycle, ewcPME_REDISTXF);
5065
5066                 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
5067                 if (pme->lb_buf_nalloc < atc->n)
5068                 {
5069                     pme->lb_buf_nalloc = atc->nalloc;
5070                     srenew(pme->lb_buf1, pme->lb_buf_nalloc);
5071                     srenew(pme->lb_buf2, pme->lb_buf_nalloc);
5072                 }
5073                 local_c6 = pme->lb_buf1;
5074                 for (i = 0; i < atc->n; ++i)
5075                 {
5076                     local_c6[i] = atc->coefficient[i];
5077                 }
5078                 where();
5079
5080                 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
5081                 local_sigma = pme->lb_buf2;
5082                 for (i = 0; i < atc->n; ++i)
5083                 {
5084                     local_sigma[i] = atc->coefficient[i];
5085                 }
5086                 where();
5087
5088                 wallcycle_stop(wcycle, ewcPME_REDISTXF);
5089             }
5090             calc_initial_lb_coeffs(pme, local_c6, local_sigma);
5091
5092             /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
5093             for (grid_index = 2; grid_index < 9; ++grid_index)
5094             {
5095                 /* Unpack structure */
5096                 pmegrid    = &pme->pmegrid[grid_index];
5097                 fftgrid    = pme->fftgrid[grid_index];
5098                 cfftgrid   = pme->cfftgrid[grid_index];
5099                 pfft_setup = pme->pfft_setup[grid_index];
5100                 calc_next_lb_coeffs(pme, local_sigma);
5101                 grid = pmegrid->grid.grid;
5102                 where();
5103
5104                 if (flags & GMX_PME_SPREAD)
5105                 {
5106                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
5107                     /* Spread the c6 on a grid */
5108                     spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
5109
5110                     if (bFirst)
5111                     {
5112                         inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
5113                     }
5114
5115                     inc_nrnb(nrnb, eNR_SPREADBSP,
5116                              pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
5117                     if (pme->nthread == 1)
5118                     {
5119                         wrap_periodic_pmegrid(pme, grid);
5120                         /* sum contributions to local grid from other nodes */
5121 #ifdef GMX_MPI
5122                         if (pme->nnodes > 1)
5123                         {
5124                             gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
5125                             where();
5126                         }
5127 #endif
5128                         copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
5129                     }
5130                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
5131                 }
5132                 /*Here we start a large thread parallel region*/
5133 #pragma omp parallel num_threads(pme->nthread) private(thread)
5134                 {
5135                     thread = gmx_omp_get_thread_num();
5136                     if (flags & GMX_PME_SOLVE)
5137                     {
5138                         /* do 3d-fft */
5139                         if (thread == 0)
5140                         {
5141                             wallcycle_start(wcycle, ewcPME_FFT);
5142                         }
5143
5144                         gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
5145                                                    thread, wcycle);
5146                         if (thread == 0)
5147                         {
5148                             wallcycle_stop(wcycle, ewcPME_FFT);
5149                         }
5150                         where();
5151                     }
5152                 }
5153                 bFirst = FALSE;
5154             }
5155             if (flags & GMX_PME_SOLVE)
5156             {
5157                 /* solve in k-space for our local cells */
5158 #pragma omp parallel num_threads(pme->nthread) private(thread)
5159                 {
5160                     int loop_count;
5161                     thread = gmx_omp_get_thread_num();
5162                     if (thread == 0)
5163                     {
5164                         wallcycle_start(wcycle, ewcLJPME);
5165                     }
5166
5167                     loop_count =
5168                         solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
5169                                          box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
5170                                          bCalcEnerVir,
5171                                          pme->nthread, thread);
5172                     if (thread == 0)
5173                     {
5174                         wallcycle_stop(wcycle, ewcLJPME);
5175                         where();
5176                         inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
5177                     }
5178                 }
5179             }
5180
5181             if (bCalcEnerVir)
5182             {
5183                 /* This should only be called on the master thread and
5184                  * after the threads have synchronized.
5185                  */
5186                 get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
5187             }
5188
5189             if (bCalcF)
5190             {
5191                 bFirst = !(flags & GMX_PME_DO_COULOMB);
5192                 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
5193                 for (grid_index = 8; grid_index >= 2; --grid_index)
5194                 {
5195                     /* Unpack structure */
5196                     pmegrid    = &pme->pmegrid[grid_index];
5197                     fftgrid    = pme->fftgrid[grid_index];
5198                     cfftgrid   = pme->cfftgrid[grid_index];
5199                     pfft_setup = pme->pfft_setup[grid_index];
5200                     grid       = pmegrid->grid.grid;
5201                     calc_next_lb_coeffs(pme, local_sigma);
5202                     where();
5203 #pragma omp parallel num_threads(pme->nthread) private(thread)
5204                     {
5205                         thread = gmx_omp_get_thread_num();
5206                         /* do 3d-invfft */
5207                         if (thread == 0)
5208                         {
5209                             where();
5210                             wallcycle_start(wcycle, ewcPME_FFT);
5211                         }
5212
5213                         gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
5214                                                    thread, wcycle);
5215                         if (thread == 0)
5216                         {
5217                             wallcycle_stop(wcycle, ewcPME_FFT);
5218
5219                             where();
5220
5221                             if (pme->nodeid == 0)
5222                             {
5223                                 ntot  = pme->nkx*pme->nky*pme->nkz;
5224                                 npme  = ntot*log((real)ntot)/log(2.0);
5225                                 inc_nrnb(nrnb, eNR_FFT, 2*npme);
5226                             }
5227                             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
5228                         }
5229
5230                         copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
5231
5232                     } /*#pragma omp parallel*/
5233
5234                     /* distribute local grid to all nodes */
5235 #ifdef GMX_MPI
5236                     if (pme->nnodes > 1)
5237                     {
5238                         gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
5239                     }
5240 #endif
5241                     where();
5242
5243                     unwrap_periodic_pmegrid(pme, grid);
5244
5245                     /* interpolate forces for our local atoms */
5246                     where();
5247                     bClearF = (bFirst && PAR(cr));
5248                     scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
5249                     scale  *= lb_scale_factor[grid_index-2];
5250 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
5251                     for (thread = 0; thread < pme->nthread; thread++)
5252                     {
5253                         gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
5254                                           &pme->atc[0].spline[thread],
5255                                           scale);
5256                     }
5257                     where();
5258
5259                     inc_nrnb(nrnb, eNR_GATHERFBSP,
5260                              pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
5261                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
5262
5263                     bFirst = FALSE;
5264                 } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
5265             }     /* if (bCalcF) */
5266         }         /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
5267     }             /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
5268
5269     if (bCalcF && pme->nnodes > 1)
5270     {
5271         wallcycle_start(wcycle, ewcPME_REDISTXF);
5272         for (d = 0; d < pme->ndecompdim; d++)
5273         {
5274             atc = &pme->atc[d];
5275             if (d == pme->ndecompdim - 1)
5276             {
5277                 n_d = homenr;
5278                 f_d = f + start;
5279             }
5280             else
5281             {
5282                 n_d = pme->atc[d+1].n;
5283                 f_d = pme->atc[d+1].f;
5284             }
5285             if (DOMAINDECOMP(cr))
5286             {
5287                 dd_pmeredist_f(pme, atc, n_d, f_d,
5288                                d == pme->ndecompdim-1 && pme->bPPnode);
5289             }
5290         }
5291
5292         wallcycle_stop(wcycle, ewcPME_REDISTXF);
5293     }
5294     where();
5295
5296     if (bCalcEnerVir)
5297     {
5298         if (flags & GMX_PME_DO_COULOMB)
5299         {
5300             if (!pme->bFEP_q)
5301             {
5302                 *energy_q = energy_AB[0];
5303                 m_add(vir_q, vir_AB[0], vir_q);
5304             }
5305             else
5306             {
5307                 *energy_q       = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
5308                 *dvdlambda_q   += energy_AB[1] - energy_AB[0];
5309                 for (i = 0; i < DIM; i++)
5310                 {
5311                     for (j = 0; j < DIM; j++)
5312                     {
5313                         vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
5314                             lambda_q*vir_AB[1][i][j];
5315                     }
5316                 }
5317             }
5318             if (debug)
5319             {
5320                 fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
5321             }
5322         }
5323         else
5324         {
5325             *energy_q = 0;
5326         }
5327
5328         if (flags & GMX_PME_DO_LJ)
5329         {
5330             if (!pme->bFEP_lj)
5331             {
5332                 *energy_lj = energy_AB[2];
5333                 m_add(vir_lj, vir_AB[2], vir_lj);
5334             }
5335             else
5336             {
5337                 *energy_lj     = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
5338                 *dvdlambda_lj += energy_AB[3] - energy_AB[2];
5339                 for (i = 0; i < DIM; i++)
5340                 {
5341                     for (j = 0; j < DIM; j++)
5342                     {
5343                         vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
5344                     }
5345                 }
5346             }
5347             if (debug)
5348             {
5349                 fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
5350             }
5351         }
5352         else
5353         {
5354             *energy_lj = 0;
5355         }
5356     }
5357     return 0;
5358 }