src/gromacs/mdlib/pme.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /* IMPORTANT FOR DEVELOPERS:
  38  *
  39  * Triclinic pme stuff isn't entirely trivial, and we've experienced
  40  * some bugs during development (many of them due to me). To avoid
  41  * this in the future, please check the following things if you make
  42  * changes in this file:
  43  *
  44  * 1. You should obtain identical (at least to the PME precision)
  45  *    energies, forces, and virial for
  46  *    a rectangular box and a triclinic one where the z (or y) axis is
  47  *    tilted a whole box side. For instance you could use these boxes:
  48  *
  49  *    rectangular       triclinic
  50  *     2  0  0           2  0  0
  51  *     0  2  0           0  2  0
  52  *     0  0  6           2  2  6
  53  *
  54  * 2. You should check the energy conservation in a triclinic box.
  55  *
  56  * It might seem an overkill, but better safe than sorry.
  57  * /Erik 001109
  58  */
  59
  60 #include "config.h"
  61
  62 #include <assert.h>
  63 #include <math.h>
  64 #include <stdio.h>
  65 #include <stdlib.h>
  66 #include <string.h>
  67
  68 #include "typedefs.h"
  69 #include "txtdump.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/utility/smalloc.h"
  72 #include "coulomb.h"
  73 #include "gromacs/utility/fatalerror.h"
  74 #include "pme.h"
  75 #include "network.h"
  76 #include "gromacs/math/units.h"
  77 #include "nrnb.h"
  78 #include "macros.h"
  79
  80 #include "gromacs/legacyheaders/types/commrec.h"
  81 #include "gromacs/fft/parallel_3dfft.h"
  82 #include "gromacs/utility/futil.h"
  83 #include "gromacs/fileio/pdbio.h"
  84 #include "gromacs/math/gmxcomplex.h"
  85 #include "gromacs/timing/cyclecounter.h"
  86 #include "gromacs/timing/wallcycle.h"
  87 #include "gromacs/utility/gmxmpi.h"
  88 #include "gromacs/utility/gmxomp.h"
  89
  90 /* Include the SIMD macro file and then check for support */
  91 #include "gromacs/simd/simd.h"
  92 #include "gromacs/simd/simd_math.h"
  93 #ifdef GMX_SIMD_HAVE_REAL
  94 /* Turn on arbitrary width SIMD intrinsics for PME solve */
  95 #    define PME_SIMD_SOLVE
  96 #endif
  97
  98 #define PME_GRID_QA    0 /* Gridindex for A-state for Q */
  99 #define PME_GRID_C6A   2 /* Gridindex for A-state for LJ */
 100 #define DO_Q           2 /* Electrostatic grids have index q<2 */
 101 #define DO_Q_AND_LJ    4 /* non-LB LJ grids have index 2 <= q < 4 */
 102 #define DO_Q_AND_LJ_LB 9 /* With LB rules we need a total of 2+7 grids */
 103
 104 /* Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */
 105 const real lb_scale_factor[] = {
 106     1.0/64, 6.0/64, 15.0/64, 20.0/64,
 107     15.0/64, 6.0/64, 1.0/64
 108 };
 109
 110 /* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
 111 const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
 112
 113 /* Check if we have 4-wide SIMD macro support */
 114 #if (defined GMX_SIMD4_HAVE_REAL)
 115 /* Do PME spread and gather with 4-wide SIMD.
 116  * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
 117  */
 118 #    define PME_SIMD4_SPREAD_GATHER
 119
 120 #    if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
 121 /* With PME-order=4 on x86, unaligned load+store is slightly faster
 122  * than doubling all SIMD operations when using aligned load+store.
 123  */
 124 #        define PME_SIMD4_UNALIGNED
 125 #    endif
 126 #endif
 127
 128 #define DFT_TOL 1e-7
 129 /* #define PRT_FORCE */
 130 /* conditions for on the fly time-measurement */
 131 /* #define TAKETIME (step > 1 && timesteps < 10) */
 132 #define TAKETIME FALSE
 133
 134 /* #define PME_TIME_THREADS */
 135
 136 #ifdef GMX_DOUBLE
 137 #define mpi_type MPI_DOUBLE
 138 #else
 139 #define mpi_type MPI_FLOAT
 140 #endif
 141
 142 #ifdef PME_SIMD4_SPREAD_GATHER
 143 #    define SIMD4_ALIGNMENT  (GMX_SIMD4_WIDTH*sizeof(real))
 144 #else
 145 /* We can use any alignment, apart from 0, so we use 4 reals */
 146 #    define SIMD4_ALIGNMENT  (4*sizeof(real))
 147 #endif
 148
 149 /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
 150  * to preserve alignment.
 151  */
 152 #define GMX_CACHE_SEP 64
 153
 154 /* We only define a maximum to be able to use local arrays without allocation.
 155  * An order larger than 12 should never be needed, even for test cases.
 156  * If needed it can be changed here.
 157  */
 158 #define PME_ORDER_MAX 12
 159
 160 /* Internal datastructures */
 161 typedef struct {
 162     int send_index0;
 163     int send_nindex;
 164     int recv_index0;
 165     int recv_nindex;
 166     int recv_size;   /* Receive buffer width, used with OpenMP */
 167 } pme_grid_comm_t;
 168
 169 typedef struct {
 170 #ifdef GMX_MPI
 171     MPI_Comm         mpi_comm;
 172 #endif
 173     int              nnodes, nodeid;
 174     int             *s2g0;
 175     int             *s2g1;
 176     int              noverlap_nodes;
 177     int             *send_id, *recv_id;
 178     int              send_size; /* Send buffer width, used with OpenMP */
 179     pme_grid_comm_t *comm_data;
 180     real            *sendbuf;
 181     real            *recvbuf;
 182 } pme_overlap_t;
 183
 184 typedef struct {
 185     int *n;      /* Cumulative counts of the number of particles per thread */
 186     int  nalloc; /* Allocation size of i */
 187     int *i;      /* Particle indices ordered on thread index (n) */
 188 } thread_plist_t;
 189
 190 typedef struct {
 191     int      *thread_one;
 192     int       n;
 193     int      *ind;
 194     splinevec theta;
 195     real     *ptr_theta_z;
 196     splinevec dtheta;
 197     real     *ptr_dtheta_z;
 198 } splinedata_t;
 199
 200 typedef struct {
 201     int      dimind;        /* The index of the dimension, 0=x, 1=y */
 202     int      nslab;
 203     int      nodeid;
 204 #ifdef GMX_MPI
 205     MPI_Comm mpi_comm;
 206 #endif
 207
 208     int     *node_dest;     /* The nodes to send x and q to with DD */
 209     int     *node_src;      /* The nodes to receive x and q from with DD */
 210     int     *buf_index;     /* Index for commnode into the buffers */
 211
 212     int      maxshift;
 213
 214     int      npd;
 215     int      pd_nalloc;
 216     int     *pd;
 217     int     *count;         /* The number of atoms to send to each node */
 218     int    **count_thread;
 219     int     *rcount;        /* The number of atoms to receive */
 220
 221     int      n;
 222     int      nalloc;
 223     rvec    *x;
 224     real    *coefficient;
 225     rvec    *f;
 226     gmx_bool bSpread;       /* These coordinates are used for spreading */
 227     int      pme_order;
 228     ivec    *idx;
 229     rvec    *fractx;            /* Fractional coordinate relative to
 230                                  * the lower cell boundary
 231                                  */
 232     int             nthread;
 233     int            *thread_idx; /* Which thread should spread which coefficient */
 234     thread_plist_t *thread_plist;
 235     splinedata_t   *spline;
 236 } pme_atomcomm_t;
 237
 238 #define FLBS  3
 239 #define FLBSZ 4
 240
 241 typedef struct {
 242     ivec  ci;     /* The spatial location of this grid         */
 243     ivec  n;      /* The used size of *grid, including order-1 */
 244     ivec  offset; /* The grid offset from the full node grid   */
 245     int   order;  /* PME spreading order                       */
 246     ivec  s;      /* The allocated size of *grid, s >= n       */
 247     real *grid;   /* The grid local thread, size n             */
 248 } pmegrid_t;
 249
 250 typedef struct {
 251     pmegrid_t  grid;         /* The full node grid (non thread-local)            */
 252     int        nthread;      /* The number of threads operating on this grid     */
 253     ivec       nc;           /* The local spatial decomposition over the threads */
 254     pmegrid_t *grid_th;      /* Array of grids for each thread                   */
 255     real      *grid_all;     /* Allocated array for the grids in *grid_th        */
 256     int      **g2t;          /* The grid to thread index                         */
 257     ivec       nthread_comm; /* The number of threads to communicate with        */
 258 } pmegrids_t;
 259
 260 typedef struct {
 261 #ifdef PME_SIMD4_SPREAD_GATHER
 262     /* Masks for 4-wide SIMD aligned spreading and gathering */
 263     gmx_simd4_bool_t mask_S0[6], mask_S1[6];
 264 #else
 265     int              dummy; /* C89 requires that struct has at least one member */
 266 #endif
 267 } pme_spline_work_t;
 268
 269 typedef struct {
 270     /* work data for solve_pme */
 271     int      nalloc;
 272     real *   mhx;
 273     real *   mhy;
 274     real *   mhz;
 275     real *   m2;
 276     real *   denom;
 277     real *   tmp1_alloc;
 278     real *   tmp1;
 279     real *   tmp2;
 280     real *   eterm;
 281     real *   m2inv;
 282
 283     real     energy_q;
 284     matrix   vir_q;
 285     real     energy_lj;
 286     matrix   vir_lj;
 287 } pme_work_t;
 288
 289 typedef struct gmx_pme {
 290     int           ndecompdim; /* The number of decomposition dimensions */
 291     int           nodeid;     /* Our nodeid in mpi->mpi_comm */
 292     int           nodeid_major;
 293     int           nodeid_minor;
 294     int           nnodes;    /* The number of nodes doing PME */
 295     int           nnodes_major;
 296     int           nnodes_minor;
 297
 298     MPI_Comm      mpi_comm;
 299     MPI_Comm      mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
 300 #ifdef GMX_MPI
 301     MPI_Datatype  rvec_mpi;      /* the pme vector's MPI type */
 302 #endif
 303
 304     gmx_bool   bUseThreads;   /* Does any of the PME ranks have nthread>1 ?  */
 305     int        nthread;       /* The number of threads doing PME on our rank */
 306
 307     gmx_bool   bPPnode;       /* Node also does particle-particle forces */
 308     gmx_bool   bFEP;          /* Compute Free energy contribution */
 309     gmx_bool   bFEP_q;
 310     gmx_bool   bFEP_lj;
 311     int        nkx, nky, nkz; /* Grid dimensions */
 312     gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
 313     int        pme_order;
 314     real       epsilon_r;
 315
 316     int        ljpme_combination_rule;  /* Type of combination rule in LJ-PME */
 317
 318     int        ngrids;                  /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
 319
 320     pmegrids_t pmegrid[DO_Q_AND_LJ_LB]; /* Grids on which we do spreading/interpolation,
 321                                          * includes overlap Grid indices are ordered as
 322                                          * follows:
 323                                          * 0: Coloumb PME, state A
 324                                          * 1: Coloumb PME, state B
 325                                          * 2-8: LJ-PME
 326                                          * This can probably be done in a better way
 327                                          * but this simple hack works for now
 328                                          */
 329     /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
 330     int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
 331     /* pmegrid_nz might be larger than strictly necessary to ensure
 332      * memory alignment, pmegrid_nz_base gives the real base size.
 333      */
 334     int     pmegrid_nz_base;
 335     /* The local PME grid starting indices */
 336     int     pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
 337
 338     /* Work data for spreading and gathering */
 339     pme_spline_work_t     *spline_work;
 340
 341     real                 **fftgrid; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
 342     /* inside the interpolation grid, but separate for 2D PME decomp. */
 343     int                    fftgrid_nx, fftgrid_ny, fftgrid_nz;
 344
 345     t_complex            **cfftgrid;  /* Grids for complex FFT data */
 346
 347     int                    cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
 348
 349     gmx_parallel_3dfft_t  *pfft_setup;
 350
 351     int                   *nnx, *nny, *nnz;
 352     real                  *fshx, *fshy, *fshz;
 353
 354     pme_atomcomm_t         atc[2]; /* Indexed on decomposition index */
 355     matrix                 recipbox;
 356     splinevec              bsp_mod;
 357     /* Buffers to store data for local atoms for L-B combination rule
 358      * calculations in LJ-PME. lb_buf1 stores either the coefficients
 359      * for spreading/gathering (in serial), or the C6 coefficient for
 360      * local atoms (in parallel).  lb_buf2 is only used in parallel,
 361      * and stores the sigma values for local atoms. */
 362     real                 *lb_buf1, *lb_buf2;
 363     int                   lb_buf_nalloc; /* Allocation size for the above buffers. */
 364
 365     pme_overlap_t         overlap[2];    /* Indexed on dimension, 0=x, 1=y */
 366
 367     pme_atomcomm_t        atc_energy;    /* Only for gmx_pme_calc_energy */
 368
 369     rvec                 *bufv;          /* Communication buffer */
 370     real                 *bufr;          /* Communication buffer */
 371     int                   buf_nalloc;    /* The communication buffer size */
 372
 373     /* thread local work data for solve_pme */
 374     pme_work_t *work;
 375
 376     /* Work data for sum_qgrid */
 377     real *   sum_qgrid_tmp;
 378     real *   sum_qgrid_dd_tmp;
 379 } t_gmx_pme;
 380
 381 static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
 382                                    int start, int grid_index, int end, int thread)
 383 {
 384     int             i;
 385     int            *idxptr, tix, tiy, tiz;
 386     real           *xptr, *fptr, tx, ty, tz;
 387     real            rxx, ryx, ryy, rzx, rzy, rzz;
 388     int             nx, ny, nz;
 389     int             start_ix, start_iy, start_iz;
 390     int            *g2tx, *g2ty, *g2tz;
 391     gmx_bool        bThreads;
 392     int            *thread_idx = NULL;
 393     thread_plist_t *tpl        = NULL;
 394     int            *tpl_n      = NULL;
 395     int             thread_i;
 396
 397     nx  = pme->nkx;
 398     ny  = pme->nky;
 399     nz  = pme->nkz;
 400
 401     start_ix = pme->pmegrid_start_ix;
 402     start_iy = pme->pmegrid_start_iy;
 403     start_iz = pme->pmegrid_start_iz;
 404
 405     rxx = pme->recipbox[XX][XX];
 406     ryx = pme->recipbox[YY][XX];
 407     ryy = pme->recipbox[YY][YY];
 408     rzx = pme->recipbox[ZZ][XX];
 409     rzy = pme->recipbox[ZZ][YY];
 410     rzz = pme->recipbox[ZZ][ZZ];
 411
 412     g2tx = pme->pmegrid[grid_index].g2t[XX];
 413     g2ty = pme->pmegrid[grid_index].g2t[YY];
 414     g2tz = pme->pmegrid[grid_index].g2t[ZZ];
 415
 416     bThreads = (atc->nthread > 1);
 417     if (bThreads)
 418     {
 419         thread_idx = atc->thread_idx;
 420
 421         tpl   = &atc->thread_plist[thread];
 422         tpl_n = tpl->n;
 423         for (i = 0; i < atc->nthread; i++)
 424         {
 425             tpl_n[i] = 0;
 426         }
 427     }
 428
 429     for (i = start; i < end; i++)
 430     {
 431         xptr   = atc->x[i];
 432         idxptr = atc->idx[i];
 433         fptr   = atc->fractx[i];
 434
 435         /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
 436         tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
 437         ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
 438         tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
 439
 440         tix = (int)(tx);
 441         tiy = (int)(ty);
 442         tiz = (int)(tz);
 443
 444         /* Because decomposition only occurs in x and y,
 445          * we never have a fraction correction in z.
 446          */
 447         fptr[XX] = tx - tix + pme->fshx[tix];
 448         fptr[YY] = ty - tiy + pme->fshy[tiy];
 449         fptr[ZZ] = tz - tiz;
 450
 451         idxptr[XX] = pme->nnx[tix];
 452         idxptr[YY] = pme->nny[tiy];
 453         idxptr[ZZ] = pme->nnz[tiz];
 454
 455 #ifdef DEBUG
 456         range_check(idxptr[XX], 0, pme->pmegrid_nx);
 457         range_check(idxptr[YY], 0, pme->pmegrid_ny);
 458         range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
 459 #endif
 460
 461         if (bThreads)
 462         {
 463             thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
 464             thread_idx[i] = thread_i;
 465             tpl_n[thread_i]++;
 466         }
 467     }
 468
 469     if (bThreads)
 470     {
 471         /* Make a list of particle indices sorted on thread */
 472
 473         /* Get the cumulative count */
 474         for (i = 1; i < atc->nthread; i++)
 475         {
 476             tpl_n[i] += tpl_n[i-1];
 477         }
 478         /* The current implementation distributes particles equally
 479          * over the threads, so we could actually allocate for that
 480          * in pme_realloc_atomcomm_things.
 481          */
 482         if (tpl_n[atc->nthread-1] > tpl->nalloc)
 483         {
 484             tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
 485             srenew(tpl->i, tpl->nalloc);
 486         }
 487         /* Set tpl_n to the cumulative start */
 488         for (i = atc->nthread-1; i >= 1; i--)
 489         {
 490             tpl_n[i] = tpl_n[i-1];
 491         }
 492         tpl_n[0] = 0;
 493
 494         /* Fill our thread local array with indices sorted on thread */
 495         for (i = start; i < end; i++)
 496         {
 497             tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
 498         }
 499         /* Now tpl_n contains the cummulative count again */
 500     }
 501 }
 502
 503 static void make_thread_local_ind(pme_atomcomm_t *atc,
 504                                   int thread, splinedata_t *spline)
 505 {
 506     int             n, t, i, start, end;
 507     thread_plist_t *tpl;
 508
 509     /* Combine the indices made by each thread into one index */
 510
 511     n     = 0;
 512     start = 0;
 513     for (t = 0; t < atc->nthread; t++)
 514     {
 515         tpl = &atc->thread_plist[t];
 516         /* Copy our part (start - end) from the list of thread t */
 517         if (thread > 0)
 518         {
 519             start = tpl->n[thread-1];
 520         }
 521         end = tpl->n[thread];
 522         for (i = start; i < end; i++)
 523         {
 524             spline->ind[n++] = tpl->i[i];
 525         }
 526     }
 527
 528     spline->n = n;
 529 }
 530
 531
 532 static void pme_calc_pidx(int start, int end,
 533                           matrix recipbox, rvec x[],
 534                           pme_atomcomm_t *atc, int *count)
 535 {
 536     int   nslab, i;
 537     int   si;
 538     real *xptr, s;
 539     real  rxx, ryx, rzx, ryy, rzy;
 540     int  *pd;
 541
 542     /* Calculate PME task index (pidx) for each grid index.
 543      * Here we always assign equally sized slabs to each node
 544      * for load balancing reasons (the PME grid spacing is not used).
 545      */
 546
 547     nslab = atc->nslab;
 548     pd    = atc->pd;
 549
 550     /* Reset the count */
 551     for (i = 0; i < nslab; i++)
 552     {
 553         count[i] = 0;
 554     }
 555
 556     if (atc->dimind == 0)
 557     {
 558         rxx = recipbox[XX][XX];
 559         ryx = recipbox[YY][XX];
 560         rzx = recipbox[ZZ][XX];
 561         /* Calculate the node index in x-dimension */
 562         for (i = start; i < end; i++)
 563         {
 564             xptr   = x[i];
 565             /* Fractional coordinates along box vectors */
 566             s     = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
 567             si    = (int)(s + 2*nslab) % nslab;
 568             pd[i] = si;
 569             count[si]++;
 570         }
 571     }
 572     else
 573     {
 574         ryy = recipbox[YY][YY];
 575         rzy = recipbox[ZZ][YY];
 576         /* Calculate the node index in y-dimension */
 577         for (i = start; i < end; i++)
 578         {
 579             xptr   = x[i];
 580             /* Fractional coordinates along box vectors */
 581             s     = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
 582             si    = (int)(s + 2*nslab) % nslab;
 583             pd[i] = si;
 584             count[si]++;
 585         }
 586     }
 587 }
 588
 589 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
 590                                   pme_atomcomm_t *atc)
 591 {
 592     int nthread, thread, slab;
 593
 594     nthread = atc->nthread;
 595
 596 #pragma omp parallel for num_threads(nthread) schedule(static)
 597     for (thread = 0; thread < nthread; thread++)
 598     {
 599         pme_calc_pidx(natoms* thread   /nthread,
 600                       natoms*(thread+1)/nthread,
 601                       recipbox, x, atc, atc->count_thread[thread]);
 602     }
 603     /* Non-parallel reduction, since nslab is small */
 604
 605     for (thread = 1; thread < nthread; thread++)
 606     {
 607         for (slab = 0; slab < atc->nslab; slab++)
 608         {
 609             atc->count_thread[0][slab] += atc->count_thread[thread][slab];
 610         }
 611     }
 612 }
 613
 614 static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
 615 {
 616     const int padding = 4;
 617     int       i;
 618
 619     srenew(th[XX], nalloc);
 620     srenew(th[YY], nalloc);
 621     /* In z we add padding, this is only required for the aligned SIMD code */
 622     sfree_aligned(*ptr_z);
 623     snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT);
 624     th[ZZ] = *ptr_z + padding;
 625
 626     for (i = 0; i < padding; i++)
 627     {
 628         (*ptr_z)[               i] = 0;
 629         (*ptr_z)[padding+nalloc+i] = 0;
 630     }
 631 }
 632
 633 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
 634 {
 635     int i, d;
 636
 637     srenew(spline->ind, atc->nalloc);
 638     /* Initialize the index to identity so it works without threads */
 639     for (i = 0; i < atc->nalloc; i++)
 640     {
 641         spline->ind[i] = i;
 642     }
 643
 644     realloc_splinevec(spline->theta, &spline->ptr_theta_z,
 645                       atc->pme_order*atc->nalloc);
 646     realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
 647                       atc->pme_order*atc->nalloc);
 648 }
 649
 650 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 651 {
 652     int nalloc_old, i, j, nalloc_tpl;
 653
 654     /* We have to avoid a NULL pointer for atc->x to avoid
 655      * possible fatal errors in MPI routines.
 656      */
 657     if (atc->n > atc->nalloc || atc->nalloc == 0)
 658     {
 659         nalloc_old  = atc->nalloc;
 660         atc->nalloc = over_alloc_dd(max(atc->n, 1));
 661
 662         if (atc->nslab > 1)
 663         {
 664             srenew(atc->x, atc->nalloc);
 665             srenew(atc->coefficient, atc->nalloc);
 666             srenew(atc->f, atc->nalloc);
 667             for (i = nalloc_old; i < atc->nalloc; i++)
 668             {
 669                 clear_rvec(atc->f[i]);
 670             }
 671         }
 672         if (atc->bSpread)
 673         {
 674             srenew(atc->fractx, atc->nalloc);
 675             srenew(atc->idx, atc->nalloc);
 676
 677             if (atc->nthread > 1)
 678             {
 679                 srenew(atc->thread_idx, atc->nalloc);
 680             }
 681
 682             for (i = 0; i < atc->nthread; i++)
 683             {
 684                 pme_realloc_splinedata(&atc->spline[i], atc);
 685             }
 686         }
 687     }
 688 }
 689
 690 static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused *atc,
 691                             gmx_bool gmx_unused bBackward, int gmx_unused shift,
 692                             void gmx_unused *buf_s, int gmx_unused nbyte_s,
 693                             void gmx_unused *buf_r, int gmx_unused nbyte_r)
 694 {
 695 #ifdef GMX_MPI
 696     int        dest, src;
 697     MPI_Status stat;
 698
 699     if (bBackward == FALSE)
 700     {
 701         dest = atc->node_dest[shift];
 702         src  = atc->node_src[shift];
 703     }
 704     else
 705     {
 706         dest = atc->node_src[shift];
 707         src  = atc->node_dest[shift];
 708     }
 709
 710     if (nbyte_s > 0 && nbyte_r > 0)
 711     {
 712         MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
 713                      dest, shift,
 714                      buf_r, nbyte_r, MPI_BYTE,
 715                      src, shift,
 716                      atc->mpi_comm, &stat);
 717     }
 718     else if (nbyte_s > 0)
 719     {
 720         MPI_Send(buf_s, nbyte_s, MPI_BYTE,
 721                  dest, shift,
 722                  atc->mpi_comm);
 723     }
 724     else if (nbyte_r > 0)
 725     {
 726         MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
 727                  src, shift,
 728                  atc->mpi_comm, &stat);
 729     }
 730 #endif
 731 }
 732
 733 static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
 734                                     int n, gmx_bool bX, rvec *x, real *data,
 735                                     pme_atomcomm_t *atc)
 736 {
 737     int *commnode, *buf_index;
 738     int  nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
 739
 740     commnode  = atc->node_dest;
 741     buf_index = atc->buf_index;
 742
 743     nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 744
 745     nsend = 0;
 746     for (i = 0; i < nnodes_comm; i++)
 747     {
 748         buf_index[commnode[i]] = nsend;
 749         nsend                 += atc->count[commnode[i]];
 750     }
 751     if (bX)
 752     {
 753         if (atc->count[atc->nodeid] + nsend != n)
 754         {
 755             gmx_fatal(FARGS, "%d particles communicated to PME rank %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
 756                       "This usually means that your system is not well equilibrated.",
 757                       n - (atc->count[atc->nodeid] + nsend),
 758                       pme->nodeid, 'x'+atc->dimind);
 759         }
 760
 761         if (nsend > pme->buf_nalloc)
 762         {
 763             pme->buf_nalloc = over_alloc_dd(nsend);
 764             srenew(pme->bufv, pme->buf_nalloc);
 765             srenew(pme->bufr, pme->buf_nalloc);
 766         }
 767
 768         atc->n = atc->count[atc->nodeid];
 769         for (i = 0; i < nnodes_comm; i++)
 770         {
 771             scount = atc->count[commnode[i]];
 772             /* Communicate the count */
 773             if (debug)
 774             {
 775                 fprintf(debug, "dimind %d PME rank %d send to rank %d: %d\n",
 776                         atc->dimind, atc->nodeid, commnode[i], scount);
 777             }
 778             pme_dd_sendrecv(atc, FALSE, i,
 779                             &scount, sizeof(int),
 780                             &atc->rcount[i], sizeof(int));
 781             atc->n += atc->rcount[i];
 782         }
 783
 784         pme_realloc_atomcomm_things(atc);
 785     }
 786
 787     local_pos = 0;
 788     for (i = 0; i < n; i++)
 789     {
 790         node = atc->pd[i];
 791         if (node == atc->nodeid)
 792         {
 793             /* Copy direct to the receive buffer */
 794             if (bX)
 795             {
 796                 copy_rvec(x[i], atc->x[local_pos]);
 797             }
 798             atc->coefficient[local_pos] = data[i];
 799             local_pos++;
 800         }
 801         else
 802         {
 803             /* Copy to the send buffer */
 804             if (bX)
 805             {
 806                 copy_rvec(x[i], pme->bufv[buf_index[node]]);
 807             }
 808             pme->bufr[buf_index[node]] = data[i];
 809             buf_index[node]++;
 810         }
 811     }
 812
 813     buf_pos = 0;
 814     for (i = 0; i < nnodes_comm; i++)
 815     {
 816         scount = atc->count[commnode[i]];
 817         rcount = atc->rcount[i];
 818         if (scount > 0 || rcount > 0)
 819         {
 820             if (bX)
 821             {
 822                 /* Communicate the coordinates */
 823                 pme_dd_sendrecv(atc, FALSE, i,
 824                                 pme->bufv[buf_pos], scount*sizeof(rvec),
 825                                 atc->x[local_pos], rcount*sizeof(rvec));
 826             }
 827             /* Communicate the coefficients */
 828             pme_dd_sendrecv(atc, FALSE, i,
 829                             pme->bufr+buf_pos, scount*sizeof(real),
 830                             atc->coefficient+local_pos, rcount*sizeof(real));
 831             buf_pos   += scount;
 832             local_pos += atc->rcount[i];
 833         }
 834     }
 835 }
 836
 837 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 838                            int n, rvec *f,
 839                            gmx_bool bAddF)
 840 {
 841     int *commnode, *buf_index;
 842     int  nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
 843
 844     commnode  = atc->node_dest;
 845     buf_index = atc->buf_index;
 846
 847     nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 848
 849     local_pos = atc->count[atc->nodeid];
 850     buf_pos   = 0;
 851     for (i = 0; i < nnodes_comm; i++)
 852     {
 853         scount = atc->rcount[i];
 854         rcount = atc->count[commnode[i]];
 855         if (scount > 0 || rcount > 0)
 856         {
 857             /* Communicate the forces */
 858             pme_dd_sendrecv(atc, TRUE, i,
 859                             atc->f[local_pos], scount*sizeof(rvec),
 860                             pme->bufv[buf_pos], rcount*sizeof(rvec));
 861             local_pos += scount;
 862         }
 863         buf_index[commnode[i]] = buf_pos;
 864         buf_pos               += rcount;
 865     }
 866
 867     local_pos = 0;
 868     if (bAddF)
 869     {
 870         for (i = 0; i < n; i++)
 871         {
 872             node = atc->pd[i];
 873             if (node == atc->nodeid)
 874             {
 875                 /* Add from the local force array */
 876                 rvec_inc(f[i], atc->f[local_pos]);
 877                 local_pos++;
 878             }
 879             else
 880             {
 881                 /* Add from the receive buffer */
 882                 rvec_inc(f[i], pme->bufv[buf_index[node]]);
 883                 buf_index[node]++;
 884             }
 885         }
 886     }
 887     else
 888     {
 889         for (i = 0; i < n; i++)
 890         {
 891             node = atc->pd[i];
 892             if (node == atc->nodeid)
 893             {
 894                 /* Copy from the local force array */
 895                 copy_rvec(atc->f[local_pos], f[i]);
 896                 local_pos++;
 897             }
 898             else
 899             {
 900                 /* Copy from the receive buffer */
 901                 copy_rvec(pme->bufv[buf_index[node]], f[i]);
 902                 buf_index[node]++;
 903             }
 904         }
 905     }
 906 }
 907
 908 #ifdef GMX_MPI
 909 static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 910 {
 911     pme_overlap_t *overlap;
 912     int            send_index0, send_nindex;
 913     int            recv_index0, recv_nindex;
 914     MPI_Status     stat;
 915     int            i, j, k, ix, iy, iz, icnt;
 916     int            ipulse, send_id, recv_id, datasize;
 917     real          *p;
 918     real          *sendptr, *recvptr;
 919
 920     /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
 921     overlap = &pme->overlap[1];
 922
 923     for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 924     {
 925         /* Since we have already (un)wrapped the overlap in the z-dimension,
 926          * we only have to communicate 0 to nkz (not pmegrid_nz).
 927          */
 928         if (direction == GMX_SUM_GRID_FORWARD)
 929         {
 930             send_id       = overlap->send_id[ipulse];
 931             recv_id       = overlap->recv_id[ipulse];
 932             send_index0   = overlap->comm_data[ipulse].send_index0;
 933             send_nindex   = overlap->comm_data[ipulse].send_nindex;
 934             recv_index0   = overlap->comm_data[ipulse].recv_index0;
 935             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 936         }
 937         else
 938         {
 939             send_id       = overlap->recv_id[ipulse];
 940             recv_id       = overlap->send_id[ipulse];
 941             send_index0   = overlap->comm_data[ipulse].recv_index0;
 942             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 943             recv_index0   = overlap->comm_data[ipulse].send_index0;
 944             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 945         }
 946
 947         /* Copy data to contiguous send buffer */
 948         if (debug)
 949         {
 950             fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
 951                     pme->nodeid, overlap->nodeid, send_id,
 952                     pme->pmegrid_start_iy,
 953                     send_index0-pme->pmegrid_start_iy,
 954                     send_index0-pme->pmegrid_start_iy+send_nindex);
 955         }
 956         icnt = 0;
 957         for (i = 0; i < pme->pmegrid_nx; i++)
 958         {
 959             ix = i;
 960             for (j = 0; j < send_nindex; j++)
 961             {
 962                 iy = j + send_index0 - pme->pmegrid_start_iy;
 963                 for (k = 0; k < pme->nkz; k++)
 964                 {
 965                     iz = k;
 966                     overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
 967                 }
 968             }
 969         }
 970
 971         datasize      = pme->pmegrid_nx * pme->nkz;
 972
 973         MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
 974                      send_id, ipulse,
 975                      overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
 976                      recv_id, ipulse,
 977                      overlap->mpi_comm, &stat);
 978
 979         /* Get data from contiguous recv buffer */
 980         if (debug)
 981         {
 982             fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
 983                     pme->nodeid, overlap->nodeid, recv_id,
 984                     pme->pmegrid_start_iy,
 985                     recv_index0-pme->pmegrid_start_iy,
 986                     recv_index0-pme->pmegrid_start_iy+recv_nindex);
 987         }
 988         icnt = 0;
 989         for (i = 0; i < pme->pmegrid_nx; i++)
 990         {
 991             ix = i;
 992             for (j = 0; j < recv_nindex; j++)
 993             {
 994                 iy = j + recv_index0 - pme->pmegrid_start_iy;
 995                 for (k = 0; k < pme->nkz; k++)
 996                 {
 997                     iz = k;
 998                     if (direction == GMX_SUM_GRID_FORWARD)
 999                     {
1000                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1001                     }
1002                     else
1003                     {
1004                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
1005                     }
1006                 }
1007             }
1008         }
1009     }
1010
1011     /* Major dimension is easier, no copying required,
1012      * but we might have to sum to separate array.
1013      * Since we don't copy, we have to communicate up to pmegrid_nz,
1014      * not nkz as for the minor direction.
1015      */
1016     overlap = &pme->overlap[0];
1017
1018     for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
1019     {
1020         if (direction == GMX_SUM_GRID_FORWARD)
1021         {
1022             send_id       = overlap->send_id[ipulse];
1023             recv_id       = overlap->recv_id[ipulse];
1024             send_index0   = overlap->comm_data[ipulse].send_index0;
1025             send_nindex   = overlap->comm_data[ipulse].send_nindex;
1026             recv_index0   = overlap->comm_data[ipulse].recv_index0;
1027             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
1028             recvptr       = overlap->recvbuf;
1029         }
1030         else
1031         {
1032             send_id       = overlap->recv_id[ipulse];
1033             recv_id       = overlap->send_id[ipulse];
1034             send_index0   = overlap->comm_data[ipulse].recv_index0;
1035             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
1036             recv_index0   = overlap->comm_data[ipulse].send_index0;
1037             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
1038             recvptr       = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1039         }
1040
1041         sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1042         datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
1043
1044         if (debug)
1045         {
1046             fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
1047                     pme->nodeid, overlap->nodeid, send_id,
1048                     pme->pmegrid_start_ix,
1049                     send_index0-pme->pmegrid_start_ix,
1050                     send_index0-pme->pmegrid_start_ix+send_nindex);
1051             fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
1052                     pme->nodeid, overlap->nodeid, recv_id,
1053                     pme->pmegrid_start_ix,
1054                     recv_index0-pme->pmegrid_start_ix,
1055                     recv_index0-pme->pmegrid_start_ix+recv_nindex);
1056         }
1057
1058         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
1059                      send_id, ipulse,
1060                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
1061                      recv_id, ipulse,
1062                      overlap->mpi_comm, &stat);
1063
1064         /* ADD data from contiguous recv buffer */
1065         if (direction == GMX_SUM_GRID_FORWARD)
1066         {
1067             p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1068             for (i = 0; i < recv_nindex*datasize; i++)
1069             {
1070                 p[i] += overlap->recvbuf[i];
1071             }
1072         }
1073     }
1074 }
1075 #endif
1076
1077
1078 static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid, int grid_index)
1079 {
1080     ivec    local_fft_ndata, local_fft_offset, local_fft_size;
1081     ivec    local_pme_size;
1082     int     i, ix, iy, iz;
1083     int     pmeidx, fftidx;
1084
1085     /* Dimensions should be identical for A/B grid, so we just use A here */
1086     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
1087                                    local_fft_ndata,
1088                                    local_fft_offset,
1089                                    local_fft_size);
1090
1091     local_pme_size[0] = pme->pmegrid_nx;
1092     local_pme_size[1] = pme->pmegrid_ny;
1093     local_pme_size[2] = pme->pmegrid_nz;
1094
1095     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1096        the offset is identical, and the PME grid always has more data (due to overlap)
1097      */
1098     {
1099 #ifdef DEBUG_PME
1100         FILE *fp, *fp2;
1101         char  fn[STRLEN];
1102         real  val;
1103         sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
1104         fp = gmx_ffopen(fn, "w");
1105         sprintf(fn, "pmegrid%d.txt", pme->nodeid);
1106         fp2 = gmx_ffopen(fn, "w");
1107 #endif
1108
1109         for (ix = 0; ix < local_fft_ndata[XX]; ix++)
1110         {
1111             for (iy = 0; iy < local_fft_ndata[YY]; iy++)
1112             {
1113                 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1114                 {
1115                     pmeidx          = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1116                     fftidx          = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1117                     fftgrid[fftidx] = pmegrid[pmeidx];
1118 #ifdef DEBUG_PME
1119                     val = 100*pmegrid[pmeidx];
1120                     if (pmegrid[pmeidx] != 0)
1121                     {
1122                         gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
1123                                                  5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
1124                     }
1125                     if (pmegrid[pmeidx] != 0)
1126                     {
1127                         fprintf(fp2, "%-12s  %5d  %5d  %5d  %12.5e\n",
1128                                 "qgrid",
1129                                 pme->pmegrid_start_ix + ix,
1130                                 pme->pmegrid_start_iy + iy,
1131                                 pme->pmegrid_start_iz + iz,
1132                                 pmegrid[pmeidx]);
1133                     }
1134 #endif
1135                 }
1136             }
1137         }
1138 #ifdef DEBUG_PME
1139         gmx_ffclose(fp);
1140         gmx_ffclose(fp2);
1141 #endif
1142     }
1143     return 0;
1144 }
1145
1146
1147 static gmx_cycles_t omp_cyc_start()
1148 {
1149     return gmx_cycles_read();
1150 }
1151
1152 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1153 {
1154     return gmx_cycles_read() - c;
1155 }
1156
1157
1158 static int copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid, int grid_index,
1159                                    int nthread, int thread)
1160 {
1161     ivec          local_fft_ndata, local_fft_offset, local_fft_size;
1162     ivec          local_pme_size;
1163     int           ixy0, ixy1, ixy, ix, iy, iz;
1164     int           pmeidx, fftidx;
1165 #ifdef PME_TIME_THREADS
1166     gmx_cycles_t  c1;
1167     static double cs1 = 0;
1168     static int    cnt = 0;
1169 #endif
1170
1171 #ifdef PME_TIME_THREADS
1172     c1 = omp_cyc_start();
1173 #endif
1174     /* Dimensions should be identical for A/B grid, so we just use A here */
1175     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
1176                                    local_fft_ndata,
1177                                    local_fft_offset,
1178                                    local_fft_size);
1179
1180     local_pme_size[0] = pme->pmegrid_nx;
1181     local_pme_size[1] = pme->pmegrid_ny;
1182     local_pme_size[2] = pme->pmegrid_nz;
1183
1184     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1185        the offset is identical, and the PME grid always has more data (due to overlap)
1186      */
1187     ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1188     ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1189
1190     for (ixy = ixy0; ixy < ixy1; ixy++)
1191     {
1192         ix = ixy/local_fft_ndata[YY];
1193         iy = ixy - ix*local_fft_ndata[YY];
1194
1195         pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1196         fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1197         for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1198         {
1199             pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1200         }
1201     }
1202
1203 #ifdef PME_TIME_THREADS
1204     c1   = omp_cyc_end(c1);
1205     cs1 += (double)c1;
1206     cnt++;
1207     if (cnt % 20 == 0)
1208     {
1209         printf("copy %.2f\n", cs1*1e-9);
1210     }
1211 #endif
1212
1213     return 0;
1214 }
1215
1216
1217 static void wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1218 {
1219     int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
1220
1221     nx = pme->nkx;
1222     ny = pme->nky;
1223     nz = pme->nkz;
1224
1225     pnx = pme->pmegrid_nx;
1226     pny = pme->pmegrid_ny;
1227     pnz = pme->pmegrid_nz;
1228
1229     overlap = pme->pme_order - 1;
1230
1231     /* Add periodic overlap in z */
1232     for (ix = 0; ix < pme->pmegrid_nx; ix++)
1233     {
1234         for (iy = 0; iy < pme->pmegrid_ny; iy++)
1235         {
1236             for (iz = 0; iz < overlap; iz++)
1237             {
1238                 pmegrid[(ix*pny+iy)*pnz+iz] +=
1239                     pmegrid[(ix*pny+iy)*pnz+nz+iz];
1240             }
1241         }
1242     }
1243
1244     if (pme->nnodes_minor == 1)
1245     {
1246         for (ix = 0; ix < pme->pmegrid_nx; ix++)
1247         {
1248             for (iy = 0; iy < overlap; iy++)
1249             {
1250                 for (iz = 0; iz < nz; iz++)
1251                 {
1252                     pmegrid[(ix*pny+iy)*pnz+iz] +=
1253                         pmegrid[(ix*pny+ny+iy)*pnz+iz];
1254                 }
1255             }
1256         }
1257     }
1258
1259     if (pme->nnodes_major == 1)
1260     {
1261         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1262
1263         for (ix = 0; ix < overlap; ix++)
1264         {
1265             for (iy = 0; iy < ny_x; iy++)
1266             {
1267                 for (iz = 0; iz < nz; iz++)
1268                 {
1269                     pmegrid[(ix*pny+iy)*pnz+iz] +=
1270                         pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1271                 }
1272             }
1273         }
1274     }
1275 }
1276
1277
1278 static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1279 {
1280     int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
1281
1282     nx = pme->nkx;
1283     ny = pme->nky;
1284     nz = pme->nkz;
1285
1286     pnx = pme->pmegrid_nx;
1287     pny = pme->pmegrid_ny;
1288     pnz = pme->pmegrid_nz;
1289
1290     overlap = pme->pme_order - 1;
1291
1292     if (pme->nnodes_major == 1)
1293     {
1294         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1295
1296         for (ix = 0; ix < overlap; ix++)
1297         {
1298             int iy, iz;
1299
1300             for (iy = 0; iy < ny_x; iy++)
1301             {
1302                 for (iz = 0; iz < nz; iz++)
1303                 {
1304                     pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1305                         pmegrid[(ix*pny+iy)*pnz+iz];
1306                 }
1307             }
1308         }
1309     }
1310
1311     if (pme->nnodes_minor == 1)
1312     {
1313 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1314         for (ix = 0; ix < pme->pmegrid_nx; ix++)
1315         {
1316             int iy, iz;
1317
1318             for (iy = 0; iy < overlap; iy++)
1319             {
1320                 for (iz = 0; iz < nz; iz++)
1321                 {
1322                     pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1323                         pmegrid[(ix*pny+iy)*pnz+iz];
1324                 }
1325             }
1326         }
1327     }
1328
1329     /* Copy periodic overlap in z */
1330 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1331     for (ix = 0; ix < pme->pmegrid_nx; ix++)
1332     {
1333         int iy, iz;
1334
1335         for (iy = 0; iy < pme->pmegrid_ny; iy++)
1336         {
1337             for (iz = 0; iz < overlap; iz++)
1338             {
1339                 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1340                     pmegrid[(ix*pny+iy)*pnz+iz];
1341             }
1342         }
1343     }
1344 }
1345
1346
1347 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1348 #define DO_BSPLINE(order)                            \
1349     for (ithx = 0; (ithx < order); ithx++)                    \
1350     {                                                    \
1351         index_x = (i0+ithx)*pny*pnz;                     \
1352         valx    = coefficient*thx[ithx];                          \
1353                                                      \
1354         for (ithy = 0; (ithy < order); ithy++)                \
1355         {                                                \
1356             valxy    = valx*thy[ithy];                   \
1357             index_xy = index_x+(j0+ithy)*pnz;            \
1358                                                      \
1359             for (ithz = 0; (ithz < order); ithz++)            \
1360             {                                            \
1361                 index_xyz        = index_xy+(k0+ithz);   \
1362                 grid[index_xyz] += valxy*thz[ithz];      \
1363             }                                            \
1364         }                                                \
1365     }
1366
1367
1368 static void spread_coefficients_bsplines_thread(pmegrid_t                    *pmegrid,
1369                                                 pme_atomcomm_t               *atc,
1370                                                 splinedata_t                 *spline,
1371                                                 pme_spline_work_t gmx_unused *work)
1372 {
1373
1374     /* spread coefficients from home atoms to local grid */
1375     real          *grid;
1376     pme_overlap_t *ol;
1377     int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
1378     int       *    idxptr;
1379     int            order, norder, index_x, index_xy, index_xyz;
1380     real           valx, valxy, coefficient;
1381     real          *thx, *thy, *thz;
1382     int            localsize, bndsize;
1383     int            pnx, pny, pnz, ndatatot;
1384     int            offx, offy, offz;
1385
1386 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
1387     real           thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
1388
1389     thz_aligned = gmx_simd4_align_r(thz_buffer);
1390 #endif
1391
1392     pnx = pmegrid->s[XX];
1393     pny = pmegrid->s[YY];
1394     pnz = pmegrid->s[ZZ];
1395
1396     offx = pmegrid->offset[XX];
1397     offy = pmegrid->offset[YY];
1398     offz = pmegrid->offset[ZZ];
1399
1400     ndatatot = pnx*pny*pnz;
1401     grid     = pmegrid->grid;
1402     for (i = 0; i < ndatatot; i++)
1403     {
1404         grid[i] = 0;
1405     }
1406
1407     order = pmegrid->order;
1408
1409     for (nn = 0; nn < spline->n; nn++)
1410     {
1411         n           = spline->ind[nn];
1412         coefficient = atc->coefficient[n];
1413
1414         if (coefficient != 0)
1415         {
1416             idxptr = atc->idx[n];
1417             norder = nn*order;
1418
1419             i0   = idxptr[XX] - offx;
1420             j0   = idxptr[YY] - offy;
1421             k0   = idxptr[ZZ] - offz;
1422
1423             thx = spline->theta[XX] + norder;
1424             thy = spline->theta[YY] + norder;
1425             thz = spline->theta[ZZ] + norder;
1426
1427             switch (order)
1428             {
1429                 case 4:
1430 #ifdef PME_SIMD4_SPREAD_GATHER
1431 #ifdef PME_SIMD4_UNALIGNED
1432 #define PME_SPREAD_SIMD4_ORDER4
1433 #else
1434 #define PME_SPREAD_SIMD4_ALIGNED
1435 #define PME_ORDER 4
1436 #endif
1437 #include "pme_simd4.h"
1438 #else
1439                     DO_BSPLINE(4);
1440 #endif
1441                     break;
1442                 case 5:
1443 #ifdef PME_SIMD4_SPREAD_GATHER
1444 #define PME_SPREAD_SIMD4_ALIGNED
1445 #define PME_ORDER 5
1446 #include "pme_simd4.h"
1447 #else
1448                     DO_BSPLINE(5);
1449 #endif
1450                     break;
1451                 default:
1452                     DO_BSPLINE(order);
1453                     break;
1454             }
1455         }
1456     }
1457 }
1458
1459 static void set_grid_alignment(int gmx_unused *pmegrid_nz, int gmx_unused pme_order)
1460 {
1461 #ifdef PME_SIMD4_SPREAD_GATHER
1462     if (pme_order == 5
1463 #ifndef PME_SIMD4_UNALIGNED
1464         || pme_order == 4
1465 #endif
1466         )
1467     {
1468         /* Round nz up to a multiple of 4 to ensure alignment */
1469         *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1470     }
1471 #endif
1472 }
1473
1474 static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
1475 {
1476 #ifdef PME_SIMD4_SPREAD_GATHER
1477 #ifndef PME_SIMD4_UNALIGNED
1478     if (pme_order == 4)
1479     {
1480         /* Add extra elements to ensured aligned operations do not go
1481          * beyond the allocated grid size.
1482          * Note that for pme_order=5, the pme grid z-size alignment
1483          * ensures that we will not go beyond the grid size.
1484          */
1485         *gridsize += 4;
1486     }
1487 #endif
1488 #endif
1489 }
1490
1491 static void pmegrid_init(pmegrid_t *grid,
1492                          int cx, int cy, int cz,
1493                          int x0, int y0, int z0,
1494                          int x1, int y1, int z1,
1495                          gmx_bool set_alignment,
1496                          int pme_order,
1497                          real *ptr)
1498 {
1499     int nz, gridsize;
1500
1501     grid->ci[XX]     = cx;
1502     grid->ci[YY]     = cy;
1503     grid->ci[ZZ]     = cz;
1504     grid->offset[XX] = x0;
1505     grid->offset[YY] = y0;
1506     grid->offset[ZZ] = z0;
1507     grid->n[XX]      = x1 - x0 + pme_order - 1;
1508     grid->n[YY]      = y1 - y0 + pme_order - 1;
1509     grid->n[ZZ]      = z1 - z0 + pme_order - 1;
1510     copy_ivec(grid->n, grid->s);
1511
1512     nz = grid->s[ZZ];
1513     set_grid_alignment(&nz, pme_order);
1514     if (set_alignment)
1515     {
1516         grid->s[ZZ] = nz;
1517     }
1518     else if (nz != grid->s[ZZ])
1519     {
1520         gmx_incons("pmegrid_init call with an unaligned z size");
1521     }
1522
1523     grid->order = pme_order;
1524     if (ptr == NULL)
1525     {
1526         gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
1527         set_gridsize_alignment(&gridsize, pme_order);
1528         snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
1529     }
1530     else
1531     {
1532         grid->grid = ptr;
1533     }
1534 }
1535
1536 static int div_round_up(int enumerator, int denominator)
1537 {
1538     return (enumerator + denominator - 1)/denominator;
1539 }
1540
1541 static void make_subgrid_division(const ivec n, int ovl, int nthread,
1542                                   ivec nsub)
1543 {
1544     int gsize_opt, gsize;
1545     int nsx, nsy, nsz;
1546     char *env;
1547
1548     gsize_opt = -1;
1549     for (nsx = 1; nsx <= nthread; nsx++)
1550     {
1551         if (nthread % nsx == 0)
1552         {
1553             for (nsy = 1; nsy <= nthread; nsy++)
1554             {
1555                 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1556                 {
1557                     nsz = nthread/(nsx*nsy);
1558
1559                     /* Determine the number of grid points per thread */
1560                     gsize =
1561                         (div_round_up(n[XX], nsx) + ovl)*
1562                         (div_round_up(n[YY], nsy) + ovl)*
1563                         (div_round_up(n[ZZ], nsz) + ovl);
1564
1565                     /* Minimize the number of grids points per thread
1566                      * and, secondarily, the number of cuts in minor dimensions.
1567                      */
1568                     if (gsize_opt == -1 ||
1569                         gsize < gsize_opt ||
1570                         (gsize == gsize_opt &&
1571                          (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1572                     {
1573                         nsub[XX]  = nsx;
1574                         nsub[YY]  = nsy;
1575                         nsub[ZZ]  = nsz;
1576                         gsize_opt = gsize;
1577                     }
1578                 }
1579             }
1580         }
1581     }
1582
1583     env = getenv("GMX_PME_THREAD_DIVISION");
1584     if (env != NULL)
1585     {
1586         sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
1587     }
1588
1589     if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1590     {
1591         gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
1592     }
1593 }
1594
1595 static void pmegrids_init(pmegrids_t *grids,
1596                           int nx, int ny, int nz, int nz_base,
1597                           int pme_order,
1598                           gmx_bool bUseThreads,
1599                           int nthread,
1600                           int overlap_x,
1601                           int overlap_y)
1602 {
1603     ivec n, n_base, g0, g1;
1604     int t, x, y, z, d, i, tfac;
1605     int max_comm_lines = -1;
1606
1607     n[XX] = nx - (pme_order - 1);
1608     n[YY] = ny - (pme_order - 1);
1609     n[ZZ] = nz - (pme_order - 1);
1610
1611     copy_ivec(n, n_base);
1612     n_base[ZZ] = nz_base;
1613
1614     pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
1615                  NULL);
1616
1617     grids->nthread = nthread;
1618
1619     make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
1620
1621     if (bUseThreads)
1622     {
1623         ivec nst;
1624         int gridsize;
1625
1626         for (d = 0; d < DIM; d++)
1627         {
1628             nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
1629         }
1630         set_grid_alignment(&nst[ZZ], pme_order);
1631
1632         if (debug)
1633         {
1634             fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
1635                     grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
1636             fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1637                     nx, ny, nz,
1638                     nst[XX], nst[YY], nst[ZZ]);
1639         }
1640
1641         snew(grids->grid_th, grids->nthread);
1642         t        = 0;
1643         gridsize = nst[XX]*nst[YY]*nst[ZZ];
1644         set_gridsize_alignment(&gridsize, pme_order);
1645         snew_aligned(grids->grid_all,
1646                      grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1647                      SIMD4_ALIGNMENT);
1648
1649         for (x = 0; x < grids->nc[XX]; x++)
1650         {
1651             for (y = 0; y < grids->nc[YY]; y++)
1652             {
1653                 for (z = 0; z < grids->nc[ZZ]; z++)
1654                 {
1655                     pmegrid_init(&grids->grid_th[t],
1656                                  x, y, z,
1657                                  (n[XX]*(x  ))/grids->nc[XX],
1658                                  (n[YY]*(y  ))/grids->nc[YY],
1659                                  (n[ZZ]*(z  ))/grids->nc[ZZ],
1660                                  (n[XX]*(x+1))/grids->nc[XX],
1661                                  (n[YY]*(y+1))/grids->nc[YY],
1662                                  (n[ZZ]*(z+1))/grids->nc[ZZ],
1663                                  TRUE,
1664                                  pme_order,
1665                                  grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1666                     t++;
1667                 }
1668             }
1669         }
1670     }
1671     else
1672     {
1673         grids->grid_th = NULL;
1674     }
1675
1676     snew(grids->g2t, DIM);
1677     tfac = 1;
1678     for (d = DIM-1; d >= 0; d--)
1679     {
1680         snew(grids->g2t[d], n[d]);
1681         t = 0;
1682         for (i = 0; i < n[d]; i++)
1683         {
1684             /* The second check should match the parameters
1685              * of the pmegrid_init call above.
1686              */
1687             while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1688             {
1689                 t++;
1690             }
1691             grids->g2t[d][i] = t*tfac;
1692         }
1693
1694         tfac *= grids->nc[d];
1695
1696         switch (d)
1697         {
1698             case XX: max_comm_lines = overlap_x;     break;
1699             case YY: max_comm_lines = overlap_y;     break;
1700             case ZZ: max_comm_lines = pme_order - 1; break;
1701         }
1702         grids->nthread_comm[d] = 0;
1703         while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
1704                grids->nthread_comm[d] < grids->nc[d])
1705         {
1706             grids->nthread_comm[d]++;
1707         }
1708         if (debug != NULL)
1709         {
1710             fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
1711                     'x'+d, grids->nthread_comm[d]);
1712         }
1713         /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1714          * work, but this is not a problematic restriction.
1715          */
1716         if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1717         {
1718             gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
1719         }
1720     }
1721 }
1722
1723
1724 static void pmegrids_destroy(pmegrids_t *grids)
1725 {
1726     int t;
1727
1728     if (grids->grid.grid != NULL)
1729     {
1730         sfree(grids->grid.grid);
1731
1732         if (grids->nthread > 0)
1733         {
1734             for (t = 0; t < grids->nthread; t++)
1735             {
1736                 sfree(grids->grid_th[t].grid);
1737             }
1738             sfree(grids->grid_th);
1739         }
1740     }
1741 }
1742
1743
1744 static void realloc_work(pme_work_t *work, int nkx)
1745 {
1746     int simd_width;
1747
1748     if (nkx > work->nalloc)
1749     {
1750         work->nalloc = nkx;
1751         srenew(work->mhx, work->nalloc);
1752         srenew(work->mhy, work->nalloc);
1753         srenew(work->mhz, work->nalloc);
1754         srenew(work->m2, work->nalloc);
1755         /* Allocate an aligned pointer for SIMD operations, including extra
1756          * elements at the end for padding.
1757          */
1758 #ifdef PME_SIMD_SOLVE
1759         simd_width = GMX_SIMD_REAL_WIDTH;
1760 #else
1761         /* We can use any alignment, apart from 0, so we use 4 */
1762         simd_width = 4;
1763 #endif
1764         sfree_aligned(work->denom);
1765         sfree_aligned(work->tmp1);
1766         sfree_aligned(work->tmp2);
1767         sfree_aligned(work->eterm);
1768         snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real));
1769         snew_aligned(work->tmp1,  work->nalloc+simd_width, simd_width*sizeof(real));
1770         snew_aligned(work->tmp2,  work->nalloc+simd_width, simd_width*sizeof(real));
1771         snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
1772         srenew(work->m2inv, work->nalloc);
1773     }
1774 }
1775
1776
1777 static void free_work(pme_work_t *work)
1778 {
1779     sfree(work->mhx);
1780     sfree(work->mhy);
1781     sfree(work->mhz);
1782     sfree(work->m2);
1783     sfree_aligned(work->denom);
1784     sfree_aligned(work->tmp1);
1785     sfree_aligned(work->tmp2);
1786     sfree_aligned(work->eterm);
1787     sfree(work->m2inv);
1788 }
1789
1790
1791 #if defined PME_SIMD_SOLVE
1792 /* Calculate exponentials through SIMD */
1793 gmx_inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1794 {
1795     {
1796         const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
1797         gmx_simd_real_t f_simd;
1798         gmx_simd_real_t lu;
1799         gmx_simd_real_t tmp_d1, d_inv, tmp_r, tmp_e;
1800         int kx;
1801         f_simd = gmx_simd_set1_r(f);
1802         /* We only need to calculate from start. But since start is 0 or 1
1803          * and we want to use aligned loads/stores, we always start from 0.
1804          */
1805         for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
1806         {
1807             tmp_d1   = gmx_simd_load_r(d_aligned+kx);
1808             d_inv    = gmx_simd_inv_r(tmp_d1);
1809             tmp_r    = gmx_simd_load_r(r_aligned+kx);
1810             tmp_r    = gmx_simd_exp_r(tmp_r);
1811             tmp_e    = gmx_simd_mul_r(f_simd, d_inv);
1812             tmp_e    = gmx_simd_mul_r(tmp_e, tmp_r);
1813             gmx_simd_store_r(e_aligned+kx, tmp_e);
1814         }
1815     }
1816 }
1817 #else
1818 gmx_inline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
1819 {
1820     int kx;
1821     for (kx = start; kx < end; kx++)
1822     {
1823         d[kx] = 1.0/d[kx];
1824     }
1825     for (kx = start; kx < end; kx++)
1826     {
1827         r[kx] = exp(r[kx]);
1828     }
1829     for (kx = start; kx < end; kx++)
1830     {
1831         e[kx] = f*r[kx]*d[kx];
1832     }
1833 }
1834 #endif
1835
1836 #if defined PME_SIMD_SOLVE
1837 /* Calculate exponentials through SIMD */
1838 gmx_inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
1839 {
1840     gmx_simd_real_t tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
1841     const gmx_simd_real_t sqr_PI = gmx_simd_sqrt_r(gmx_simd_set1_r(M_PI));
1842     int kx;
1843     for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
1844     {
1845         /* We only need to calculate from start. But since start is 0 or 1
1846          * and we want to use aligned loads/stores, we always start from 0.
1847          */
1848         tmp_d = gmx_simd_load_r(d_aligned+kx);
1849         d_inv = gmx_simd_inv_r(tmp_d);
1850         gmx_simd_store_r(d_aligned+kx, d_inv);
1851         tmp_r = gmx_simd_load_r(r_aligned+kx);
1852         tmp_r = gmx_simd_exp_r(tmp_r);
1853         gmx_simd_store_r(r_aligned+kx, tmp_r);
1854         tmp_mk  = gmx_simd_load_r(factor_aligned+kx);
1855         tmp_fac = gmx_simd_mul_r(sqr_PI, gmx_simd_mul_r(tmp_mk, gmx_simd_erfc_r(tmp_mk)));
1856         gmx_simd_store_r(factor_aligned+kx, tmp_fac);
1857     }
1858 }
1859 #else
1860 gmx_inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
1861 {
1862     int kx;
1863     real mk;
1864     for (kx = start; kx < end; kx++)
1865     {
1866         d[kx] = 1.0/d[kx];
1867     }
1868
1869     for (kx = start; kx < end; kx++)
1870     {
1871         r[kx] = exp(r[kx]);
1872     }
1873
1874     for (kx = start; kx < end; kx++)
1875     {
1876         mk       = tmp2[kx];
1877         tmp2[kx] = sqrt(M_PI)*mk*gmx_erfc(mk);
1878     }
1879 }
1880 #endif
1881
1882 static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
1883                          real ewaldcoeff, real vol,
1884                          gmx_bool bEnerVir,
1885                          int nthread, int thread)
1886 {
1887     /* do recip sum over local cells in grid */
1888     /* y major, z middle, x minor or continuous */
1889     t_complex *p0;
1890     int     kx, ky, kz, maxkx, maxky, maxkz;
1891     int     nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
1892     real    mx, my, mz;
1893     real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1894     real    ets2, struct2, vfactor, ets2vf;
1895     real    d1, d2, energy = 0;
1896     real    by, bz;
1897     real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
1898     real    rxx, ryx, ryy, rzx, rzy, rzz;
1899     pme_work_t *work;
1900     real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
1901     real    mhxk, mhyk, mhzk, m2k;
1902     real    corner_fac;
1903     ivec    complex_order;
1904     ivec    local_ndata, local_offset, local_size;
1905     real    elfac;
1906
1907     elfac = ONE_4PI_EPS0/pme->epsilon_r;
1908
1909     nx = pme->nkx;
1910     ny = pme->nky;
1911     nz = pme->nkz;
1912
1913     /* Dimensions should be identical for A/B grid, so we just use A here */
1914     gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA],
1915                                       complex_order,
1916                                       local_ndata,
1917                                       local_offset,
1918                                       local_size);
1919
1920     rxx = pme->recipbox[XX][XX];
1921     ryx = pme->recipbox[YY][XX];
1922     ryy = pme->recipbox[YY][YY];
1923     rzx = pme->recipbox[ZZ][XX];
1924     rzy = pme->recipbox[ZZ][YY];
1925     rzz = pme->recipbox[ZZ][ZZ];
1926
1927     maxkx = (nx+1)/2;
1928     maxky = (ny+1)/2;
1929     maxkz = nz/2+1;
1930
1931     work  = &pme->work[thread];
1932     mhx   = work->mhx;
1933     mhy   = work->mhy;
1934     mhz   = work->mhz;
1935     m2    = work->m2;
1936     denom = work->denom;
1937     tmp1  = work->tmp1;
1938     eterm = work->eterm;
1939     m2inv = work->m2inv;
1940
1941     iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
1942     iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1943
1944     for (iyz = iyz0; iyz < iyz1; iyz++)
1945     {
1946         iy = iyz/local_ndata[ZZ];
1947         iz = iyz - iy*local_ndata[ZZ];
1948
1949         ky = iy + local_offset[YY];
1950
1951         if (ky < maxky)
1952         {
1953             my = ky;
1954         }
1955         else
1956         {
1957             my = (ky - ny);
1958         }
1959
1960         by = M_PI*vol*pme->bsp_mod[YY][ky];
1961
1962         kz = iz + local_offset[ZZ];
1963
1964         mz = kz;
1965
1966         bz = pme->bsp_mod[ZZ][kz];
1967
1968         /* 0.5 correction for corner points */
1969         corner_fac = 1;
1970         if (kz == 0 || kz == (nz+1)/2)
1971         {
1972             corner_fac = 0.5;
1973         }
1974
1975         p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1976
1977         /* We should skip the k-space point (0,0,0) */
1978         /* Note that since here x is the minor index, local_offset[XX]=0 */
1979         if (local_offset[XX] > 0 || ky > 0 || kz > 0)
1980         {
1981             kxstart = local_offset[XX];
1982         }
1983         else
1984         {
1985             kxstart = local_offset[XX] + 1;
1986             p0++;
1987         }
1988         kxend = local_offset[XX] + local_ndata[XX];
1989
1990         if (bEnerVir)
1991         {
1992             /* More expensive inner loop, especially because of the storage
1993              * of the mh elements in array's.
1994              * Because x is the minor grid index, all mh elements
1995              * depend on kx for triclinic unit cells.
1996              */
1997
1998             /* Two explicit loops to avoid a conditional inside the loop */
1999             for (kx = kxstart; kx < maxkx; kx++)
2000             {
2001                 mx = kx;
2002
2003                 mhxk      = mx * rxx;
2004                 mhyk      = mx * ryx + my * ryy;
2005                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2006                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2007                 mhx[kx]   = mhxk;
2008                 mhy[kx]   = mhyk;
2009                 mhz[kx]   = mhzk;
2010                 m2[kx]    = m2k;
2011                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2012                 tmp1[kx]  = -factor*m2k;
2013             }
2014
2015             for (kx = maxkx; kx < kxend; kx++)
2016             {
2017                 mx = (kx - nx);
2018
2019                 mhxk      = mx * rxx;
2020                 mhyk      = mx * ryx + my * ryy;
2021                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2022                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2023                 mhx[kx]   = mhxk;
2024                 mhy[kx]   = mhyk;
2025                 mhz[kx]   = mhzk;
2026                 m2[kx]    = m2k;
2027                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2028                 tmp1[kx]  = -factor*m2k;
2029             }
2030
2031             for (kx = kxstart; kx < kxend; kx++)
2032             {
2033                 m2inv[kx] = 1.0/m2[kx];
2034             }
2035
2036             calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
2037
2038             for (kx = kxstart; kx < kxend; kx++, p0++)
2039             {
2040                 d1      = p0->re;
2041                 d2      = p0->im;
2042
2043                 p0->re  = d1*eterm[kx];
2044                 p0->im  = d2*eterm[kx];
2045
2046                 struct2 = 2.0*(d1*d1+d2*d2);
2047
2048                 tmp1[kx] = eterm[kx]*struct2;
2049             }
2050
2051             for (kx = kxstart; kx < kxend; kx++)
2052             {
2053                 ets2     = corner_fac*tmp1[kx];
2054                 vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2055                 energy  += ets2;
2056
2057                 ets2vf   = ets2*vfactor;
2058                 virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
2059                 virxy   += ets2vf*mhx[kx]*mhy[kx];
2060                 virxz   += ets2vf*mhx[kx]*mhz[kx];
2061                 viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
2062                 viryz   += ets2vf*mhy[kx]*mhz[kx];
2063                 virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
2064             }
2065         }
2066         else
2067         {
2068             /* We don't need to calculate the energy and the virial.
2069              * In this case the triclinic overhead is small.
2070              */
2071
2072             /* Two explicit loops to avoid a conditional inside the loop */
2073
2074             for (kx = kxstart; kx < maxkx; kx++)
2075             {
2076                 mx = kx;
2077
2078                 mhxk      = mx * rxx;
2079                 mhyk      = mx * ryx + my * ryy;
2080                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2081                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2082                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2083                 tmp1[kx]  = -factor*m2k;
2084             }
2085
2086             for (kx = maxkx; kx < kxend; kx++)
2087             {
2088                 mx = (kx - nx);
2089
2090                 mhxk      = mx * rxx;
2091                 mhyk      = mx * ryx + my * ryy;
2092                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2093                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2094                 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2095                 tmp1[kx]  = -factor*m2k;
2096             }
2097
2098             calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
2099
2100             for (kx = kxstart; kx < kxend; kx++, p0++)
2101             {
2102                 d1      = p0->re;
2103                 d2      = p0->im;
2104
2105                 p0->re  = d1*eterm[kx];
2106                 p0->im  = d2*eterm[kx];
2107             }
2108         }
2109     }
2110
2111     if (bEnerVir)
2112     {
2113         /* Update virial with local values.
2114          * The virial is symmetric by definition.
2115          * this virial seems ok for isotropic scaling, but I'm
2116          * experiencing problems on semiisotropic membranes.
2117          * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2118          */
2119         work->vir_q[XX][XX] = 0.25*virxx;
2120         work->vir_q[YY][YY] = 0.25*viryy;
2121         work->vir_q[ZZ][ZZ] = 0.25*virzz;
2122         work->vir_q[XX][YY] = work->vir_q[YY][XX] = 0.25*virxy;
2123         work->vir_q[XX][ZZ] = work->vir_q[ZZ][XX] = 0.25*virxz;
2124         work->vir_q[YY][ZZ] = work->vir_q[ZZ][YY] = 0.25*viryz;
2125
2126         /* This energy should be corrected for a charged system */
2127         work->energy_q = 0.5*energy;
2128     }
2129
2130     /* Return the loop count */
2131     return local_ndata[YY]*local_ndata[XX];
2132 }
2133
2134 static int solve_pme_lj_yzx(gmx_pme_t pme, t_complex **grid, gmx_bool bLB,
2135                             real ewaldcoeff, real vol,
2136                             gmx_bool bEnerVir, int nthread, int thread)
2137 {
2138     /* do recip sum over local cells in grid */
2139     /* y major, z middle, x minor or continuous */
2140     int     ig, gcount;
2141     int     kx, ky, kz, maxkx, maxky, maxkz;
2142     int     nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
2143     real    mx, my, mz;
2144     real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
2145     real    ets2, ets2vf;
2146     real    eterm, vterm, d1, d2, energy = 0;
2147     real    by, bz;
2148     real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
2149     real    rxx, ryx, ryy, rzx, rzy, rzz;
2150     real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2;
2151     real    mhxk, mhyk, mhzk, m2k;
2152     real    mk;
2153     pme_work_t *work;
2154     real    corner_fac;
2155     ivec    complex_order;
2156     ivec    local_ndata, local_offset, local_size;
2157     nx = pme->nkx;
2158     ny = pme->nky;
2159     nz = pme->nkz;
2160
2161     /* Dimensions should be identical for A/B grid, so we just use A here */
2162     gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A],
2163                                       complex_order,
2164                                       local_ndata,
2165                                       local_offset,
2166                                       local_size);
2167     rxx = pme->recipbox[XX][XX];
2168     ryx = pme->recipbox[YY][XX];
2169     ryy = pme->recipbox[YY][YY];
2170     rzx = pme->recipbox[ZZ][XX];
2171     rzy = pme->recipbox[ZZ][YY];
2172     rzz = pme->recipbox[ZZ][ZZ];
2173
2174     maxkx = (nx+1)/2;
2175     maxky = (ny+1)/2;
2176     maxkz = nz/2+1;
2177
2178     work  = &pme->work[thread];
2179     mhx   = work->mhx;
2180     mhy   = work->mhy;
2181     mhz   = work->mhz;
2182     m2    = work->m2;
2183     denom = work->denom;
2184     tmp1  = work->tmp1;
2185     tmp2  = work->tmp2;
2186
2187     iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
2188     iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
2189
2190     for (iyz = iyz0; iyz < iyz1; iyz++)
2191     {
2192         iy = iyz/local_ndata[ZZ];
2193         iz = iyz - iy*local_ndata[ZZ];
2194
2195         ky = iy + local_offset[YY];
2196
2197         if (ky < maxky)
2198         {
2199             my = ky;
2200         }
2201         else
2202         {
2203             my = (ky - ny);
2204         }
2205
2206         by = 3.0*vol*pme->bsp_mod[YY][ky]
2207             / (M_PI*sqrt(M_PI)*ewaldcoeff*ewaldcoeff*ewaldcoeff);
2208
2209         kz = iz + local_offset[ZZ];
2210
2211         mz = kz;
2212
2213         bz = pme->bsp_mod[ZZ][kz];
2214
2215         /* 0.5 correction for corner points */
2216         corner_fac = 1;
2217         if (kz == 0 || kz == (nz+1)/2)
2218         {
2219             corner_fac = 0.5;
2220         }
2221
2222         kxstart = local_offset[XX];
2223         kxend   = local_offset[XX] + local_ndata[XX];
2224         if (bEnerVir)
2225         {
2226             /* More expensive inner loop, especially because of the
2227              * storage of the mh elements in array's.  Because x is the
2228              * minor grid index, all mh elements depend on kx for
2229              * triclinic unit cells.
2230              */
2231
2232             /* Two explicit loops to avoid a conditional inside the loop */
2233             for (kx = kxstart; kx < maxkx; kx++)
2234             {
2235                 mx = kx;
2236
2237                 mhxk      = mx * rxx;
2238                 mhyk      = mx * ryx + my * ryy;
2239                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2240                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2241                 mhx[kx]   = mhxk;
2242                 mhy[kx]   = mhyk;
2243                 mhz[kx]   = mhzk;
2244                 m2[kx]    = m2k;
2245                 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
2246                 tmp1[kx]  = -factor*m2k;
2247                 tmp2[kx]  = sqrt(factor*m2k);
2248             }
2249
2250             for (kx = maxkx; kx < kxend; kx++)
2251             {
2252                 mx = (kx - nx);
2253
2254                 mhxk      = mx * rxx;
2255                 mhyk      = mx * ryx + my * ryy;
2256                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2257                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2258                 mhx[kx]   = mhxk;
2259                 mhy[kx]   = mhyk;
2260                 mhz[kx]   = mhzk;
2261                 m2[kx]    = m2k;
2262                 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
2263                 tmp1[kx]  = -factor*m2k;
2264                 tmp2[kx]  = sqrt(factor*m2k);
2265             }
2266
2267             calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
2268
2269             for (kx = kxstart; kx < kxend; kx++)
2270             {
2271                 m2k   = factor*m2[kx];
2272                 eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
2273                           + 2.0*m2k*tmp2[kx]);
2274                 vterm    = 3.0*(-tmp1[kx] + tmp2[kx]);
2275                 tmp1[kx] = eterm*denom[kx];
2276                 tmp2[kx] = vterm*denom[kx];
2277             }
2278
2279             if (!bLB)
2280             {
2281                 t_complex *p0;
2282                 real       struct2;
2283
2284                 p0 = grid[0] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2285                 for (kx = kxstart; kx < kxend; kx++, p0++)
2286                 {
2287                     d1      = p0->re;
2288                     d2      = p0->im;
2289
2290                     eterm   = tmp1[kx];
2291                     vterm   = tmp2[kx];
2292                     p0->re  = d1*eterm;
2293                     p0->im  = d2*eterm;
2294
2295                     struct2 = 2.0*(d1*d1+d2*d2);
2296
2297                     tmp1[kx] = eterm*struct2;
2298                     tmp2[kx] = vterm*struct2;
2299                 }
2300             }
2301             else
2302             {
2303                 real *struct2 = denom;
2304                 real  str2;
2305
2306                 for (kx = kxstart; kx < kxend; kx++)
2307                 {
2308                     struct2[kx] = 0.0;
2309                 }
2310                 /* Due to symmetry we only need to calculate 4 of the 7 terms */
2311                 for (ig = 0; ig <= 3; ++ig)
2312                 {
2313                     t_complex *p0, *p1;
2314                     real       scale;
2315
2316                     p0    = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2317                     p1    = grid[6-ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2318                     scale = 2.0*lb_scale_factor_symm[ig];
2319                     for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
2320                     {
2321                         struct2[kx] += scale*(p0->re*p1->re + p0->im*p1->im);
2322                     }
2323
2324                 }
2325                 for (ig = 0; ig <= 6; ++ig)
2326                 {
2327                     t_complex *p0;
2328
2329                     p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2330                     for (kx = kxstart; kx < kxend; kx++, p0++)
2331                     {
2332                         d1     = p0->re;
2333                         d2     = p0->im;
2334
2335                         eterm  = tmp1[kx];
2336                         p0->re = d1*eterm;
2337                         p0->im = d2*eterm;
2338                     }
2339                 }
2340                 for (kx = kxstart; kx < kxend; kx++)
2341                 {
2342                     eterm    = tmp1[kx];
2343                     vterm    = tmp2[kx];
2344                     str2     = struct2[kx];
2345                     tmp1[kx] = eterm*str2;
2346                     tmp2[kx] = vterm*str2;
2347                 }
2348             }
2349
2350             for (kx = kxstart; kx < kxend; kx++)
2351             {
2352                 ets2     = corner_fac*tmp1[kx];
2353                 vterm    = 2.0*factor*tmp2[kx];
2354                 energy  += ets2;
2355                 ets2vf   = corner_fac*vterm;
2356                 virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
2357                 virxy   += ets2vf*mhx[kx]*mhy[kx];
2358                 virxz   += ets2vf*mhx[kx]*mhz[kx];
2359                 viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
2360                 viryz   += ets2vf*mhy[kx]*mhz[kx];
2361                 virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
2362             }
2363         }
2364         else
2365         {
2366             /* We don't need to calculate the energy and the virial.
2367              *  In this case the triclinic overhead is small.
2368              */
2369
2370             /* Two explicit loops to avoid a conditional inside the loop */
2371
2372             for (kx = kxstart; kx < maxkx; kx++)
2373             {
2374                 mx = kx;
2375
2376                 mhxk      = mx * rxx;
2377                 mhyk      = mx * ryx + my * ryy;
2378                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2379                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2380                 m2[kx]    = m2k;
2381                 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
2382                 tmp1[kx]  = -factor*m2k;
2383                 tmp2[kx]  = sqrt(factor*m2k);
2384             }
2385
2386             for (kx = maxkx; kx < kxend; kx++)
2387             {
2388                 mx = (kx - nx);
2389
2390                 mhxk      = mx * rxx;
2391                 mhyk      = mx * ryx + my * ryy;
2392                 mhzk      = mx * rzx + my * rzy + mz * rzz;
2393                 m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2394                 m2[kx]    = m2k;
2395                 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
2396                 tmp1[kx]  = -factor*m2k;
2397                 tmp2[kx]  = sqrt(factor*m2k);
2398             }
2399
2400             calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
2401
2402             for (kx = kxstart; kx < kxend; kx++)
2403             {
2404                 m2k    = factor*m2[kx];
2405                 eterm  = -((1.0 - 2.0*m2k)*tmp1[kx]
2406                            + 2.0*m2k*tmp2[kx]);
2407                 tmp1[kx] = eterm*denom[kx];
2408             }
2409             gcount = (bLB ? 7 : 1);
2410             for (ig = 0; ig < gcount; ++ig)
2411             {
2412                 t_complex *p0;
2413
2414                 p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2415                 for (kx = kxstart; kx < kxend; kx++, p0++)
2416                 {
2417                     d1      = p0->re;
2418                     d2      = p0->im;
2419
2420                     eterm   = tmp1[kx];
2421
2422                     p0->re  = d1*eterm;
2423                     p0->im  = d2*eterm;
2424                 }
2425             }
2426         }
2427     }
2428     if (bEnerVir)
2429     {
2430         work->vir_lj[XX][XX] = 0.25*virxx;
2431         work->vir_lj[YY][YY] = 0.25*viryy;
2432         work->vir_lj[ZZ][ZZ] = 0.25*virzz;
2433         work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25*virxy;
2434         work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25*virxz;
2435         work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25*viryz;
2436
2437         /* This energy should be corrected for a charged system */
2438         work->energy_lj = 0.5*energy;
2439     }
2440     /* Return the loop count */
2441     return local_ndata[YY]*local_ndata[XX];
2442 }
2443
2444 static void get_pme_ener_vir_q(const gmx_pme_t pme, int nthread,
2445                                real *mesh_energy, matrix vir)
2446 {
2447     /* This function sums output over threads and should therefore
2448      * only be called after thread synchronization.
2449      */
2450     int thread;
2451
2452     *mesh_energy = pme->work[0].energy_q;
2453     copy_mat(pme->work[0].vir_q, vir);
2454
2455     for (thread = 1; thread < nthread; thread++)
2456     {
2457         *mesh_energy += pme->work[thread].energy_q;
2458         m_add(vir, pme->work[thread].vir_q, vir);
2459     }
2460 }
2461
2462 static void get_pme_ener_vir_lj(const gmx_pme_t pme, int nthread,
2463                                 real *mesh_energy, matrix vir)
2464 {
2465     /* This function sums output over threads and should therefore
2466      * only be called after thread synchronization.
2467      */
2468     int thread;
2469
2470     *mesh_energy = pme->work[0].energy_lj;
2471     copy_mat(pme->work[0].vir_lj, vir);
2472
2473     for (thread = 1; thread < nthread; thread++)
2474     {
2475         *mesh_energy += pme->work[thread].energy_lj;
2476         m_add(vir, pme->work[thread].vir_lj, vir);
2477     }
2478 }
2479
2480
2481 #define DO_FSPLINE(order)                      \
2482     for (ithx = 0; (ithx < order); ithx++)              \
2483     {                                              \
2484         index_x = (i0+ithx)*pny*pnz;               \
2485         tx      = thx[ithx];                       \
2486         dx      = dthx[ithx];                      \
2487                                                \
2488         for (ithy = 0; (ithy < order); ithy++)          \
2489         {                                          \
2490             index_xy = index_x+(j0+ithy)*pnz;      \
2491             ty       = thy[ithy];                  \
2492             dy       = dthy[ithy];                 \
2493             fxy1     = fz1 = 0;                    \
2494                                                \
2495             for (ithz = 0; (ithz < order); ithz++)      \
2496             {                                      \
2497                 gval  = grid[index_xy+(k0+ithz)];  \
2498                 fxy1 += thz[ithz]*gval;            \
2499                 fz1  += dthz[ithz]*gval;           \
2500             }                                      \
2501             fx += dx*ty*fxy1;                      \
2502             fy += tx*dy*fxy1;                      \
2503             fz += tx*ty*fz1;                       \
2504         }                                          \
2505     }
2506
2507
2508 static void gather_f_bsplines(gmx_pme_t pme, real *grid,
2509                               gmx_bool bClearF, pme_atomcomm_t *atc,
2510                               splinedata_t *spline,
2511                               real scale)
2512 {
2513     /* sum forces for local particles */
2514     int     nn, n, ithx, ithy, ithz, i0, j0, k0;
2515     int     index_x, index_xy;
2516     int     nx, ny, nz, pnx, pny, pnz;
2517     int *   idxptr;
2518     real    tx, ty, dx, dy, coefficient;
2519     real    fx, fy, fz, gval;
2520     real    fxy1, fz1;
2521     real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
2522     int     norder;
2523     real    rxx, ryx, ryy, rzx, rzy, rzz;
2524     int     order;
2525
2526     pme_spline_work_t *work;
2527
2528 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
2529     real           thz_buffer[GMX_SIMD4_WIDTH*3],  *thz_aligned;
2530     real           dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;
2531
2532     thz_aligned  = gmx_simd4_align_r(thz_buffer);
2533     dthz_aligned = gmx_simd4_align_r(dthz_buffer);
2534 #endif
2535
2536     work = pme->spline_work;
2537
2538     order = pme->pme_order;
2539     thx   = spline->theta[XX];
2540     thy   = spline->theta[YY];
2541     thz   = spline->theta[ZZ];
2542     dthx  = spline->dtheta[XX];
2543     dthy  = spline->dtheta[YY];
2544     dthz  = spline->dtheta[ZZ];
2545     nx    = pme->nkx;
2546     ny    = pme->nky;
2547     nz    = pme->nkz;
2548     pnx   = pme->pmegrid_nx;
2549     pny   = pme->pmegrid_ny;
2550     pnz   = pme->pmegrid_nz;
2551
2552     rxx   = pme->recipbox[XX][XX];
2553     ryx   = pme->recipbox[YY][XX];
2554     ryy   = pme->recipbox[YY][YY];
2555     rzx   = pme->recipbox[ZZ][XX];
2556     rzy   = pme->recipbox[ZZ][YY];
2557     rzz   = pme->recipbox[ZZ][ZZ];
2558
2559     for (nn = 0; nn < spline->n; nn++)
2560     {
2561         n           = spline->ind[nn];
2562         coefficient = scale*atc->coefficient[n];
2563
2564         if (bClearF)
2565         {
2566             atc->f[n][XX] = 0;
2567             atc->f[n][YY] = 0;
2568             atc->f[n][ZZ] = 0;
2569         }
2570         if (coefficient != 0)
2571         {
2572             fx     = 0;
2573             fy     = 0;
2574             fz     = 0;
2575             idxptr = atc->idx[n];
2576             norder = nn*order;
2577
2578             i0   = idxptr[XX];
2579             j0   = idxptr[YY];
2580             k0   = idxptr[ZZ];
2581
2582             /* Pointer arithmetic alert, next six statements */
2583             thx  = spline->theta[XX] + norder;
2584             thy  = spline->theta[YY] + norder;
2585             thz  = spline->theta[ZZ] + norder;
2586             dthx = spline->dtheta[XX] + norder;
2587             dthy = spline->dtheta[YY] + norder;
2588             dthz = spline->dtheta[ZZ] + norder;
2589
2590             switch (order)
2591             {
2592                 case 4:
2593 #ifdef PME_SIMD4_SPREAD_GATHER
2594 #ifdef PME_SIMD4_UNALIGNED
2595 #define PME_GATHER_F_SIMD4_ORDER4
2596 #else
2597 #define PME_GATHER_F_SIMD4_ALIGNED
2598 #define PME_ORDER 4
2599 #endif
2600 #include "pme_simd4.h"
2601 #else
2602                     DO_FSPLINE(4);
2603 #endif
2604                     break;
2605                 case 5:
2606 #ifdef PME_SIMD4_SPREAD_GATHER
2607 #define PME_GATHER_F_SIMD4_ALIGNED
2608 #define PME_ORDER 5
2609 #include "pme_simd4.h"
2610 #else
2611                     DO_FSPLINE(5);
2612 #endif
2613                     break;
2614                 default:
2615                     DO_FSPLINE(order);
2616                     break;
2617             }
2618
2619             atc->f[n][XX] += -coefficient*( fx*nx*rxx );
2620             atc->f[n][YY] += -coefficient*( fx*nx*ryx + fy*ny*ryy );
2621             atc->f[n][ZZ] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2622         }
2623     }
2624     /* Since the energy and not forces are interpolated
2625      * the net force might not be exactly zero.
2626      * This can be solved by also interpolating F, but
2627      * that comes at a cost.
2628      * A better hack is to remove the net force every
2629      * step, but that must be done at a higher level
2630      * since this routine doesn't see all atoms if running
2631      * in parallel. Don't know how important it is?  EL 990726
2632      */
2633 }
2634
2635
2636 static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
2637                                    pme_atomcomm_t *atc)
2638 {
2639     splinedata_t *spline;
2640     int     n, ithx, ithy, ithz, i0, j0, k0;
2641     int     index_x, index_xy;
2642     int *   idxptr;
2643     real    energy, pot, tx, ty, coefficient, gval;
2644     real    *thx, *thy, *thz;
2645     int     norder;
2646     int     order;
2647
2648     spline = &atc->spline[0];
2649
2650     order = pme->pme_order;
2651
2652     energy = 0;
2653     for (n = 0; (n < atc->n); n++)
2654     {
2655         coefficient      = atc->coefficient[n];
2656
2657         if (coefficient != 0)
2658         {
2659             idxptr = atc->idx[n];
2660             norder = n*order;
2661
2662             i0   = idxptr[XX];
2663             j0   = idxptr[YY];
2664             k0   = idxptr[ZZ];
2665
2666             /* Pointer arithmetic alert, next three statements */
2667             thx  = spline->theta[XX] + norder;
2668             thy  = spline->theta[YY] + norder;
2669             thz  = spline->theta[ZZ] + norder;
2670
2671             pot = 0;
2672             for (ithx = 0; (ithx < order); ithx++)
2673             {
2674                 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2675                 tx      = thx[ithx];
2676
2677                 for (ithy = 0; (ithy < order); ithy++)
2678                 {
2679                     index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2680                     ty       = thy[ithy];
2681
2682                     for (ithz = 0; (ithz < order); ithz++)
2683                     {
2684                         gval  = grid[index_xy+(k0+ithz)];
2685                         pot  += tx*ty*thz[ithz]*gval;
2686                     }
2687
2688                 }
2689             }
2690
2691             energy += pot*coefficient;
2692         }
2693     }
2694
2695     return energy;
2696 }
2697
2698 /* Macro to force loop unrolling by fixing order.
2699  * This gives a significant performance gain.
2700  */
2701 #define CALC_SPLINE(order)                     \
2702     {                                              \
2703         int j, k, l;                                 \
2704         real dr, div;                               \
2705         real data[PME_ORDER_MAX];                  \
2706         real ddata[PME_ORDER_MAX];                 \
2707                                                \
2708         for (j = 0; (j < DIM); j++)                     \
2709         {                                          \
2710             dr  = xptr[j];                         \
2711                                                \
2712             /* dr is relative offset from lower cell limit */ \
2713             data[order-1] = 0;                     \
2714             data[1]       = dr;                          \
2715             data[0]       = 1 - dr;                      \
2716                                                \
2717             for (k = 3; (k < order); k++)               \
2718             {                                      \
2719                 div       = 1.0/(k - 1.0);               \
2720                 data[k-1] = div*dr*data[k-2];      \
2721                 for (l = 1; (l < (k-1)); l++)           \
2722                 {                                  \
2723                     data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2724                                        data[k-l-1]);                \
2725                 }                                  \
2726                 data[0] = div*(1-dr)*data[0];      \
2727             }                                      \
2728             /* differentiate */                    \
2729             ddata[0] = -data[0];                   \
2730             for (k = 1; (k < order); k++)               \
2731             {                                      \
2732                 ddata[k] = data[k-1] - data[k];    \
2733             }                                      \
2734                                                \
2735             div           = 1.0/(order - 1);                 \
2736             data[order-1] = div*dr*data[order-2];  \
2737             for (l = 1; (l < (order-1)); l++)           \
2738             {                                      \
2739                 data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
2740                                        (order-l-dr)*data[order-l-1]); \
2741             }                                      \
2742             data[0] = div*(1 - dr)*data[0];        \
2743                                                \
2744             for (k = 0; k < order; k++)                 \
2745             {                                      \
2746                 theta[j][i*order+k]  = data[k];    \
2747                 dtheta[j][i*order+k] = ddata[k];   \
2748             }                                      \
2749         }                                          \
2750     }
2751
2752 void make_bsplines(splinevec theta, splinevec dtheta, int order,
2753                    rvec fractx[], int nr, int ind[], real coefficient[],
2754                    gmx_bool bDoSplines)
2755 {
2756     /* construct splines for local atoms */
2757     int  i, ii;
2758     real *xptr;
2759
2760     for (i = 0; i < nr; i++)
2761     {
2762         /* With free energy we do not use the coefficient check.
2763          * In most cases this will be more efficient than calling make_bsplines
2764          * twice, since usually more than half the particles have non-zero coefficients.
2765          */
2766         ii = ind[i];
2767         if (bDoSplines || coefficient[ii] != 0.0)
2768         {
2769             xptr = fractx[ii];
2770             switch (order)
2771             {
2772                 case 4:  CALC_SPLINE(4);     break;
2773                 case 5:  CALC_SPLINE(5);     break;
2774                 default: CALC_SPLINE(order); break;
2775             }
2776         }
2777     }
2778 }
2779
2780
2781 void make_dft_mod(real *mod, real *data, int ndata)
2782 {
2783     int i, j;
2784     real sc, ss, arg;
2785
2786     for (i = 0; i < ndata; i++)
2787     {
2788         sc = ss = 0;
2789         for (j = 0; j < ndata; j++)
2790         {
2791             arg = (2.0*M_PI*i*j)/ndata;
2792             sc += data[j]*cos(arg);
2793             ss += data[j]*sin(arg);
2794         }
2795         mod[i] = sc*sc+ss*ss;
2796     }
2797     for (i = 0; i < ndata; i++)
2798     {
2799         if (mod[i] < 1e-7)
2800         {
2801             mod[i] = (mod[i-1]+mod[i+1])*0.5;
2802         }
2803     }
2804 }
2805
2806
2807 static void make_bspline_moduli(splinevec bsp_mod,
2808                                 int nx, int ny, int nz, int order)
2809 {
2810     int nmax = max(nx, max(ny, nz));
2811     real *data, *ddata, *bsp_data;
2812     int i, k, l;
2813     real div;
2814
2815     snew(data, order);
2816     snew(ddata, order);
2817     snew(bsp_data, nmax);
2818
2819     data[order-1] = 0;
2820     data[1]       = 0;
2821     data[0]       = 1;
2822
2823     for (k = 3; k < order; k++)
2824     {
2825         div       = 1.0/(k-1.0);
2826         data[k-1] = 0;
2827         for (l = 1; l < (k-1); l++)
2828         {
2829             data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2830         }
2831         data[0] = div*data[0];
2832     }
2833     /* differentiate */
2834     ddata[0] = -data[0];
2835     for (k = 1; k < order; k++)
2836     {
2837         ddata[k] = data[k-1]-data[k];
2838     }
2839     div           = 1.0/(order-1);
2840     data[order-1] = 0;
2841     for (l = 1; l < (order-1); l++)
2842     {
2843         data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2844     }
2845     data[0] = div*data[0];
2846
2847     for (i = 0; i < nmax; i++)
2848     {
2849         bsp_data[i] = 0;
2850     }
2851     for (i = 1; i <= order; i++)
2852     {
2853         bsp_data[i] = data[i-1];
2854     }
2855
2856     make_dft_mod(bsp_mod[XX], bsp_data, nx);
2857     make_dft_mod(bsp_mod[YY], bsp_data, ny);
2858     make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
2859
2860     sfree(data);
2861     sfree(ddata);
2862     sfree(bsp_data);
2863 }
2864
2865
2866 /* Return the P3M optimal influence function */
2867 static double do_p3m_influence(double z, int order)
2868 {
2869     double z2, z4;
2870
2871     z2 = z*z;
2872     z4 = z2*z2;
2873
2874     /* The formula and most constants can be found in:
2875      * Ballenegger et al., JCTC 8, 936 (2012)
2876      */
2877     switch (order)
2878     {
2879         case 2:
2880             return 1.0 - 2.0*z2/3.0;
2881             break;
2882         case 3:
2883             return 1.0 - z2 + 2.0*z4/15.0;
2884             break;
2885         case 4:
2886             return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2887             break;
2888         case 5:
2889             return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2890             break;
2891         case 6:
2892             return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2893             break;
2894         case 7:
2895             return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2896         case 8:
2897             return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2898             break;
2899     }
2900
2901     return 0.0;
2902 }
2903
2904 /* Calculate the P3M B-spline moduli for one dimension */
2905 static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
2906 {
2907     double zarg, zai, sinzai, infl;
2908     int    maxk, i;
2909
2910     if (order > 8)
2911     {
2912         gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
2913     }
2914
2915     zarg = M_PI/n;
2916
2917     maxk = (n + 1)/2;
2918
2919     for (i = -maxk; i < 0; i++)
2920     {
2921         zai          = zarg*i;
2922         sinzai       = sin(zai);
2923         infl         = do_p3m_influence(sinzai, order);
2924         bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
2925     }
2926     bsp_mod[0] = 1.0;
2927     for (i = 1; i < maxk; i++)
2928     {
2929         zai        = zarg*i;
2930         sinzai     = sin(zai);
2931         infl       = do_p3m_influence(sinzai, order);
2932         bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
2933     }
2934 }
2935
2936 /* Calculate the P3M B-spline moduli */
2937 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2938                                     int nx, int ny, int nz, int order)
2939 {
2940     make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
2941     make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
2942     make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
2943 }
2944
2945
2946 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2947 {
2948     int nslab, n, i;
2949     int fw, bw;
2950
2951     nslab = atc->nslab;
2952
2953     n = 0;
2954     for (i = 1; i <= nslab/2; i++)
2955     {
2956         fw = (atc->nodeid + i) % nslab;
2957         bw = (atc->nodeid - i + nslab) % nslab;
2958         if (n < nslab - 1)
2959         {
2960             atc->node_dest[n] = fw;
2961             atc->node_src[n]  = bw;
2962             n++;
2963         }
2964         if (n < nslab - 1)
2965         {
2966             atc->node_dest[n] = bw;
2967             atc->node_src[n]  = fw;
2968             n++;
2969         }
2970     }
2971 }
2972
2973 int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
2974 {
2975     int thread, i;
2976
2977     if (NULL != log)
2978     {
2979         fprintf(log, "Destroying PME data structures.\n");
2980     }
2981
2982     sfree((*pmedata)->nnx);
2983     sfree((*pmedata)->nny);
2984     sfree((*pmedata)->nnz);
2985
2986     for (i = 0; i < (*pmedata)->ngrids; ++i)
2987     {
2988         pmegrids_destroy(&(*pmedata)->pmegrid[i]);
2989         sfree((*pmedata)->fftgrid[i]);
2990         sfree((*pmedata)->cfftgrid[i]);
2991         gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
2992     }
2993
2994     sfree((*pmedata)->lb_buf1);
2995     sfree((*pmedata)->lb_buf2);
2996
2997     for (thread = 0; thread < (*pmedata)->nthread; thread++)
2998     {
2999         free_work(&(*pmedata)->work[thread]);
3000     }
3001     sfree((*pmedata)->work);
3002
3003     sfree(*pmedata);
3004     *pmedata = NULL;
3005
3006     return 0;
3007 }
3008
3009 static int mult_up(int n, int f)
3010 {
3011     return ((n + f - 1)/f)*f;
3012 }
3013
3014
3015 static double pme_load_imbalance(gmx_pme_t pme)
3016 {
3017     int    nma, nmi;
3018     double n1, n2, n3;
3019
3020     nma = pme->nnodes_major;
3021     nmi = pme->nnodes_minor;
3022
3023     n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
3024     n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
3025     n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
3026
3027     /* pme_solve is roughly double the cost of an fft */
3028
3029     return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
3030 }
3031
3032 static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
3033                           int dimind, gmx_bool bSpread)
3034 {
3035     int nk, k, s, thread;
3036
3037     atc->dimind    = dimind;
3038     atc->nslab     = 1;
3039     atc->nodeid    = 0;
3040     atc->pd_nalloc = 0;
3041 #ifdef GMX_MPI
3042     if (pme->nnodes > 1)
3043     {
3044         atc->mpi_comm = pme->mpi_comm_d[dimind];
3045         MPI_Comm_size(atc->mpi_comm, &atc->nslab);
3046         MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
3047     }
3048     if (debug)
3049     {
3050         fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
3051     }
3052 #endif
3053
3054     atc->bSpread   = bSpread;
3055     atc->pme_order = pme->pme_order;
3056
3057     if (atc->nslab > 1)
3058     {
3059         snew(atc->node_dest, atc->nslab);
3060         snew(atc->node_src, atc->nslab);
3061         setup_coordinate_communication(atc);
3062
3063         snew(atc->count_thread, pme->nthread);
3064         for (thread = 0; thread < pme->nthread; thread++)
3065         {
3066             snew(atc->count_thread[thread], atc->nslab);
3067         }
3068         atc->count = atc->count_thread[0];
3069         snew(atc->rcount, atc->nslab);
3070         snew(atc->buf_index, atc->nslab);
3071     }
3072
3073     atc->nthread = pme->nthread;
3074     if (atc->nthread > 1)
3075     {
3076         snew(atc->thread_plist, atc->nthread);
3077     }
3078     snew(atc->spline, atc->nthread);
3079     for (thread = 0; thread < atc->nthread; thread++)
3080     {
3081         if (atc->nthread > 1)
3082         {
3083             snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
3084             atc->thread_plist[thread].n += GMX_CACHE_SEP;
3085         }
3086         snew(atc->spline[thread].thread_one, pme->nthread);
3087         atc->spline[thread].thread_one[thread] = 1;
3088     }
3089 }
3090
3091 static void
3092 init_overlap_comm(pme_overlap_t *  ol,
3093                   int              norder,
3094 #ifdef GMX_MPI
3095                   MPI_Comm         comm,
3096 #endif
3097                   int              nnodes,
3098                   int              nodeid,
3099                   int              ndata,
3100                   int              commplainsize)
3101 {
3102     int lbnd, rbnd, maxlr, b, i;
3103     int exten;
3104     int nn, nk;
3105     pme_grid_comm_t *pgc;
3106     gmx_bool bCont;
3107     int fft_start, fft_end, send_index1, recv_index1;
3108 #ifdef GMX_MPI
3109     MPI_Status stat;
3110
3111     ol->mpi_comm = comm;
3112 #endif
3113
3114     ol->nnodes = nnodes;
3115     ol->nodeid = nodeid;
3116
3117     /* Linear translation of the PME grid won't affect reciprocal space
3118      * calculations, so to optimize we only interpolate "upwards",
3119      * which also means we only have to consider overlap in one direction.
3120      * I.e., particles on this node might also be spread to grid indices
3121      * that belong to higher nodes (modulo nnodes)
3122      */
3123
3124     snew(ol->s2g0, ol->nnodes+1);
3125     snew(ol->s2g1, ol->nnodes);
3126     if (debug)
3127     {
3128         fprintf(debug, "PME slab boundaries:");
3129     }
3130     for (i = 0; i < nnodes; i++)
3131     {
3132         /* s2g0 the local interpolation grid start.
3133          * s2g1 the local interpolation grid end.
3134          * Because grid overlap communication only goes forward,
3135          * the grid the slabs for fft's should be rounded down.
3136          */
3137         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
3138         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
3139
3140         if (debug)
3141         {
3142             fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
3143         }
3144     }
3145     ol->s2g0[nnodes] = ndata;
3146     if (debug)
3147     {
3148         fprintf(debug, "\n");
3149     }
3150
3151     /* Determine with how many nodes we need to communicate the grid overlap */
3152     b = 0;
3153     do
3154     {
3155         b++;
3156         bCont = FALSE;
3157         for (i = 0; i < nnodes; i++)
3158         {
3159             if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
3160                 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
3161             {
3162                 bCont = TRUE;
3163             }
3164         }
3165     }
3166     while (bCont && b < nnodes);
3167     ol->noverlap_nodes = b - 1;
3168
3169     snew(ol->send_id, ol->noverlap_nodes);
3170     snew(ol->recv_id, ol->noverlap_nodes);
3171     for (b = 0; b < ol->noverlap_nodes; b++)
3172     {
3173         ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
3174         ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
3175     }
3176     snew(ol->comm_data, ol->noverlap_nodes);
3177
3178     ol->send_size = 0;
3179     for (b = 0; b < ol->noverlap_nodes; b++)
3180     {
3181         pgc = &ol->comm_data[b];
3182         /* Send */
3183         fft_start        = ol->s2g0[ol->send_id[b]];
3184         fft_end          = ol->s2g0[ol->send_id[b]+1];
3185         if (ol->send_id[b] < nodeid)
3186         {
3187             fft_start += ndata;
3188             fft_end   += ndata;
3189         }
3190         send_index1       = ol->s2g1[nodeid];
3191         send_index1       = min(send_index1, fft_end);
3192         pgc->send_index0  = fft_start;
3193         pgc->send_nindex  = max(0, send_index1 - pgc->send_index0);
3194         ol->send_size    += pgc->send_nindex;
3195
3196         /* We always start receiving to the first index of our slab */
3197         fft_start        = ol->s2g0[ol->nodeid];
3198         fft_end          = ol->s2g0[ol->nodeid+1];
3199         recv_index1      = ol->s2g1[ol->recv_id[b]];
3200         if (ol->recv_id[b] > nodeid)
3201         {
3202             recv_index1 -= ndata;
3203         }
3204         recv_index1      = min(recv_index1, fft_end);
3205         pgc->recv_index0 = fft_start;
3206         pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
3207     }
3208
3209 #ifdef GMX_MPI
3210     /* Communicate the buffer sizes to receive */
3211     for (b = 0; b < ol->noverlap_nodes; b++)
3212     {
3213         MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
3214                      &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
3215                      ol->mpi_comm, &stat);
3216     }
3217 #endif
3218
3219     /* For non-divisible grid we need pme_order iso pme_order-1 */
3220     snew(ol->sendbuf, norder*commplainsize);
3221     snew(ol->recvbuf, norder*commplainsize);
3222 }
3223
3224 static void
3225 make_gridindex5_to_localindex(int n, int local_start, int local_range,
3226                               int **global_to_local,
3227                               real **fraction_shift)
3228 {
3229     int i;
3230     int * gtl;
3231     real * fsh;
3232
3233     snew(gtl, 5*n);
3234     snew(fsh, 5*n);
3235     for (i = 0; (i < 5*n); i++)
3236     {
3237         /* Determine the global to local grid index */
3238         gtl[i] = (i - local_start + n) % n;
3239         /* For coordinates that fall within the local grid the fraction
3240          * is correct, we don't need to shift it.
3241          */
3242         fsh[i] = 0;
3243         if (local_range < n)
3244         {
3245             /* Due to rounding issues i could be 1 beyond the lower or
3246              * upper boundary of the local grid. Correct the index for this.
3247              * If we shift the index, we need to shift the fraction by
3248              * the same amount in the other direction to not affect
3249              * the weights.
3250              * Note that due to this shifting the weights at the end of
3251              * the spline might change, but that will only involve values
3252              * between zero and values close to the precision of a real,
3253              * which is anyhow the accuracy of the whole mesh calculation.
3254              */
3255             /* With local_range=0 we should not change i=local_start */
3256             if (i % n != local_start)
3257             {
3258                 if (gtl[i] == n-1)
3259                 {
3260                     gtl[i] = 0;
3261                     fsh[i] = -1;
3262                 }
3263                 else if (gtl[i] == local_range)
3264                 {
3265                     gtl[i] = local_range - 1;
3266                     fsh[i] = 1;
3267                 }
3268             }
3269         }
3270     }
3271
3272     *global_to_local = gtl;
3273     *fraction_shift  = fsh;
3274 }
3275
3276 static pme_spline_work_t *make_pme_spline_work(int gmx_unused order)
3277 {
3278     pme_spline_work_t *work;
3279
3280 #ifdef PME_SIMD4_SPREAD_GATHER
3281     real             tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
3282     gmx_simd4_real_t zero_S;
3283     gmx_simd4_real_t real_mask_S0, real_mask_S1;
3284     int              of, i;
3285
3286     snew_aligned(work, 1, SIMD4_ALIGNMENT);
3287
3288     tmp_aligned = gmx_simd4_align_r(tmp);
3289
3290     zero_S = gmx_simd4_setzero_r();
3291
3292     /* Generate bit masks to mask out the unused grid entries,
3293      * as we only operate on order of the 8 grid entries that are
3294      * load into 2 SIMD registers.
3295      */
3296     for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
3297     {
3298         for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
3299         {
3300             tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
3301         }
3302         real_mask_S0      = gmx_simd4_load_r(tmp_aligned);
3303         real_mask_S1      = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
3304         work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
3305         work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
3306     }
3307 #else
3308     work = NULL;
3309 #endif
3310
3311     return work;
3312 }
3313
3314 void gmx_pme_check_restrictions(int pme_order,
3315                                 int nkx, int nky, int nkz,
3316                                 int nnodes_major,
3317                                 int nnodes_minor,
3318                                 gmx_bool bUseThreads,
3319                                 gmx_bool bFatal,
3320                                 gmx_bool *bValidSettings)
3321 {
3322     if (pme_order > PME_ORDER_MAX)
3323     {
3324         if (!bFatal)
3325         {
3326             *bValidSettings = FALSE;
3327             return;
3328         }
3329         gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3330                   pme_order, PME_ORDER_MAX);
3331     }
3332
3333     if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
3334         nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
3335         nkz <= pme_order)
3336     {
3337         if (!bFatal)
3338         {
3339             *bValidSettings = FALSE;
3340             return;
3341         }
3342         gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
3343                   pme_order);
3344     }
3345
3346     /* Check for a limitation of the (current) sum_fftgrid_dd code.
3347      * We only allow multiple communication pulses in dim 1, not in dim 0.
3348      */
3349     if (bUseThreads && (nkx < nnodes_major*pme_order &&
3350                         nkx != nnodes_major*(pme_order - 1)))
3351     {
3352         if (!bFatal)
3353         {
3354             *bValidSettings = FALSE;
3355             return;
3356         }
3357         gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
3358                   nkx/(double)nnodes_major, pme_order);
3359     }
3360
3361     if (bValidSettings != NULL)
3362     {
3363         *bValidSettings = TRUE;
3364     }
3365
3366     return;
3367 }
3368
3369 int gmx_pme_init(gmx_pme_t *         pmedata,
3370                  t_commrec *         cr,
3371                  int                 nnodes_major,
3372                  int                 nnodes_minor,
3373                  t_inputrec *        ir,
3374                  int                 homenr,
3375                  gmx_bool            bFreeEnergy_q,
3376                  gmx_bool            bFreeEnergy_lj,
3377                  gmx_bool            bReproducible,
3378                  int                 nthread)
3379 {
3380     gmx_pme_t pme = NULL;
3381
3382     int  use_threads, sum_use_threads, i;
3383     ivec ndata;
3384
3385     if (debug)
3386     {
3387         fprintf(debug, "Creating PME data structures.\n");
3388     }
3389     snew(pme, 1);
3390
3391     pme->sum_qgrid_tmp       = NULL;
3392     pme->sum_qgrid_dd_tmp    = NULL;
3393     pme->buf_nalloc          = 0;
3394
3395     pme->nnodes              = 1;
3396     pme->bPPnode             = TRUE;
3397
3398     pme->nnodes_major        = nnodes_major;
3399     pme->nnodes_minor        = nnodes_minor;
3400
3401 #ifdef GMX_MPI
3402     if (nnodes_major*nnodes_minor > 1)
3403     {
3404         pme->mpi_comm = cr->mpi_comm_mygroup;
3405
3406         MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
3407         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
3408         if (pme->nnodes != nnodes_major*nnodes_minor)
3409         {
3410             gmx_incons("PME rank count mismatch");
3411         }
3412     }
3413     else
3414     {
3415         pme->mpi_comm = MPI_COMM_NULL;
3416     }
3417 #endif
3418
3419     if (pme->nnodes == 1)
3420     {
3421 #ifdef GMX_MPI
3422         pme->mpi_comm_d[0] = MPI_COMM_NULL;
3423         pme->mpi_comm_d[1] = MPI_COMM_NULL;
3424 #endif
3425         pme->ndecompdim   = 0;
3426         pme->nodeid_major = 0;
3427         pme->nodeid_minor = 0;
3428 #ifdef GMX_MPI
3429         pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
3430 #endif
3431     }
3432     else
3433     {
3434         if (nnodes_minor == 1)
3435         {
3436 #ifdef GMX_MPI
3437             pme->mpi_comm_d[0] = pme->mpi_comm;
3438             pme->mpi_comm_d[1] = MPI_COMM_NULL;
3439 #endif
3440             pme->ndecompdim   = 1;
3441             pme->nodeid_major = pme->nodeid;
3442             pme->nodeid_minor = 0;
3443
3444         }
3445         else if (nnodes_major == 1)
3446         {
3447 #ifdef GMX_MPI
3448             pme->mpi_comm_d[0] = MPI_COMM_NULL;
3449             pme->mpi_comm_d[1] = pme->mpi_comm;
3450 #endif
3451             pme->ndecompdim   = 1;
3452             pme->nodeid_major = 0;
3453             pme->nodeid_minor = pme->nodeid;
3454         }
3455         else
3456         {
3457             if (pme->nnodes % nnodes_major != 0)
3458             {
3459                 gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
3460             }
3461             pme->ndecompdim = 2;
3462
3463 #ifdef GMX_MPI
3464             MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
3465                            pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
3466             MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
3467                            pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
3468
3469             MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
3470             MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
3471             MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
3472             MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
3473 #endif
3474         }
3475         pme->bPPnode = (cr->duty & DUTY_PP);
3476     }
3477
3478     pme->nthread = nthread;
3479
3480     /* Check if any of the PME MPI ranks uses threads */
3481     use_threads = (pme->nthread > 1 ? 1 : 0);
3482 #ifdef GMX_MPI
3483     if (pme->nnodes > 1)
3484     {
3485         MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
3486                       MPI_SUM, pme->mpi_comm);
3487     }
3488     else
3489 #endif
3490     {
3491         sum_use_threads = use_threads;
3492     }
3493     pme->bUseThreads = (sum_use_threads > 0);
3494
3495     if (ir->ePBC == epbcSCREW)
3496     {
3497         gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
3498     }
3499
3500     pme->bFEP_q      = ((ir->efep != efepNO) && bFreeEnergy_q);
3501     pme->bFEP_lj     = ((ir->efep != efepNO) && bFreeEnergy_lj);
3502     pme->bFEP        = (pme->bFEP_q || pme->bFEP_lj);
3503     pme->nkx         = ir->nkx;
3504     pme->nky         = ir->nky;
3505     pme->nkz         = ir->nkz;
3506     pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3507     pme->pme_order   = ir->pme_order;
3508
3509     /* Always constant electrostatics coefficients */
3510     pme->epsilon_r   = ir->epsilon_r;
3511
3512     /* Always constant LJ coefficients */
3513     pme->ljpme_combination_rule = ir->ljpme_combination_rule;
3514
3515     /* If we violate restrictions, generate a fatal error here */
3516     gmx_pme_check_restrictions(pme->pme_order,
3517                                pme->nkx, pme->nky, pme->nkz,
3518                                pme->nnodes_major,
3519                                pme->nnodes_minor,
3520                                pme->bUseThreads,
3521                                TRUE,
3522                                NULL);
3523
3524     if (pme->nnodes > 1)
3525     {
3526         double imbal;
3527
3528 #ifdef GMX_MPI
3529         MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3530         MPI_Type_commit(&(pme->rvec_mpi));
3531 #endif
3532
3533         /* Note that the coefficient spreading and force gathering, which usually
3534          * takes about the same amount of time as FFT+solve_pme,
3535          * is always fully load balanced
3536          * (unless the coefficient distribution is inhomogeneous).
3537          */
3538
3539         imbal = pme_load_imbalance(pme);
3540         if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3541         {
3542             fprintf(stderr,
3543                     "\n"
3544                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3545                     "      For optimal PME load balancing\n"
3546                     "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
3547                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
3548                     "\n",
3549                     (int)((imbal-1)*100 + 0.5),
3550                     pme->nkx, pme->nky, pme->nnodes_major,
3551                     pme->nky, pme->nkz, pme->nnodes_minor);
3552         }
3553     }
3554
3555     /* For non-divisible grid we need pme_order iso pme_order-1 */
3556     /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3557      * y is always copied through a buffer: we don't need padding in z,
3558      * but we do need the overlap in x because of the communication order.
3559      */
3560     init_overlap_comm(&pme->overlap[0], pme->pme_order,
3561 #ifdef GMX_MPI
3562                       pme->mpi_comm_d[0],
3563 #endif
3564                       pme->nnodes_major, pme->nodeid_major,
3565                       pme->nkx,
3566                       (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3567
3568     /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3569      * We do this with an offset buffer of equal size, so we need to allocate
3570      * extra for the offset. That's what the (+1)*pme->nkz is for.
3571      */
3572     init_overlap_comm(&pme->overlap[1], pme->pme_order,
3573 #ifdef GMX_MPI
3574                       pme->mpi_comm_d[1],
3575 #endif
3576                       pme->nnodes_minor, pme->nodeid_minor,
3577                       pme->nky,
3578                       (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
3579
3580     /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
3581      * Note that gmx_pme_check_restrictions checked for this already.
3582      */
3583     if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
3584     {
3585         gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
3586     }
3587
3588     snew(pme->bsp_mod[XX], pme->nkx);
3589     snew(pme->bsp_mod[YY], pme->nky);
3590     snew(pme->bsp_mod[ZZ], pme->nkz);
3591
3592     /* The required size of the interpolation grid, including overlap.
3593      * The allocated size (pmegrid_n?) might be slightly larger.
3594      */
3595     pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3596         pme->overlap[0].s2g0[pme->nodeid_major];
3597     pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3598         pme->overlap[1].s2g0[pme->nodeid_minor];
3599     pme->pmegrid_nz_base = pme->nkz;
3600     pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
3601     set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
3602
3603     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3604     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3605     pme->pmegrid_start_iz = 0;
3606
3607     make_gridindex5_to_localindex(pme->nkx,
3608                                   pme->pmegrid_start_ix,
3609                                   pme->pmegrid_nx - (pme->pme_order-1),
3610                                   &pme->nnx, &pme->fshx);
3611     make_gridindex5_to_localindex(pme->nky,
3612                                   pme->pmegrid_start_iy,
3613                                   pme->pmegrid_ny - (pme->pme_order-1),
3614                                   &pme->nny, &pme->fshy);
3615     make_gridindex5_to_localindex(pme->nkz,
3616                                   pme->pmegrid_start_iz,
3617                                   pme->pmegrid_nz_base,
3618                                   &pme->nnz, &pme->fshz);
3619
3620     pme->spline_work = make_pme_spline_work(pme->pme_order);
3621
3622     ndata[0]    = pme->nkx;
3623     ndata[1]    = pme->nky;
3624     ndata[2]    = pme->nkz;
3625     /* It doesn't matter if we allocate too many grids here,
3626      * we only allocate and use the ones we need.
3627      */
3628     if (EVDW_PME(ir->vdwtype))
3629     {
3630         pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
3631     }
3632     else
3633     {
3634         pme->ngrids = DO_Q;
3635     }
3636     snew(pme->fftgrid, pme->ngrids);
3637     snew(pme->cfftgrid, pme->ngrids);
3638     snew(pme->pfft_setup, pme->ngrids);
3639
3640     for (i = 0; i < pme->ngrids; ++i)
3641     {
3642         if ((i <  DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
3643                                                        bFreeEnergy_q)) ||
3644             (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
3645                                                     bFreeEnergy_lj ||
3646                                                     ir->ljpme_combination_rule == eljpmeLB)))
3647         {
3648             pmegrids_init(&pme->pmegrid[i],
3649                           pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3650                           pme->pmegrid_nz_base,
3651                           pme->pme_order,
3652                           pme->bUseThreads,
3653                           pme->nthread,
3654                           pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3655                           pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3656             /* This routine will allocate the grid data to fit the FFTs */
3657             gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
3658                                     &pme->fftgrid[i], &pme->cfftgrid[i],
3659                                     pme->mpi_comm_d,
3660                                     bReproducible, pme->nthread);
3661
3662         }
3663     }
3664
3665     if (!pme->bP3M)
3666     {
3667         /* Use plain SPME B-spline interpolation */
3668         make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3669     }
3670     else
3671     {
3672         /* Use the P3M grid-optimized influence function */
3673         make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3674     }
3675
3676     /* Use atc[0] for spreading */
3677     init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
3678     if (pme->ndecompdim >= 2)
3679     {
3680         init_atomcomm(pme, &pme->atc[1], 1, FALSE);
3681     }
3682
3683     if (pme->nnodes == 1)
3684     {
3685         pme->atc[0].n = homenr;
3686         pme_realloc_atomcomm_things(&pme->atc[0]);
3687     }
3688
3689     pme->lb_buf1       = NULL;
3690     pme->lb_buf2       = NULL;
3691     pme->lb_buf_nalloc = 0;
3692
3693     {
3694         int thread;
3695
3696         /* Use fft5d, order after FFT is y major, z, x minor */
3697
3698         snew(pme->work, pme->nthread);
3699         for (thread = 0; thread < pme->nthread; thread++)
3700         {
3701             realloc_work(&pme->work[thread], pme->nkx);
3702         }
3703     }
3704
3705     *pmedata = pme;
3706
3707     return 0;
3708 }
3709
3710 static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
3711 {
3712     int d, t;
3713
3714     for (d = 0; d < DIM; d++)
3715     {
3716         if (new->grid.n[d] > old->grid.n[d])
3717         {
3718             return;
3719         }
3720     }
3721
3722     sfree_aligned(new->grid.grid);
3723     new->grid.grid = old->grid.grid;
3724
3725     if (new->grid_th != NULL && new->nthread == old->nthread)
3726     {
3727         sfree_aligned(new->grid_all);
3728         for (t = 0; t < new->nthread; t++)
3729         {
3730             new->grid_th[t].grid = old->grid_th[t].grid;
3731         }
3732     }
3733 }
3734
3735 int gmx_pme_reinit(gmx_pme_t *         pmedata,
3736                    t_commrec *         cr,
3737                    gmx_pme_t           pme_src,
3738                    const t_inputrec *  ir,
3739                    ivec                grid_size)
3740 {
3741     t_inputrec irc;
3742     int homenr;
3743     int ret;
3744
3745     irc     = *ir;
3746     irc.nkx = grid_size[XX];
3747     irc.nky = grid_size[YY];
3748     irc.nkz = grid_size[ZZ];
3749
3750     if (pme_src->nnodes == 1)
3751     {
3752         homenr = pme_src->atc[0].n;
3753     }
3754     else
3755     {
3756         homenr = -1;
3757     }
3758
3759     ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
3760                        &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, pme_src->nthread);
3761
3762     if (ret == 0)
3763     {
3764         /* We can easily reuse the allocated pme grids in pme_src */
3765         reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
3766         /* We would like to reuse the fft grids, but that's harder */
3767     }
3768
3769     return ret;
3770 }
3771
3772
3773 static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids,
3774                             int grid_index, int thread, real *fftgrid)
3775 {
3776     ivec local_fft_ndata, local_fft_offset, local_fft_size;
3777     int  fft_my, fft_mz;
3778     int  nsx, nsy, nsz;
3779     ivec nf;
3780     int  offx, offy, offz, x, y, z, i0, i0t;
3781     int  d;
3782     pmegrid_t *pmegrid;
3783     real *grid_th;
3784
3785     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
3786                                    local_fft_ndata,
3787                                    local_fft_offset,
3788                                    local_fft_size);
3789     fft_my = local_fft_size[YY];
3790     fft_mz = local_fft_size[ZZ];
3791
3792     pmegrid = &pmegrids->grid_th[thread];
3793
3794     nsx = pmegrid->s[XX];
3795     nsy = pmegrid->s[YY];
3796     nsz = pmegrid->s[ZZ];
3797
3798     for (d = 0; d < DIM; d++)
3799     {
3800         nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3801                     local_fft_ndata[d] - pmegrid->offset[d]);
3802     }
3803
3804     offx = pmegrid->offset[XX];
3805     offy = pmegrid->offset[YY];
3806     offz = pmegrid->offset[ZZ];
3807
3808     /* Directly copy the non-overlapping parts of the local grids.
3809      * This also initializes the full grid.
3810      */
3811     grid_th = pmegrid->grid;
3812     for (x = 0; x < nf[XX]; x++)
3813     {
3814         for (y = 0; y < nf[YY]; y++)
3815         {
3816             i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3817             i0t = (x*nsy + y)*nsz;
3818             for (z = 0; z < nf[ZZ]; z++)
3819             {
3820                 fftgrid[i0+z] = grid_th[i0t+z];
3821             }
3822         }
3823     }
3824 }
3825
3826 static void
3827 reduce_threadgrid_overlap(gmx_pme_t pme,
3828                           const pmegrids_t *pmegrids, int thread,
3829                           real *fftgrid, real *commbuf_x, real *commbuf_y,
3830                           int grid_index)
3831 {
3832     ivec local_fft_ndata, local_fft_offset, local_fft_size;
3833     int  fft_nx, fft_ny, fft_nz;
3834     int  fft_my, fft_mz;
3835     int  buf_my = -1;
3836     int  nsx, nsy, nsz;
3837     ivec ne;
3838     int  offx, offy, offz, x, y, z, i0, i0t;
3839     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
3840     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
3841     gmx_bool bCommX, bCommY;
3842     int  d;
3843     int  thread_f;
3844     const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
3845     const real *grid_th;
3846     real *commbuf = NULL;
3847
3848     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
3849                                    local_fft_ndata,
3850                                    local_fft_offset,
3851                                    local_fft_size);
3852     fft_nx = local_fft_ndata[XX];
3853     fft_ny = local_fft_ndata[YY];
3854     fft_nz = local_fft_ndata[ZZ];
3855
3856     fft_my = local_fft_size[YY];
3857     fft_mz = local_fft_size[ZZ];
3858
3859     /* This routine is called when all thread have finished spreading.
3860      * Here each thread sums grid contributions calculated by other threads
3861      * to the thread local grid volume.
3862      * To minimize the number of grid copying operations,
3863      * this routines sums immediately from the pmegrid to the fftgrid.
3864      */
3865
3866     /* Determine which part of the full node grid we should operate on,
3867      * this is our thread local part of the full grid.
3868      */
3869     pmegrid = &pmegrids->grid_th[thread];
3870
3871     for (d = 0; d < DIM; d++)
3872     {
3873         ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3874                     local_fft_ndata[d]);
3875     }
3876
3877     offx = pmegrid->offset[XX];
3878     offy = pmegrid->offset[YY];
3879     offz = pmegrid->offset[ZZ];
3880
3881
3882     bClearBufX  = TRUE;
3883     bClearBufY  = TRUE;
3884     bClearBufXY = TRUE;
3885
3886     /* Now loop over all the thread data blocks that contribute
3887      * to the grid region we (our thread) are operating on.
3888      */
3889     /* Note that fft_nx/y is equal to the number of grid points
3890      * between the first point of our node grid and the one of the next node.
3891      */
3892     for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
3893     {
3894         fx     = pmegrid->ci[XX] + sx;
3895         ox     = 0;
3896         bCommX = FALSE;
3897         if (fx < 0)
3898         {
3899             fx    += pmegrids->nc[XX];
3900             ox    -= fft_nx;
3901             bCommX = (pme->nnodes_major > 1);
3902         }
3903         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3904         ox       += pmegrid_g->offset[XX];
3905         /* Determine the end of our part of the source grid */
3906         tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
3907
3908         for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
3909         {
3910             fy     = pmegrid->ci[YY] + sy;
3911             oy     = 0;
3912             bCommY = FALSE;
3913             if (fy < 0)
3914             {
3915                 fy    += pmegrids->nc[YY];
3916                 oy    -= fft_ny;
3917                 bCommY = (pme->nnodes_minor > 1);
3918             }
3919             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3920             oy       += pmegrid_g->offset[YY];
3921             /* Determine the end of our part of the source grid */
3922             ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
3923
3924             for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
3925             {
3926                 fz = pmegrid->ci[ZZ] + sz;
3927                 oz = 0;
3928                 if (fz < 0)
3929                 {
3930                     fz += pmegrids->nc[ZZ];
3931                     oz -= fft_nz;
3932                 }
3933                 pmegrid_g = &pmegrids->grid_th[fz];
3934                 oz       += pmegrid_g->offset[ZZ];
3935                 tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
3936
3937                 if (sx == 0 && sy == 0 && sz == 0)
3938                 {
3939                     /* We have already added our local contribution
3940                      * before calling this routine, so skip it here.
3941                      */
3942                     continue;
3943                 }
3944
3945                 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3946
3947                 pmegrid_f = &pmegrids->grid_th[thread_f];
3948
3949                 grid_th = pmegrid_f->grid;
3950
3951                 nsx = pmegrid_f->s[XX];
3952                 nsy = pmegrid_f->s[YY];
3953                 nsz = pmegrid_f->s[ZZ];
3954
3955 #ifdef DEBUG_PME_REDUCE
3956                 printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3957                        pme->nodeid, thread, thread_f,
3958                        pme->pmegrid_start_ix,
3959                        pme->pmegrid_start_iy,
3960                        pme->pmegrid_start_iz,
3961                        sx, sy, sz,
3962                        offx-ox, tx1-ox, offx, tx1,
3963                        offy-oy, ty1-oy, offy, ty1,
3964                        offz-oz, tz1-oz, offz, tz1);
3965 #endif
3966
3967                 if (!(bCommX || bCommY))
3968                 {
3969                     /* Copy from the thread local grid to the node grid */
3970                     for (x = offx; x < tx1; x++)
3971                     {
3972                         for (y = offy; y < ty1; y++)
3973                         {
3974                             i0  = (x*fft_my + y)*fft_mz;
3975                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3976                             for (z = offz; z < tz1; z++)
3977                             {
3978                                 fftgrid[i0+z] += grid_th[i0t+z];
3979                             }
3980                         }
3981                     }
3982                 }
3983                 else
3984                 {
3985                     /* The order of this conditional decides
3986                      * where the corner volume gets stored with x+y decomp.
3987                      */
3988                     if (bCommY)
3989                     {
3990                         commbuf = commbuf_y;
3991                         /* The y-size of the communication buffer is order-1 */
3992                         buf_my  = pmegrid->order - 1;
3993                         if (bCommX)
3994                         {
3995                             /* We index commbuf modulo the local grid size */
3996                             commbuf += buf_my*fft_nx*fft_nz;
3997
3998                             bClearBuf   = bClearBufXY;
3999                             bClearBufXY = FALSE;
4000                         }
4001                         else
4002                         {
4003                             bClearBuf  = bClearBufY;
4004                             bClearBufY = FALSE;
4005                         }
4006                     }
4007                     else
4008                     {
4009                         commbuf    = commbuf_x;
4010                         buf_my     = fft_ny;
4011                         bClearBuf  = bClearBufX;
4012                         bClearBufX = FALSE;
4013                     }
4014
4015                     /* Copy to the communication buffer */
4016                     for (x = offx; x < tx1; x++)
4017                     {
4018                         for (y = offy; y < ty1; y++)
4019                         {
4020                             i0  = (x*buf_my + y)*fft_nz;
4021                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
4022
4023                             if (bClearBuf)
4024                             {
4025                                 /* First access of commbuf, initialize it */
4026                                 for (z = offz; z < tz1; z++)
4027                                 {
4028                                     commbuf[i0+z]  = grid_th[i0t+z];
4029                                 }
4030                             }
4031                             else
4032                             {
4033                                 for (z = offz; z < tz1; z++)
4034                                 {
4035                                     commbuf[i0+z] += grid_th[i0t+z];
4036                                 }
4037                             }
4038                         }
4039                     }
4040                 }
4041             }
4042         }
4043     }
4044 }
4045
4046
4047 static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
4048 {
4049     ivec local_fft_ndata, local_fft_offset, local_fft_size;
4050     pme_overlap_t *overlap;
4051     int  send_index0, send_nindex;
4052     int  recv_nindex;
4053 #ifdef GMX_MPI
4054     MPI_Status stat;
4055 #endif
4056     int  send_size_y, recv_size_y;
4057     int  ipulse, send_id, recv_id, datasize, gridsize, size_yx;
4058     real *sendptr, *recvptr;
4059     int  x, y, z, indg, indb;
4060
4061     /* Note that this routine is only used for forward communication.
4062      * Since the force gathering, unlike the coefficient spreading,
4063      * can be trivially parallelized over the particles,
4064      * the backwards process is much simpler and can use the "old"
4065      * communication setup.
4066      */
4067
4068     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
4069                                    local_fft_ndata,
4070                                    local_fft_offset,
4071                                    local_fft_size);
4072
4073     if (pme->nnodes_minor > 1)
4074     {
4075         /* Major dimension */
4076         overlap = &pme->overlap[1];
4077
4078         if (pme->nnodes_major > 1)
4079         {
4080             size_yx = pme->overlap[0].comm_data[0].send_nindex;
4081         }
4082         else
4083         {
4084             size_yx = 0;
4085         }
4086         datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
4087
4088         send_size_y = overlap->send_size;
4089
4090         for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
4091         {
4092             send_id       = overlap->send_id[ipulse];
4093             recv_id       = overlap->recv_id[ipulse];
4094             send_index0   =
4095                 overlap->comm_data[ipulse].send_index0 -
4096                 overlap->comm_data[0].send_index0;
4097             send_nindex   = overlap->comm_data[ipulse].send_nindex;
4098             /* We don't use recv_index0, as we always receive starting at 0 */
4099             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
4100             recv_size_y   = overlap->comm_data[ipulse].recv_size;
4101
4102             sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
4103             recvptr = overlap->recvbuf;
4104
4105 #ifdef GMX_MPI
4106             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
4107                          send_id, ipulse,
4108                          recvptr, recv_size_y*datasize, GMX_MPI_REAL,
4109                          recv_id, ipulse,
4110                          overlap->mpi_comm, &stat);
4111 #endif
4112
4113             for (x = 0; x < local_fft_ndata[XX]; x++)
4114             {
4115                 for (y = 0; y < recv_nindex; y++)
4116                 {
4117                     indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
4118                     indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
4119                     for (z = 0; z < local_fft_ndata[ZZ]; z++)
4120                     {
4121                         fftgrid[indg+z] += recvptr[indb+z];
4122                     }
4123                 }
4124             }
4125
4126             if (pme->nnodes_major > 1)
4127             {
4128                 /* Copy from the received buffer to the send buffer for dim 0 */
4129                 sendptr = pme->overlap[0].sendbuf;
4130                 for (x = 0; x < size_yx; x++)
4131                 {
4132                     for (y = 0; y < recv_nindex; y++)
4133                     {
4134                         indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
4135                         indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
4136                         for (z = 0; z < local_fft_ndata[ZZ]; z++)
4137                         {
4138                             sendptr[indg+z] += recvptr[indb+z];
4139                         }
4140                     }
4141                 }
4142             }
4143         }
4144     }
4145
4146     /* We only support a single pulse here.
4147      * This is not a severe limitation, as this code is only used
4148      * with OpenMP and with OpenMP the (PME) domains can be larger.
4149      */
4150     if (pme->nnodes_major > 1)
4151     {
4152         /* Major dimension */
4153         overlap = &pme->overlap[0];
4154
4155         datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
4156         gridsize = local_fft_size[YY] *local_fft_size[ZZ];
4157
4158         ipulse = 0;
4159
4160         send_id       = overlap->send_id[ipulse];
4161         recv_id       = overlap->recv_id[ipulse];
4162         send_nindex   = overlap->comm_data[ipulse].send_nindex;
4163         /* We don't use recv_index0, as we always receive starting at 0 */
4164         recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
4165
4166         sendptr = overlap->sendbuf;
4167         recvptr = overlap->recvbuf;
4168
4169         if (debug != NULL)
4170         {
4171             fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
4172                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
4173         }
4174
4175 #ifdef GMX_MPI
4176         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
4177                      send_id, ipulse,
4178                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
4179                      recv_id, ipulse,
4180                      overlap->mpi_comm, &stat);
4181 #endif
4182
4183         for (x = 0; x < recv_nindex; x++)
4184         {
4185             for (y = 0; y < local_fft_ndata[YY]; y++)
4186             {
4187                 indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
4188                 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
4189                 for (z = 0; z < local_fft_ndata[ZZ]; z++)
4190                 {
4191                     fftgrid[indg+z] += recvptr[indb+z];
4192                 }
4193             }
4194         }
4195     }
4196 }
4197
4198
4199 static void spread_on_grid(gmx_pme_t pme,
4200                            pme_atomcomm_t *atc, pmegrids_t *grids,
4201                            gmx_bool bCalcSplines, gmx_bool bSpread,
4202                            real *fftgrid, gmx_bool bDoSplines, int grid_index)
4203 {
4204     int nthread, thread;
4205 #ifdef PME_TIME_THREADS
4206     gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
4207     static double cs1     = 0, cs2 = 0, cs3 = 0;
4208     static double cs1a[6] = {0, 0, 0, 0, 0, 0};
4209     static int cnt        = 0;
4210 #endif
4211
4212     nthread = pme->nthread;
4213     assert(nthread > 0);
4214
4215 #ifdef PME_TIME_THREADS
4216     c1 = omp_cyc_start();
4217 #endif
4218     if (bCalcSplines)
4219     {
4220 #pragma omp parallel for num_threads(nthread) schedule(static)
4221         for (thread = 0; thread < nthread; thread++)
4222         {
4223             int start, end;
4224
4225             start = atc->n* thread   /nthread;
4226             end   = atc->n*(thread+1)/nthread;
4227
4228             /* Compute fftgrid index for all atoms,
4229              * with help of some extra variables.
4230              */
4231             calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
4232         }
4233     }
4234 #ifdef PME_TIME_THREADS
4235     c1   = omp_cyc_end(c1);
4236     cs1 += (double)c1;
4237 #endif
4238
4239 #ifdef PME_TIME_THREADS
4240     c2 = omp_cyc_start();
4241 #endif
4242 #pragma omp parallel for num_threads(nthread) schedule(static)
4243     for (thread = 0; thread < nthread; thread++)
4244     {
4245         splinedata_t *spline;
4246         pmegrid_t *grid = NULL;
4247
4248         /* make local bsplines  */
4249         if (grids == NULL || !pme->bUseThreads)
4250         {
4251             spline = &atc->spline[0];
4252
4253             spline->n = atc->n;
4254
4255             if (bSpread)
4256             {
4257                 grid = &grids->grid;
4258             }
4259         }
4260         else
4261         {
4262             spline = &atc->spline[thread];
4263
4264             if (grids->nthread == 1)
4265             {
4266                 /* One thread, we operate on all coefficients */
4267                 spline->n = atc->n;
4268             }
4269             else
4270             {
4271                 /* Get the indices our thread should operate on */
4272                 make_thread_local_ind(atc, thread, spline);
4273             }
4274
4275             grid = &grids->grid_th[thread];
4276         }
4277
4278         if (bCalcSplines)
4279         {
4280             make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
4281                           atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
4282         }
4283
4284         if (bSpread)
4285         {
4286             /* put local atoms on grid. */
4287 #ifdef PME_TIME_SPREAD
4288             ct1a = omp_cyc_start();
4289 #endif
4290             spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
4291
4292             if (pme->bUseThreads)
4293             {
4294                 copy_local_grid(pme, grids, grid_index, thread, fftgrid);
4295             }
4296 #ifdef PME_TIME_SPREAD
4297             ct1a          = omp_cyc_end(ct1a);
4298             cs1a[thread] += (double)ct1a;
4299 #endif
4300         }
4301     }
4302 #ifdef PME_TIME_THREADS
4303     c2   = omp_cyc_end(c2);
4304     cs2 += (double)c2;
4305 #endif
4306
4307     if (bSpread && pme->bUseThreads)
4308     {
4309 #ifdef PME_TIME_THREADS
4310         c3 = omp_cyc_start();
4311 #endif
4312 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
4313         for (thread = 0; thread < grids->nthread; thread++)
4314         {
4315             reduce_threadgrid_overlap(pme, grids, thread,
4316                                       fftgrid,
4317                                       pme->overlap[0].sendbuf,
4318                                       pme->overlap[1].sendbuf,
4319                                       grid_index);
4320         }
4321 #ifdef PME_TIME_THREADS
4322         c3   = omp_cyc_end(c3);
4323         cs3 += (double)c3;
4324 #endif
4325
4326         if (pme->nnodes > 1)
4327         {
4328             /* Communicate the overlapping part of the fftgrid.
4329              * For this communication call we need to check pme->bUseThreads
4330              * to have all ranks communicate here, regardless of pme->nthread.
4331              */
4332             sum_fftgrid_dd(pme, fftgrid, grid_index);
4333         }
4334     }
4335
4336 #ifdef PME_TIME_THREADS
4337     cnt++;
4338     if (cnt % 20 == 0)
4339     {
4340         printf("idx %.2f spread %.2f red %.2f",
4341                cs1*1e-9, cs2*1e-9, cs3*1e-9);
4342 #ifdef PME_TIME_SPREAD
4343         for (thread = 0; thread < nthread; thread++)
4344         {
4345             printf(" %.2f", cs1a[thread]*1e-9);
4346         }
4347 #endif
4348         printf("\n");
4349     }
4350 #endif
4351 }
4352
4353
4354 static void dump_grid(FILE *fp,
4355                       int sx, int sy, int sz, int nx, int ny, int nz,
4356                       int my, int mz, const real *g)
4357 {
4358     int x, y, z;
4359
4360     for (x = 0; x < nx; x++)
4361     {
4362         for (y = 0; y < ny; y++)
4363         {
4364             for (z = 0; z < nz; z++)
4365             {
4366                 fprintf(fp, "%2d %2d %2d %6.3f\n",
4367                         sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
4368             }
4369         }
4370     }
4371 }
4372
4373 static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
4374 {
4375     ivec local_fft_ndata, local_fft_offset, local_fft_size;
4376
4377     gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
4378                                    local_fft_ndata,
4379                                    local_fft_offset,
4380                                    local_fft_size);
4381
4382     dump_grid(stderr,
4383               pme->pmegrid_start_ix,
4384               pme->pmegrid_start_iy,
4385               pme->pmegrid_start_iz,
4386               pme->pmegrid_nx-pme->pme_order+1,
4387               pme->pmegrid_ny-pme->pme_order+1,
4388               pme->pmegrid_nz-pme->pme_order+1,
4389               local_fft_size[YY],
4390               local_fft_size[ZZ],
4391               fftgrid);
4392 }
4393
4394
4395 void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
4396 {
4397     pme_atomcomm_t *atc;
4398     pmegrids_t *grid;
4399
4400     if (pme->nnodes > 1)
4401     {
4402         gmx_incons("gmx_pme_calc_energy called in parallel");
4403     }
4404     if (pme->bFEP_q > 1)
4405     {
4406         gmx_incons("gmx_pme_calc_energy with free energy");
4407     }
4408
4409     atc            = &pme->atc_energy;
4410     atc->nthread   = 1;
4411     if (atc->spline == NULL)
4412     {
4413         snew(atc->spline, atc->nthread);
4414     }
4415     atc->nslab     = 1;
4416     atc->bSpread   = TRUE;
4417     atc->pme_order = pme->pme_order;
4418     atc->n         = n;
4419     pme_realloc_atomcomm_things(atc);
4420     atc->x           = x;
4421     atc->coefficient = q;
4422
4423     /* We only use the A-charges grid */
4424     grid = &pme->pmegrid[PME_GRID_QA];
4425
4426     /* Only calculate the spline coefficients, don't actually spread */
4427     spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
4428
4429     *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
4430 }
4431
4432
4433 static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
4434                                    gmx_walltime_accounting_t walltime_accounting,
4435                                    t_nrnb *nrnb, t_inputrec *ir,
4436                                    gmx_int64_t step)
4437 {
4438     /* Reset all the counters related to performance over the run */
4439     wallcycle_stop(wcycle, ewcRUN);
4440     wallcycle_reset_all(wcycle);
4441     init_nrnb(nrnb);
4442     if (ir->nsteps >= 0)
4443     {
4444         /* ir->nsteps is not used here, but we update it for consistency */
4445         ir->nsteps -= step - ir->init_step;
4446     }
4447     ir->init_step = step;
4448     wallcycle_start(wcycle, ewcRUN);
4449     walltime_accounting_start(walltime_accounting);
4450 }
4451
4452
4453 static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
4454                                ivec grid_size,
4455                                t_commrec *cr, t_inputrec *ir,
4456                                gmx_pme_t *pme_ret)
4457 {
4458     int ind;
4459     gmx_pme_t pme = NULL;
4460
4461     ind = 0;
4462     while (ind < *npmedata)
4463     {
4464         pme = (*pmedata)[ind];
4465         if (pme->nkx == grid_size[XX] &&
4466             pme->nky == grid_size[YY] &&
4467             pme->nkz == grid_size[ZZ])
4468         {
4469             *pme_ret = pme;
4470
4471             return;
4472         }
4473
4474         ind++;
4475     }
4476
4477     (*npmedata)++;
4478     srenew(*pmedata, *npmedata);
4479
4480     /* Generate a new PME data structure, copying part of the old pointers */
4481     gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
4482
4483     *pme_ret = (*pmedata)[ind];
4484 }
4485
4486 int gmx_pmeonly(gmx_pme_t pme,
4487                 t_commrec *cr,    t_nrnb *mynrnb,
4488                 gmx_wallcycle_t wcycle,
4489                 gmx_walltime_accounting_t walltime_accounting,
4490                 real ewaldcoeff_q, real ewaldcoeff_lj,
4491                 t_inputrec *ir)
4492 {
4493     int npmedata;
4494     gmx_pme_t *pmedata;
4495     gmx_pme_pp_t pme_pp;
4496     int  ret;
4497     int  natoms;
4498     matrix box;
4499     rvec *x_pp      = NULL, *f_pp = NULL;
4500     real *chargeA   = NULL, *chargeB = NULL;
4501     real *c6A       = NULL, *c6B = NULL;
4502     real *sigmaA    = NULL, *sigmaB = NULL;
4503     real lambda_q   = 0;
4504     real lambda_lj  = 0;
4505     int  maxshift_x = 0, maxshift_y = 0;
4506     real energy_q, energy_lj, dvdlambda_q, dvdlambda_lj;
4507     matrix vir_q, vir_lj;
4508     float cycles;
4509     int  count;
4510     gmx_bool bEnerVir;
4511     int pme_flags;
4512     gmx_int64_t step, step_rel;
4513     ivec grid_switch;
4514
4515     /* This data will only use with PME tuning, i.e. switching PME grids */
4516     npmedata = 1;
4517     snew(pmedata, npmedata);
4518     pmedata[0] = pme;
4519
4520     pme_pp = gmx_pme_pp_init(cr);
4521
4522     init_nrnb(mynrnb);
4523
4524     count = 0;
4525     do /****** this is a quasi-loop over time steps! */
4526     {
4527         /* The reason for having a loop here is PME grid tuning/switching */
4528         do
4529         {
4530             /* Domain decomposition */
4531             ret = gmx_pme_recv_coeffs_coords(pme_pp,
4532                                              &natoms,
4533                                              &chargeA, &chargeB,
4534                                              &c6A, &c6B,
4535                                              &sigmaA, &sigmaB,
4536                                              box, &x_pp, &f_pp,
4537                                              &maxshift_x, &maxshift_y,
4538                                              &pme->bFEP_q, &pme->bFEP_lj,
4539                                              &lambda_q, &lambda_lj,
4540                                              &bEnerVir,
4541                                              &pme_flags,
4542                                              &step,
4543                                              grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj);
4544
4545             if (ret == pmerecvqxSWITCHGRID)
4546             {
4547                 /* Switch the PME grid to grid_switch */
4548                 gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
4549             }
4550
4551             if (ret == pmerecvqxRESETCOUNTERS)
4552             {
4553                 /* Reset the cycle and flop counters */
4554                 reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, ir, step);
4555             }
4556         }
4557         while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
4558
4559         if (ret == pmerecvqxFINISH)
4560         {
4561             /* We should stop: break out of the loop */
4562             break;
4563         }
4564
4565         step_rel = step - ir->init_step;
4566
4567         if (count == 0)
4568         {
4569             wallcycle_start(wcycle, ewcRUN);
4570             walltime_accounting_start(walltime_accounting);
4571         }
4572
4573         wallcycle_start(wcycle, ewcPMEMESH);
4574
4575         dvdlambda_q  = 0;
4576         dvdlambda_lj = 0;
4577         clear_mat(vir_q);
4578         clear_mat(vir_lj);
4579
4580         gmx_pme_do(pme, 0, natoms, x_pp, f_pp,
4581                    chargeA, chargeB, c6A, c6B, sigmaA, sigmaB, box,
4582                    cr, maxshift_x, maxshift_y, mynrnb, wcycle,
4583                    vir_q, ewaldcoeff_q, vir_lj, ewaldcoeff_lj,
4584                    &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
4585                    pme_flags | GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
4586
4587         cycles = wallcycle_stop(wcycle, ewcPMEMESH);
4588
4589         gmx_pme_send_force_vir_ener(pme_pp,
4590                                     f_pp, vir_q, energy_q, vir_lj, energy_lj,
4591                                     dvdlambda_q, dvdlambda_lj, cycles);
4592
4593         count++;
4594     } /***** end of quasi-loop, we stop with the break above */
4595     while (TRUE);
4596
4597     walltime_accounting_end(walltime_accounting);
4598
4599     return 0;
4600 }
4601
4602 static void
4603 calc_initial_lb_coeffs(gmx_pme_t pme, real *local_c6, real *local_sigma)
4604 {
4605     int  i;
4606
4607     for (i = 0; i < pme->atc[0].n; ++i)
4608     {
4609         real sigma4;
4610
4611         sigma4                     = local_sigma[i];
4612         sigma4                     = sigma4*sigma4;
4613         sigma4                     = sigma4*sigma4;
4614         pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
4615     }
4616 }
4617
4618 static void
4619 calc_next_lb_coeffs(gmx_pme_t pme, real *local_sigma)
4620 {
4621     int  i;
4622
4623     for (i = 0; i < pme->atc[0].n; ++i)
4624     {
4625         pme->atc[0].coefficient[i] *= local_sigma[i];
4626     }
4627 }
4628
4629 static void
4630 do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
4631                      gmx_bool bFirst, rvec x[], real *data)
4632 {
4633     int      d;
4634     pme_atomcomm_t *atc;
4635     atc = &pme->atc[0];
4636
4637     for (d = pme->ndecompdim - 1; d >= 0; d--)
4638     {
4639         int             n_d;
4640         rvec           *x_d;
4641         real           *param_d;
4642
4643         if (d == pme->ndecompdim - 1)
4644         {
4645             n_d     = homenr;
4646             x_d     = x + start;
4647             param_d = data;
4648         }
4649         else
4650         {
4651             n_d     = pme->atc[d + 1].n;
4652             x_d     = atc->x;
4653             param_d = atc->coefficient;
4654         }
4655         atc      = &pme->atc[d];
4656         atc->npd = n_d;
4657         if (atc->npd > atc->pd_nalloc)
4658         {
4659             atc->pd_nalloc = over_alloc_dd(atc->npd);
4660             srenew(atc->pd, atc->pd_nalloc);
4661         }
4662         pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
4663         where();
4664         /* Redistribute x (only once) and qA/c6A or qB/c6B */
4665         if (DOMAINDECOMP(cr))
4666         {
4667             dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc);
4668         }
4669     }
4670 }
4671
4672 int gmx_pme_do(gmx_pme_t pme,
4673                int start,       int homenr,
4674                rvec x[],        rvec f[],
4675                real *chargeA,   real *chargeB,
4676                real *c6A,       real *c6B,
4677                real *sigmaA,    real *sigmaB,
4678                matrix box, t_commrec *cr,
4679                int  maxshift_x, int maxshift_y,
4680                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
4681                matrix vir_q,      real ewaldcoeff_q,
4682                matrix vir_lj,   real ewaldcoeff_lj,
4683                real *energy_q,  real *energy_lj,
4684                real lambda_q, real lambda_lj,
4685                real *dvdlambda_q, real *dvdlambda_lj,
4686                int flags)
4687 {
4688     int     d, i, j, k, ntot, npme, grid_index, max_grid_index;
4689     int     nx, ny, nz;
4690     int     n_d, local_ny;
4691     pme_atomcomm_t *atc = NULL;
4692     pmegrids_t *pmegrid = NULL;
4693     real    *grid       = NULL;
4694     real    *ptr;
4695     rvec    *x_d, *f_d;
4696     real    *coefficient = NULL;
4697     real    energy_AB[4];
4698     matrix  vir_AB[4];
4699     real    scale, lambda;
4700     gmx_bool bClearF;
4701     gmx_parallel_3dfft_t pfft_setup;
4702     real *  fftgrid;
4703     t_complex * cfftgrid;
4704     int     thread;
4705     gmx_bool bFirst, bDoSplines;
4706     int fep_state;
4707     int fep_states_lj           = pme->bFEP_lj ? 2 : 1;
4708     const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4709     const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
4710
4711     assert(pme->nnodes > 0);
4712     assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4713
4714     if (pme->nnodes > 1)
4715     {
4716         atc      = &pme->atc[0];
4717         atc->npd = homenr;
4718         if (atc->npd > atc->pd_nalloc)
4719         {
4720             atc->pd_nalloc = over_alloc_dd(atc->npd);
4721             srenew(atc->pd, atc->pd_nalloc);
4722         }
4723         for (d = pme->ndecompdim-1; d >= 0; d--)
4724         {
4725             atc           = &pme->atc[d];
4726             atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4727         }
4728     }
4729     else
4730     {
4731         atc = &pme->atc[0];
4732         /* This could be necessary for TPI */
4733         pme->atc[0].n = homenr;
4734         if (DOMAINDECOMP(cr))
4735         {
4736             pme_realloc_atomcomm_things(atc);
4737         }
4738         atc->x = x;
4739         atc->f = f;
4740     }
4741
4742     m_inv_ur0(box, pme->recipbox);
4743     bFirst = TRUE;
4744
4745     /* For simplicity, we construct the splines for all particles if
4746      * more than one PME calculations is needed. Some optimization
4747      * could be done by keeping track of which atoms have splines
4748      * constructed, and construct new splines on each pass for atoms
4749      * that don't yet have them.
4750      */
4751
4752     bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
4753
4754     /* We need a maximum of four separate PME calculations:
4755      * grid_index=0: Coulomb PME with charges from state A
4756      * grid_index=1: Coulomb PME with charges from state B
4757      * grid_index=2: LJ PME with C6 from state A
4758      * grid_index=3: LJ PME with C6 from state B
4759      * For Lorentz-Berthelot combination rules, a separate loop is used to
4760      * calculate all the terms
4761      */
4762
4763     /* If we are doing LJ-PME with LB, we only do Q here */
4764     max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
4765
4766     for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
4767     {
4768         /* Check if we should do calculations at this grid_index
4769          * If grid_index is odd we should be doing FEP
4770          * If grid_index < 2 we should be doing electrostatic PME
4771          * If grid_index >= 2 we should be doing LJ-PME
4772          */
4773         if ((grid_index <  DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
4774                                     (grid_index == 1 && !pme->bFEP_q))) ||
4775             (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
4776                                     (grid_index == 3 && !pme->bFEP_lj))))
4777         {
4778             continue;
4779         }
4780         /* Unpack structure */
4781         pmegrid    = &pme->pmegrid[grid_index];
4782         fftgrid    = pme->fftgrid[grid_index];
4783         cfftgrid   = pme->cfftgrid[grid_index];
4784         pfft_setup = pme->pfft_setup[grid_index];
4785         switch (grid_index)
4786         {
4787             case 0: coefficient = chargeA + start; break;
4788             case 1: coefficient = chargeB + start; break;
4789             case 2: coefficient = c6A + start; break;
4790             case 3: coefficient = c6B + start; break;
4791         }
4792
4793         grid = pmegrid->grid.grid;
4794
4795         if (debug)
4796         {
4797             fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
4798                     cr->nnodes, cr->nodeid);
4799             fprintf(debug, "Grid = %p\n", (void*)grid);
4800             if (grid == NULL)
4801             {
4802                 gmx_fatal(FARGS, "No grid!");
4803             }
4804         }
4805         where();
4806
4807         if (pme->nnodes == 1)
4808         {
4809             atc->coefficient = coefficient;
4810         }
4811         else
4812         {
4813             wallcycle_start(wcycle, ewcPME_REDISTXF);
4814             do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
4815             where();
4816
4817             wallcycle_stop(wcycle, ewcPME_REDISTXF);
4818         }
4819
4820         if (debug)
4821         {
4822             fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
4823                     cr->nodeid, atc->n);
4824         }
4825
4826         if (flags & GMX_PME_SPREAD)
4827         {
4828             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4829
4830             /* Spread the coefficients on a grid */
4831             spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
4832
4833             if (bFirst)
4834             {
4835                 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
4836             }
4837             inc_nrnb(nrnb, eNR_SPREADBSP,
4838                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4839
4840             if (!pme->bUseThreads)
4841             {
4842                 wrap_periodic_pmegrid(pme, grid);
4843
4844                 /* sum contributions to local grid from other nodes */
4845 #ifdef GMX_MPI
4846                 if (pme->nnodes > 1)
4847                 {
4848                     gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
4849                     where();
4850                 }
4851 #endif
4852
4853                 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
4854             }
4855
4856             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4857
4858             /*
4859                dump_local_fftgrid(pme,fftgrid);
4860                exit(0);
4861              */
4862         }
4863
4864         /* Here we start a large thread parallel region */
4865 #pragma omp parallel num_threads(pme->nthread) private(thread)
4866         {
4867             thread = gmx_omp_get_thread_num();
4868             if (flags & GMX_PME_SOLVE)
4869             {
4870                 int loop_count;
4871
4872                 /* do 3d-fft */
4873                 if (thread == 0)
4874                 {
4875                     wallcycle_start(wcycle, ewcPME_FFT);
4876                 }
4877                 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
4878                                            thread, wcycle);
4879                 if (thread == 0)
4880                 {
4881                     wallcycle_stop(wcycle, ewcPME_FFT);
4882                 }
4883                 where();
4884
4885                 /* solve in k-space for our local cells */
4886                 if (thread == 0)
4887                 {
4888                     wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
4889                 }
4890                 if (grid_index < DO_Q)
4891                 {
4892                     loop_count =
4893                         solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
4894                                       box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4895                                       bCalcEnerVir,
4896                                       pme->nthread, thread);
4897                 }
4898                 else
4899                 {
4900                     loop_count =
4901                         solve_pme_lj_yzx(pme, &cfftgrid, FALSE, ewaldcoeff_lj,
4902                                          box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4903                                          bCalcEnerVir,
4904                                          pme->nthread, thread);
4905                 }
4906
4907                 if (thread == 0)
4908                 {
4909                     wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
4910                     where();
4911                     inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
4912                 }
4913             }
4914
4915             if (bCalcF)
4916             {
4917                 /* do 3d-invfft */
4918                 if (thread == 0)
4919                 {
4920                     where();
4921                     wallcycle_start(wcycle, ewcPME_FFT);
4922                 }
4923                 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
4924                                            thread, wcycle);
4925                 if (thread == 0)
4926                 {
4927                     wallcycle_stop(wcycle, ewcPME_FFT);
4928
4929                     where();
4930
4931                     if (pme->nodeid == 0)
4932                     {
4933                         ntot  = pme->nkx*pme->nky*pme->nkz;
4934                         npme  = ntot*log((real)ntot)/log(2.0);
4935                         inc_nrnb(nrnb, eNR_FFT, 2*npme);
4936                     }
4937
4938                     /* Note: this wallcycle region is closed below
4939                        outside an OpenMP region, so take care if
4940                        refactoring code here. */
4941                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4942                 }
4943
4944                 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
4945             }
4946         }
4947         /* End of thread parallel section.
4948          * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4949          */
4950
4951         if (bCalcF)
4952         {
4953             /* distribute local grid to all nodes */
4954 #ifdef GMX_MPI
4955             if (pme->nnodes > 1)
4956             {
4957                 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
4958             }
4959 #endif
4960             where();
4961
4962             unwrap_periodic_pmegrid(pme, grid);
4963
4964             /* interpolate forces for our local atoms */
4965
4966             where();
4967
4968             /* If we are running without parallelization,
4969              * atc->f is the actual force array, not a buffer,
4970              * therefore we should not clear it.
4971              */
4972             lambda  = grid_index < DO_Q ? lambda_q : lambda_lj;
4973             bClearF = (bFirst && PAR(cr));
4974 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4975             for (thread = 0; thread < pme->nthread; thread++)
4976             {
4977                 gather_f_bsplines(pme, grid, bClearF, atc,
4978                                   &atc->spline[thread],
4979                                   pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
4980             }
4981
4982             where();
4983
4984             inc_nrnb(nrnb, eNR_GATHERFBSP,
4985                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4986             /* Note: this wallcycle region is opened above inside an OpenMP
4987                region, so take care if refactoring code here. */
4988             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4989         }
4990
4991         if (bCalcEnerVir)
4992         {
4993             /* This should only be called on the master thread
4994              * and after the threads have synchronized.
4995              */
4996             if (grid_index < 2)
4997             {
4998                 get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
4999             }
5000             else
5001             {
5002                 get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
5003             }
5004         }
5005         bFirst = FALSE;
5006     } /* of grid_index-loop */
5007
5008     /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
5009      * seven terms. */
5010
5011     if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
5012     {
5013         /* Loop over A- and B-state if we are doing FEP */
5014         for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
5015         {
5016             real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
5017             if (pme->nnodes == 1)
5018             {
5019                 if (pme->lb_buf1 == NULL)
5020                 {
5021                     pme->lb_buf_nalloc = pme->atc[0].n;
5022                     snew(pme->lb_buf1, pme->lb_buf_nalloc);
5023                 }
5024                 pme->atc[0].coefficient = pme->lb_buf1;
5025                 switch (fep_state)
5026                 {
5027                     case 0:
5028                         local_c6      = c6A;
5029                         local_sigma   = sigmaA;
5030                         break;
5031                     case 1:
5032                         local_c6      = c6B;
5033                         local_sigma   = sigmaB;
5034                         break;
5035                     default:
5036                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
5037                 }
5038             }
5039             else
5040             {
5041                 atc = &pme->atc[0];
5042                 switch (fep_state)
5043                 {
5044                     case 0:
5045                         RedistC6      = c6A;
5046                         RedistSigma   = sigmaA;
5047                         break;
5048                     case 1:
5049                         RedistC6      = c6B;
5050                         RedistSigma   = sigmaB;
5051                         break;
5052                     default:
5053                         gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
5054                 }
5055                 wallcycle_start(wcycle, ewcPME_REDISTXF);
5056
5057                 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
5058                 if (pme->lb_buf_nalloc < atc->n)
5059                 {
5060                     pme->lb_buf_nalloc = atc->nalloc;
5061                     srenew(pme->lb_buf1, pme->lb_buf_nalloc);
5062                     srenew(pme->lb_buf2, pme->lb_buf_nalloc);
5063                 }
5064                 local_c6 = pme->lb_buf1;
5065                 for (i = 0; i < atc->n; ++i)
5066                 {
5067                     local_c6[i] = atc->coefficient[i];
5068                 }
5069                 where();
5070
5071                 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
5072                 local_sigma = pme->lb_buf2;
5073                 for (i = 0; i < atc->n; ++i)
5074                 {
5075                     local_sigma[i] = atc->coefficient[i];
5076                 }
5077                 where();
5078
5079                 wallcycle_stop(wcycle, ewcPME_REDISTXF);
5080             }
5081             calc_initial_lb_coeffs(pme, local_c6, local_sigma);
5082
5083             /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
5084             for (grid_index = 2; grid_index < 9; ++grid_index)
5085             {
5086                 /* Unpack structure */
5087                 pmegrid    = &pme->pmegrid[grid_index];
5088                 fftgrid    = pme->fftgrid[grid_index];
5089                 cfftgrid   = pme->cfftgrid[grid_index];
5090                 pfft_setup = pme->pfft_setup[grid_index];
5091                 calc_next_lb_coeffs(pme, local_sigma);
5092                 grid = pmegrid->grid.grid;
5093                 where();
5094
5095                 if (flags & GMX_PME_SPREAD)
5096                 {
5097                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
5098                     /* Spread the c6 on a grid */
5099                     spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
5100
5101                     if (bFirst)
5102                     {
5103                         inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
5104                     }
5105
5106                     inc_nrnb(nrnb, eNR_SPREADBSP,
5107                              pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
5108                     if (pme->nthread == 1)
5109                     {
5110                         wrap_periodic_pmegrid(pme, grid);
5111                         /* sum contributions to local grid from other nodes */
5112 #ifdef GMX_MPI
5113                         if (pme->nnodes > 1)
5114                         {
5115                             gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
5116                             where();
5117                         }
5118 #endif
5119                         copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
5120                     }
5121                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
5122                 }
5123                 /*Here we start a large thread parallel region*/
5124 #pragma omp parallel num_threads(pme->nthread) private(thread)
5125                 {
5126                     thread = gmx_omp_get_thread_num();
5127                     if (flags & GMX_PME_SOLVE)
5128                     {
5129                         /* do 3d-fft */
5130                         if (thread == 0)
5131                         {
5132                             wallcycle_start(wcycle, ewcPME_FFT);
5133                         }
5134
5135                         gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
5136                                                    thread, wcycle);
5137                         if (thread == 0)
5138                         {
5139                             wallcycle_stop(wcycle, ewcPME_FFT);
5140                         }
5141                         where();
5142                     }
5143                 }
5144                 bFirst = FALSE;
5145             }
5146             if (flags & GMX_PME_SOLVE)
5147             {
5148                 /* solve in k-space for our local cells */
5149 #pragma omp parallel num_threads(pme->nthread) private(thread)
5150                 {
5151                     int loop_count;
5152                     thread = gmx_omp_get_thread_num();
5153                     if (thread == 0)
5154                     {
5155                         wallcycle_start(wcycle, ewcLJPME);
5156                     }
5157
5158                     loop_count =
5159                         solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
5160                                          box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
5161                                          bCalcEnerVir,
5162                                          pme->nthread, thread);
5163                     if (thread == 0)
5164                     {
5165                         wallcycle_stop(wcycle, ewcLJPME);
5166                         where();
5167                         inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
5168                     }
5169                 }
5170             }
5171
5172             if (bCalcEnerVir)
5173             {
5174                 /* This should only be called on the master thread and
5175                  * after the threads have synchronized.
5176                  */
5177                 get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
5178             }
5179
5180             if (bCalcF)
5181             {
5182                 bFirst = !(flags & GMX_PME_DO_COULOMB);
5183                 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
5184                 for (grid_index = 8; grid_index >= 2; --grid_index)
5185                 {
5186                     /* Unpack structure */
5187                     pmegrid    = &pme->pmegrid[grid_index];
5188                     fftgrid    = pme->fftgrid[grid_index];
5189                     cfftgrid   = pme->cfftgrid[grid_index];
5190                     pfft_setup = pme->pfft_setup[grid_index];
5191                     grid       = pmegrid->grid.grid;
5192                     calc_next_lb_coeffs(pme, local_sigma);
5193                     where();
5194 #pragma omp parallel num_threads(pme->nthread) private(thread)
5195                     {
5196                         thread = gmx_omp_get_thread_num();
5197                         /* do 3d-invfft */
5198                         if (thread == 0)
5199                         {
5200                             where();
5201                             wallcycle_start(wcycle, ewcPME_FFT);
5202                         }
5203
5204                         gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
5205                                                    thread, wcycle);
5206                         if (thread == 0)
5207                         {
5208                             wallcycle_stop(wcycle, ewcPME_FFT);
5209
5210                             where();
5211
5212                             if (pme->nodeid == 0)
5213                             {
5214                                 ntot  = pme->nkx*pme->nky*pme->nkz;
5215                                 npme  = ntot*log((real)ntot)/log(2.0);
5216                                 inc_nrnb(nrnb, eNR_FFT, 2*npme);
5217                             }
5218                             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
5219                         }
5220
5221                         copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
5222
5223                     } /*#pragma omp parallel*/
5224
5225                     /* distribute local grid to all nodes */
5226 #ifdef GMX_MPI
5227                     if (pme->nnodes > 1)
5228                     {
5229                         gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
5230                     }
5231 #endif
5232                     where();
5233
5234                     unwrap_periodic_pmegrid(pme, grid);
5235
5236                     /* interpolate forces for our local atoms */
5237                     where();
5238                     bClearF = (bFirst && PAR(cr));
5239                     scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
5240                     scale  *= lb_scale_factor[grid_index-2];
5241 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
5242                     for (thread = 0; thread < pme->nthread; thread++)
5243                     {
5244                         gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
5245                                           &pme->atc[0].spline[thread],
5246                                           scale);
5247                     }
5248                     where();
5249
5250                     inc_nrnb(nrnb, eNR_GATHERFBSP,
5251                              pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
5252                     wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
5253
5254                     bFirst = FALSE;
5255                 } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
5256             }     /* if (bCalcF) */
5257         }         /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
5258     }             /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
5259
5260     if (bCalcF && pme->nnodes > 1)
5261     {
5262         wallcycle_start(wcycle, ewcPME_REDISTXF);
5263         for (d = 0; d < pme->ndecompdim; d++)
5264         {
5265             atc = &pme->atc[d];
5266             if (d == pme->ndecompdim - 1)
5267             {
5268                 n_d = homenr;
5269                 f_d = f + start;
5270             }
5271             else
5272             {
5273                 n_d = pme->atc[d+1].n;
5274                 f_d = pme->atc[d+1].f;
5275             }
5276             if (DOMAINDECOMP(cr))
5277             {
5278                 dd_pmeredist_f(pme, atc, n_d, f_d,
5279                                d == pme->ndecompdim-1 && pme->bPPnode);
5280             }
5281         }
5282
5283         wallcycle_stop(wcycle, ewcPME_REDISTXF);
5284     }
5285     where();
5286
5287     if (bCalcEnerVir)
5288     {
5289         if (flags & GMX_PME_DO_COULOMB)
5290         {
5291             if (!pme->bFEP_q)
5292             {
5293                 *energy_q = energy_AB[0];
5294                 m_add(vir_q, vir_AB[0], vir_q);
5295             }
5296             else
5297             {
5298                 *energy_q       = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
5299                 *dvdlambda_q   += energy_AB[1] - energy_AB[0];
5300                 for (i = 0; i < DIM; i++)
5301                 {
5302                     for (j = 0; j < DIM; j++)
5303                     {
5304                         vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
5305                             lambda_q*vir_AB[1][i][j];
5306                     }
5307                 }
5308             }
5309             if (debug)
5310             {
5311                 fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
5312             }
5313         }
5314         else
5315         {
5316             *energy_q = 0;
5317         }
5318
5319         if (flags & GMX_PME_DO_LJ)
5320         {
5321             if (!pme->bFEP_lj)
5322             {
5323                 *energy_lj = energy_AB[2];
5324                 m_add(vir_lj, vir_AB[2], vir_lj);
5325             }
5326             else
5327             {
5328                 *energy_lj     = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
5329                 *dvdlambda_lj += energy_AB[3] - energy_AB[2];
5330                 for (i = 0; i < DIM; i++)
5331                 {
5332                     for (j = 0; j < DIM; j++)
5333                     {
5334                         vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
5335                     }
5336                 }
5337             }
5338             if (debug)
5339             {
5340                 fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
5341             }
5342         }
5343         else
5344         {
5345             *energy_lj = 0;
5346         }
5347     }
5348     return 0;
5349 }