Merge release-4-6 into master
authorRoland Schulz <roland@utk.edu>
Wed, 13 Feb 2013 21:05:55 +0000 (16:05 -0500)
committerRoland Schulz <roland@utk.edu>
Wed, 13 Feb 2013 21:08:11 +0000 (16:08 -0500)
Conflicts:
cmake/Findgsl.cmake (deleted)
src/gromacs/legacyheaders/pme.h (manually removed export)
src/gromacs/mdlib/pme_pp.c (trivial)

Change-Id: I716f58e9dbb2c4a4c81993475ea6a98a0d0ba440

1  2 
CMakeLists.txt
src/gromacs/gmxlib/thread_mpi/pthreads.c
src/gromacs/legacyheaders/pme.h
src/gromacs/mdlib/pme.c
src/gromacs/mdlib/pme_pp.c
src/gromacs/mdlib/update.c
src/programs/mdrun/md.c
src/programs/mdrun/pme_loadbal.c

diff --cc CMakeLists.txt
index 4bc19765e386e570ea4dff6207357f2d75c2e508,907f79d773266327bfe6093ba778d437b79677d4..43dffd9933c76488ea62482c4a81b86c7968ff0e
@@@ -453,20 -526,9 +453,20 @@@ IF( WIN32 AND NOT CYGWIN
    ENDIF()
  ENDIF()
  
 +option(GMX_XML "Use libxml2 to parse xml files" ON)
 +if (GMX_XML)
 +  find_package(LibXml2)
 +  set(PKG_XML "")
 +  if(LIBXML2_FOUND)
 +    include_directories(${LIBXML2_INCLUDE_DIR})
 +    set(PKG_XML libxml-2.0)
 +    set(XML_LIBRARIES ${LIBXML2_LIBRARIES})
 +  endif(LIBXML2_FOUND)
 +endif(GMX_XML)
 +
  option(GMX_GSL "Add support for gsl" OFF)
  if (GMX_GSL)
-   find_package(gsl)
+   find_package(GSL)
    set(PKG_GSL "")
    if(GSL_FOUND)
      include_directories(${GSL_INCLUDE_DIR})
index 3af9abbfb6c96d9a5410ff7bd8b750cee81775f8,0000000000000000000000000000000000000000..4643148fdbe8f7c3ffd30dd4eb5ba25499ffd696
mode 100644,000000..100644
--- /dev/null
@@@ -1,169 -1,0 +1,182 @@@
- void gmx_pme_send_switch(t_commrec *cr, ivec grid_size, real ewaldcoeff);
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef _pme_h
 +#define _pme_h
 +
 +#include <stdio.h>
 +#include "typedefs.h"
 +#include "gmxcomplex.h"
 +#include "gmx_wallcycle.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +typedef real *splinevec[DIM];
 +
 +enum {
 +    GMX_SUM_QGRID_FORWARD, GMX_SUM_QGRID_BACKWARD
 +};
 +
 +int gmx_pme_init(gmx_pme_t *pmedata, t_commrec *cr,
 +                 int nnodes_major, int nnodes_minor,
 +                 t_inputrec *ir, int homenr,
 +                 gmx_bool bFreeEnergy, gmx_bool bReproducible, int nthread);
 +/* Initialize the pme data structures resepectively.
 + * Return value 0 indicates all well, non zero is an error code.
 + */
 +
 +int gmx_pme_reinit(gmx_pme_t *         pmedata,
 +                   t_commrec *         cr,
 +                   gmx_pme_t           pme_src,
 +                   const t_inputrec *  ir,
 +                   ivec                grid_size);
 +/* As gmx_pme_init, but takes most settings, except the grid, from pme_src */
 +
 +int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata);
 +/* Destroy the pme data structures resepectively.
 + * Return value 0 indicates all well, non zero is an error code.
 + */
 +
 +#define GMX_PME_SPREAD_Q      (1<<0)
 +#define GMX_PME_SOLVE         (1<<1)
 +#define GMX_PME_CALC_F        (1<<2)
 +#define GMX_PME_CALC_ENER_VIR (1<<3)
 +/* This forces the grid to be backtransformed even without GMX_PME_CALC_F */
 +#define GMX_PME_CALC_POT      (1<<4)
 +#define GMX_PME_DO_ALL_F  (GMX_PME_SPREAD_Q | GMX_PME_SOLVE | GMX_PME_CALC_F)
 +
 +int gmx_pme_do(gmx_pme_t pme,
 +               int start,       int homenr,
 +               rvec x[],        rvec f[],
 +               real chargeA[],  real chargeB[],
 +               matrix box,      t_commrec *cr,
 +               int  maxshift_x, int maxshift_y,
 +               t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
 +               matrix lrvir,    real ewaldcoeff,
 +               real *energy,    real lambda,
 +               real *dvdlambda, int flags);
 +/* Do a PME calculation for the long range electrostatics.
 + * flags, defined above, determine which parts of the calculation are performed.
 + * Return value 0 indicates all well, non zero is an error code.
 + */
 +
 +int gmx_pmeonly(gmx_pme_t pme,
 +                t_commrec *cr,     t_nrnb *mynrnb,
 +                gmx_wallcycle_t wcycle,
 +                real ewaldcoeff,   gmx_bool bGatherOnly,
 +                t_inputrec *ir);
 +/* Called on the nodes that do PME exclusively (as slaves)
 + */
 +
 +void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V);
 +/* Calculate the PME grid energy V for n charges with a potential
 + * in the pme struct determined before with a call to gmx_pme_do
 + * with at least GMX_PME_SPREAD_Q and GMX_PME_SOLVE specified.
 + * Note that the charges are not spread on the grid in the pme struct.
 + * Currently does not work in parallel or with free energy.
 + */
 +
 +/* The following three routines are for PME/PP node splitting in pme_pp.c */
 +
 +/* Abstract type for PME <-> PP communication */
 +typedef struct gmx_pme_pp *gmx_pme_pp_t;
 +
 +gmx_pme_pp_t gmx_pme_pp_init(t_commrec *cr);
 +/* Initialize the PME-only side of the PME <-> PP communication */
 +
 +void gmx_pme_send_q(t_commrec *cr,
 +                    gmx_bool bFreeEnergy, real *chargeA, real *chargeB,
 +                    int maxshift_x, int maxshift_y);
 +/* Send the charges and maxshift to out PME-only node. */
 +
 +void gmx_pme_send_x(t_commrec *cr, matrix box, rvec *x,
 +                    gmx_bool bFreeEnergy, real lambda,
 +                    gmx_bool bEnerVir,
 +                    gmx_large_int_t step);
 +/* Send the coordinates to our PME-only node and request a PME calculation */
 +
 +void gmx_pme_send_finish(t_commrec *cr);
 +/* Tell our PME-only node to finish */
 +
- /* Receive charges and/or coordinates from the PP-only nodes.
-  * Returns the number of atoms, or -1 when the run is finished.
-  * In the special case of a PME grid size switch request, -2 is returned
-  * and grid_size and *ewaldcoeff are set, which are otherwise not set.
++void gmx_pme_send_switchgrid(t_commrec *cr, ivec grid_size, real ewaldcoeff);
 +/* Tell our PME-only node to switch to a new grid size */
 +
++void gmx_pme_send_resetcounters(t_commrec *cr, gmx_large_int_t step);
++/* Tell our PME-only node to reset all cycle and flop counters */
++
 +void gmx_pme_receive_f(t_commrec *cr,
 +                       rvec f[], matrix vir,
 +                       real *energy, real *dvdlambda,
 +                       float *pme_cycles);
 +/* PP nodes receive the long range forces from the PME nodes */
 +
++/* Return values for gmx_pme_recv_q_x */
++enum {
++    pmerecvqxX,            /* calculate PME mesh interactions for new x    */
++    pmerecvqxFINISH,       /* the simulation should finish, we should quit */
++    pmerecvqxSWITCHGRID,   /* change the PME grid size                     */
++    pmerecvqxRESETCOUNTERS /* reset the cycle and flop counters            */
++};
++
 +int gmx_pme_recv_q_x(gmx_pme_pp_t pme_pp,
++                     int *natoms,
 +                     real **chargeA, real **chargeB,
 +                     matrix box, rvec **x, rvec **f,
 +                     int *maxshift_x, int *maxshift_y,
 +                     gmx_bool *bFreeEnergy, real *lambda,
 +                     gmx_bool *bEnerVir,
 +                     gmx_large_int_t *step,
 +                     ivec grid_size, real *ewaldcoeff);
 +;
++/* With return value:
++ * pmerecvqxX:             all parameters set, chargeA and chargeB can be NULL
++ * pmerecvqxFINISH:        no parameters set
++ * pmerecvqxSWITCHGRID:    only grid_size and *ewaldcoeff are set
++ * pmerecvqxRESETCOUNTERS: *step is set
 + */
 +
 +void gmx_pme_send_force_vir_ener(gmx_pme_pp_t pme_pp,
 +                                 rvec *f, matrix vir,
 +                                 real energy, real dvdlambda,
 +                                 float cycles);
 +/* Send the PME mesh force, virial and energy to the PP-only nodes */
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index 51cc32c7a545ebc13cb9b9da80affae73a4e68d6,0000000000000000000000000000000000000000..6927eace8018b719b2da49435e6770c491654aa6
mode 100644,000000..100644
--- /dev/null
@@@ -1,4616 -1,0 +1,4621 @@@
-                                    t_nrnb *nrnb, t_inputrec *ir, gmx_large_int_t step_rel)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +/* IMPORTANT FOR DEVELOPERS:
 + *
 + * Triclinic pme stuff isn't entirely trivial, and we've experienced
 + * some bugs during development (many of them due to me). To avoid
 + * this in the future, please check the following things if you make
 + * changes in this file:
 + *
 + * 1. You should obtain identical (at least to the PME precision)
 + *    energies, forces, and virial for
 + *    a rectangular box and a triclinic one where the z (or y) axis is
 + *    tilted a whole box side. For instance you could use these boxes:
 + *
 + *    rectangular       triclinic
 + *     2  0  0           2  0  0
 + *     0  2  0           0  2  0
 + *     0  0  6           2  2  6
 + *
 + * 2. You should check the energy conservation in a triclinic box.
 + *
 + * It might seem an overkill, but better safe than sorry.
 + * /Erik 001109
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#include <stdio.h>
 +#include <string.h>
 +#include <math.h>
 +#include <assert.h>
 +#include "typedefs.h"
 +#include "txtdump.h"
 +#include "vec.h"
 +#include "gmxcomplex.h"
 +#include "smalloc.h"
 +#include "futil.h"
 +#include "coulomb.h"
 +#include "gmx_fatal.h"
 +#include "pme.h"
 +#include "network.h"
 +#include "physics.h"
 +#include "nrnb.h"
 +#include "copyrite.h"
 +#include "gmx_wallcycle.h"
 +#include "gmx_parallel_3dfft.h"
 +#include "pdbio.h"
 +#include "gmx_cyclecounter.h"
 +#include "gmx_omp.h"
 +#include "macros.h"
 +
 +/* Single precision, with SSE2 or higher available */
 +#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
 +
 +#include "gmx_x86_simd_single.h"
 +
 +#define PME_SSE
 +/* Some old AMD processors could have problems with unaligned loads+stores */
 +#ifndef GMX_FAHCORE
 +#define PME_SSE_UNALIGNED
 +#endif
 +#endif
 +
 +#define DFT_TOL 1e-7
 +/* #define PRT_FORCE */
 +/* conditions for on the fly time-measurement */
 +/* #define TAKETIME (step > 1 && timesteps < 10) */
 +#define TAKETIME FALSE
 +
 +/* #define PME_TIME_THREADS */
 +
 +#ifdef GMX_DOUBLE
 +#define mpi_type MPI_DOUBLE
 +#else
 +#define mpi_type MPI_FLOAT
 +#endif
 +
 +/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
 +#define GMX_CACHE_SEP 64
 +
 +/* We only define a maximum to be able to use local arrays without allocation.
 + * An order larger than 12 should never be needed, even for test cases.
 + * If needed it can be changed here.
 + */
 +#define PME_ORDER_MAX 12
 +
 +/* Internal datastructures */
 +typedef struct {
 +    int send_index0;
 +    int send_nindex;
 +    int recv_index0;
 +    int recv_nindex;
 +    int recv_size;   /* Receive buffer width, used with OpenMP */
 +} pme_grid_comm_t;
 +
 +typedef struct {
 +#ifdef GMX_MPI
 +    MPI_Comm         mpi_comm;
 +#endif
 +    int              nnodes, nodeid;
 +    int             *s2g0;
 +    int             *s2g1;
 +    int              noverlap_nodes;
 +    int             *send_id, *recv_id;
 +    int              send_size; /* Send buffer width, used with OpenMP */
 +    pme_grid_comm_t *comm_data;
 +    real            *sendbuf;
 +    real            *recvbuf;
 +} pme_overlap_t;
 +
 +typedef struct {
 +    int *n;      /* Cumulative counts of the number of particles per thread */
 +    int  nalloc; /* Allocation size of i */
 +    int *i;      /* Particle indices ordered on thread index (n) */
 +} thread_plist_t;
 +
 +typedef struct {
 +    int      *thread_one;
 +    int       n;
 +    int      *ind;
 +    splinevec theta;
 +    real     *ptr_theta_z;
 +    splinevec dtheta;
 +    real     *ptr_dtheta_z;
 +} splinedata_t;
 +
 +typedef struct {
 +    int      dimind;        /* The index of the dimension, 0=x, 1=y */
 +    int      nslab;
 +    int      nodeid;
 +#ifdef GMX_MPI
 +    MPI_Comm mpi_comm;
 +#endif
 +
 +    int     *node_dest;     /* The nodes to send x and q to with DD */
 +    int     *node_src;      /* The nodes to receive x and q from with DD */
 +    int     *buf_index;     /* Index for commnode into the buffers */
 +
 +    int      maxshift;
 +
 +    int      npd;
 +    int      pd_nalloc;
 +    int     *pd;
 +    int     *count;         /* The number of atoms to send to each node */
 +    int    **count_thread;
 +    int     *rcount;        /* The number of atoms to receive */
 +
 +    int      n;
 +    int      nalloc;
 +    rvec    *x;
 +    real    *q;
 +    rvec    *f;
 +    gmx_bool bSpread;       /* These coordinates are used for spreading */
 +    int      pme_order;
 +    ivec    *idx;
 +    rvec    *fractx;            /* Fractional coordinate relative to the
 +                                 * lower cell boundary
 +                                 */
 +    int             nthread;
 +    int            *thread_idx; /* Which thread should spread which charge */
 +    thread_plist_t *thread_plist;
 +    splinedata_t   *spline;
 +} pme_atomcomm_t;
 +
 +#define FLBS  3
 +#define FLBSZ 4
 +
 +typedef struct {
 +    ivec  ci;     /* The spatial location of this grid         */
 +    ivec  n;      /* The used size of *grid, including order-1 */
 +    ivec  offset; /* The grid offset from the full node grid   */
 +    int   order;  /* PME spreading order                       */
 +    ivec  s;      /* The allocated size of *grid, s >= n       */
 +    real *grid;   /* The grid local thread, size n             */
 +} pmegrid_t;
 +
 +typedef struct {
 +    pmegrid_t  grid;         /* The full node grid (non thread-local)            */
 +    int        nthread;      /* The number of threads operating on this grid     */
 +    ivec       nc;           /* The local spatial decomposition over the threads */
 +    pmegrid_t *grid_th;      /* Array of grids for each thread                   */
 +    real      *grid_all;     /* Allocated array for the grids in *grid_th        */
 +    int      **g2t;          /* The grid to thread index                         */
 +    ivec       nthread_comm; /* The number of threads to communicate with        */
 +} pmegrids_t;
 +
 +
 +typedef struct {
 +#ifdef PME_SSE
 +    /* Masks for SSE aligned spreading and gathering */
 +    __m128 mask_SSE0[6], mask_SSE1[6];
 +#else
 +    int    dummy; /* C89 requires that struct has at least one member */
 +#endif
 +} pme_spline_work_t;
 +
 +typedef struct {
 +    /* work data for solve_pme */
 +    int      nalloc;
 +    real *   mhx;
 +    real *   mhy;
 +    real *   mhz;
 +    real *   m2;
 +    real *   denom;
 +    real *   tmp1_alloc;
 +    real *   tmp1;
 +    real *   eterm;
 +    real *   m2inv;
 +
 +    real     energy;
 +    matrix   vir;
 +} pme_work_t;
 +
 +typedef struct gmx_pme {
 +    int           ndecompdim; /* The number of decomposition dimensions */
 +    int           nodeid;     /* Our nodeid in mpi->mpi_comm */
 +    int           nodeid_major;
 +    int           nodeid_minor;
 +    int           nnodes;    /* The number of nodes doing PME */
 +    int           nnodes_major;
 +    int           nnodes_minor;
 +
 +    MPI_Comm      mpi_comm;
 +    MPI_Comm      mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
 +#ifdef GMX_MPI
 +    MPI_Datatype  rvec_mpi;      /* the pme vector's MPI type */
 +#endif
 +
 +    int        nthread;       /* The number of threads doing PME */
 +
 +    gmx_bool   bPPnode;       /* Node also does particle-particle forces */
 +    gmx_bool   bFEP;          /* Compute Free energy contribution */
 +    int        nkx, nky, nkz; /* Grid dimensions */
 +    gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
 +    int        pme_order;
 +    real       epsilon_r;
 +
 +    pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
 +    pmegrids_t pmegridB;
 +    /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
 +    int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
 +    /* pmegrid_nz might be larger than strictly necessary to ensure
 +     * memory alignment, pmegrid_nz_base gives the real base size.
 +     */
 +    int     pmegrid_nz_base;
 +    /* The local PME grid starting indices */
 +    int     pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
 +
 +    /* Work data for spreading and gathering */
 +    pme_spline_work_t    *spline_work;
 +
 +    real                 *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
 +    real                 *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
 +    int                   fftgrid_nx, fftgrid_ny, fftgrid_nz;
 +
 +    t_complex            *cfftgridA;  /* Grids for complex FFT data */
 +    t_complex            *cfftgridB;
 +    int                   cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
 +
 +    gmx_parallel_3dfft_t  pfft_setupA;
 +    gmx_parallel_3dfft_t  pfft_setupB;
 +
 +    int                  *nnx, *nny, *nnz;
 +    real                 *fshx, *fshy, *fshz;
 +
 +    pme_atomcomm_t        atc[2]; /* Indexed on decomposition index */
 +    matrix                recipbox;
 +    splinevec             bsp_mod;
 +
 +    pme_overlap_t         overlap[2]; /* Indexed on dimension, 0=x, 1=y */
 +
 +    pme_atomcomm_t        atc_energy; /* Only for gmx_pme_calc_energy */
 +
 +    rvec                 *bufv;       /* Communication buffer */
 +    real                 *bufr;       /* Communication buffer */
 +    int                   buf_nalloc; /* The communication buffer size */
 +
 +    /* thread local work data for solve_pme */
 +    pme_work_t *work;
 +
 +    /* Work data for PME_redist */
 +    gmx_bool redist_init;
 +    int *    scounts;
 +    int *    rcounts;
 +    int *    sdispls;
 +    int *    rdispls;
 +    int *    sidx;
 +    int *    idxa;
 +    real *   redist_buf;
 +    int      redist_buf_nalloc;
 +
 +    /* Work data for sum_qgrid */
 +    real *   sum_qgrid_tmp;
 +    real *   sum_qgrid_dd_tmp;
 +} t_gmx_pme;
 +
 +
 +static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
 +                                   int start, int end, int thread)
 +{
 +    int             i;
 +    int            *idxptr, tix, tiy, tiz;
 +    real           *xptr, *fptr, tx, ty, tz;
 +    real            rxx, ryx, ryy, rzx, rzy, rzz;
 +    int             nx, ny, nz;
 +    int             start_ix, start_iy, start_iz;
 +    int            *g2tx, *g2ty, *g2tz;
 +    gmx_bool        bThreads;
 +    int            *thread_idx = NULL;
 +    thread_plist_t *tpl        = NULL;
 +    int            *tpl_n      = NULL;
 +    int             thread_i;
 +
 +    nx  = pme->nkx;
 +    ny  = pme->nky;
 +    nz  = pme->nkz;
 +
 +    start_ix = pme->pmegrid_start_ix;
 +    start_iy = pme->pmegrid_start_iy;
 +    start_iz = pme->pmegrid_start_iz;
 +
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
 +
 +    g2tx = pme->pmegridA.g2t[XX];
 +    g2ty = pme->pmegridA.g2t[YY];
 +    g2tz = pme->pmegridA.g2t[ZZ];
 +
 +    bThreads = (atc->nthread > 1);
 +    if (bThreads)
 +    {
 +        thread_idx = atc->thread_idx;
 +
 +        tpl   = &atc->thread_plist[thread];
 +        tpl_n = tpl->n;
 +        for (i = 0; i < atc->nthread; i++)
 +        {
 +            tpl_n[i] = 0;
 +        }
 +    }
 +
 +    for (i = start; i < end; i++)
 +    {
 +        xptr   = atc->x[i];
 +        idxptr = atc->idx[i];
 +        fptr   = atc->fractx[i];
 +
 +        /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
 +        tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
 +        ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
 +        tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
 +
 +        tix = (int)(tx);
 +        tiy = (int)(ty);
 +        tiz = (int)(tz);
 +
 +        /* Because decomposition only occurs in x and y,
 +         * we never have a fraction correction in z.
 +         */
 +        fptr[XX] = tx - tix + pme->fshx[tix];
 +        fptr[YY] = ty - tiy + pme->fshy[tiy];
 +        fptr[ZZ] = tz - tiz;
 +
 +        idxptr[XX] = pme->nnx[tix];
 +        idxptr[YY] = pme->nny[tiy];
 +        idxptr[ZZ] = pme->nnz[tiz];
 +
 +#ifdef DEBUG
 +        range_check(idxptr[XX], 0, pme->pmegrid_nx);
 +        range_check(idxptr[YY], 0, pme->pmegrid_ny);
 +        range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
 +#endif
 +
 +        if (bThreads)
 +        {
 +            thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
 +            thread_idx[i] = thread_i;
 +            tpl_n[thread_i]++;
 +        }
 +    }
 +
 +    if (bThreads)
 +    {
 +        /* Make a list of particle indices sorted on thread */
 +
 +        /* Get the cumulative count */
 +        for (i = 1; i < atc->nthread; i++)
 +        {
 +            tpl_n[i] += tpl_n[i-1];
 +        }
 +        /* The current implementation distributes particles equally
 +         * over the threads, so we could actually allocate for that
 +         * in pme_realloc_atomcomm_things.
 +         */
 +        if (tpl_n[atc->nthread-1] > tpl->nalloc)
 +        {
 +            tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
 +            srenew(tpl->i, tpl->nalloc);
 +        }
 +        /* Set tpl_n to the cumulative start */
 +        for (i = atc->nthread-1; i >= 1; i--)
 +        {
 +            tpl_n[i] = tpl_n[i-1];
 +        }
 +        tpl_n[0] = 0;
 +
 +        /* Fill our thread local array with indices sorted on thread */
 +        for (i = start; i < end; i++)
 +        {
 +            tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
 +        }
 +        /* Now tpl_n contains the cummulative count again */
 +    }
 +}
 +
 +static void make_thread_local_ind(pme_atomcomm_t *atc,
 +                                  int thread, splinedata_t *spline)
 +{
 +    int             n, t, i, start, end;
 +    thread_plist_t *tpl;
 +
 +    /* Combine the indices made by each thread into one index */
 +
 +    n     = 0;
 +    start = 0;
 +    for (t = 0; t < atc->nthread; t++)
 +    {
 +        tpl = &atc->thread_plist[t];
 +        /* Copy our part (start - end) from the list of thread t */
 +        if (thread > 0)
 +        {
 +            start = tpl->n[thread-1];
 +        }
 +        end = tpl->n[thread];
 +        for (i = start; i < end; i++)
 +        {
 +            spline->ind[n++] = tpl->i[i];
 +        }
 +    }
 +
 +    spline->n = n;
 +}
 +
 +
 +static void pme_calc_pidx(int start, int end,
 +                          matrix recipbox, rvec x[],
 +                          pme_atomcomm_t *atc, int *count)
 +{
 +    int   nslab, i;
 +    int   si;
 +    real *xptr, s;
 +    real  rxx, ryx, rzx, ryy, rzy;
 +    int  *pd;
 +
 +    /* Calculate PME task index (pidx) for each grid index.
 +     * Here we always assign equally sized slabs to each node
 +     * for load balancing reasons (the PME grid spacing is not used).
 +     */
 +
 +    nslab = atc->nslab;
 +    pd    = atc->pd;
 +
 +    /* Reset the count */
 +    for (i = 0; i < nslab; i++)
 +    {
 +        count[i] = 0;
 +    }
 +
 +    if (atc->dimind == 0)
 +    {
 +        rxx = recipbox[XX][XX];
 +        ryx = recipbox[YY][XX];
 +        rzx = recipbox[ZZ][XX];
 +        /* Calculate the node index in x-dimension */
 +        for (i = start; i < end; i++)
 +        {
 +            xptr   = x[i];
 +            /* Fractional coordinates along box vectors */
 +            s     = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
 +            si    = (int)(s + 2*nslab) % nslab;
 +            pd[i] = si;
 +            count[si]++;
 +        }
 +    }
 +    else
 +    {
 +        ryy = recipbox[YY][YY];
 +        rzy = recipbox[ZZ][YY];
 +        /* Calculate the node index in y-dimension */
 +        for (i = start; i < end; i++)
 +        {
 +            xptr   = x[i];
 +            /* Fractional coordinates along box vectors */
 +            s     = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
 +            si    = (int)(s + 2*nslab) % nslab;
 +            pd[i] = si;
 +            count[si]++;
 +        }
 +    }
 +}
 +
 +static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
 +                                  pme_atomcomm_t *atc)
 +{
 +    int nthread, thread, slab;
 +
 +    nthread = atc->nthread;
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        pme_calc_pidx(natoms* thread   /nthread,
 +                      natoms*(thread+1)/nthread,
 +                      recipbox, x, atc, atc->count_thread[thread]);
 +    }
 +    /* Non-parallel reduction, since nslab is small */
 +
 +    for (thread = 1; thread < nthread; thread++)
 +    {
 +        for (slab = 0; slab < atc->nslab; slab++)
 +        {
 +            atc->count_thread[0][slab] += atc->count_thread[thread][slab];
 +        }
 +    }
 +}
 +
 +static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
 +{
 +    const int padding = 4;
 +    int       i;
 +
 +    srenew(th[XX], nalloc);
 +    srenew(th[YY], nalloc);
 +    /* In z we add padding, this is only required for the aligned SSE code */
 +    srenew(*ptr_z, nalloc+2*padding);
 +    th[ZZ] = *ptr_z + padding;
 +
 +    for (i = 0; i < padding; i++)
 +    {
 +        (*ptr_z)[               i] = 0;
 +        (*ptr_z)[padding+nalloc+i] = 0;
 +    }
 +}
 +
 +static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
 +{
 +    int i, d;
 +
 +    srenew(spline->ind, atc->nalloc);
 +    /* Initialize the index to identity so it works without threads */
 +    for (i = 0; i < atc->nalloc; i++)
 +    {
 +        spline->ind[i] = i;
 +    }
 +
 +    realloc_splinevec(spline->theta, &spline->ptr_theta_z,
 +                      atc->pme_order*atc->nalloc);
 +    realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
 +                      atc->pme_order*atc->nalloc);
 +}
 +
 +static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 +{
 +    int nalloc_old, i, j, nalloc_tpl;
 +
 +    /* We have to avoid a NULL pointer for atc->x to avoid
 +     * possible fatal errors in MPI routines.
 +     */
 +    if (atc->n > atc->nalloc || atc->nalloc == 0)
 +    {
 +        nalloc_old  = atc->nalloc;
 +        atc->nalloc = over_alloc_dd(max(atc->n, 1));
 +
 +        if (atc->nslab > 1)
 +        {
 +            srenew(atc->x, atc->nalloc);
 +            srenew(atc->q, atc->nalloc);
 +            srenew(atc->f, atc->nalloc);
 +            for (i = nalloc_old; i < atc->nalloc; i++)
 +            {
 +                clear_rvec(atc->f[i]);
 +            }
 +        }
 +        if (atc->bSpread)
 +        {
 +            srenew(atc->fractx, atc->nalloc);
 +            srenew(atc->idx, atc->nalloc);
 +
 +            if (atc->nthread > 1)
 +            {
 +                srenew(atc->thread_idx, atc->nalloc);
 +            }
 +
 +            for (i = 0; i < atc->nthread; i++)
 +            {
 +                pme_realloc_splinedata(&atc->spline[i], atc);
 +            }
 +        }
 +    }
 +}
 +
 +static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
 +                         int n, gmx_bool bXF, rvec *x_f, real *charge,
 +                         pme_atomcomm_t *atc)
 +/* Redistribute particle data for PME calculation */
 +/* domain decomposition by x coordinate           */
 +{
 +    int *idxa;
 +    int  i, ii;
 +
 +    if (FALSE == pme->redist_init)
 +    {
 +        snew(pme->scounts, atc->nslab);
 +        snew(pme->rcounts, atc->nslab);
 +        snew(pme->sdispls, atc->nslab);
 +        snew(pme->rdispls, atc->nslab);
 +        snew(pme->sidx, atc->nslab);
 +        pme->redist_init = TRUE;
 +    }
 +    if (n > pme->redist_buf_nalloc)
 +    {
 +        pme->redist_buf_nalloc = over_alloc_dd(n);
 +        srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
 +    }
 +
 +    pme->idxa = atc->pd;
 +
 +#ifdef GMX_MPI
 +    if (forw && bXF)
 +    {
 +        /* forward, redistribution from pp to pme */
 +
 +        /* Calculate send counts and exchange them with other nodes */
 +        for (i = 0; (i < atc->nslab); i++)
 +        {
 +            pme->scounts[i] = 0;
 +        }
 +        for (i = 0; (i < n); i++)
 +        {
 +            pme->scounts[pme->idxa[i]]++;
 +        }
 +        MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
 +
 +        /* Calculate send and receive displacements and index into send
 +           buffer */
 +        pme->sdispls[0] = 0;
 +        pme->rdispls[0] = 0;
 +        pme->sidx[0]    = 0;
 +        for (i = 1; i < atc->nslab; i++)
 +        {
 +            pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
 +            pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
 +            pme->sidx[i]    = pme->sdispls[i];
 +        }
 +        /* Total # of particles to be received */
 +        atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
 +
 +        pme_realloc_atomcomm_things(atc);
 +
 +        /* Copy particle coordinates into send buffer and exchange*/
 +        for (i = 0; (i < n); i++)
 +        {
 +            ii = DIM*pme->sidx[pme->idxa[i]];
 +            pme->sidx[pme->idxa[i]]++;
 +            pme->redist_buf[ii+XX] = x_f[i][XX];
 +            pme->redist_buf[ii+YY] = x_f[i][YY];
 +            pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
 +        }
 +        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
 +                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
 +                      pme->rvec_mpi, atc->mpi_comm);
 +    }
 +    if (forw)
 +    {
 +        /* Copy charge into send buffer and exchange*/
 +        for (i = 0; i < atc->nslab; i++)
 +        {
 +            pme->sidx[i] = pme->sdispls[i];
 +        }
 +        for (i = 0; (i < n); i++)
 +        {
 +            ii = pme->sidx[pme->idxa[i]];
 +            pme->sidx[pme->idxa[i]]++;
 +            pme->redist_buf[ii] = charge[i];
 +        }
 +        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
 +                      atc->q, pme->rcounts, pme->rdispls, mpi_type,
 +                      atc->mpi_comm);
 +    }
 +    else   /* backward, redistribution from pme to pp */
 +    {
 +        MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
 +                      pme->redist_buf, pme->scounts, pme->sdispls,
 +                      pme->rvec_mpi, atc->mpi_comm);
 +
 +        /* Copy data from receive buffer */
 +        for (i = 0; i < atc->nslab; i++)
 +        {
 +            pme->sidx[i] = pme->sdispls[i];
 +        }
 +        for (i = 0; (i < n); i++)
 +        {
 +            ii          = DIM*pme->sidx[pme->idxa[i]];
 +            x_f[i][XX] += pme->redist_buf[ii+XX];
 +            x_f[i][YY] += pme->redist_buf[ii+YY];
 +            x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
 +            pme->sidx[pme->idxa[i]]++;
 +        }
 +    }
 +#endif
 +}
 +
 +static void pme_dd_sendrecv(pme_atomcomm_t *atc,
 +                            gmx_bool bBackward, int shift,
 +                            void *buf_s, int nbyte_s,
 +                            void *buf_r, int nbyte_r)
 +{
 +#ifdef GMX_MPI
 +    int        dest, src;
 +    MPI_Status stat;
 +
 +    if (bBackward == FALSE)
 +    {
 +        dest = atc->node_dest[shift];
 +        src  = atc->node_src[shift];
 +    }
 +    else
 +    {
 +        dest = atc->node_src[shift];
 +        src  = atc->node_dest[shift];
 +    }
 +
 +    if (nbyte_s > 0 && nbyte_r > 0)
 +    {
 +        MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
 +                     dest, shift,
 +                     buf_r, nbyte_r, MPI_BYTE,
 +                     src, shift,
 +                     atc->mpi_comm, &stat);
 +    }
 +    else if (nbyte_s > 0)
 +    {
 +        MPI_Send(buf_s, nbyte_s, MPI_BYTE,
 +                 dest, shift,
 +                 atc->mpi_comm);
 +    }
 +    else if (nbyte_r > 0)
 +    {
 +        MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
 +                 src, shift,
 +                 atc->mpi_comm, &stat);
 +    }
 +#endif
 +}
 +
 +static void dd_pmeredist_x_q(gmx_pme_t pme,
 +                             int n, gmx_bool bX, rvec *x, real *charge,
 +                             pme_atomcomm_t *atc)
 +{
 +    int *commnode, *buf_index;
 +    int  nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
 +
 +    commnode  = atc->node_dest;
 +    buf_index = atc->buf_index;
 +
 +    nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 +
 +    nsend = 0;
 +    for (i = 0; i < nnodes_comm; i++)
 +    {
 +        buf_index[commnode[i]] = nsend;
 +        nsend                 += atc->count[commnode[i]];
 +    }
 +    if (bX)
 +    {
 +        if (atc->count[atc->nodeid] + nsend != n)
 +        {
 +            gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
 +                      "This usually means that your system is not well equilibrated.",
 +                      n - (atc->count[atc->nodeid] + nsend),
 +                      pme->nodeid, 'x'+atc->dimind);
 +        }
 +
 +        if (nsend > pme->buf_nalloc)
 +        {
 +            pme->buf_nalloc = over_alloc_dd(nsend);
 +            srenew(pme->bufv, pme->buf_nalloc);
 +            srenew(pme->bufr, pme->buf_nalloc);
 +        }
 +
 +        atc->n = atc->count[atc->nodeid];
 +        for (i = 0; i < nnodes_comm; i++)
 +        {
 +            scount = atc->count[commnode[i]];
 +            /* Communicate the count */
 +            if (debug)
 +            {
 +                fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
 +                        atc->dimind, atc->nodeid, commnode[i], scount);
 +            }
 +            pme_dd_sendrecv(atc, FALSE, i,
 +                            &scount, sizeof(int),
 +                            &atc->rcount[i], sizeof(int));
 +            atc->n += atc->rcount[i];
 +        }
 +
 +        pme_realloc_atomcomm_things(atc);
 +    }
 +
 +    local_pos = 0;
 +    for (i = 0; i < n; i++)
 +    {
 +        node = atc->pd[i];
 +        if (node == atc->nodeid)
 +        {
 +            /* Copy direct to the receive buffer */
 +            if (bX)
 +            {
 +                copy_rvec(x[i], atc->x[local_pos]);
 +            }
 +            atc->q[local_pos] = charge[i];
 +            local_pos++;
 +        }
 +        else
 +        {
 +            /* Copy to the send buffer */
 +            if (bX)
 +            {
 +                copy_rvec(x[i], pme->bufv[buf_index[node]]);
 +            }
 +            pme->bufr[buf_index[node]] = charge[i];
 +            buf_index[node]++;
 +        }
 +    }
 +
 +    buf_pos = 0;
 +    for (i = 0; i < nnodes_comm; i++)
 +    {
 +        scount = atc->count[commnode[i]];
 +        rcount = atc->rcount[i];
 +        if (scount > 0 || rcount > 0)
 +        {
 +            if (bX)
 +            {
 +                /* Communicate the coordinates */
 +                pme_dd_sendrecv(atc, FALSE, i,
 +                                pme->bufv[buf_pos], scount*sizeof(rvec),
 +                                atc->x[local_pos], rcount*sizeof(rvec));
 +            }
 +            /* Communicate the charges */
 +            pme_dd_sendrecv(atc, FALSE, i,
 +                            pme->bufr+buf_pos, scount*sizeof(real),
 +                            atc->q+local_pos, rcount*sizeof(real));
 +            buf_pos   += scount;
 +            local_pos += atc->rcount[i];
 +        }
 +    }
 +}
 +
 +static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 +                           int n, rvec *f,
 +                           gmx_bool bAddF)
 +{
 +    int *commnode, *buf_index;
 +    int  nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
 +
 +    commnode  = atc->node_dest;
 +    buf_index = atc->buf_index;
 +
 +    nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 +
 +    local_pos = atc->count[atc->nodeid];
 +    buf_pos   = 0;
 +    for (i = 0; i < nnodes_comm; i++)
 +    {
 +        scount = atc->rcount[i];
 +        rcount = atc->count[commnode[i]];
 +        if (scount > 0 || rcount > 0)
 +        {
 +            /* Communicate the forces */
 +            pme_dd_sendrecv(atc, TRUE, i,
 +                            atc->f[local_pos], scount*sizeof(rvec),
 +                            pme->bufv[buf_pos], rcount*sizeof(rvec));
 +            local_pos += scount;
 +        }
 +        buf_index[commnode[i]] = buf_pos;
 +        buf_pos               += rcount;
 +    }
 +
 +    local_pos = 0;
 +    if (bAddF)
 +    {
 +        for (i = 0; i < n; i++)
 +        {
 +            node = atc->pd[i];
 +            if (node == atc->nodeid)
 +            {
 +                /* Add from the local force array */
 +                rvec_inc(f[i], atc->f[local_pos]);
 +                local_pos++;
 +            }
 +            else
 +            {
 +                /* Add from the receive buffer */
 +                rvec_inc(f[i], pme->bufv[buf_index[node]]);
 +                buf_index[node]++;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (i = 0; i < n; i++)
 +        {
 +            node = atc->pd[i];
 +            if (node == atc->nodeid)
 +            {
 +                /* Copy from the local force array */
 +                copy_rvec(atc->f[local_pos], f[i]);
 +                local_pos++;
 +            }
 +            else
 +            {
 +                /* Copy from the receive buffer */
 +                copy_rvec(pme->bufv[buf_index[node]], f[i]);
 +                buf_index[node]++;
 +            }
 +        }
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void
 +gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 +{
 +    pme_overlap_t *overlap;
 +    int            send_index0, send_nindex;
 +    int            recv_index0, recv_nindex;
 +    MPI_Status     stat;
 +    int            i, j, k, ix, iy, iz, icnt;
 +    int            ipulse, send_id, recv_id, datasize;
 +    real          *p;
 +    real          *sendptr, *recvptr;
 +
 +    /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
 +    overlap = &pme->overlap[1];
 +
 +    for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 +    {
 +        /* Since we have already (un)wrapped the overlap in the z-dimension,
 +         * we only have to communicate 0 to nkz (not pmegrid_nz).
 +         */
 +        if (direction == GMX_SUM_QGRID_FORWARD)
 +        {
 +            send_id       = overlap->send_id[ipulse];
 +            recv_id       = overlap->recv_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +        }
 +        else
 +        {
 +            send_id       = overlap->recv_id[ipulse];
 +            recv_id       = overlap->send_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].recv_index0;
 +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].send_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        }
 +
 +        /* Copy data to contiguous send buffer */
 +        if (debug)
 +        {
 +            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid, overlap->nodeid, send_id,
 +                    pme->pmegrid_start_iy,
 +                    send_index0-pme->pmegrid_start_iy,
 +                    send_index0-pme->pmegrid_start_iy+send_nindex);
 +        }
 +        icnt = 0;
 +        for (i = 0; i < pme->pmegrid_nx; i++)
 +        {
 +            ix = i;
 +            for (j = 0; j < send_nindex; j++)
 +            {
 +                iy = j + send_index0 - pme->pmegrid_start_iy;
 +                for (k = 0; k < pme->nkz; k++)
 +                {
 +                    iz = k;
 +                    overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
 +                }
 +            }
 +        }
 +
 +        datasize      = pme->pmegrid_nx * pme->nkz;
 +
 +        MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
 +                     send_id, ipulse,
 +                     overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
 +                     recv_id, ipulse,
 +                     overlap->mpi_comm, &stat);
 +
 +        /* Get data from contiguous recv buffer */
 +        if (debug)
 +        {
 +            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid, overlap->nodeid, recv_id,
 +                    pme->pmegrid_start_iy,
 +                    recv_index0-pme->pmegrid_start_iy,
 +                    recv_index0-pme->pmegrid_start_iy+recv_nindex);
 +        }
 +        icnt = 0;
 +        for (i = 0; i < pme->pmegrid_nx; i++)
 +        {
 +            ix = i;
 +            for (j = 0; j < recv_nindex; j++)
 +            {
 +                iy = j + recv_index0 - pme->pmegrid_start_iy;
 +                for (k = 0; k < pme->nkz; k++)
 +                {
 +                    iz = k;
 +                    if (direction == GMX_SUM_QGRID_FORWARD)
 +                    {
 +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
 +                    }
 +                    else
 +                    {
 +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Major dimension is easier, no copying required,
 +     * but we might have to sum to separate array.
 +     * Since we don't copy, we have to communicate up to pmegrid_nz,
 +     * not nkz as for the minor direction.
 +     */
 +    overlap = &pme->overlap[0];
 +
 +    for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 +    {
 +        if (direction == GMX_SUM_QGRID_FORWARD)
 +        {
 +            send_id       = overlap->send_id[ipulse];
 +            recv_id       = overlap->recv_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recvptr       = overlap->recvbuf;
 +        }
 +        else
 +        {
 +            send_id       = overlap->recv_id[ipulse];
 +            recv_id       = overlap->send_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].recv_index0;
 +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].send_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recvptr       = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +        }
 +
 +        sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +        datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid, overlap->nodeid, send_id,
 +                    pme->pmegrid_start_ix,
 +                    send_index0-pme->pmegrid_start_ix,
 +                    send_index0-pme->pmegrid_start_ix+send_nindex);
 +            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid, overlap->nodeid, recv_id,
 +                    pme->pmegrid_start_ix,
 +                    recv_index0-pme->pmegrid_start_ix,
 +                    recv_index0-pme->pmegrid_start_ix+recv_nindex);
 +        }
 +
 +        MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
 +                     send_id, ipulse,
 +                     recvptr, recv_nindex*datasize, GMX_MPI_REAL,
 +                     recv_id, ipulse,
 +                     overlap->mpi_comm, &stat);
 +
 +        /* ADD data from contiguous recv buffer */
 +        if (direction == GMX_SUM_QGRID_FORWARD)
 +        {
 +            p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +            for (i = 0; i < recv_nindex*datasize; i++)
 +            {
 +                p[i] += overlap->recvbuf[i];
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +
 +static int
 +copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
 +{
 +    ivec    local_fft_ndata, local_fft_offset, local_fft_size;
 +    ivec    local_pme_size;
 +    int     i, ix, iy, iz;
 +    int     pmeidx, fftidx;
 +
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    local_pme_size[0] = pme->pmegrid_nx;
 +    local_pme_size[1] = pme->pmegrid_ny;
 +    local_pme_size[2] = pme->pmegrid_nz;
 +
 +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 +       the offset is identical, and the PME grid always has more data (due to overlap)
 +     */
 +    {
 +#ifdef DEBUG_PME
 +        FILE *fp, *fp2;
 +        char  fn[STRLEN], format[STRLEN];
 +        real  val;
 +        sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
 +        fp = ffopen(fn, "w");
 +        sprintf(fn, "pmegrid%d.txt", pme->nodeid);
 +        fp2 = ffopen(fn, "w");
 +        sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
 +#endif
 +
 +        for (ix = 0; ix < local_fft_ndata[XX]; ix++)
 +        {
 +            for (iy = 0; iy < local_fft_ndata[YY]; iy++)
 +            {
 +                for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
 +                {
 +                    pmeidx          = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
 +                    fftidx          = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
 +                    fftgrid[fftidx] = pmegrid[pmeidx];
 +#ifdef DEBUG_PME
 +                    val = 100*pmegrid[pmeidx];
 +                    if (pmegrid[pmeidx] != 0)
 +                    {
 +                        fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
 +                                5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
 +                    }
 +                    if (pmegrid[pmeidx] != 0)
 +                    {
 +                        fprintf(fp2, "%-12s  %5d  %5d  %5d  %12.5e\n",
 +                                "qgrid",
 +                                pme->pmegrid_start_ix + ix,
 +                                pme->pmegrid_start_iy + iy,
 +                                pme->pmegrid_start_iz + iz,
 +                                pmegrid[pmeidx]);
 +                    }
 +#endif
 +                }
 +            }
 +        }
 +#ifdef DEBUG_PME
 +        ffclose(fp);
 +        ffclose(fp2);
 +#endif
 +    }
 +    return 0;
 +}
 +
 +
 +static gmx_cycles_t omp_cyc_start()
 +{
 +    return gmx_cycles_read();
 +}
 +
 +static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
 +{
 +    return gmx_cycles_read() - c;
 +}
 +
 +
 +static int
 +copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
 +                        int nthread, int thread)
 +{
 +    ivec          local_fft_ndata, local_fft_offset, local_fft_size;
 +    ivec          local_pme_size;
 +    int           ixy0, ixy1, ixy, ix, iy, iz;
 +    int           pmeidx, fftidx;
 +#ifdef PME_TIME_THREADS
 +    gmx_cycles_t  c1;
 +    static double cs1 = 0;
 +    static int    cnt = 0;
 +#endif
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_start();
 +#endif
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    local_pme_size[0] = pme->pmegrid_nx;
 +    local_pme_size[1] = pme->pmegrid_ny;
 +    local_pme_size[2] = pme->pmegrid_nz;
 +
 +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 +       the offset is identical, and the PME grid always has more data (due to overlap)
 +     */
 +    ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 +    ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 +
 +    for (ixy = ixy0; ixy < ixy1; ixy++)
 +    {
 +        ix = ixy/local_fft_ndata[YY];
 +        iy = ixy - ix*local_fft_ndata[YY];
 +
 +        pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
 +        fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
 +        for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
 +        {
 +            pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
 +        }
 +    }
 +
 +#ifdef PME_TIME_THREADS
 +    c1   = omp_cyc_end(c1);
 +    cs1 += (double)c1;
 +    cnt++;
 +    if (cnt % 20 == 0)
 +    {
 +        printf("copy %.2f\n", cs1*1e-9);
 +    }
 +#endif
 +
 +    return 0;
 +}
 +
 +
 +static void
 +wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 +{
 +    int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    pnx = pme->pmegrid_nx;
 +    pny = pme->pmegrid_ny;
 +    pnz = pme->pmegrid_nz;
 +
 +    overlap = pme->pme_order - 1;
 +
 +    /* Add periodic overlap in z */
 +    for (ix = 0; ix < pme->pmegrid_nx; ix++)
 +    {
 +        for (iy = 0; iy < pme->pmegrid_ny; iy++)
 +        {
 +            for (iz = 0; iz < overlap; iz++)
 +            {
 +                pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                    pmegrid[(ix*pny+iy)*pnz+nz+iz];
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_minor == 1)
 +    {
 +        for (ix = 0; ix < pme->pmegrid_nx; ix++)
 +        {
 +            for (iy = 0; iy < overlap; iy++)
 +            {
 +                for (iz = 0; iz < nz; iz++)
 +                {
 +                    pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                        pmegrid[(ix*pny+ny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_major == 1)
 +    {
 +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 +
 +        for (ix = 0; ix < overlap; ix++)
 +        {
 +            for (iy = 0; iy < ny_x; iy++)
 +            {
 +                for (iz = 0; iz < nz; iz++)
 +                {
 +                    pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                        pmegrid[((nx+ix)*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void
 +unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 +{
 +    int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    pnx = pme->pmegrid_nx;
 +    pny = pme->pmegrid_ny;
 +    pnz = pme->pmegrid_nz;
 +
 +    overlap = pme->pme_order - 1;
 +
 +    if (pme->nnodes_major == 1)
 +    {
 +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 +
 +        for (ix = 0; ix < overlap; ix++)
 +        {
 +            int iy, iz;
 +
 +            for (iy = 0; iy < ny_x; iy++)
 +            {
 +                for (iz = 0; iz < nz; iz++)
 +                {
 +                    pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
 +                        pmegrid[(ix*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_minor == 1)
 +    {
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +        for (ix = 0; ix < pme->pmegrid_nx; ix++)
 +        {
 +            int iy, iz;
 +
 +            for (iy = 0; iy < overlap; iy++)
 +            {
 +                for (iz = 0; iz < nz; iz++)
 +                {
 +                    pmegrid[(ix*pny+ny+iy)*pnz+iz] =
 +                        pmegrid[(ix*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Copy periodic overlap in z */
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +    for (ix = 0; ix < pme->pmegrid_nx; ix++)
 +    {
 +        int iy, iz;
 +
 +        for (iy = 0; iy < pme->pmegrid_ny; iy++)
 +        {
 +            for (iz = 0; iz < overlap; iz++)
 +            {
 +                pmegrid[(ix*pny+iy)*pnz+nz+iz] =
 +                    pmegrid[(ix*pny+iy)*pnz+iz];
 +            }
 +        }
 +    }
 +}
 +
 +static void clear_grid(int nx, int ny, int nz, real *grid,
 +                       ivec fs, int *flag,
 +                       int fx, int fy, int fz,
 +                       int order)
 +{
 +    int nc, ncz;
 +    int fsx, fsy, fsz, gx, gy, gz, g0x, g0y, x, y, z;
 +    int flind;
 +
 +    nc  = 2 + (order - 2)/FLBS;
 +    ncz = 2 + (order - 2)/FLBSZ;
 +
 +    for (fsx = fx; fsx < fx+nc; fsx++)
 +    {
 +        for (fsy = fy; fsy < fy+nc; fsy++)
 +        {
 +            for (fsz = fz; fsz < fz+ncz; fsz++)
 +            {
 +                flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
 +                if (flag[flind] == 0)
 +                {
 +                    gx  = fsx*FLBS;
 +                    gy  = fsy*FLBS;
 +                    gz  = fsz*FLBSZ;
 +                    g0x = (gx*ny + gy)*nz + gz;
 +                    for (x = 0; x < FLBS; x++)
 +                    {
 +                        g0y = g0x;
 +                        for (y = 0; y < FLBS; y++)
 +                        {
 +                            for (z = 0; z < FLBSZ; z++)
 +                            {
 +                                grid[g0y+z] = 0;
 +                            }
 +                            g0y += nz;
 +                        }
 +                        g0x += ny*nz;
 +                    }
 +
 +                    flag[flind] = 1;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
 +#define DO_BSPLINE(order)                            \
 +    for (ithx = 0; (ithx < order); ithx++)                    \
 +    {                                                    \
 +        index_x = (i0+ithx)*pny*pnz;                     \
 +        valx    = qn*thx[ithx];                          \
 +                                                     \
 +        for (ithy = 0; (ithy < order); ithy++)                \
 +        {                                                \
 +            valxy    = valx*thy[ithy];                   \
 +            index_xy = index_x+(j0+ithy)*pnz;            \
 +                                                     \
 +            for (ithz = 0; (ithz < order); ithz++)            \
 +            {                                            \
 +                index_xyz        = index_xy+(k0+ithz);   \
 +                grid[index_xyz] += valxy*thz[ithz];      \
 +            }                                            \
 +        }                                                \
 +    }
 +
 +
 +static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
 +                                     pme_atomcomm_t *atc, splinedata_t *spline,
 +                                     pme_spline_work_t *work)
 +{
 +
 +    /* spread charges from home atoms to local grid */
 +    real          *grid;
 +    pme_overlap_t *ol;
 +    int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
 +    int       *    idxptr;
 +    int            order, norder, index_x, index_xy, index_xyz;
 +    real           valx, valxy, qn;
 +    real          *thx, *thy, *thz;
 +    int            localsize, bndsize;
 +    int            pnx, pny, pnz, ndatatot;
 +    int            offx, offy, offz;
 +
 +    pnx = pmegrid->s[XX];
 +    pny = pmegrid->s[YY];
 +    pnz = pmegrid->s[ZZ];
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +    ndatatot = pnx*pny*pnz;
 +    grid     = pmegrid->grid;
 +    for (i = 0; i < ndatatot; i++)
 +    {
 +        grid[i] = 0;
 +    }
 +
 +    order = pmegrid->order;
 +
 +    for (nn = 0; nn < spline->n; nn++)
 +    {
 +        n  = spline->ind[nn];
 +        qn = atc->q[n];
 +
 +        if (qn != 0)
 +        {
 +            idxptr = atc->idx[n];
 +            norder = nn*order;
 +
 +            i0   = idxptr[XX] - offx;
 +            j0   = idxptr[YY] - offy;
 +            k0   = idxptr[ZZ] - offz;
 +
 +            thx = spline->theta[XX] + norder;
 +            thy = spline->theta[YY] + norder;
 +            thz = spline->theta[ZZ] + norder;
 +
 +            switch (order)
 +            {
 +                case 4:
 +#ifdef PME_SSE
 +#ifdef PME_SSE_UNALIGNED
 +#define PME_SPREAD_SSE_ORDER4
 +#else
 +#define PME_SPREAD_SSE_ALIGNED
 +#define PME_ORDER 4
 +#endif
 +#include "pme_sse_single.h"
 +#else
 +                    DO_BSPLINE(4);
 +#endif
 +                    break;
 +                case 5:
 +#ifdef PME_SSE
 +#define PME_SPREAD_SSE_ALIGNED
 +#define PME_ORDER 5
 +#include "pme_sse_single.h"
 +#else
 +                    DO_BSPLINE(5);
 +#endif
 +                    break;
 +                default:
 +                    DO_BSPLINE(order);
 +                    break;
 +            }
 +        }
 +    }
 +}
 +
 +static void set_grid_alignment(int *pmegrid_nz, int pme_order)
 +{
 +#ifdef PME_SSE
 +    if (pme_order == 5
 +#ifndef PME_SSE_UNALIGNED
 +        || pme_order == 4
 +#endif
 +        )
 +    {
 +        /* Round nz up to a multiple of 4 to ensure alignment */
 +        *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
 +    }
 +#endif
 +}
 +
 +static void set_gridsize_alignment(int *gridsize, int pme_order)
 +{
 +#ifdef PME_SSE
 +#ifndef PME_SSE_UNALIGNED
 +    if (pme_order == 4)
 +    {
 +        /* Add extra elements to ensured aligned operations do not go
 +         * beyond the allocated grid size.
 +         * Note that for pme_order=5, the pme grid z-size alignment
 +         * ensures that we will not go beyond the grid size.
 +         */
 +        *gridsize += 4;
 +    }
 +#endif
 +#endif
 +}
 +
 +static void pmegrid_init(pmegrid_t *grid,
 +                         int cx, int cy, int cz,
 +                         int x0, int y0, int z0,
 +                         int x1, int y1, int z1,
 +                         gmx_bool set_alignment,
 +                         int pme_order,
 +                         real *ptr)
 +{
 +    int nz, gridsize;
 +
 +    grid->ci[XX]     = cx;
 +    grid->ci[YY]     = cy;
 +    grid->ci[ZZ]     = cz;
 +    grid->offset[XX] = x0;
 +    grid->offset[YY] = y0;
 +    grid->offset[ZZ] = z0;
 +    grid->n[XX]      = x1 - x0 + pme_order - 1;
 +    grid->n[YY]      = y1 - y0 + pme_order - 1;
 +    grid->n[ZZ]      = z1 - z0 + pme_order - 1;
 +    copy_ivec(grid->n, grid->s);
 +
 +    nz = grid->s[ZZ];
 +    set_grid_alignment(&nz, pme_order);
 +    if (set_alignment)
 +    {
 +        grid->s[ZZ] = nz;
 +    }
 +    else if (nz != grid->s[ZZ])
 +    {
 +        gmx_incons("pmegrid_init call with an unaligned z size");
 +    }
 +
 +    grid->order = pme_order;
 +    if (ptr == NULL)
 +    {
 +        gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
 +        set_gridsize_alignment(&gridsize, pme_order);
 +        snew_aligned(grid->grid, gridsize, 16);
 +    }
 +    else
 +    {
 +        grid->grid = ptr;
 +    }
 +}
 +
 +static int div_round_up(int enumerator, int denominator)
 +{
 +    return (enumerator + denominator - 1)/denominator;
 +}
 +
 +static void make_subgrid_division(const ivec n, int ovl, int nthread,
 +                                  ivec nsub)
 +{
 +    int gsize_opt, gsize;
 +    int nsx, nsy, nsz;
 +    char *env;
 +
 +    gsize_opt = -1;
 +    for (nsx = 1; nsx <= nthread; nsx++)
 +    {
 +        if (nthread % nsx == 0)
 +        {
 +            for (nsy = 1; nsy <= nthread; nsy++)
 +            {
 +                if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
 +                {
 +                    nsz = nthread/(nsx*nsy);
 +
 +                    /* Determine the number of grid points per thread */
 +                    gsize =
 +                        (div_round_up(n[XX], nsx) + ovl)*
 +                        (div_round_up(n[YY], nsy) + ovl)*
 +                        (div_round_up(n[ZZ], nsz) + ovl);
 +
 +                    /* Minimize the number of grids points per thread
 +                     * and, secondarily, the number of cuts in minor dimensions.
 +                     */
 +                    if (gsize_opt == -1 ||
 +                        gsize < gsize_opt ||
 +                        (gsize == gsize_opt &&
 +                         (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
 +                    {
 +                        nsub[XX]  = nsx;
 +                        nsub[YY]  = nsy;
 +                        nsub[ZZ]  = nsz;
 +                        gsize_opt = gsize;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    env = getenv("GMX_PME_THREAD_DIVISION");
 +    if (env != NULL)
 +    {
 +        sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
 +    }
 +
 +    if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
 +    {
 +        gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
 +    }
 +}
 +
 +static void pmegrids_init(pmegrids_t *grids,
 +                          int nx, int ny, int nz, int nz_base,
 +                          int pme_order,
 +                          int nthread,
 +                          int overlap_x,
 +                          int overlap_y)
 +{
 +    ivec n, n_base, g0, g1;
 +    int t, x, y, z, d, i, tfac;
 +    int max_comm_lines = -1;
 +
 +    n[XX] = nx - (pme_order - 1);
 +    n[YY] = ny - (pme_order - 1);
 +    n[ZZ] = nz - (pme_order - 1);
 +
 +    copy_ivec(n, n_base);
 +    n_base[ZZ] = nz_base;
 +
 +    pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
 +                 NULL);
 +
 +    grids->nthread = nthread;
 +
 +    make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
 +
 +    if (grids->nthread > 1)
 +    {
 +        ivec nst;
 +        int gridsize;
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
 +        }
 +        set_grid_alignment(&nst[ZZ], pme_order);
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
 +                    grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
 +            fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
 +                    nx, ny, nz,
 +                    nst[XX], nst[YY], nst[ZZ]);
 +        }
 +
 +        snew(grids->grid_th, grids->nthread);
 +        t        = 0;
 +        gridsize = nst[XX]*nst[YY]*nst[ZZ];
 +        set_gridsize_alignment(&gridsize, pme_order);
 +        snew_aligned(grids->grid_all,
 +                     grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
 +                     16);
 +
 +        for (x = 0; x < grids->nc[XX]; x++)
 +        {
 +            for (y = 0; y < grids->nc[YY]; y++)
 +            {
 +                for (z = 0; z < grids->nc[ZZ]; z++)
 +                {
 +                    pmegrid_init(&grids->grid_th[t],
 +                                 x, y, z,
 +                                 (n[XX]*(x  ))/grids->nc[XX],
 +                                 (n[YY]*(y  ))/grids->nc[YY],
 +                                 (n[ZZ]*(z  ))/grids->nc[ZZ],
 +                                 (n[XX]*(x+1))/grids->nc[XX],
 +                                 (n[YY]*(y+1))/grids->nc[YY],
 +                                 (n[ZZ]*(z+1))/grids->nc[ZZ],
 +                                 TRUE,
 +                                 pme_order,
 +                                 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
 +                    t++;
 +                }
 +            }
 +        }
 +    }
 +
 +    snew(grids->g2t, DIM);
 +    tfac = 1;
 +    for (d = DIM-1; d >= 0; d--)
 +    {
 +        snew(grids->g2t[d], n[d]);
 +        t = 0;
 +        for (i = 0; i < n[d]; i++)
 +        {
 +            /* The second check should match the parameters
 +             * of the pmegrid_init call above.
 +             */
 +            while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
 +            {
 +                t++;
 +            }
 +            grids->g2t[d][i] = t*tfac;
 +        }
 +
 +        tfac *= grids->nc[d];
 +
 +        switch (d)
 +        {
 +            case XX: max_comm_lines = overlap_x;     break;
 +            case YY: max_comm_lines = overlap_y;     break;
 +            case ZZ: max_comm_lines = pme_order - 1; break;
 +        }
 +        grids->nthread_comm[d] = 0;
 +        while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
 +               grids->nthread_comm[d] < grids->nc[d])
 +        {
 +            grids->nthread_comm[d]++;
 +        }
 +        if (debug != NULL)
 +        {
 +            fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
 +                    'x'+d, grids->nthread_comm[d]);
 +        }
 +        /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
 +         * work, but this is not a problematic restriction.
 +         */
 +        if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
 +        {
 +            gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
 +        }
 +    }
 +}
 +
 +
 +static void pmegrids_destroy(pmegrids_t *grids)
 +{
 +    int t;
 +
 +    if (grids->grid.grid != NULL)
 +    {
 +        sfree(grids->grid.grid);
 +
 +        if (grids->nthread > 0)
 +        {
 +            for (t = 0; t < grids->nthread; t++)
 +            {
 +                sfree(grids->grid_th[t].grid);
 +            }
 +            sfree(grids->grid_th);
 +        }
 +    }
 +}
 +
 +
 +static void realloc_work(pme_work_t *work, int nkx)
 +{
 +    if (nkx > work->nalloc)
 +    {
 +        work->nalloc = nkx;
 +        srenew(work->mhx, work->nalloc);
 +        srenew(work->mhy, work->nalloc);
 +        srenew(work->mhz, work->nalloc);
 +        srenew(work->m2, work->nalloc);
 +        /* Allocate an aligned pointer for SSE operations, including 3 extra
 +         * elements at the end since SSE operates on 4 elements at a time.
 +         */
 +        sfree_aligned(work->denom);
 +        sfree_aligned(work->tmp1);
 +        sfree_aligned(work->eterm);
 +        snew_aligned(work->denom, work->nalloc+3, 16);
 +        snew_aligned(work->tmp1, work->nalloc+3, 16);
 +        snew_aligned(work->eterm, work->nalloc+3, 16);
 +        srenew(work->m2inv, work->nalloc);
 +    }
 +}
 +
 +
 +static void free_work(pme_work_t *work)
 +{
 +    sfree(work->mhx);
 +    sfree(work->mhy);
 +    sfree(work->mhz);
 +    sfree(work->m2);
 +    sfree_aligned(work->denom);
 +    sfree_aligned(work->tmp1);
 +    sfree_aligned(work->eterm);
 +    sfree(work->m2inv);
 +}
 +
 +
 +#ifdef PME_SSE
 +/* Calculate exponentials through SSE in float precision */
 +inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
 +{
 +    {
 +        const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
 +        __m128 f_sse;
 +        __m128 lu;
 +        __m128 tmp_d1, d_inv, tmp_r, tmp_e;
 +        int kx;
 +        f_sse = _mm_load1_ps(&f);
 +        for (kx = 0; kx < end; kx += 4)
 +        {
 +            tmp_d1   = _mm_load_ps(d_aligned+kx);
 +            lu       = _mm_rcp_ps(tmp_d1);
 +            d_inv    = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
 +            tmp_r    = _mm_load_ps(r_aligned+kx);
 +            tmp_r    = gmx_mm_exp_ps(tmp_r);
 +            tmp_e    = _mm_mul_ps(f_sse, d_inv);
 +            tmp_e    = _mm_mul_ps(tmp_e, tmp_r);
 +            _mm_store_ps(e_aligned+kx, tmp_e);
 +        }
 +    }
 +}
 +#else
 +inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
 +{
 +    int kx;
 +    for (kx = start; kx < end; kx++)
 +    {
 +        d[kx] = 1.0/d[kx];
 +    }
 +    for (kx = start; kx < end; kx++)
 +    {
 +        r[kx] = exp(r[kx]);
 +    }
 +    for (kx = start; kx < end; kx++)
 +    {
 +        e[kx] = f*r[kx]*d[kx];
 +    }
 +}
 +#endif
 +
 +
 +static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
 +                         real ewaldcoeff, real vol,
 +                         gmx_bool bEnerVir,
 +                         int nthread, int thread)
 +{
 +    /* do recip sum over local cells in grid */
 +    /* y major, z middle, x minor or continuous */
 +    t_complex *p0;
 +    int     kx, ky, kz, maxkx, maxky, maxkz;
 +    int     nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
 +    real    mx, my, mz;
 +    real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
 +    real    ets2, struct2, vfactor, ets2vf;
 +    real    d1, d2, energy = 0;
 +    real    by, bz;
 +    real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
 +    real    rxx, ryx, ryy, rzx, rzy, rzz;
 +    pme_work_t *work;
 +    real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
 +    real    mhxk, mhyk, mhzk, m2k;
 +    real    corner_fac;
 +    ivec    complex_order;
 +    ivec    local_ndata, local_offset, local_size;
 +    real    elfac;
 +
 +    elfac = ONE_4PI_EPS0/pme->epsilon_r;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
 +                                      complex_order,
 +                                      local_ndata,
 +                                      local_offset,
 +                                      local_size);
 +
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
 +
 +    maxkx = (nx+1)/2;
 +    maxky = (ny+1)/2;
 +    maxkz = nz/2+1;
 +
 +    work  = &pme->work[thread];
 +    mhx   = work->mhx;
 +    mhy   = work->mhy;
 +    mhz   = work->mhz;
 +    m2    = work->m2;
 +    denom = work->denom;
 +    tmp1  = work->tmp1;
 +    eterm = work->eterm;
 +    m2inv = work->m2inv;
 +
 +    iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
 +    iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
 +
 +    for (iyz = iyz0; iyz < iyz1; iyz++)
 +    {
 +        iy = iyz/local_ndata[ZZ];
 +        iz = iyz - iy*local_ndata[ZZ];
 +
 +        ky = iy + local_offset[YY];
 +
 +        if (ky < maxky)
 +        {
 +            my = ky;
 +        }
 +        else
 +        {
 +            my = (ky - ny);
 +        }
 +
 +        by = M_PI*vol*pme->bsp_mod[YY][ky];
 +
 +        kz = iz + local_offset[ZZ];
 +
 +        mz = kz;
 +
 +        bz = pme->bsp_mod[ZZ][kz];
 +
 +        /* 0.5 correction for corner points */
 +        corner_fac = 1;
 +        if (kz == 0 || kz == (nz+1)/2)
 +        {
 +            corner_fac = 0.5;
 +        }
 +
 +        p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 +
 +        /* We should skip the k-space point (0,0,0) */
 +        if (local_offset[XX] > 0 || ky > 0 || kz > 0)
 +        {
 +            kxstart = local_offset[XX];
 +        }
 +        else
 +        {
 +            kxstart = local_offset[XX] + 1;
 +            p0++;
 +        }
 +        kxend = local_offset[XX] + local_ndata[XX];
 +
 +        if (bEnerVir)
 +        {
 +            /* More expensive inner loop, especially because of the storage
 +             * of the mh elements in array's.
 +             * Because x is the minor grid index, all mh elements
 +             * depend on kx for triclinic unit cells.
 +             */
 +
 +            /* Two explicit loops to avoid a conditional inside the loop */
 +            for (kx = kxstart; kx < maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for (kx = maxkx; kx < kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for (kx = kxstart; kx < kxend; kx++)
 +            {
 +                m2inv[kx] = 1.0/m2[kx];
 +            }
 +
 +            calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
 +
 +            for (kx = kxstart; kx < kxend; kx++, p0++)
 +            {
 +                d1      = p0->re;
 +                d2      = p0->im;
 +
 +                p0->re  = d1*eterm[kx];
 +                p0->im  = d2*eterm[kx];
 +
 +                struct2 = 2.0*(d1*d1+d2*d2);
 +
 +                tmp1[kx] = eterm[kx]*struct2;
 +            }
 +
 +            for (kx = kxstart; kx < kxend; kx++)
 +            {
 +                ets2     = corner_fac*tmp1[kx];
 +                vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
 +                energy  += ets2;
 +
 +                ets2vf   = ets2*vfactor;
 +                virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
 +                virxy   += ets2vf*mhx[kx]*mhy[kx];
 +                virxz   += ets2vf*mhx[kx]*mhz[kx];
 +                viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
 +                viryz   += ets2vf*mhy[kx]*mhz[kx];
 +                virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
 +            }
 +        }
 +        else
 +        {
 +            /* We don't need to calculate the energy and the virial.
 +             * In this case the triclinic overhead is small.
 +             */
 +
 +            /* Two explicit loops to avoid a conditional inside the loop */
 +
 +            for (kx = kxstart; kx < maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for (kx = maxkx; kx < kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
 +
 +            for (kx = kxstart; kx < kxend; kx++, p0++)
 +            {
 +                d1      = p0->re;
 +                d2      = p0->im;
 +
 +                p0->re  = d1*eterm[kx];
 +                p0->im  = d2*eterm[kx];
 +            }
 +        }
 +    }
 +
 +    if (bEnerVir)
 +    {
 +        /* Update virial with local values.
 +         * The virial is symmetric by definition.
 +         * this virial seems ok for isotropic scaling, but I'm
 +         * experiencing problems on semiisotropic membranes.
 +         * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
 +         */
 +        work->vir[XX][XX] = 0.25*virxx;
 +        work->vir[YY][YY] = 0.25*viryy;
 +        work->vir[ZZ][ZZ] = 0.25*virzz;
 +        work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
 +        work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
 +        work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
 +
 +        /* This energy should be corrected for a charged system */
 +        work->energy = 0.5*energy;
 +    }
 +
 +    /* Return the loop count */
 +    return local_ndata[YY]*local_ndata[XX];
 +}
 +
 +static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
 +                             real *mesh_energy, matrix vir)
 +{
 +    /* This function sums output over threads
 +     * and should therefore only be called after thread synchronization.
 +     */
 +    int thread;
 +
 +    *mesh_energy = pme->work[0].energy;
 +    copy_mat(pme->work[0].vir, vir);
 +
 +    for (thread = 1; thread < nthread; thread++)
 +    {
 +        *mesh_energy += pme->work[thread].energy;
 +        m_add(vir, pme->work[thread].vir, vir);
 +    }
 +}
 +
 +#define DO_FSPLINE(order)                      \
 +    for (ithx = 0; (ithx < order); ithx++)              \
 +    {                                              \
 +        index_x = (i0+ithx)*pny*pnz;               \
 +        tx      = thx[ithx];                       \
 +        dx      = dthx[ithx];                      \
 +                                               \
 +        for (ithy = 0; (ithy < order); ithy++)          \
 +        {                                          \
 +            index_xy = index_x+(j0+ithy)*pnz;      \
 +            ty       = thy[ithy];                  \
 +            dy       = dthy[ithy];                 \
 +            fxy1     = fz1 = 0;                    \
 +                                               \
 +            for (ithz = 0; (ithz < order); ithz++)      \
 +            {                                      \
 +                gval  = grid[index_xy+(k0+ithz)];  \
 +                fxy1 += thz[ithz]*gval;            \
 +                fz1  += dthz[ithz]*gval;           \
 +            }                                      \
 +            fx += dx*ty*fxy1;                      \
 +            fy += tx*dy*fxy1;                      \
 +            fz += tx*ty*fz1;                       \
 +        }                                          \
 +    }
 +
 +
 +static void gather_f_bsplines(gmx_pme_t pme, real *grid,
 +                              gmx_bool bClearF, pme_atomcomm_t *atc,
 +                              splinedata_t *spline,
 +                              real scale)
 +{
 +    /* sum forces for local particles */
 +    int     nn, n, ithx, ithy, ithz, i0, j0, k0;
 +    int     index_x, index_xy;
 +    int     nx, ny, nz, pnx, pny, pnz;
 +    int *   idxptr;
 +    real    tx, ty, dx, dy, qn;
 +    real    fx, fy, fz, gval;
 +    real    fxy1, fz1;
 +    real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
 +    int     norder;
 +    real    rxx, ryx, ryy, rzx, rzy, rzz;
 +    int     order;
 +
 +    pme_spline_work_t *work;
 +
 +    work = pme->spline_work;
 +
 +    order = pme->pme_order;
 +    thx   = spline->theta[XX];
 +    thy   = spline->theta[YY];
 +    thz   = spline->theta[ZZ];
 +    dthx  = spline->dtheta[XX];
 +    dthy  = spline->dtheta[YY];
 +    dthz  = spline->dtheta[ZZ];
 +    nx    = pme->nkx;
 +    ny    = pme->nky;
 +    nz    = pme->nkz;
 +    pnx   = pme->pmegrid_nx;
 +    pny   = pme->pmegrid_ny;
 +    pnz   = pme->pmegrid_nz;
 +
 +    rxx   = pme->recipbox[XX][XX];
 +    ryx   = pme->recipbox[YY][XX];
 +    ryy   = pme->recipbox[YY][YY];
 +    rzx   = pme->recipbox[ZZ][XX];
 +    rzy   = pme->recipbox[ZZ][YY];
 +    rzz   = pme->recipbox[ZZ][ZZ];
 +
 +    for (nn = 0; nn < spline->n; nn++)
 +    {
 +        n  = spline->ind[nn];
 +        qn = scale*atc->q[n];
 +
 +        if (bClearF)
 +        {
 +            atc->f[n][XX] = 0;
 +            atc->f[n][YY] = 0;
 +            atc->f[n][ZZ] = 0;
 +        }
 +        if (qn != 0)
 +        {
 +            fx     = 0;
 +            fy     = 0;
 +            fz     = 0;
 +            idxptr = atc->idx[n];
 +            norder = nn*order;
 +
 +            i0   = idxptr[XX];
 +            j0   = idxptr[YY];
 +            k0   = idxptr[ZZ];
 +
 +            /* Pointer arithmetic alert, next six statements */
 +            thx  = spline->theta[XX] + norder;
 +            thy  = spline->theta[YY] + norder;
 +            thz  = spline->theta[ZZ] + norder;
 +            dthx = spline->dtheta[XX] + norder;
 +            dthy = spline->dtheta[YY] + norder;
 +            dthz = spline->dtheta[ZZ] + norder;
 +
 +            switch (order)
 +            {
 +                case 4:
 +#ifdef PME_SSE
 +#ifdef PME_SSE_UNALIGNED
 +#define PME_GATHER_F_SSE_ORDER4
 +#else
 +#define PME_GATHER_F_SSE_ALIGNED
 +#define PME_ORDER 4
 +#endif
 +#include "pme_sse_single.h"
 +#else
 +                    DO_FSPLINE(4);
 +#endif
 +                    break;
 +                case 5:
 +#ifdef PME_SSE
 +#define PME_GATHER_F_SSE_ALIGNED
 +#define PME_ORDER 5
 +#include "pme_sse_single.h"
 +#else
 +                    DO_FSPLINE(5);
 +#endif
 +                    break;
 +                default:
 +                    DO_FSPLINE(order);
 +                    break;
 +            }
 +
 +            atc->f[n][XX] += -qn*( fx*nx*rxx );
 +            atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
 +            atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
 +        }
 +    }
 +    /* Since the energy and not forces are interpolated
 +     * the net force might not be exactly zero.
 +     * This can be solved by also interpolating F, but
 +     * that comes at a cost.
 +     * A better hack is to remove the net force every
 +     * step, but that must be done at a higher level
 +     * since this routine doesn't see all atoms if running
 +     * in parallel. Don't know how important it is?  EL 990726
 +     */
 +}
 +
 +
 +static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
 +                                   pme_atomcomm_t *atc)
 +{
 +    splinedata_t *spline;
 +    int     n, ithx, ithy, ithz, i0, j0, k0;
 +    int     index_x, index_xy;
 +    int *   idxptr;
 +    real    energy, pot, tx, ty, qn, gval;
 +    real    *thx, *thy, *thz;
 +    int     norder;
 +    int     order;
 +
 +    spline = &atc->spline[0];
 +
 +    order = pme->pme_order;
 +
 +    energy = 0;
 +    for (n = 0; (n < atc->n); n++)
 +    {
 +        qn      = atc->q[n];
 +
 +        if (qn != 0)
 +        {
 +            idxptr = atc->idx[n];
 +            norder = n*order;
 +
 +            i0   = idxptr[XX];
 +            j0   = idxptr[YY];
 +            k0   = idxptr[ZZ];
 +
 +            /* Pointer arithmetic alert, next three statements */
 +            thx  = spline->theta[XX] + norder;
 +            thy  = spline->theta[YY] + norder;
 +            thz  = spline->theta[ZZ] + norder;
 +
 +            pot = 0;
 +            for (ithx = 0; (ithx < order); ithx++)
 +            {
 +                index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
 +                tx      = thx[ithx];
 +
 +                for (ithy = 0; (ithy < order); ithy++)
 +                {
 +                    index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
 +                    ty       = thy[ithy];
 +
 +                    for (ithz = 0; (ithz < order); ithz++)
 +                    {
 +                        gval  = grid[index_xy+(k0+ithz)];
 +                        pot  += tx*ty*thz[ithz]*gval;
 +                    }
 +
 +                }
 +            }
 +
 +            energy += pot*qn;
 +        }
 +    }
 +
 +    return energy;
 +}
 +
 +/* Macro to force loop unrolling by fixing order.
 + * This gives a significant performance gain.
 + */
 +#define CALC_SPLINE(order)                     \
 +    {                                              \
 +        int j, k, l;                                 \
 +        real dr, div;                               \
 +        real data[PME_ORDER_MAX];                  \
 +        real ddata[PME_ORDER_MAX];                 \
 +                                               \
 +        for (j = 0; (j < DIM); j++)                     \
 +        {                                          \
 +            dr  = xptr[j];                         \
 +                                               \
 +            /* dr is relative offset from lower cell limit */ \
 +            data[order-1] = 0;                     \
 +            data[1]       = dr;                          \
 +            data[0]       = 1 - dr;                      \
 +                                               \
 +            for (k = 3; (k < order); k++)               \
 +            {                                      \
 +                div       = 1.0/(k - 1.0);               \
 +                data[k-1] = div*dr*data[k-2];      \
 +                for (l = 1; (l < (k-1)); l++)           \
 +                {                                  \
 +                    data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
 +                                       data[k-l-1]);                \
 +                }                                  \
 +                data[0] = div*(1-dr)*data[0];      \
 +            }                                      \
 +            /* differentiate */                    \
 +            ddata[0] = -data[0];                   \
 +            for (k = 1; (k < order); k++)               \
 +            {                                      \
 +                ddata[k] = data[k-1] - data[k];    \
 +            }                                      \
 +                                               \
 +            div           = 1.0/(order - 1);                 \
 +            data[order-1] = div*dr*data[order-2];  \
 +            for (l = 1; (l < (order-1)); l++)           \
 +            {                                      \
 +                data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
 +                                       (order-l-dr)*data[order-l-1]); \
 +            }                                      \
 +            data[0] = div*(1 - dr)*data[0];        \
 +                                               \
 +            for (k = 0; k < order; k++)                 \
 +            {                                      \
 +                theta[j][i*order+k]  = data[k];    \
 +                dtheta[j][i*order+k] = ddata[k];   \
 +            }                                      \
 +        }                                          \
 +    }
 +
 +void make_bsplines(splinevec theta, splinevec dtheta, int order,
 +                   rvec fractx[], int nr, int ind[], real charge[],
 +                   gmx_bool bFreeEnergy)
 +{
 +    /* construct splines for local atoms */
 +    int  i, ii;
 +    real *xptr;
 +
 +    for (i = 0; i < nr; i++)
 +    {
 +        /* With free energy we do not use the charge check.
 +         * In most cases this will be more efficient than calling make_bsplines
 +         * twice, since usually more than half the particles have charges.
 +         */
 +        ii = ind[i];
 +        if (bFreeEnergy || charge[ii] != 0.0)
 +        {
 +            xptr = fractx[ii];
 +            switch (order)
 +            {
 +                case 4:  CALC_SPLINE(4);     break;
 +                case 5:  CALC_SPLINE(5);     break;
 +                default: CALC_SPLINE(order); break;
 +            }
 +        }
 +    }
 +}
 +
 +
 +void make_dft_mod(real *mod, real *data, int ndata)
 +{
 +    int i, j;
 +    real sc, ss, arg;
 +
 +    for (i = 0; i < ndata; i++)
 +    {
 +        sc = ss = 0;
 +        for (j = 0; j < ndata; j++)
 +        {
 +            arg = (2.0*M_PI*i*j)/ndata;
 +            sc += data[j]*cos(arg);
 +            ss += data[j]*sin(arg);
 +        }
 +        mod[i] = sc*sc+ss*ss;
 +    }
 +    for (i = 0; i < ndata; i++)
 +    {
 +        if (mod[i] < 1e-7)
 +        {
 +            mod[i] = (mod[i-1]+mod[i+1])*0.5;
 +        }
 +    }
 +}
 +
 +
 +static void make_bspline_moduli(splinevec bsp_mod,
 +                                int nx, int ny, int nz, int order)
 +{
 +    int nmax = max(nx, max(ny, nz));
 +    real *data, *ddata, *bsp_data;
 +    int i, k, l;
 +    real div;
 +
 +    snew(data, order);
 +    snew(ddata, order);
 +    snew(bsp_data, nmax);
 +
 +    data[order-1] = 0;
 +    data[1]       = 0;
 +    data[0]       = 1;
 +
 +    for (k = 3; k < order; k++)
 +    {
 +        div       = 1.0/(k-1.0);
 +        data[k-1] = 0;
 +        for (l = 1; l < (k-1); l++)
 +        {
 +            data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
 +        }
 +        data[0] = div*data[0];
 +    }
 +    /* differentiate */
 +    ddata[0] = -data[0];
 +    for (k = 1; k < order; k++)
 +    {
 +        ddata[k] = data[k-1]-data[k];
 +    }
 +    div           = 1.0/(order-1);
 +    data[order-1] = 0;
 +    for (l = 1; l < (order-1); l++)
 +    {
 +        data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
 +    }
 +    data[0] = div*data[0];
 +
 +    for (i = 0; i < nmax; i++)
 +    {
 +        bsp_data[i] = 0;
 +    }
 +    for (i = 1; i <= order; i++)
 +    {
 +        bsp_data[i] = data[i-1];
 +    }
 +
 +    make_dft_mod(bsp_mod[XX], bsp_data, nx);
 +    make_dft_mod(bsp_mod[YY], bsp_data, ny);
 +    make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
 +
 +    sfree(data);
 +    sfree(ddata);
 +    sfree(bsp_data);
 +}
 +
 +
 +/* Return the P3M optimal influence function */
 +static double do_p3m_influence(double z, int order)
 +{
 +    double z2, z4;
 +
 +    z2 = z*z;
 +    z4 = z2*z2;
 +
 +    /* The formula and most constants can be found in:
 +     * Ballenegger et al., JCTC 8, 936 (2012)
 +     */
 +    switch (order)
 +    {
 +        case 2:
 +            return 1.0 - 2.0*z2/3.0;
 +            break;
 +        case 3:
 +            return 1.0 - z2 + 2.0*z4/15.0;
 +            break;
 +        case 4:
 +            return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
 +            break;
 +        case 5:
 +            return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
 +            break;
 +        case 6:
 +            return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
 +            break;
 +        case 7:
 +            return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
 +        case 8:
 +            return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
 +            break;
 +    }
 +
 +    return 0.0;
 +}
 +
 +/* Calculate the P3M B-spline moduli for one dimension */
 +static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
 +{
 +    double zarg, zai, sinzai, infl;
 +    int    maxk, i;
 +
 +    if (order > 8)
 +    {
 +        gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
 +    }
 +
 +    zarg = M_PI/n;
 +
 +    maxk = (n + 1)/2;
 +
 +    for (i = -maxk; i < 0; i++)
 +    {
 +        zai          = zarg*i;
 +        sinzai       = sin(zai);
 +        infl         = do_p3m_influence(sinzai, order);
 +        bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
 +    }
 +    bsp_mod[0] = 1.0;
 +    for (i = 1; i < maxk; i++)
 +    {
 +        zai        = zarg*i;
 +        sinzai     = sin(zai);
 +        infl       = do_p3m_influence(sinzai, order);
 +        bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
 +    }
 +}
 +
 +/* Calculate the P3M B-spline moduli */
 +static void make_p3m_bspline_moduli(splinevec bsp_mod,
 +                                    int nx, int ny, int nz, int order)
 +{
 +    make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
 +    make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
 +    make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
 +}
 +
 +
 +static void setup_coordinate_communication(pme_atomcomm_t *atc)
 +{
 +    int nslab, n, i;
 +    int fw, bw;
 +
 +    nslab = atc->nslab;
 +
 +    n = 0;
 +    for (i = 1; i <= nslab/2; i++)
 +    {
 +        fw = (atc->nodeid + i) % nslab;
 +        bw = (atc->nodeid - i + nslab) % nslab;
 +        if (n < nslab - 1)
 +        {
 +            atc->node_dest[n] = fw;
 +            atc->node_src[n]  = bw;
 +            n++;
 +        }
 +        if (n < nslab - 1)
 +        {
 +            atc->node_dest[n] = bw;
 +            atc->node_src[n]  = fw;
 +            n++;
 +        }
 +    }
 +}
 +
 +int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
 +{
 +    int thread;
 +
 +    if (NULL != log)
 +    {
 +        fprintf(log, "Destroying PME data structures.\n");
 +    }
 +
 +    sfree((*pmedata)->nnx);
 +    sfree((*pmedata)->nny);
 +    sfree((*pmedata)->nnz);
 +
 +    pmegrids_destroy(&(*pmedata)->pmegridA);
 +
 +    sfree((*pmedata)->fftgridA);
 +    sfree((*pmedata)->cfftgridA);
 +    gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
 +
 +    if ((*pmedata)->pmegridB.grid.grid != NULL)
 +    {
 +        pmegrids_destroy(&(*pmedata)->pmegridB);
 +        sfree((*pmedata)->fftgridB);
 +        sfree((*pmedata)->cfftgridB);
 +        gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
 +    }
 +    for (thread = 0; thread < (*pmedata)->nthread; thread++)
 +    {
 +        free_work(&(*pmedata)->work[thread]);
 +    }
 +    sfree((*pmedata)->work);
 +
 +    sfree(*pmedata);
 +    *pmedata = NULL;
 +
 +    return 0;
 +}
 +
 +static int mult_up(int n, int f)
 +{
 +    return ((n + f - 1)/f)*f;
 +}
 +
 +
 +static double pme_load_imbalance(gmx_pme_t pme)
 +{
 +    int    nma, nmi;
 +    double n1, n2, n3;
 +
 +    nma = pme->nnodes_major;
 +    nmi = pme->nnodes_minor;
 +
 +    n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
 +    n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
 +    n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
 +
 +    /* pme_solve is roughly double the cost of an fft */
 +
 +    return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
 +}
 +
 +static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc, t_commrec *cr,
 +                          int dimind, gmx_bool bSpread)
 +{
 +    int nk, k, s, thread;
 +
 +    atc->dimind    = dimind;
 +    atc->nslab     = 1;
 +    atc->nodeid    = 0;
 +    atc->pd_nalloc = 0;
 +#ifdef GMX_MPI
 +    if (pme->nnodes > 1)
 +    {
 +        atc->mpi_comm = pme->mpi_comm_d[dimind];
 +        MPI_Comm_size(atc->mpi_comm, &atc->nslab);
 +        MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
 +    }
 +#endif
 +
 +    atc->bSpread   = bSpread;
 +    atc->pme_order = pme->pme_order;
 +
 +    if (atc->nslab > 1)
 +    {
 +        /* These three allocations are not required for particle decomp. */
 +        snew(atc->node_dest, atc->nslab);
 +        snew(atc->node_src, atc->nslab);
 +        setup_coordinate_communication(atc);
 +
 +        snew(atc->count_thread, pme->nthread);
 +        for (thread = 0; thread < pme->nthread; thread++)
 +        {
 +            snew(atc->count_thread[thread], atc->nslab);
 +        }
 +        atc->count = atc->count_thread[0];
 +        snew(atc->rcount, atc->nslab);
 +        snew(atc->buf_index, atc->nslab);
 +    }
 +
 +    atc->nthread = pme->nthread;
 +    if (atc->nthread > 1)
 +    {
 +        snew(atc->thread_plist, atc->nthread);
 +    }
 +    snew(atc->spline, atc->nthread);
 +    for (thread = 0; thread < atc->nthread; thread++)
 +    {
 +        if (atc->nthread > 1)
 +        {
 +            snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
 +            atc->thread_plist[thread].n += GMX_CACHE_SEP;
 +        }
 +        snew(atc->spline[thread].thread_one, pme->nthread);
 +        atc->spline[thread].thread_one[thread] = 1;
 +    }
 +}
 +
 +static void
 +init_overlap_comm(pme_overlap_t *  ol,
 +                  int              norder,
 +#ifdef GMX_MPI
 +                  MPI_Comm         comm,
 +#endif
 +                  int              nnodes,
 +                  int              nodeid,
 +                  int              ndata,
 +                  int              commplainsize)
 +{
 +    int lbnd, rbnd, maxlr, b, i;
 +    int exten;
 +    int nn, nk;
 +    pme_grid_comm_t *pgc;
 +    gmx_bool bCont;
 +    int fft_start, fft_end, send_index1, recv_index1;
 +#ifdef GMX_MPI
 +    MPI_Status stat;
 +
 +    ol->mpi_comm = comm;
 +#endif
 +
 +    ol->nnodes = nnodes;
 +    ol->nodeid = nodeid;
 +
 +    /* Linear translation of the PME grid won't affect reciprocal space
 +     * calculations, so to optimize we only interpolate "upwards",
 +     * which also means we only have to consider overlap in one direction.
 +     * I.e., particles on this node might also be spread to grid indices
 +     * that belong to higher nodes (modulo nnodes)
 +     */
 +
 +    snew(ol->s2g0, ol->nnodes+1);
 +    snew(ol->s2g1, ol->nnodes);
 +    if (debug)
 +    {
 +        fprintf(debug, "PME slab boundaries:");
 +    }
 +    for (i = 0; i < nnodes; i++)
 +    {
 +        /* s2g0 the local interpolation grid start.
 +         * s2g1 the local interpolation grid end.
 +         * Because grid overlap communication only goes forward,
 +         * the grid the slabs for fft's should be rounded down.
 +         */
 +        ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
 +        ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
 +        }
 +    }
 +    ol->s2g0[nnodes] = ndata;
 +    if (debug)
 +    {
 +        fprintf(debug, "\n");
 +    }
 +
 +    /* Determine with how many nodes we need to communicate the grid overlap */
 +    b = 0;
 +    do
 +    {
 +        b++;
 +        bCont = FALSE;
 +        for (i = 0; i < nnodes; i++)
 +        {
 +            if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
 +                (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
 +            {
 +                bCont = TRUE;
 +            }
 +        }
 +    }
 +    while (bCont && b < nnodes);
 +    ol->noverlap_nodes = b - 1;
 +
 +    snew(ol->send_id, ol->noverlap_nodes);
 +    snew(ol->recv_id, ol->noverlap_nodes);
 +    for (b = 0; b < ol->noverlap_nodes; b++)
 +    {
 +        ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
 +        ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
 +    }
 +    snew(ol->comm_data, ol->noverlap_nodes);
 +
 +    ol->send_size = 0;
 +    for (b = 0; b < ol->noverlap_nodes; b++)
 +    {
 +        pgc = &ol->comm_data[b];
 +        /* Send */
 +        fft_start        = ol->s2g0[ol->send_id[b]];
 +        fft_end          = ol->s2g0[ol->send_id[b]+1];
 +        if (ol->send_id[b] < nodeid)
 +        {
 +            fft_start += ndata;
 +            fft_end   += ndata;
 +        }
 +        send_index1       = ol->s2g1[nodeid];
 +        send_index1       = min(send_index1, fft_end);
 +        pgc->send_index0  = fft_start;
 +        pgc->send_nindex  = max(0, send_index1 - pgc->send_index0);
 +        ol->send_size    += pgc->send_nindex;
 +
 +        /* We always start receiving to the first index of our slab */
 +        fft_start        = ol->s2g0[ol->nodeid];
 +        fft_end          = ol->s2g0[ol->nodeid+1];
 +        recv_index1      = ol->s2g1[ol->recv_id[b]];
 +        if (ol->recv_id[b] > nodeid)
 +        {
 +            recv_index1 -= ndata;
 +        }
 +        recv_index1      = min(recv_index1, fft_end);
 +        pgc->recv_index0 = fft_start;
 +        pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
 +    }
 +
 +#ifdef GMX_MPI
 +    /* Communicate the buffer sizes to receive */
 +    for (b = 0; b < ol->noverlap_nodes; b++)
 +    {
 +        MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
 +                     &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
 +                     ol->mpi_comm, &stat);
 +    }
 +#endif
 +
 +    /* For non-divisible grid we need pme_order iso pme_order-1 */
 +    snew(ol->sendbuf, norder*commplainsize);
 +    snew(ol->recvbuf, norder*commplainsize);
 +}
 +
 +static void
 +make_gridindex5_to_localindex(int n, int local_start, int local_range,
 +                              int **global_to_local,
 +                              real **fraction_shift)
 +{
 +    int i;
 +    int * gtl;
 +    real * fsh;
 +
 +    snew(gtl, 5*n);
 +    snew(fsh, 5*n);
 +    for (i = 0; (i < 5*n); i++)
 +    {
 +        /* Determine the global to local grid index */
 +        gtl[i] = (i - local_start + n) % n;
 +        /* For coordinates that fall within the local grid the fraction
 +         * is correct, we don't need to shift it.
 +         */
 +        fsh[i] = 0;
 +        if (local_range < n)
 +        {
 +            /* Due to rounding issues i could be 1 beyond the lower or
 +             * upper boundary of the local grid. Correct the index for this.
 +             * If we shift the index, we need to shift the fraction by
 +             * the same amount in the other direction to not affect
 +             * the weights.
 +             * Note that due to this shifting the weights at the end of
 +             * the spline might change, but that will only involve values
 +             * between zero and values close to the precision of a real,
 +             * which is anyhow the accuracy of the whole mesh calculation.
 +             */
 +            /* With local_range=0 we should not change i=local_start */
 +            if (i % n != local_start)
 +            {
 +                if (gtl[i] == n-1)
 +                {
 +                    gtl[i] = 0;
 +                    fsh[i] = -1;
 +                }
 +                else if (gtl[i] == local_range)
 +                {
 +                    gtl[i] = local_range - 1;
 +                    fsh[i] = 1;
 +                }
 +            }
 +        }
 +    }
 +
 +    *global_to_local = gtl;
 +    *fraction_shift  = fsh;
 +}
 +
 +static pme_spline_work_t *make_pme_spline_work(int order)
 +{
 +    pme_spline_work_t *work;
 +
 +#ifdef PME_SSE
 +    float  tmp[8];
 +    __m128 zero_SSE;
 +    int    of, i;
 +
 +    snew_aligned(work, 1, 16);
 +
 +    zero_SSE = _mm_setzero_ps();
 +
 +    /* Generate bit masks to mask out the unused grid entries,
 +     * as we only operate on order of the 8 grid entries that are
 +     * load into 2 SSE float registers.
 +     */
 +    for (of = 0; of < 8-(order-1); of++)
 +    {
 +        for (i = 0; i < 8; i++)
 +        {
 +            tmp[i] = (i >= of && i < of+order ? 1 : 0);
 +        }
 +        work->mask_SSE0[of] = _mm_loadu_ps(tmp);
 +        work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
 +        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
 +        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
 +    }
 +#else
 +    work = NULL;
 +#endif
 +
 +    return work;
 +}
 +
 +static void
 +gmx_pme_check_grid_restrictions(FILE *fplog, char dim, int nnodes, int *nk)
 +{
 +    int nk_new;
 +
 +    if (*nk % nnodes != 0)
 +    {
 +        nk_new = nnodes*(*nk/nnodes + 1);
 +
 +        if (2*nk_new >= 3*(*nk))
 +        {
 +            gmx_fatal(FARGS, "The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
 +                      dim, *nk, dim, nnodes, dim);
 +        }
 +
 +        if (fplog != NULL)
 +        {
 +            fprintf(fplog, "\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
 +                    dim, *nk, dim, nnodes, dim, nk_new, dim);
 +        }
 +
 +        *nk = nk_new;
 +    }
 +}
 +
 +int gmx_pme_init(gmx_pme_t *         pmedata,
 +                 t_commrec *         cr,
 +                 int                 nnodes_major,
 +                 int                 nnodes_minor,
 +                 t_inputrec *        ir,
 +                 int                 homenr,
 +                 gmx_bool            bFreeEnergy,
 +                 gmx_bool            bReproducible,
 +                 int                 nthread)
 +{
 +    gmx_pme_t pme = NULL;
 +
 +    pme_atomcomm_t *atc;
 +    ivec ndata;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Creating PME data structures.\n");
 +    }
 +    snew(pme, 1);
 +
 +    pme->redist_init         = FALSE;
 +    pme->sum_qgrid_tmp       = NULL;
 +    pme->sum_qgrid_dd_tmp    = NULL;
 +    pme->buf_nalloc          = 0;
 +    pme->redist_buf_nalloc   = 0;
 +
 +    pme->nnodes              = 1;
 +    pme->bPPnode             = TRUE;
 +
 +    pme->nnodes_major        = nnodes_major;
 +    pme->nnodes_minor        = nnodes_minor;
 +
 +#ifdef GMX_MPI
 +    if (nnodes_major*nnodes_minor > 1)
 +    {
 +        pme->mpi_comm = cr->mpi_comm_mygroup;
 +
 +        MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
 +        MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
 +        if (pme->nnodes != nnodes_major*nnodes_minor)
 +        {
 +            gmx_incons("PME node count mismatch");
 +        }
 +    }
 +    else
 +    {
 +        pme->mpi_comm = MPI_COMM_NULL;
 +    }
 +#endif
 +
 +    if (pme->nnodes == 1)
 +    {
 +#ifdef GMX_MPI
 +        pme->mpi_comm_d[0] = MPI_COMM_NULL;
 +        pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +        pme->ndecompdim   = 0;
 +        pme->nodeid_major = 0;
 +        pme->nodeid_minor = 0;
 +#ifdef GMX_MPI
 +        pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +    }
 +    else
 +    {
 +        if (nnodes_minor == 1)
 +        {
 +#ifdef GMX_MPI
 +            pme->mpi_comm_d[0] = pme->mpi_comm;
 +            pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +            pme->ndecompdim   = 1;
 +            pme->nodeid_major = pme->nodeid;
 +            pme->nodeid_minor = 0;
 +
 +        }
 +        else if (nnodes_major == 1)
 +        {
 +#ifdef GMX_MPI
 +            pme->mpi_comm_d[0] = MPI_COMM_NULL;
 +            pme->mpi_comm_d[1] = pme->mpi_comm;
 +#endif
 +            pme->ndecompdim   = 1;
 +            pme->nodeid_major = 0;
 +            pme->nodeid_minor = pme->nodeid;
 +        }
 +        else
 +        {
 +            if (pme->nnodes % nnodes_major != 0)
 +            {
 +                gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
 +            }
 +            pme->ndecompdim = 2;
 +
 +#ifdef GMX_MPI
 +            MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
 +                           pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
 +            MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
 +                           pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
 +
 +            MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
 +            MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
 +            MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
 +            MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
 +#endif
 +        }
 +        pme->bPPnode = (cr->duty & DUTY_PP);
 +    }
 +
 +    pme->nthread = nthread;
 +
 +    if (ir->ePBC == epbcSCREW)
 +    {
 +        gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
 +    }
 +
 +    pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
 +    pme->nkx         = ir->nkx;
 +    pme->nky         = ir->nky;
 +    pme->nkz         = ir->nkz;
 +    pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
 +    pme->pme_order   = ir->pme_order;
 +    pme->epsilon_r   = ir->epsilon_r;
 +
 +    if (pme->pme_order > PME_ORDER_MAX)
 +    {
 +        gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
 +                  pme->pme_order, PME_ORDER_MAX);
 +    }
 +
 +    /* Currently pme.c supports only the fft5d FFT code.
 +     * Therefore the grid always needs to be divisible by nnodes.
 +     * When the old 1D code is also supported again, change this check.
 +     *
 +     * This check should be done before calling gmx_pme_init
 +     * and fplog should be passed iso stderr.
 +     *
 +       if (pme->ndecompdim >= 2)
 +     */
 +    if (pme->ndecompdim >= 1)
 +    {
 +        /*
 +           gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
 +                                        'x',nnodes_major,&pme->nkx);
 +           gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
 +                                        'y',nnodes_minor,&pme->nky);
 +         */
 +    }
 +
 +    if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
 +        pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
 +        pme->nkz <= pme->pme_order)
 +    {
 +        gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order", pme->pme_order);
 +    }
 +
 +    if (pme->nnodes > 1)
 +    {
 +        double imbal;
 +
 +#ifdef GMX_MPI
 +        MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
 +        MPI_Type_commit(&(pme->rvec_mpi));
 +#endif
 +
 +        /* Note that the charge spreading and force gathering, which usually
 +         * takes about the same amount of time as FFT+solve_pme,
 +         * is always fully load balanced
 +         * (unless the charge distribution is inhomogeneous).
 +         */
 +
 +        imbal = pme_load_imbalance(pme);
 +        if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
 +        {
 +            fprintf(stderr,
 +                    "\n"
 +                    "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
 +                    "      For optimal PME load balancing\n"
 +                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
 +                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
 +                    "\n",
 +                    (int)((imbal-1)*100 + 0.5),
 +                    pme->nkx, pme->nky, pme->nnodes_major,
 +                    pme->nky, pme->nkz, pme->nnodes_minor);
 +        }
 +    }
 +
 +    /* For non-divisible grid we need pme_order iso pme_order-1 */
 +    /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
 +     * y is always copied through a buffer: we don't need padding in z,
 +     * but we do need the overlap in x because of the communication order.
 +     */
 +    init_overlap_comm(&pme->overlap[0], pme->pme_order,
 +#ifdef GMX_MPI
 +                      pme->mpi_comm_d[0],
 +#endif
 +                      pme->nnodes_major, pme->nodeid_major,
 +                      pme->nkx,
 +                      (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
 +
 +    /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
 +     * We do this with an offset buffer of equal size, so we need to allocate
 +     * extra for the offset. That's what the (+1)*pme->nkz is for.
 +     */
 +    init_overlap_comm(&pme->overlap[1], pme->pme_order,
 +#ifdef GMX_MPI
 +                      pme->mpi_comm_d[1],
 +#endif
 +                      pme->nnodes_minor, pme->nodeid_minor,
 +                      pme->nky,
 +                      (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
 +
 +    /* Check for a limitation of the (current) sum_fftgrid_dd code.
 +     * We only allow multiple communication pulses in dim 1, not in dim 0.
 +     */
 +    if (pme->nthread > 1 && (pme->overlap[0].noverlap_nodes > 1 ||
 +                             pme->nkx < pme->nnodes_major*pme->pme_order))
 +    {
 +        gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
 +                  pme->nkx/(double)pme->nnodes_major, pme->pme_order);
 +    }
 +
 +    snew(pme->bsp_mod[XX], pme->nkx);
 +    snew(pme->bsp_mod[YY], pme->nky);
 +    snew(pme->bsp_mod[ZZ], pme->nkz);
 +
 +    /* The required size of the interpolation grid, including overlap.
 +     * The allocated size (pmegrid_n?) might be slightly larger.
 +     */
 +    pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
 +        pme->overlap[0].s2g0[pme->nodeid_major];
 +    pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
 +        pme->overlap[1].s2g0[pme->nodeid_minor];
 +    pme->pmegrid_nz_base = pme->nkz;
 +    pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
 +    set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
 +
 +    pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
 +    pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
 +    pme->pmegrid_start_iz = 0;
 +
 +    make_gridindex5_to_localindex(pme->nkx,
 +                                  pme->pmegrid_start_ix,
 +                                  pme->pmegrid_nx - (pme->pme_order-1),
 +                                  &pme->nnx, &pme->fshx);
 +    make_gridindex5_to_localindex(pme->nky,
 +                                  pme->pmegrid_start_iy,
 +                                  pme->pmegrid_ny - (pme->pme_order-1),
 +                                  &pme->nny, &pme->fshy);
 +    make_gridindex5_to_localindex(pme->nkz,
 +                                  pme->pmegrid_start_iz,
 +                                  pme->pmegrid_nz_base,
 +                                  &pme->nnz, &pme->fshz);
 +
 +    pmegrids_init(&pme->pmegridA,
 +                  pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
 +                  pme->pmegrid_nz_base,
 +                  pme->pme_order,
 +                  pme->nthread,
 +                  pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
 +                  pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
 +
 +    pme->spline_work = make_pme_spline_work(pme->pme_order);
 +
 +    ndata[0] = pme->nkx;
 +    ndata[1] = pme->nky;
 +    ndata[2] = pme->nkz;
 +
 +    /* This routine will allocate the grid data to fit the FFTs */
 +    gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
 +                            &pme->fftgridA, &pme->cfftgridA,
 +                            pme->mpi_comm_d,
 +                            pme->overlap[0].s2g0, pme->overlap[1].s2g0,
 +                            bReproducible, pme->nthread);
 +
 +    if (bFreeEnergy)
 +    {
 +        pmegrids_init(&pme->pmegridB,
 +                      pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
 +                      pme->pmegrid_nz_base,
 +                      pme->pme_order,
 +                      pme->nthread,
 +                      pme->nkx % pme->nnodes_major != 0,
 +                      pme->nky % pme->nnodes_minor != 0);
 +
 +        gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
 +                                &pme->fftgridB, &pme->cfftgridB,
 +                                pme->mpi_comm_d,
 +                                pme->overlap[0].s2g0, pme->overlap[1].s2g0,
 +                                bReproducible, pme->nthread);
 +    }
 +    else
 +    {
 +        pme->pmegridB.grid.grid = NULL;
 +        pme->fftgridB           = NULL;
 +        pme->cfftgridB          = NULL;
 +    }
 +
 +    if (!pme->bP3M)
 +    {
 +        /* Use plain SPME B-spline interpolation */
 +        make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 +    }
 +    else
 +    {
 +        /* Use the P3M grid-optimized influence function */
 +        make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 +    }
 +
 +    /* Use atc[0] for spreading */
 +    init_atomcomm(pme, &pme->atc[0], cr, nnodes_major > 1 ? 0 : 1, TRUE);
 +    if (pme->ndecompdim >= 2)
 +    {
 +        init_atomcomm(pme, &pme->atc[1], cr, 1, FALSE);
 +    }
 +
 +    if (pme->nnodes == 1)
 +    {
 +        pme->atc[0].n = homenr;
 +        pme_realloc_atomcomm_things(&pme->atc[0]);
 +    }
 +
 +    {
 +        int thread;
 +
 +        /* Use fft5d, order after FFT is y major, z, x minor */
 +
 +        snew(pme->work, pme->nthread);
 +        for (thread = 0; thread < pme->nthread; thread++)
 +        {
 +            realloc_work(&pme->work[thread], pme->nkx);
 +        }
 +    }
 +
 +    *pmedata = pme;
 +
 +    return 0;
 +}
 +
 +static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
 +{
 +    int d, t;
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        if (new->grid.n[d] > old->grid.n[d])
 +        {
 +            return;
 +        }
 +    }
 +
 +    sfree_aligned(new->grid.grid);
 +    new->grid.grid = old->grid.grid;
 +
 +    if (new->nthread > 1 && new->nthread == old->nthread)
 +    {
 +        sfree_aligned(new->grid_all);
 +        for (t = 0; t < new->nthread; t++)
 +        {
 +            new->grid_th[t].grid = old->grid_th[t].grid;
 +        }
 +    }
 +}
 +
 +int gmx_pme_reinit(gmx_pme_t *         pmedata,
 +                   t_commrec *         cr,
 +                   gmx_pme_t           pme_src,
 +                   const t_inputrec *  ir,
 +                   ivec                grid_size)
 +{
 +    t_inputrec irc;
 +    int homenr;
 +    int ret;
 +
 +    irc     = *ir;
 +    irc.nkx = grid_size[XX];
 +    irc.nky = grid_size[YY];
 +    irc.nkz = grid_size[ZZ];
 +
 +    if (pme_src->nnodes == 1)
 +    {
 +        homenr = pme_src->atc[0].n;
 +    }
 +    else
 +    {
 +        homenr = -1;
 +    }
 +
 +    ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
 +                       &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
 +
 +    if (ret == 0)
 +    {
 +        /* We can easily reuse the allocated pme grids in pme_src */
 +        reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
 +        /* We would like to reuse the fft grids, but that's harder */
 +    }
 +
 +    return ret;
 +}
 +
 +
 +static void copy_local_grid(gmx_pme_t pme,
 +                            pmegrids_t *pmegrids, int thread, real *fftgrid)
 +{
 +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 +    int  fft_my, fft_mz;
 +    int  nsx, nsy, nsz;
 +    ivec nf;
 +    int  offx, offy, offz, x, y, z, i0, i0t;
 +    int  d;
 +    pmegrid_t *pmegrid;
 +    real *grid_th;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    fft_my = local_fft_size[YY];
 +    fft_mz = local_fft_size[ZZ];
 +
 +    pmegrid = &pmegrids->grid_th[thread];
 +
 +    nsx = pmegrid->s[XX];
 +    nsy = pmegrid->s[YY];
 +    nsz = pmegrid->s[ZZ];
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
 +                    local_fft_ndata[d] - pmegrid->offset[d]);
 +    }
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +    /* Directly copy the non-overlapping parts of the local grids.
 +     * This also initializes the full grid.
 +     */
 +    grid_th = pmegrid->grid;
 +    for (x = 0; x < nf[XX]; x++)
 +    {
 +        for (y = 0; y < nf[YY]; y++)
 +        {
 +            i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
 +            i0t = (x*nsy + y)*nsz;
 +            for (z = 0; z < nf[ZZ]; z++)
 +            {
 +                fftgrid[i0+z] = grid_th[i0t+z];
 +            }
 +        }
 +    }
 +}
 +
 +static void
 +reduce_threadgrid_overlap(gmx_pme_t pme,
 +                          const pmegrids_t *pmegrids, int thread,
 +                          real *fftgrid, real *commbuf_x, real *commbuf_y)
 +{
 +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 +    int  fft_nx, fft_ny, fft_nz;
 +    int  fft_my, fft_mz;
 +    int  buf_my = -1;
 +    int  nsx, nsy, nsz;
 +    ivec ne;
 +    int  offx, offy, offz, x, y, z, i0, i0t;
 +    int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
 +    gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
 +    gmx_bool bCommX, bCommY;
 +    int  d;
 +    int  thread_f;
 +    const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
 +    const real *grid_th;
 +    real *commbuf = NULL;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    fft_nx = local_fft_ndata[XX];
 +    fft_ny = local_fft_ndata[YY];
 +    fft_nz = local_fft_ndata[ZZ];
 +
 +    fft_my = local_fft_size[YY];
 +    fft_mz = local_fft_size[ZZ];
 +
 +    /* This routine is called when all thread have finished spreading.
 +     * Here each thread sums grid contributions calculated by other threads
 +     * to the thread local grid volume.
 +     * To minimize the number of grid copying operations,
 +     * this routines sums immediately from the pmegrid to the fftgrid.
 +     */
 +
 +    /* Determine which part of the full node grid we should operate on,
 +     * this is our thread local part of the full grid.
 +     */
 +    pmegrid = &pmegrids->grid_th[thread];
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
 +                    local_fft_ndata[d]);
 +    }
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +
 +    bClearBufX  = TRUE;
 +    bClearBufY  = TRUE;
 +    bClearBufXY = TRUE;
 +
 +    /* Now loop over all the thread data blocks that contribute
 +     * to the grid region we (our thread) are operating on.
 +     */
 +    /* Note that ffy_nx/y is equal to the number of grid points
 +     * between the first point of our node grid and the one of the next node.
 +     */
 +    for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
 +    {
 +        fx     = pmegrid->ci[XX] + sx;
 +        ox     = 0;
 +        bCommX = FALSE;
 +        if (fx < 0)
 +        {
 +            fx    += pmegrids->nc[XX];
 +            ox    -= fft_nx;
 +            bCommX = (pme->nnodes_major > 1);
 +        }
 +        pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
 +        ox       += pmegrid_g->offset[XX];
 +        if (!bCommX)
 +        {
 +            tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
 +        }
 +        else
 +        {
 +            tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
 +        }
 +
 +        for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
 +        {
 +            fy     = pmegrid->ci[YY] + sy;
 +            oy     = 0;
 +            bCommY = FALSE;
 +            if (fy < 0)
 +            {
 +                fy    += pmegrids->nc[YY];
 +                oy    -= fft_ny;
 +                bCommY = (pme->nnodes_minor > 1);
 +            }
 +            pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
 +            oy       += pmegrid_g->offset[YY];
 +            if (!bCommY)
 +            {
 +                ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
 +            }
 +            else
 +            {
 +                ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
 +            }
 +
 +            for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
 +            {
 +                fz = pmegrid->ci[ZZ] + sz;
 +                oz = 0;
 +                if (fz < 0)
 +                {
 +                    fz += pmegrids->nc[ZZ];
 +                    oz -= fft_nz;
 +                }
 +                pmegrid_g = &pmegrids->grid_th[fz];
 +                oz       += pmegrid_g->offset[ZZ];
 +                tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
 +
 +                if (sx == 0 && sy == 0 && sz == 0)
 +                {
 +                    /* We have already added our local contribution
 +                     * before calling this routine, so skip it here.
 +                     */
 +                    continue;
 +                }
 +
 +                thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
 +
 +                pmegrid_f = &pmegrids->grid_th[thread_f];
 +
 +                grid_th = pmegrid_f->grid;
 +
 +                nsx = pmegrid_f->s[XX];
 +                nsy = pmegrid_f->s[YY];
 +                nsz = pmegrid_f->s[ZZ];
 +
 +#ifdef DEBUG_PME_REDUCE
 +                printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
 +                       pme->nodeid, thread, thread_f,
 +                       pme->pmegrid_start_ix,
 +                       pme->pmegrid_start_iy,
 +                       pme->pmegrid_start_iz,
 +                       sx, sy, sz,
 +                       offx-ox, tx1-ox, offx, tx1,
 +                       offy-oy, ty1-oy, offy, ty1,
 +                       offz-oz, tz1-oz, offz, tz1);
 +#endif
 +
 +                if (!(bCommX || bCommY))
 +                {
 +                    /* Copy from the thread local grid to the node grid */
 +                    for (x = offx; x < tx1; x++)
 +                    {
 +                        for (y = offy; y < ty1; y++)
 +                        {
 +                            i0  = (x*fft_my + y)*fft_mz;
 +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 +                            for (z = offz; z < tz1; z++)
 +                            {
 +                                fftgrid[i0+z] += grid_th[i0t+z];
 +                            }
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    /* The order of this conditional decides
 +                     * where the corner volume gets stored with x+y decomp.
 +                     */
 +                    if (bCommY)
 +                    {
 +                        commbuf = commbuf_y;
 +                        buf_my  = ty1 - offy;
 +                        if (bCommX)
 +                        {
 +                            /* We index commbuf modulo the local grid size */
 +                            commbuf += buf_my*fft_nx*fft_nz;
 +
 +                            bClearBuf   = bClearBufXY;
 +                            bClearBufXY = FALSE;
 +                        }
 +                        else
 +                        {
 +                            bClearBuf  = bClearBufY;
 +                            bClearBufY = FALSE;
 +                        }
 +                    }
 +                    else
 +                    {
 +                        commbuf    = commbuf_x;
 +                        buf_my     = fft_ny;
 +                        bClearBuf  = bClearBufX;
 +                        bClearBufX = FALSE;
 +                    }
 +
 +                    /* Copy to the communication buffer */
 +                    for (x = offx; x < tx1; x++)
 +                    {
 +                        for (y = offy; y < ty1; y++)
 +                        {
 +                            i0  = (x*buf_my + y)*fft_nz;
 +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 +
 +                            if (bClearBuf)
 +                            {
 +                                /* First access of commbuf, initialize it */
 +                                for (z = offz; z < tz1; z++)
 +                                {
 +                                    commbuf[i0+z]  = grid_th[i0t+z];
 +                                }
 +                            }
 +                            else
 +                            {
 +                                for (z = offz; z < tz1; z++)
 +                                {
 +                                    commbuf[i0+z] += grid_th[i0t+z];
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
 +{
 +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 +    pme_overlap_t *overlap;
 +    int  send_index0, send_nindex;
 +    int  recv_nindex;
 +#ifdef GMX_MPI
 +    MPI_Status stat;
 +#endif
 +    int  send_size_y, recv_size_y;
 +    int  ipulse, send_id, recv_id, datasize, gridsize, size_yx;
 +    real *sendptr, *recvptr;
 +    int  x, y, z, indg, indb;
 +
 +    /* Note that this routine is only used for forward communication.
 +     * Since the force gathering, unlike the charge spreading,
 +     * can be trivially parallelized over the particles,
 +     * the backwards process is much simpler and can use the "old"
 +     * communication setup.
 +     */
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    if (pme->nnodes_minor > 1)
 +    {
 +        /* Major dimension */
 +        overlap = &pme->overlap[1];
 +
 +        if (pme->nnodes_major > 1)
 +        {
 +            size_yx = pme->overlap[0].comm_data[0].send_nindex;
 +        }
 +        else
 +        {
 +            size_yx = 0;
 +        }
 +        datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
 +
 +        send_size_y = overlap->send_size;
 +
 +        for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 +        {
 +            send_id       = overlap->send_id[ipulse];
 +            recv_id       = overlap->recv_id[ipulse];
 +            send_index0   =
 +                overlap->comm_data[ipulse].send_index0 -
 +                overlap->comm_data[0].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            /* We don't use recv_index0, as we always receive starting at 0 */
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_size_y   = overlap->comm_data[ipulse].recv_size;
 +
 +            sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
 +            recvptr = overlap->recvbuf;
 +
 +#ifdef GMX_MPI
 +            MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
 +                         send_id, ipulse,
 +                         recvptr, recv_size_y*datasize, GMX_MPI_REAL,
 +                         recv_id, ipulse,
 +                         overlap->mpi_comm, &stat);
 +#endif
 +
 +            for (x = 0; x < local_fft_ndata[XX]; x++)
 +            {
 +                for (y = 0; y < recv_nindex; y++)
 +                {
 +                    indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
 +                    indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
 +                    for (z = 0; z < local_fft_ndata[ZZ]; z++)
 +                    {
 +                        fftgrid[indg+z] += recvptr[indb+z];
 +                    }
 +                }
 +            }
 +
 +            if (pme->nnodes_major > 1)
 +            {
 +                /* Copy from the received buffer to the send buffer for dim 0 */
 +                sendptr = pme->overlap[0].sendbuf;
 +                for (x = 0; x < size_yx; x++)
 +                {
 +                    for (y = 0; y < recv_nindex; y++)
 +                    {
 +                        indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 +                        indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
 +                        for (z = 0; z < local_fft_ndata[ZZ]; z++)
 +                        {
 +                            sendptr[indg+z] += recvptr[indb+z];
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* We only support a single pulse here.
 +     * This is not a severe limitation, as this code is only used
 +     * with OpenMP and with OpenMP the (PME) domains can be larger.
 +     */
 +    if (pme->nnodes_major > 1)
 +    {
 +        /* Major dimension */
 +        overlap = &pme->overlap[0];
 +
 +        datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
 +        gridsize = local_fft_size[YY] *local_fft_size[ZZ];
 +
 +        ipulse = 0;
 +
 +        send_id       = overlap->send_id[ipulse];
 +        recv_id       = overlap->recv_id[ipulse];
 +        send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        /* We don't use recv_index0, as we always receive starting at 0 */
 +        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +
 +        sendptr = overlap->sendbuf;
 +        recvptr = overlap->recvbuf;
 +
 +        if (debug != NULL)
 +        {
 +            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
 +                    send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
 +        }
 +
 +#ifdef GMX_MPI
 +        MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
 +                     send_id, ipulse,
 +                     recvptr, recv_nindex*datasize, GMX_MPI_REAL,
 +                     recv_id, ipulse,
 +                     overlap->mpi_comm, &stat);
 +#endif
 +
 +        for (x = 0; x < recv_nindex; x++)
 +        {
 +            for (y = 0; y < local_fft_ndata[YY]; y++)
 +            {
 +                indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
 +                indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 +                for (z = 0; z < local_fft_ndata[ZZ]; z++)
 +                {
 +                    fftgrid[indg+z] += recvptr[indb+z];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void spread_on_grid(gmx_pme_t pme,
 +                           pme_atomcomm_t *atc, pmegrids_t *grids,
 +                           gmx_bool bCalcSplines, gmx_bool bSpread,
 +                           real *fftgrid)
 +{
 +    int nthread, thread;
 +#ifdef PME_TIME_THREADS
 +    gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
 +    static double cs1     = 0, cs2 = 0, cs3 = 0;
 +    static double cs1a[6] = {0, 0, 0, 0, 0, 0};
 +    static int cnt        = 0;
 +#endif
 +
 +    nthread = pme->nthread;
 +    assert(nthread > 0);
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_start();
 +#endif
 +    if (bCalcSplines)
 +    {
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +        for (thread = 0; thread < nthread; thread++)
 +        {
 +            int start, end;
 +
 +            start = atc->n* thread   /nthread;
 +            end   = atc->n*(thread+1)/nthread;
 +
 +            /* Compute fftgrid index for all atoms,
 +             * with help of some extra variables.
 +             */
 +            calc_interpolation_idx(pme, atc, start, end, thread);
 +        }
 +    }
 +#ifdef PME_TIME_THREADS
 +    c1   = omp_cyc_end(c1);
 +    cs1 += (double)c1;
 +#endif
 +
 +#ifdef PME_TIME_THREADS
 +    c2 = omp_cyc_start();
 +#endif
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        splinedata_t *spline;
 +        pmegrid_t *grid;
 +
 +        /* make local bsplines  */
 +        if (grids == NULL || grids->nthread == 1)
 +        {
 +            spline = &atc->spline[0];
 +
 +            spline->n = atc->n;
 +
 +            grid = &grids->grid;
 +        }
 +        else
 +        {
 +            spline = &atc->spline[thread];
 +
 +            make_thread_local_ind(atc, thread, spline);
 +
 +            grid = &grids->grid_th[thread];
 +        }
 +
 +        if (bCalcSplines)
 +        {
 +            make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
 +                          atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
 +        }
 +
 +        if (bSpread)
 +        {
 +            /* put local atoms on grid. */
 +#ifdef PME_TIME_SPREAD
 +            ct1a = omp_cyc_start();
 +#endif
 +            spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
 +
 +            if (grids->nthread > 1)
 +            {
 +                copy_local_grid(pme, grids, thread, fftgrid);
 +            }
 +#ifdef PME_TIME_SPREAD
 +            ct1a          = omp_cyc_end(ct1a);
 +            cs1a[thread] += (double)ct1a;
 +#endif
 +        }
 +    }
 +#ifdef PME_TIME_THREADS
 +    c2   = omp_cyc_end(c2);
 +    cs2 += (double)c2;
 +#endif
 +
 +    if (bSpread && grids->nthread > 1)
 +    {
 +#ifdef PME_TIME_THREADS
 +        c3 = omp_cyc_start();
 +#endif
 +#pragma omp parallel for num_threads(grids->nthread) schedule(static)
 +        for (thread = 0; thread < grids->nthread; thread++)
 +        {
 +            reduce_threadgrid_overlap(pme, grids, thread,
 +                                      fftgrid,
 +                                      pme->overlap[0].sendbuf,
 +                                      pme->overlap[1].sendbuf);
 +        }
 +#ifdef PME_TIME_THREADS
 +        c3   = omp_cyc_end(c3);
 +        cs3 += (double)c3;
 +#endif
 +
 +        if (pme->nnodes > 1)
 +        {
 +            /* Communicate the overlapping part of the fftgrid */
 +            sum_fftgrid_dd(pme, fftgrid);
 +        }
 +    }
 +
 +#ifdef PME_TIME_THREADS
 +    cnt++;
 +    if (cnt % 20 == 0)
 +    {
 +        printf("idx %.2f spread %.2f red %.2f",
 +               cs1*1e-9, cs2*1e-9, cs3*1e-9);
 +#ifdef PME_TIME_SPREAD
 +        for (thread = 0; thread < nthread; thread++)
 +        {
 +            printf(" %.2f", cs1a[thread]*1e-9);
 +        }
 +#endif
 +        printf("\n");
 +    }
 +#endif
 +}
 +
 +
 +static void dump_grid(FILE *fp,
 +                      int sx, int sy, int sz, int nx, int ny, int nz,
 +                      int my, int mz, const real *g)
 +{
 +    int x, y, z;
 +
 +    for (x = 0; x < nx; x++)
 +    {
 +        for (y = 0; y < ny; y++)
 +        {
 +            for (z = 0; z < nz; z++)
 +            {
 +                fprintf(fp, "%2d %2d %2d %6.3f\n",
 +                        sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
 +            }
 +        }
 +    }
 +}
 +
 +static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
 +{
 +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    dump_grid(stderr,
 +              pme->pmegrid_start_ix,
 +              pme->pmegrid_start_iy,
 +              pme->pmegrid_start_iz,
 +              pme->pmegrid_nx-pme->pme_order+1,
 +              pme->pmegrid_ny-pme->pme_order+1,
 +              pme->pmegrid_nz-pme->pme_order+1,
 +              local_fft_size[YY],
 +              local_fft_size[ZZ],
 +              fftgrid);
 +}
 +
 +
 +void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
 +{
 +    pme_atomcomm_t *atc;
 +    pmegrids_t *grid;
 +
 +    if (pme->nnodes > 1)
 +    {
 +        gmx_incons("gmx_pme_calc_energy called in parallel");
 +    }
 +    if (pme->bFEP > 1)
 +    {
 +        gmx_incons("gmx_pme_calc_energy with free energy");
 +    }
 +
 +    atc            = &pme->atc_energy;
 +    atc->nthread   = 1;
 +    if (atc->spline == NULL)
 +    {
 +        snew(atc->spline, atc->nthread);
 +    }
 +    atc->nslab     = 1;
 +    atc->bSpread   = TRUE;
 +    atc->pme_order = pme->pme_order;
 +    atc->n         = n;
 +    pme_realloc_atomcomm_things(atc);
 +    atc->x         = x;
 +    atc->q         = q;
 +
 +    /* We only use the A-charges grid */
 +    grid = &pme->pmegridA;
 +
 +    spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
 +
 +    *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
 +}
 +
 +
 +static void reset_pmeonly_counters(t_commrec *cr, gmx_wallcycle_t wcycle,
-     ir->init_step += step_rel;
-     ir->nsteps    -= step_rel;
++                                   t_nrnb *nrnb, t_inputrec *ir,
++                                   gmx_large_int_t step)
 +{
 +    /* Reset all the counters related to performance over the run */
 +    wallcycle_stop(wcycle, ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    init_nrnb(nrnb);
-             natoms = gmx_pme_recv_q_x(pme_pp,
-                                       &chargeA, &chargeB, box, &x_pp, &f_pp,
-                                       &maxshift_x, &maxshift_y,
-                                       &pme->bFEP, &lambda,
-                                       &bEnerVir,
-                                       &step,
-                                       grid_switch, &ewaldcoeff);
-             if (natoms == -2)
++    if (ir->nsteps >= 0)
++    {
++        /* ir->nsteps is not used here, but we update it for consistency */
++        ir->nsteps -= step - ir->init_step;
++    }
++    ir->init_step = step;
 +    wallcycle_start(wcycle, ewcRUN);
 +}
 +
 +
 +static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
 +                               ivec grid_size,
 +                               t_commrec *cr, t_inputrec *ir,
 +                               gmx_pme_t *pme_ret)
 +{
 +    int ind;
 +    gmx_pme_t pme = NULL;
 +
 +    ind = 0;
 +    while (ind < *npmedata)
 +    {
 +        pme = (*pmedata)[ind];
 +        if (pme->nkx == grid_size[XX] &&
 +            pme->nky == grid_size[YY] &&
 +            pme->nkz == grid_size[ZZ])
 +        {
 +            *pme_ret = pme;
 +
 +            return;
 +        }
 +
 +        ind++;
 +    }
 +
 +    (*npmedata)++;
 +    srenew(*pmedata, *npmedata);
 +
 +    /* Generate a new PME data structure, copying part of the old pointers */
 +    gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
 +
 +    *pme_ret = (*pmedata)[ind];
 +}
 +
 +
 +int gmx_pmeonly(gmx_pme_t pme,
 +                t_commrec *cr,    t_nrnb *nrnb,
 +                gmx_wallcycle_t wcycle,
 +                real ewaldcoeff,  gmx_bool bGatherOnly,
 +                t_inputrec *ir)
 +{
 +    int npmedata;
 +    gmx_pme_t *pmedata;
 +    gmx_pme_pp_t pme_pp;
++    int  ret;
 +    int  natoms;
 +    matrix box;
 +    rvec *x_pp      = NULL, *f_pp = NULL;
 +    real *chargeA   = NULL, *chargeB = NULL;
 +    real lambda     = 0;
 +    int  maxshift_x = 0, maxshift_y = 0;
 +    real energy, dvdlambda;
 +    matrix vir;
 +    float cycles;
 +    int  count;
 +    gmx_bool bEnerVir;
 +    gmx_large_int_t step, step_rel;
 +    ivec grid_switch;
 +
 +    /* This data will only use with PME tuning, i.e. switching PME grids */
 +    npmedata = 1;
 +    snew(pmedata, npmedata);
 +    pmedata[0] = pme;
 +
 +    pme_pp = gmx_pme_pp_init(cr);
 +
 +    init_nrnb(nrnb);
 +
 +    count = 0;
 +    do /****** this is a quasi-loop over time steps! */
 +    {
 +        /* The reason for having a loop here is PME grid tuning/switching */
 +        do
 +        {
 +            /* Domain decomposition */
-         while (natoms == -2);
++            ret = gmx_pme_recv_q_x(pme_pp,
++                                   &natoms,
++                                   &chargeA, &chargeB, box, &x_pp, &f_pp,
++                                   &maxshift_x, &maxshift_y,
++                                   &pme->bFEP, &lambda,
++                                   &bEnerVir,
++                                   &step,
++                                   grid_switch, &ewaldcoeff);
++
++            if (ret == pmerecvqxSWITCHGRID)
 +            {
 +                /* Switch the PME grid to grid_switch */
 +                gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
 +            }
++
++            if (ret == pmerecvqxRESETCOUNTERS)
++            {
++                /* Reset the cycle and flop counters */
++                reset_pmeonly_counters(cr, wcycle, nrnb, ir, step);
++            }
 +        }
-         if (natoms == -1)
++        while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
 +
-         if (step_rel == wcycle_get_reset_counters(wcycle))
-         {
-             /* Reset all the counters related to performance over the run */
-             reset_pmeonly_counters(cr, wcycle, nrnb, ir, step_rel);
-             wcycle_set_reset_counters(wcycle, 0);
-         }
++        if (ret == pmerecvqxFINISH)
 +        {
 +            /* We should stop: break out of the loop */
 +            break;
 +        }
 +
 +        step_rel = step - ir->init_step;
 +
 +        if (count == 0)
 +        {
 +            wallcycle_start(wcycle, ewcRUN);
 +        }
 +
 +        wallcycle_start(wcycle, ewcPMEMESH);
 +
 +        dvdlambda = 0;
 +        clear_mat(vir);
 +        gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
 +                   cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
 +                   &energy, lambda, &dvdlambda,
 +                   GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
 +
 +        cycles = wallcycle_stop(wcycle, ewcPMEMESH);
 +
 +        gmx_pme_send_force_vir_ener(pme_pp,
 +                                    f_pp, vir, energy, dvdlambda,
 +                                    cycles);
 +
 +        count++;
 +    } /***** end of quasi-loop, we stop with the break above */
 +    while (TRUE);
 +
 +    return 0;
 +}
 +
 +int gmx_pme_do(gmx_pme_t pme,
 +               int start,       int homenr,
 +               rvec x[],        rvec f[],
 +               real *chargeA,   real *chargeB,
 +               matrix box, t_commrec *cr,
 +               int  maxshift_x, int maxshift_y,
 +               t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
 +               matrix vir,      real ewaldcoeff,
 +               real *energy,    real lambda,
 +               real *dvdlambda, int flags)
 +{
 +    int     q, d, i, j, ntot, npme;
 +    int     nx, ny, nz;
 +    int     n_d, local_ny;
 +    pme_atomcomm_t *atc = NULL;
 +    pmegrids_t *pmegrid = NULL;
 +    real    *grid       = NULL;
 +    real    *ptr;
 +    rvec    *x_d, *f_d;
 +    real    *charge = NULL, *q_d;
 +    real    energy_AB[2];
 +    matrix  vir_AB[2];
 +    gmx_bool bClearF;
 +    gmx_parallel_3dfft_t pfft_setup;
 +    real *  fftgrid;
 +    t_complex * cfftgrid;
 +    int     thread;
 +    const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
 +    const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
 +
 +    assert(pme->nnodes > 0);
 +    assert(pme->nnodes == 1 || pme->ndecompdim > 0);
 +
 +    if (pme->nnodes > 1)
 +    {
 +        atc      = &pme->atc[0];
 +        atc->npd = homenr;
 +        if (atc->npd > atc->pd_nalloc)
 +        {
 +            atc->pd_nalloc = over_alloc_dd(atc->npd);
 +            srenew(atc->pd, atc->pd_nalloc);
 +        }
 +        atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
 +    }
 +    else
 +    {
 +        /* This could be necessary for TPI */
 +        pme->atc[0].n = homenr;
 +    }
 +
 +    for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
 +    {
 +        if (q == 0)
 +        {
 +            pmegrid    = &pme->pmegridA;
 +            fftgrid    = pme->fftgridA;
 +            cfftgrid   = pme->cfftgridA;
 +            pfft_setup = pme->pfft_setupA;
 +            charge     = chargeA+start;
 +        }
 +        else
 +        {
 +            pmegrid    = &pme->pmegridB;
 +            fftgrid    = pme->fftgridB;
 +            cfftgrid   = pme->cfftgridB;
 +            pfft_setup = pme->pfft_setupB;
 +            charge     = chargeB+start;
 +        }
 +        grid = pmegrid->grid.grid;
 +        /* Unpack structure */
 +        if (debug)
 +        {
 +            fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
 +                    cr->nnodes, cr->nodeid);
 +            fprintf(debug, "Grid = %p\n", (void*)grid);
 +            if (grid == NULL)
 +            {
 +                gmx_fatal(FARGS, "No grid!");
 +            }
 +        }
 +        where();
 +
 +        m_inv_ur0(box, pme->recipbox);
 +
 +        if (pme->nnodes == 1)
 +        {
 +            atc = &pme->atc[0];
 +            if (DOMAINDECOMP(cr))
 +            {
 +                atc->n = homenr;
 +                pme_realloc_atomcomm_things(atc);
 +            }
 +            atc->x = x;
 +            atc->q = charge;
 +            atc->f = f;
 +        }
 +        else
 +        {
 +            wallcycle_start(wcycle, ewcPME_REDISTXF);
 +            for (d = pme->ndecompdim-1; d >= 0; d--)
 +            {
 +                if (d == pme->ndecompdim-1)
 +                {
 +                    n_d = homenr;
 +                    x_d = x + start;
 +                    q_d = charge;
 +                }
 +                else
 +                {
 +                    n_d = pme->atc[d+1].n;
 +                    x_d = atc->x;
 +                    q_d = atc->q;
 +                }
 +                atc      = &pme->atc[d];
 +                atc->npd = n_d;
 +                if (atc->npd > atc->pd_nalloc)
 +                {
 +                    atc->pd_nalloc = over_alloc_dd(atc->npd);
 +                    srenew(atc->pd, atc->pd_nalloc);
 +                }
 +                atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
 +                pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
 +                where();
 +
 +                /* Redistribute x (only once) and qA or qB */
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
 +                }
 +                else
 +                {
 +                    pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
 +                }
 +            }
 +            where();
 +
 +            wallcycle_stop(wcycle, ewcPME_REDISTXF);
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Node= %6d, pme local particles=%6d\n",
 +                    cr->nodeid, atc->n);
 +        }
 +
 +        if (flags & GMX_PME_SPREAD_Q)
 +        {
 +            wallcycle_start(wcycle, ewcPME_SPREADGATHER);
 +
 +            /* Spread the charges on a grid */
 +            spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
 +
 +            if (q == 0)
 +            {
 +                inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
 +            }
 +            inc_nrnb(nrnb, eNR_SPREADQBSP,
 +                     pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
 +
 +            if (pme->nthread == 1)
 +            {
 +                wrap_periodic_pmegrid(pme, grid);
 +
 +                /* sum contributions to local grid from other nodes */
 +#ifdef GMX_MPI
 +                if (pme->nnodes > 1)
 +                {
 +                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
 +                    where();
 +                }
 +#endif
 +
 +                copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
 +            }
 +
 +            wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
 +
 +            /*
 +               dump_local_fftgrid(pme,fftgrid);
 +               exit(0);
 +             */
 +        }
 +
 +        /* Here we start a large thread parallel region */
 +#pragma omp parallel num_threads(pme->nthread) private(thread)
 +        {
 +            thread = gmx_omp_get_thread_num();
 +            if (flags & GMX_PME_SOLVE)
 +            {
 +                int loop_count;
 +
 +                /* do 3d-fft */
 +                if (thread == 0)
 +                {
 +                    wallcycle_start(wcycle, ewcPME_FFT);
 +                }
 +                gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
 +                                           fftgrid, cfftgrid, thread, wcycle);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle, ewcPME_FFT);
 +                }
 +                where();
 +
 +                /* solve in k-space for our local cells */
 +                if (thread == 0)
 +                {
 +                    wallcycle_start(wcycle, ewcPME_SOLVE);
 +                }
 +                loop_count =
 +                    solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
 +                                  box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
 +                                  bCalcEnerVir,
 +                                  pme->nthread, thread);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle, ewcPME_SOLVE);
 +                    where();
 +                    inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
 +                }
 +            }
 +
 +            if (bCalcF)
 +            {
 +                /* do 3d-invfft */
 +                if (thread == 0)
 +                {
 +                    where();
 +                    wallcycle_start(wcycle, ewcPME_FFT);
 +                }
 +                gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
 +                                           cfftgrid, fftgrid, thread, wcycle);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle, ewcPME_FFT);
 +
 +                    where();
 +
 +                    if (pme->nodeid == 0)
 +                    {
 +                        ntot  = pme->nkx*pme->nky*pme->nkz;
 +                        npme  = ntot*log((real)ntot)/log(2.0);
 +                        inc_nrnb(nrnb, eNR_FFT, 2*npme);
 +                    }
 +
 +                    wallcycle_start(wcycle, ewcPME_SPREADGATHER);
 +                }
 +
 +                copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
 +            }
 +        }
 +        /* End of thread parallel section.
 +         * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
 +         */
 +
 +        if (bCalcF)
 +        {
 +            /* distribute local grid to all nodes */
 +#ifdef GMX_MPI
 +            if (pme->nnodes > 1)
 +            {
 +                gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
 +            }
 +#endif
 +            where();
 +
 +            unwrap_periodic_pmegrid(pme, grid);
 +
 +            /* interpolate forces for our local atoms */
 +
 +            where();
 +
 +            /* If we are running without parallelization,
 +             * atc->f is the actual force array, not a buffer,
 +             * therefore we should not clear it.
 +             */
 +            bClearF = (q == 0 && PAR(cr));
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +            for (thread = 0; thread < pme->nthread; thread++)
 +            {
 +                gather_f_bsplines(pme, grid, bClearF, atc,
 +                                  &atc->spline[thread],
 +                                  pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
 +            }
 +
 +            where();
 +
 +            inc_nrnb(nrnb, eNR_GATHERFBSP,
 +                     pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
 +            wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
 +        }
 +
 +        if (bCalcEnerVir)
 +        {
 +            /* This should only be called on the master thread
 +             * and after the threads have synchronized.
 +             */
 +            get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
 +        }
 +    } /* of q-loop */
 +
 +    if (bCalcF && pme->nnodes > 1)
 +    {
 +        wallcycle_start(wcycle, ewcPME_REDISTXF);
 +        for (d = 0; d < pme->ndecompdim; d++)
 +        {
 +            atc = &pme->atc[d];
 +            if (d == pme->ndecompdim - 1)
 +            {
 +                n_d = homenr;
 +                f_d = f + start;
 +            }
 +            else
 +            {
 +                n_d = pme->atc[d+1].n;
 +                f_d = pme->atc[d+1].f;
 +            }
 +            if (DOMAINDECOMP(cr))
 +            {
 +                dd_pmeredist_f(pme, atc, n_d, f_d,
 +                               d == pme->ndecompdim-1 && pme->bPPnode);
 +            }
 +            else
 +            {
 +                pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
 +            }
 +        }
 +
 +        wallcycle_stop(wcycle, ewcPME_REDISTXF);
 +    }
 +    where();
 +
 +    if (bCalcEnerVir)
 +    {
 +        if (!pme->bFEP)
 +        {
 +            *energy = energy_AB[0];
 +            m_add(vir, vir_AB[0], vir);
 +        }
 +        else
 +        {
 +            *energy     = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
 +            *dvdlambda += energy_AB[1] - energy_AB[0];
 +            for (i = 0; i < DIM; i++)
 +            {
 +                for (j = 0; j < DIM; j++)
 +                {
 +                    vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
 +                        lambda*vir_AB[1][i][j];
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        *energy = 0;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "PME mesh energy: %g\n", *energy);
 +    }
 +
 +    return 0;
 +}
index 1baf577d06b1f956e9fe68db7278ce9a0af2d073,0000000000000000000000000000000000000000..a597540351952ef5cfec4217f423905ceef14066
mode 100644,000000..100644
--- /dev/null
@@@ -1,612 -1,0 +1,643 @@@
- #define PP_PME_CHARGE   (1<<0)
- #define PP_PME_CHARGEB  (1<<1)
- #define PP_PME_COORD    (1<<2)
- #define PP_PME_FEP      (1<<3)
- #define PP_PME_ENER_VIR (1<<4)
- #define PP_PME_FINISH   (1<<5)
- #define PP_PME_SWITCH   (1<<6)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +
 +#include <stdio.h>
 +#include <string.h>
 +#include <math.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "vec.h"
 +#include "pme.h"
 +#include "network.h"
 +#include "domdec.h"
 +#include "sighandler.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
- void gmx_pme_send_switch(t_commrec *cr, ivec grid_size, real ewaldcoeff)
++#define PP_PME_CHARGE         (1<<0)
++#define PP_PME_CHARGEB        (1<<1)
++#define PP_PME_COORD          (1<<2)
++#define PP_PME_FEP            (1<<3)
++#define PP_PME_ENER_VIR       (1<<4)
++#define PP_PME_FINISH         (1<<5)
++#define PP_PME_SWITCHGRID     (1<<6)
++#define PP_PME_RESETCOUNTERS  (1<<7)
++
 +
 +#define PME_PP_SIGSTOP     (1<<0)
 +#define PME_PP_SIGSTOPNSS     (1<<1)
 +
 +typedef struct gmx_pme_pp {
 +#ifdef GMX_MPI
 +    MPI_Comm     mpi_comm_mysim;
 +#endif
 +    int          nnode;        /* The number of PP node to communicate with  */
 +    int         *node;         /* The PP node ranks                          */
 +    int          node_peer;    /* The peer PP node rank                      */
 +    int         *nat;          /* The number of atom for each PP node        */
 +    int          flags_charge; /* The flags sent along with the last charges */
 +    real        *chargeA;
 +    real        *chargeB;
 +    rvec        *x;
 +    rvec        *f;
 +    int          nalloc;
 +#ifdef GMX_MPI
 +    MPI_Request *req;
 +    MPI_Status  *stat;
 +#endif
 +} t_gmx_pme_pp;
 +
 +typedef struct gmx_pme_comm_n_box {
 +    int             natoms;
 +    matrix          box;
 +    int             maxshift_x;
 +    int             maxshift_y;
 +    real            lambda;
 +    int             flags;
 +    gmx_large_int_t step;
 +    ivec            grid_size;  /* For PME grid tuning */
 +    real            ewaldcoeff; /* For PME grid tuning */
 +} gmx_pme_comm_n_box_t;
 +
 +typedef struct {
 +    matrix          vir;
 +    real            energy;
 +    real            dvdlambda;
 +    float           cycles;
 +    gmx_stop_cond_t stop_cond;
 +} gmx_pme_comm_vir_ene_t;
 +
 +
 +
 +
 +gmx_pme_pp_t gmx_pme_pp_init(t_commrec *cr)
 +{
 +    struct gmx_pme_pp *pme_pp;
 +    int                rank;
 +
 +    snew(pme_pp, 1);
 +
 +#ifdef GMX_MPI
 +    pme_pp->mpi_comm_mysim = cr->mpi_comm_mysim;
 +    MPI_Comm_rank(cr->mpi_comm_mygroup, &rank);
 +    get_pme_ddnodes(cr, rank, &pme_pp->nnode, &pme_pp->node, &pme_pp->node_peer);
 +    snew(pme_pp->nat, pme_pp->nnode);
 +    snew(pme_pp->req, 2*pme_pp->nnode);
 +    snew(pme_pp->stat, 2*pme_pp->nnode);
 +    pme_pp->nalloc       = 0;
 +    pme_pp->flags_charge = 0;
 +#endif
 +
 +    return pme_pp;
 +}
 +
 +/* This should be faster with a real non-blocking MPI implementation */
 +/* #define GMX_PME_DELAYED_WAIT */
 +
 +static void gmx_pme_send_q_x_wait(gmx_domdec_t *dd)
 +{
 +#ifdef GMX_MPI
 +    if (dd->nreq_pme)
 +    {
 +        MPI_Waitall(dd->nreq_pme, dd->req_pme, MPI_STATUSES_IGNORE);
 +        dd->nreq_pme = 0;
 +    }
 +#endif
 +}
 +
 +static void gmx_pme_send_q_x(t_commrec *cr, int flags,
 +                             real *chargeA, real *chargeB,
 +                             matrix box, rvec *x,
 +                             real lambda,
 +                             int maxshift_x, int maxshift_y,
 +                             gmx_large_int_t step)
 +{
 +    gmx_domdec_t         *dd;
 +    gmx_pme_comm_n_box_t *cnb;
 +    int                   n;
 +
 +    dd = cr->dd;
 +    n  = dd->nat_home;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "PP node %d sending to PME node %d: %d%s%s\n",
 +                cr->sim_nodeid, dd->pme_nodeid, n,
 +                flags & PP_PME_CHARGE ? " charges" : "",
 +                flags & PP_PME_COORD  ? " coordinates" : "");
 +    }
 +
 +#ifdef GMX_PME_DELAYED_WAIT
 +    /* When can not use cnb until pending communication has finished */
 +    gmx_pme_send_x_q_wait(dd);
 +#endif
 +
 +    if (dd->pme_receive_vir_ener)
 +    {
 +        /* Peer PP node: communicate all data */
 +        if (dd->cnb == NULL)
 +        {
 +            snew(dd->cnb, 1);
 +        }
 +        cnb = dd->cnb;
 +
 +        cnb->flags      = flags;
 +        cnb->natoms     = n;
 +        cnb->maxshift_x = maxshift_x;
 +        cnb->maxshift_y = maxshift_y;
 +        cnb->lambda     = lambda;
 +        cnb->step       = step;
 +        if (flags & PP_PME_COORD)
 +        {
 +            copy_mat(box, cnb->box);
 +        }
 +#ifdef GMX_MPI
 +        MPI_Isend(cnb, sizeof(*cnb), MPI_BYTE,
 +                  dd->pme_nodeid, 0, cr->mpi_comm_mysim,
 +                  &dd->req_pme[dd->nreq_pme++]);
 +#endif
 +    }
 +    else if (flags & PP_PME_CHARGE)
 +    {
 +#ifdef GMX_MPI
 +        /* Communicate only the number of atoms */
 +        MPI_Isend(&n, sizeof(n), MPI_BYTE,
 +                  dd->pme_nodeid, 0, cr->mpi_comm_mysim,
 +                  &dd->req_pme[dd->nreq_pme++]);
 +#endif
 +    }
 +
 +#ifdef GMX_MPI
 +    if (n > 0)
 +    {
 +        if (flags & PP_PME_CHARGE)
 +        {
 +            MPI_Isend(chargeA, n*sizeof(real), MPI_BYTE,
 +                      dd->pme_nodeid, 1, cr->mpi_comm_mysim,
 +                      &dd->req_pme[dd->nreq_pme++]);
 +        }
 +        if (flags & PP_PME_CHARGEB)
 +        {
 +            MPI_Isend(chargeB, n*sizeof(real), MPI_BYTE,
 +                      dd->pme_nodeid, 2, cr->mpi_comm_mysim,
 +                      &dd->req_pme[dd->nreq_pme++]);
 +        }
 +        if (flags & PP_PME_COORD)
 +        {
 +            MPI_Isend(x[0], n*sizeof(rvec), MPI_BYTE,
 +                      dd->pme_nodeid, 3, cr->mpi_comm_mysim,
 +                      &dd->req_pme[dd->nreq_pme++]);
 +        }
 +    }
 +
 +#ifndef GMX_PME_DELAYED_WAIT
 +    /* Wait for the data to arrive */
 +    /* We can skip this wait as we are sure x and q will not be modified
 +     * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
 +     */
 +    gmx_pme_send_q_x_wait(dd);
 +#endif
 +#endif
 +}
 +
 +void gmx_pme_send_q(t_commrec *cr,
 +                    gmx_bool bFreeEnergy, real *chargeA, real *chargeB,
 +                    int maxshift_x, int maxshift_y)
 +{
 +    int flags;
 +
 +    flags = PP_PME_CHARGE;
 +    if (bFreeEnergy)
 +    {
 +        flags |= PP_PME_CHARGEB;
 +    }
 +
 +    gmx_pme_send_q_x(cr, flags,
 +                     chargeA, chargeB, NULL, NULL, 0, maxshift_x, maxshift_y, -1);
 +}
 +
 +void gmx_pme_send_x(t_commrec *cr, matrix box, rvec *x,
 +                    gmx_bool bFreeEnergy, real lambda,
 +                    gmx_bool bEnerVir,
 +                    gmx_large_int_t step)
 +{
 +    int flags;
 +
 +    flags = PP_PME_COORD;
 +    if (bFreeEnergy)
 +    {
 +        flags |= PP_PME_FEP;
 +    }
 +    if (bEnerVir)
 +    {
 +        flags |= PP_PME_ENER_VIR;
 +    }
 +
 +    gmx_pme_send_q_x(cr, flags, NULL, NULL, box, x, lambda, 0, 0, step);
 +}
 +
 +void gmx_pme_send_finish(t_commrec *cr)
 +{
 +    int flags;
 +
 +    flags = PP_PME_FINISH;
 +
 +    gmx_pme_send_q_x(cr, flags, NULL, NULL, NULL, NULL, 0, 0, 0, -1);
 +}
 +
-         cnb.flags = PP_PME_SWITCH;
++void gmx_pme_send_switchgrid(t_commrec *cr, ivec grid_size, real ewaldcoeff)
 +{
 +#ifdef GMX_MPI
 +    gmx_pme_comm_n_box_t cnb;
 +
++    /* Only let one PP node signal each PME node */
 +    if (cr->dd->pme_receive_vir_ener)
 +    {
-             fprintf(debug, "PME only node receiving:%s%s%s%s\n",
-                     (cnb.flags & PP_PME_CHARGE) ? " charges" : "",
-                     (cnb.flags & PP_PME_COORD ) ? " coordinates" : "",
-                     (cnb.flags & PP_PME_FINISH) ? " finish" : "",
-                     (cnb.flags & PP_PME_SWITCH) ? " switch" : "");
++        cnb.flags = PP_PME_SWITCHGRID;
 +        copy_ivec(grid_size, cnb.grid_size);
 +        cnb.ewaldcoeff = ewaldcoeff;
 +
 +        /* We send this, uncommon, message blocking to simplify the code */
 +        MPI_Send(&cnb, sizeof(cnb), MPI_BYTE,
 +                 cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim);
 +    }
 +#endif
 +}
 +
++void gmx_pme_send_resetcounters(t_commrec *cr, gmx_large_int_t step)
++{
++#ifdef GMX_MPI
++    gmx_pme_comm_n_box_t cnb;
++
++    /* Only let one PP node signal each PME node */
++    if (cr->dd->pme_receive_vir_ener)
++    {
++        cnb.flags = PP_PME_RESETCOUNTERS;
++        cnb.step  = step;
++
++        /* We send this, uncommon, message blocking to simplify the code */
++        MPI_Send(&cnb, sizeof(cnb), MPI_BYTE,
++                 cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim);
++    }
++#endif
++}
++
 +int gmx_pme_recv_q_x(struct gmx_pme_pp *pme_pp,
++                     int *natoms,
 +                     real **chargeA, real **chargeB,
 +                     matrix box, rvec **x, rvec **f,
 +                     int *maxshift_x, int *maxshift_y,
 +                     gmx_bool *bFreeEnergy, real *lambda,
 +                     gmx_bool *bEnerVir,
 +                     gmx_large_int_t *step,
 +                     ivec grid_size, real *ewaldcoeff)
 +{
 +    gmx_pme_comm_n_box_t cnb;
 +    int                  nat = 0, q, messages, sender;
 +    real                *charge_pp;
 +
 +    messages = 0;
 +
 +    /* avoid compiler warning about unused variable without MPI support */
 +    cnb.flags = 0;
 +#ifdef GMX_MPI
 +    do
 +    {
 +        /* Receive the send count, box and time step from the peer PP node */
 +        MPI_Recv(&cnb, sizeof(cnb), MPI_BYTE,
 +                 pme_pp->node_peer, 0,
 +                 pme_pp->mpi_comm_mysim, MPI_STATUS_IGNORE);
 +
 +        if (debug)
 +        {
-         if (cnb.flags & PP_PME_SWITCH)
++            fprintf(debug, "PME only node receiving:%s%s%s%s%s\n",
++                    (cnb.flags & PP_PME_CHARGE)        ? " charges" : "",
++                    (cnb.flags & PP_PME_COORD )        ? " coordinates" : "",
++                    (cnb.flags & PP_PME_FINISH)        ? " finish" : "",
++                    (cnb.flags & PP_PME_SWITCHGRID)    ? " switch grid" : "",
++                    (cnb.flags & PP_PME_RESETCOUNTERS) ? " reset counters" : "");
 +        }
 +
-             return -2;
++        if (cnb.flags & PP_PME_SWITCHGRID)
 +        {
 +            /* Special case, receive the new parameters and return */
 +            copy_ivec(cnb.grid_size, grid_size);
 +            *ewaldcoeff = cnb.ewaldcoeff;
 +
-     return ((cnb.flags & PP_PME_FINISH) ? -1 : nat);
++            return pmerecvqxSWITCHGRID;
++        }
++
++        if (cnb.flags & PP_PME_RESETCOUNTERS)
++        {
++            /* Special case, receive the step and return */
++            *step = cnb.step;
++
++            return pmerecvqxRESETCOUNTERS;
 +        }
 +
 +        if (cnb.flags & PP_PME_CHARGE)
 +        {
 +            /* Receive the send counts from the other PP nodes */
 +            for (sender = 0; sender < pme_pp->nnode; sender++)
 +            {
 +                if (pme_pp->node[sender] == pme_pp->node_peer)
 +                {
 +                    pme_pp->nat[sender] = cnb.natoms;
 +                }
 +                else
 +                {
 +                    MPI_Irecv(&(pme_pp->nat[sender]), sizeof(pme_pp->nat[0]),
 +                              MPI_BYTE,
 +                              pme_pp->node[sender], 0,
 +                              pme_pp->mpi_comm_mysim, &pme_pp->req[messages++]);
 +                }
 +            }
 +            MPI_Waitall(messages, pme_pp->req, pme_pp->stat);
 +            messages = 0;
 +
 +            nat = 0;
 +            for (sender = 0; sender < pme_pp->nnode; sender++)
 +            {
 +                nat += pme_pp->nat[sender];
 +            }
 +
 +            if (nat > pme_pp->nalloc)
 +            {
 +                pme_pp->nalloc = over_alloc_dd(nat);
 +                srenew(pme_pp->chargeA, pme_pp->nalloc);
 +                if (cnb.flags & PP_PME_CHARGEB)
 +                {
 +                    srenew(pme_pp->chargeB, pme_pp->nalloc);
 +                }
 +                srenew(pme_pp->x, pme_pp->nalloc);
 +                srenew(pme_pp->f, pme_pp->nalloc);
 +            }
 +
 +            /* maxshift is sent when the charges are sent */
 +            *maxshift_x = cnb.maxshift_x;
 +            *maxshift_y = cnb.maxshift_y;
 +
 +            /* Receive the charges in place */
 +            for (q = 0; q < ((cnb.flags & PP_PME_CHARGEB) ? 2 : 1); q++)
 +            {
 +                if (q == 0)
 +                {
 +                    charge_pp = pme_pp->chargeA;
 +                }
 +                else
 +                {
 +                    charge_pp = pme_pp->chargeB;
 +                }
 +                nat = 0;
 +                for (sender = 0; sender < pme_pp->nnode; sender++)
 +                {
 +                    if (pme_pp->nat[sender] > 0)
 +                    {
 +                        MPI_Irecv(charge_pp+nat,
 +                                  pme_pp->nat[sender]*sizeof(real),
 +                                  MPI_BYTE,
 +                                  pme_pp->node[sender], 1+q,
 +                                  pme_pp->mpi_comm_mysim,
 +                                  &pme_pp->req[messages++]);
 +                        nat += pme_pp->nat[sender];
 +                        if (debug)
 +                        {
 +                            fprintf(debug, "Received from PP node %d: %d "
 +                                    "charges\n",
 +                                    pme_pp->node[sender], pme_pp->nat[sender]);
 +                        }
 +                    }
 +                }
 +            }
 +
 +            pme_pp->flags_charge = cnb.flags;
 +        }
 +
 +        if (cnb.flags & PP_PME_COORD)
 +        {
 +            if (!(pme_pp->flags_charge & PP_PME_CHARGE))
 +            {
 +                gmx_incons("PME-only node received coordinates before charges"
 +                           );
 +            }
 +
 +            /* The box, FE flag and lambda are sent along with the coordinates
 +             *  */
 +            copy_mat(cnb.box, box);
 +            *bFreeEnergy = (cnb.flags & PP_PME_FEP);
 +            *lambda      = cnb.lambda;
 +            *bEnerVir    = (cnb.flags & PP_PME_ENER_VIR);
 +
 +            if (*bFreeEnergy && !(pme_pp->flags_charge & PP_PME_CHARGEB))
 +            {
 +                gmx_incons("PME-only node received free energy request, but "
 +                           "did not receive B-state charges");
 +            }
 +
 +            /* Receive the coordinates in place */
 +            nat = 0;
 +            for (sender = 0; sender < pme_pp->nnode; sender++)
 +            {
 +                if (pme_pp->nat[sender] > 0)
 +                {
 +                    MPI_Irecv(pme_pp->x[nat], pme_pp->nat[sender]*sizeof(rvec),
 +                              MPI_BYTE,
 +                              pme_pp->node[sender], 3,
 +                              pme_pp->mpi_comm_mysim, &pme_pp->req[messages++]);
 +                    nat += pme_pp->nat[sender];
 +                    if (debug)
 +                    {
 +                        fprintf(debug, "Received from PP node %d: %d "
 +                                "coordinates\n",
 +                                pme_pp->node[sender], pme_pp->nat[sender]);
 +                    }
 +                }
 +            }
 +        }
 +
 +        /* Wait for the coordinates and/or charges to arrive */
 +        MPI_Waitall(messages, pme_pp->req, pme_pp->stat);
 +        messages = 0;
 +    }
 +    while (!(cnb.flags & (PP_PME_COORD | PP_PME_FINISH)));
 +
 +    *step = cnb.step;
 +#endif
 +
++    *natoms  = nat;
 +    *chargeA = pme_pp->chargeA;
 +    *chargeB = pme_pp->chargeB;
 +    *x       = pme_pp->x;
 +    *f       = pme_pp->f;
 +
++    return ((cnb.flags & PP_PME_FINISH) ? pmerecvqxFINISH : pmerecvqxX);
 +}
 +
 +static void receive_virial_energy(t_commrec *cr,
 +                                  matrix vir, real *energy, real *dvdlambda,
 +                                  float *pme_cycles)
 +{
 +    gmx_pme_comm_vir_ene_t cve;
 +
 +    if (cr->dd->pme_receive_vir_ener)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug,
 +                    "PP node %d receiving from PME node %d: virial and energy\n",
 +                    cr->sim_nodeid, cr->dd->pme_nodeid);
 +        }
 +#ifdef GMX_MPI
 +        MPI_Recv(&cve, sizeof(cve), MPI_BYTE, cr->dd->pme_nodeid, 1, cr->mpi_comm_mysim,
 +                 MPI_STATUS_IGNORE);
 +#else
 +        memset(&cve, 0, sizeof(cve));
 +#endif
 +
 +        m_add(vir, cve.vir, vir);
 +        *energy     = cve.energy;
 +        *dvdlambda += cve.dvdlambda;
 +        *pme_cycles = cve.cycles;
 +
 +        if (cve.stop_cond != gmx_stop_cond_none)
 +        {
 +            gmx_set_stop_condition(cve.stop_cond);
 +        }
 +    }
 +    else
 +    {
 +        *energy     = 0;
 +        *pme_cycles = 0;
 +    }
 +}
 +
 +void gmx_pme_receive_f(t_commrec *cr,
 +                       rvec f[], matrix vir,
 +                       real *energy, real *dvdlambda,
 +                       float *pme_cycles)
 +{
 +    int natoms, i;
 +
 +#ifdef GMX_PME_DELAYED_WAIT
 +    /* Wait for the x request to finish */
 +    gmx_pme_send_q_x_wait(cr->dd);
 +#endif
 +
 +    natoms = cr->dd->nat_home;
 +
 +    if (natoms > cr->dd->pme_recv_f_alloc)
 +    {
 +        cr->dd->pme_recv_f_alloc = over_alloc_dd(natoms);
 +        srenew(cr->dd->pme_recv_f_buf, cr->dd->pme_recv_f_alloc);
 +    }
 +
 +#ifdef GMX_MPI
 +    MPI_Recv(cr->dd->pme_recv_f_buf[0],
 +             natoms*sizeof(rvec), MPI_BYTE,
 +             cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim,
 +             MPI_STATUS_IGNORE);
 +#endif
 +
 +    for (i = 0; i < natoms; i++)
 +    {
 +        rvec_inc(f[i], cr->dd->pme_recv_f_buf[i]);
 +    }
 +
 +
 +    receive_virial_energy(cr, vir, energy, dvdlambda, pme_cycles);
 +}
 +
 +void gmx_pme_send_force_vir_ener(struct gmx_pme_pp *pme_pp,
 +                                 rvec *f, matrix vir,
 +                                 real energy, real dvdlambda,
 +                                 float cycles)
 +{
 +    gmx_pme_comm_vir_ene_t cve;
 +    int                    messages, ind_start, ind_end, receiver;
 +
 +    cve.cycles = cycles;
 +
 +    /* Now the evaluated forces have to be transferred to the PP nodes */
 +    messages = 0;
 +    ind_end  = 0;
 +    for (receiver = 0; receiver < pme_pp->nnode; receiver++)
 +    {
 +        ind_start = ind_end;
 +        ind_end   = ind_start + pme_pp->nat[receiver];
 +#ifdef GMX_MPI
 +        if (MPI_Isend(f[ind_start], (ind_end-ind_start)*sizeof(rvec), MPI_BYTE,
 +                      pme_pp->node[receiver], 0,
 +                      pme_pp->mpi_comm_mysim, &pme_pp->req[messages++]) != 0)
 +        {
 +            gmx_comm("MPI_Isend failed in do_pmeonly");
 +        }
 +#endif
 +    }
 +
 +    /* send virial and energy to our last PP node */
 +    copy_mat(vir, cve.vir);
 +    cve.energy    = energy;
 +    cve.dvdlambda = dvdlambda;
 +    /* check for the signals to send back to a PP node */
 +    cve.stop_cond = gmx_get_stop_condition();
 +
 +    cve.cycles = cycles;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "PME node sending to PP node %d: virial and energy\n",
 +                pme_pp->node_peer);
 +    }
 +#ifdef GMX_MPI
 +    MPI_Isend(&cve, sizeof(cve), MPI_BYTE,
 +              pme_pp->node_peer, 1,
 +              pme_pp->mpi_comm_mysim, &pme_pp->req[messages++]);
 +
 +    /* Wait for the forces to arrive */
 +    MPI_Waitall(messages, pme_pp->req, pme_pp->stat);
 +#endif
 +}
index 386e6907245425d3a965176fd23101010faea973,0000000000000000000000000000000000000000..119e8420f096b04f7f76a54364b34776662cba1d
mode 100644,000000..100644
--- /dev/null
@@@ -1,2079 -1,0 +1,2083 @@@
-     if (nrend-start > sd->sd_V_nalloc)
-     {
-         sd->sd_V_nalloc = over_alloc_dd(nrend-start);
-         srenew(sd->sd_V, sd->sd_V_nalloc);
-     }
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +
 +#include <stdio.h>
 +#include <math.h>
 +
 +#include "types/commrec.h"
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "typedefs.h"
 +#include "nrnb.h"
 +#include "physics.h"
 +#include "macros.h"
 +#include "vec.h"
 +#include "main.h"
 +#include "confio.h"
 +#include "update.h"
 +#include "gmx_random.h"
 +#include "futil.h"
 +#include "mshift.h"
 +#include "tgroup.h"
 +#include "force.h"
 +#include "names.h"
 +#include "txtdump.h"
 +#include "mdrun.h"
 +#include "copyrite.h"
 +#include "constr.h"
 +#include "edsam.h"
 +#include "pull.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "gmx_wallcycle.h"
 +#include "gmx_omp_nthreads.h"
 +#include "gmx_omp.h"
 +
 +/*For debugging, start at v(-dt/2) for velolcity verlet -- uncomment next line */
 +/*#define STARTFROMDT2*/
 +
 +typedef struct {
 +    double gdt;
 +    double eph;
 +    double emh;
 +    double em;
 +    double b;
 +    double c;
 +    double d;
 +} gmx_sd_const_t;
 +
 +typedef struct {
 +    real V;
 +    real X;
 +    real Yv;
 +    real Yx;
 +} gmx_sd_sigma_t;
 +
 +typedef struct {
 +    /* The random state for ngaussrand threads.
 +     * Normal thermostats need just 1 random number generator,
 +     * but SD and BD with OpenMP parallelization need 1 for each thread.
 +     */
 +    int             ngaussrand;
 +    gmx_rng_t      *gaussrand;
 +    /* BD stuff */
 +    real           *bd_rf;
 +    /* SD stuff */
 +    gmx_sd_const_t *sdc;
 +    gmx_sd_sigma_t *sdsig;
 +    rvec           *sd_V;
 +    int             sd_V_nalloc;
 +    /* andersen temperature control stuff */
 +    gmx_bool       *randomize_group;
 +    real           *boltzfac;
 +} gmx_stochd_t;
 +
 +typedef struct gmx_update
 +{
 +    gmx_stochd_t *sd;
 +    /* xprime for constraint algorithms */
 +    rvec         *xp;
 +    int           xp_nalloc;
 +
 +    /* variable size arrays for andersen */
 +    gmx_bool *randatom;
 +    int      *randatom_list;
 +    gmx_bool  randatom_list_init;
 +
 +    /* Variables for the deform algorithm */
 +    gmx_large_int_t deformref_step;
 +    matrix          deformref_box;
 +} t_gmx_update;
 +
 +
 +static void do_update_md(int start, int nrend, double dt,
 +                         t_grp_tcstat *tcstat,
 +                         double nh_vxi[],
 +                         gmx_bool bNEMD, t_grp_acc *gstat, rvec accel[],
 +                         ivec nFreeze[],
 +                         real invmass[],
 +                         unsigned short ptype[], unsigned short cFREEZE[],
 +                         unsigned short cACC[], unsigned short cTC[],
 +                         rvec x[], rvec xprime[], rvec v[],
 +                         rvec f[], matrix M,
 +                         gmx_bool bNH, gmx_bool bPR)
 +{
 +    double imass, w_dt;
 +    int    gf = 0, ga = 0, gt = 0;
 +    rvec   vrel;
 +    real   vn, vv, va, vb, vnrel;
 +    real   lg, vxi = 0, u;
 +    int    n, d;
 +
 +    if (bNH || bPR)
 +    {
 +        /* Update with coupling to extended ensembles, used for
 +         * Nose-Hoover and Parrinello-Rahman coupling
 +         * Nose-Hoover uses the reversible leap-frog integrator from
 +         * Holian et al. Phys Rev E 52(3) : 2338, 1995
 +         */
 +        for (n = start; n < nrend; n++)
 +        {
 +            imass = invmass[n];
 +            if (cFREEZE)
 +            {
 +                gf   = cFREEZE[n];
 +            }
 +            if (cACC)
 +            {
 +                ga   = cACC[n];
 +            }
 +            if (cTC)
 +            {
 +                gt   = cTC[n];
 +            }
 +            lg   = tcstat[gt].lambda;
 +            if (bNH)
 +            {
 +                vxi   = nh_vxi[gt];
 +            }
 +            rvec_sub(v[n], gstat[ga].u, vrel);
 +
 +            for (d = 0; d < DIM; d++)
 +            {
 +                if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +                {
 +                    vnrel = (lg*vrel[d] + dt*(imass*f[n][d] - 0.5*vxi*vrel[d]
 +                                              - iprod(M[d], vrel)))/(1 + 0.5*vxi*dt);
 +                    /* do not scale the mean velocities u */
 +                    vn             = gstat[ga].u[d] + accel[ga][d]*dt + vnrel;
 +                    v[n][d]        = vn;
 +                    xprime[n][d]   = x[n][d]+vn*dt;
 +                }
 +                else
 +                {
 +                    v[n][d]        = 0.0;
 +                    xprime[n][d]   = x[n][d];
 +                }
 +            }
 +        }
 +    }
 +    else if (cFREEZE != NULL ||
 +             nFreeze[0][XX] || nFreeze[0][YY] || nFreeze[0][ZZ] ||
 +             bNEMD)
 +    {
 +        /* Update with Berendsen/v-rescale coupling and freeze or NEMD */
 +        for (n = start; n < nrend; n++)
 +        {
 +            w_dt = invmass[n]*dt;
 +            if (cFREEZE)
 +            {
 +                gf   = cFREEZE[n];
 +            }
 +            if (cACC)
 +            {
 +                ga   = cACC[n];
 +            }
 +            if (cTC)
 +            {
 +                gt   = cTC[n];
 +            }
 +            lg   = tcstat[gt].lambda;
 +
 +            for (d = 0; d < DIM; d++)
 +            {
 +                vn             = v[n][d];
 +                if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +                {
 +                    vv             = lg*vn + f[n][d]*w_dt;
 +
 +                    /* do not scale the mean velocities u */
 +                    u              = gstat[ga].u[d];
 +                    va             = vv + accel[ga][d]*dt;
 +                    vb             = va + (1.0-lg)*u;
 +                    v[n][d]        = vb;
 +                    xprime[n][d]   = x[n][d]+vb*dt;
 +                }
 +                else
 +                {
 +                    v[n][d]        = 0.0;
 +                    xprime[n][d]   = x[n][d];
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Plain update with Berendsen/v-rescale coupling */
 +        for (n = start; n < nrend; n++)
 +        {
 +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
 +            {
 +                w_dt = invmass[n]*dt;
 +                if (cTC)
 +                {
 +                    gt = cTC[n];
 +                }
 +                lg = tcstat[gt].lambda;
 +
 +                for (d = 0; d < DIM; d++)
 +                {
 +                    vn           = lg*v[n][d] + f[n][d]*w_dt;
 +                    v[n][d]      = vn;
 +                    xprime[n][d] = x[n][d] + vn*dt;
 +                }
 +            }
 +            else
 +            {
 +                for (d = 0; d < DIM; d++)
 +                {
 +                    v[n][d]        = 0.0;
 +                    xprime[n][d]   = x[n][d];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static void do_update_vv_vel(int start, int nrend, double dt,
 +                             t_grp_tcstat *tcstat, t_grp_acc *gstat,
 +                             rvec accel[], ivec nFreeze[], real invmass[],
 +                             unsigned short ptype[], unsigned short cFREEZE[],
 +                             unsigned short cACC[], rvec v[], rvec f[],
 +                             gmx_bool bExtended, real veta, real alpha)
 +{
 +    double imass, w_dt;
 +    int    gf = 0, ga = 0;
 +    rvec   vrel;
 +    real   u, vn, vv, va, vb, vnrel;
 +    int    n, d;
 +    double g, mv1, mv2;
 +
 +    if (bExtended)
 +    {
 +        g        = 0.25*dt*veta*alpha;
 +        mv1      = exp(-g);
 +        mv2      = series_sinhx(g);
 +    }
 +    else
 +    {
 +        mv1      = 1.0;
 +        mv2      = 1.0;
 +    }
 +    for (n = start; n < nrend; n++)
 +    {
 +        w_dt = invmass[n]*dt;
 +        if (cFREEZE)
 +        {
 +            gf   = cFREEZE[n];
 +        }
 +        if (cACC)
 +        {
 +            ga   = cACC[n];
 +        }
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +            {
 +                v[n][d]             = mv1*(mv1*v[n][d] + 0.5*(w_dt*mv2*f[n][d]))+0.5*accel[ga][d]*dt;
 +            }
 +            else
 +            {
 +                v[n][d]        = 0.0;
 +            }
 +        }
 +    }
 +} /* do_update_vv_vel */
 +
 +static void do_update_vv_pos(int start, int nrend, double dt,
 +                             t_grp_tcstat *tcstat, t_grp_acc *gstat,
 +                             rvec accel[], ivec nFreeze[], real invmass[],
 +                             unsigned short ptype[], unsigned short cFREEZE[],
 +                             rvec x[], rvec xprime[], rvec v[],
 +                             rvec f[], gmx_bool bExtended, real veta, real alpha)
 +{
 +    double imass, w_dt;
 +    int    gf = 0;
 +    int    n, d;
 +    double g, mr1, mr2;
 +
 +    /* Would it make more sense if Parrinello-Rahman was put here? */
 +    if (bExtended)
 +    {
 +        g        = 0.5*dt*veta;
 +        mr1      = exp(g);
 +        mr2      = series_sinhx(g);
 +    }
 +    else
 +    {
 +        mr1      = 1.0;
 +        mr2      = 1.0;
 +    }
 +
 +    for (n = start; n < nrend; n++)
 +    {
 +
 +        if (cFREEZE)
 +        {
 +            gf   = cFREEZE[n];
 +        }
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +            {
 +                xprime[n][d]   = mr1*(mr1*x[n][d]+mr2*dt*v[n][d]);
 +            }
 +            else
 +            {
 +                xprime[n][d]   = x[n][d];
 +            }
 +        }
 +    }
 +} /* do_update_vv_pos */
 +
 +static void do_update_visc(int start, int nrend, double dt,
 +                           t_grp_tcstat *tcstat,
 +                           double nh_vxi[],
 +                           real invmass[],
 +                           unsigned short ptype[], unsigned short cTC[],
 +                           rvec x[], rvec xprime[], rvec v[],
 +                           rvec f[], matrix M, matrix box, real
 +                           cos_accel, real vcos,
 +                           gmx_bool bNH, gmx_bool bPR)
 +{
 +    double imass, w_dt;
 +    int    gt = 0;
 +    real   vn, vc;
 +    real   lg, vxi = 0, vv;
 +    real   fac, cosz;
 +    rvec   vrel;
 +    int    n, d;
 +
 +    fac = 2*M_PI/(box[ZZ][ZZ]);
 +
 +    if (bNH || bPR)
 +    {
 +        /* Update with coupling to extended ensembles, used for
 +         * Nose-Hoover and Parrinello-Rahman coupling
 +         */
 +        for (n = start; n < nrend; n++)
 +        {
 +            imass = invmass[n];
 +            if (cTC)
 +            {
 +                gt   = cTC[n];
 +            }
 +            lg   = tcstat[gt].lambda;
 +            cosz = cos(fac*x[n][ZZ]);
 +
 +            copy_rvec(v[n], vrel);
 +
 +            vc            = cosz*vcos;
 +            vrel[XX]     -= vc;
 +            if (bNH)
 +            {
 +                vxi        = nh_vxi[gt];
 +            }
 +            for (d = 0; d < DIM; d++)
 +            {
 +                vn             = v[n][d];
 +
 +                if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
 +                {
 +                    vn  = (lg*vrel[d] + dt*(imass*f[n][d] - 0.5*vxi*vrel[d]
 +                                            - iprod(M[d], vrel)))/(1 + 0.5*vxi*dt);
 +                    if (d == XX)
 +                    {
 +                        vn += vc + dt*cosz*cos_accel;
 +                    }
 +                    v[n][d]        = vn;
 +                    xprime[n][d]   = x[n][d]+vn*dt;
 +                }
 +                else
 +                {
 +                    xprime[n][d]   = x[n][d];
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Classic version of update, used with berendsen coupling */
 +        for (n = start; n < nrend; n++)
 +        {
 +            w_dt = invmass[n]*dt;
 +            if (cTC)
 +            {
 +                gt   = cTC[n];
 +            }
 +            lg   = tcstat[gt].lambda;
 +            cosz = cos(fac*x[n][ZZ]);
 +
 +            for (d = 0; d < DIM; d++)
 +            {
 +                vn             = v[n][d];
 +
 +                if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
 +                {
 +                    if (d == XX)
 +                    {
 +                        vc           = cosz*vcos;
 +                        /* Do not scale the cosine velocity profile */
 +                        vv           = vc + lg*(vn - vc + f[n][d]*w_dt);
 +                        /* Add the cosine accelaration profile */
 +                        vv          += dt*cosz*cos_accel;
 +                    }
 +                    else
 +                    {
 +                        vv           = lg*(vn + f[n][d]*w_dt);
 +                    }
 +                    v[n][d]        = vv;
 +                    xprime[n][d]   = x[n][d]+vv*dt;
 +                }
 +                else
 +                {
 +                    v[n][d]        = 0.0;
 +                    xprime[n][d]   = x[n][d];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Allocates and initializes sd->gaussrand[i] for i=1, i<sd->ngaussrand,
 + * Using seeds generated from sd->gaussrand[0].
 + */
 +static void init_multiple_gaussrand(gmx_stochd_t *sd)
 +{
 +    int           ngr, i;
 +    unsigned int *seed;
 +
 +    ngr = sd->ngaussrand;
 +    snew(seed, ngr);
 +
 +    for (i = 1; i < ngr; i++)
 +    {
 +        seed[i] = gmx_rng_uniform_uint32(sd->gaussrand[0]);
 +    }
 +
 +#pragma omp parallel num_threads(ngr)
 +    {
 +        int th;
 +
 +        th = gmx_omp_get_thread_num();
 +        if (th > 0)
 +        {
 +            /* Initialize on each thread to have thread-local memory alloced */
 +            sd->gaussrand[th] = gmx_rng_init(seed[th]);
 +        }
 +    }
 +
 +    sfree(seed);
 +}
 +
 +static gmx_stochd_t *init_stochd(FILE *fplog, t_inputrec *ir, int nthreads)
 +{
 +    gmx_stochd_t   *sd;
 +    gmx_sd_const_t *sdc;
 +    int             ngtc, n, th;
 +    real            y;
 +
 +    snew(sd, 1);
 +
 +    /* Initiate random number generator for langevin type dynamics,
 +     * for BD, SD or velocity rescaling temperature coupling.
 +     */
 +    if (ir->eI == eiBD || EI_SD(ir->eI))
 +    {
 +        sd->ngaussrand = nthreads;
 +    }
 +    else
 +    {
 +        sd->ngaussrand = 1;
 +    }
 +    snew(sd->gaussrand, sd->ngaussrand);
 +
 +    /* Initialize the first random generator */
 +    sd->gaussrand[0] = gmx_rng_init(ir->ld_seed);
 +
 +    if (sd->ngaussrand > 1)
 +    {
 +        /* Initialize the rest of the random number generators,
 +         * using the first one to generate seeds.
 +         */
 +        init_multiple_gaussrand(sd);
 +    }
 +
 +    ngtc = ir->opts.ngtc;
 +
 +    if (ir->eI == eiBD)
 +    {
 +        snew(sd->bd_rf, ngtc);
 +    }
 +    else if (EI_SD(ir->eI))
 +    {
 +        snew(sd->sdc, ngtc);
 +        snew(sd->sdsig, ngtc);
 +
 +        sdc = sd->sdc;
 +        for (n = 0; n < ngtc; n++)
 +        {
 +            if (ir->opts.tau_t[n] > 0)
 +            {
 +                sdc[n].gdt = ir->delta_t/ir->opts.tau_t[n];
 +                sdc[n].eph = exp(sdc[n].gdt/2);
 +                sdc[n].emh = exp(-sdc[n].gdt/2);
 +                sdc[n].em  = exp(-sdc[n].gdt);
 +            }
 +            else
 +            {
 +                /* No friction and noise on this group */
 +                sdc[n].gdt = 0;
 +                sdc[n].eph = 1;
 +                sdc[n].emh = 1;
 +                sdc[n].em  = 1;
 +            }
 +            if (sdc[n].gdt >= 0.05)
 +            {
 +                sdc[n].b = sdc[n].gdt*(sdc[n].eph*sdc[n].eph - 1)
 +                    - 4*(sdc[n].eph - 1)*(sdc[n].eph - 1);
 +                sdc[n].c = sdc[n].gdt - 3 + 4*sdc[n].emh - sdc[n].em;
 +                sdc[n].d = 2 - sdc[n].eph - sdc[n].emh;
 +            }
 +            else
 +            {
 +                y = sdc[n].gdt/2;
 +                /* Seventh order expansions for small y */
 +                sdc[n].b = y*y*y*y*(1/3.0+y*(1/3.0+y*(17/90.0+y*7/9.0)));
 +                sdc[n].c = y*y*y*(2/3.0+y*(-1/2.0+y*(7/30.0+y*(-1/12.0+y*31/1260.0))));
 +                sdc[n].d = y*y*(-1+y*y*(-1/12.0-y*y/360.0));
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug, "SD const tc-grp %d: b %g  c %g  d %g\n",
 +                        n, sdc[n].b, sdc[n].c, sdc[n].d);
 +            }
 +        }
 +    }
 +    else if (ETC_ANDERSEN(ir->etc))
 +    {
 +        int        ngtc;
 +        t_grpopts *opts;
 +        real       reft;
 +
 +        opts = &ir->opts;
 +        ngtc = opts->ngtc;
 +
 +        snew(sd->randomize_group, ngtc);
 +        snew(sd->boltzfac, ngtc);
 +
 +        /* for now, assume that all groups, if randomized, are randomized at the same rate, i.e. tau_t is the same. */
 +        /* since constraint groups don't necessarily match up with temperature groups! This is checked in readir.c */
 +
 +        for (n = 0; n < ngtc; n++)
 +        {
 +            reft = max(0.0, opts->ref_t[n]);
 +            if ((opts->tau_t[n] > 0) && (reft > 0))  /* tau_t or ref_t = 0 means that no randomization is done */
 +            {
 +                sd->randomize_group[n] = TRUE;
 +                sd->boltzfac[n]        = BOLTZ*opts->ref_t[n];
 +            }
 +            else
 +            {
 +                sd->randomize_group[n] = FALSE;
 +            }
 +        }
 +    }
 +    return sd;
 +}
 +
 +void get_stochd_state(gmx_update_t upd, t_state *state)
 +{
 +    /* Note that we only get the state of the first random generator,
 +     * even if there are multiple. This avoids repetition.
 +     */
 +    gmx_rng_get_state(upd->sd->gaussrand[0], state->ld_rng, state->ld_rngi);
 +}
 +
 +void set_stochd_state(gmx_update_t upd, t_state *state)
 +{
 +    gmx_stochd_t *sd;
 +    int           i;
 +
 +    sd = upd->sd;
 +
 +    gmx_rng_set_state(sd->gaussrand[0], state->ld_rng, state->ld_rngi[0]);
 +
 +    if (sd->ngaussrand > 1)
 +    {
 +        /* We only end up here with SD or BD with OpenMP.
 +         * Destroy and reinitialize the rest of the random number generators,
 +         * using seeds generated from the first one.
 +         * Although this doesn't recover the previous state,
 +         * it at least avoids repetition, which is most important.
 +         * Exaclty restoring states with all MPI+OpenMP setups is difficult
 +         * and as the integrator is random to start with, doesn't gain us much.
 +         */
 +        for (i = 1; i < sd->ngaussrand; i++)
 +        {
 +            gmx_rng_destroy(sd->gaussrand[i]);
 +        }
 +
 +        init_multiple_gaussrand(sd);
 +    }
 +}
 +
 +gmx_update_t init_update(FILE *fplog, t_inputrec *ir)
 +{
 +    t_gmx_update *upd;
 +
 +    snew(upd, 1);
 +
 +    if (ir->eI == eiBD || EI_SD(ir->eI) || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc))
 +    {
 +        upd->sd = init_stochd(fplog, ir, gmx_omp_nthreads_get(emntUpdate));
 +    }
 +
 +    upd->xp                 = NULL;
 +    upd->xp_nalloc          = 0;
 +    upd->randatom           = NULL;
 +    upd->randatom_list      = NULL;
 +    upd->randatom_list_init = FALSE; /* we have not yet cleared the data structure at this point */
 +
 +    return upd;
 +}
 +
 +static void do_update_sd1(gmx_stochd_t *sd,
 +                          gmx_rng_t gaussrand,
 +                          int start, int nrend, double dt,
 +                          rvec accel[], ivec nFreeze[],
 +                          real invmass[], unsigned short ptype[],
 +                          unsigned short cFREEZE[], unsigned short cACC[],
 +                          unsigned short cTC[],
 +                          rvec x[], rvec xprime[], rvec v[], rvec f[],
 +                          rvec sd_X[],
 +                          int ngtc, real tau_t[], real ref_t[])
 +{
 +    gmx_sd_const_t *sdc;
 +    gmx_sd_sigma_t *sig;
 +    real            kT;
 +    int             gf = 0, ga = 0, gt = 0;
 +    real            ism, sd_V;
 +    int             n, d;
 +
 +    sdc = sd->sdc;
 +    sig = sd->sdsig;
-     sdc = sd->sdc;
-     sig = sd->sdsig;
-     if (nrend-start > sd->sd_V_nalloc)
-     {
-         sd->sd_V_nalloc = over_alloc_dd(nrend-start);
-         srenew(sd->sd_V, sd->sd_V_nalloc);
-     }
 +
 +    for (n = 0; n < ngtc; n++)
 +    {
 +        kT = BOLTZ*ref_t[n];
 +        /* The mass is encounted for later, since this differs per atom */
 +        sig[n].V  = sqrt(kT*(1 - sdc[n].em*sdc[n].em));
 +    }
 +
 +    for (n = start; n < nrend; n++)
 +    {
 +        ism = sqrt(invmass[n]);
 +        if (cFREEZE)
 +        {
 +            gf  = cFREEZE[n];
 +        }
 +        if (cACC)
 +        {
 +            ga  = cACC[n];
 +        }
 +        if (cTC)
 +        {
 +            gt  = cTC[n];
 +        }
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +            {
 +                sd_V = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
 +
 +                v[n][d] = v[n][d]*sdc[gt].em
 +                    + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em)
 +                    + sd_V;
 +
 +                xprime[n][d] = x[n][d] + v[n][d]*dt;
 +            }
 +            else
 +            {
 +                v[n][d]      = 0.0;
 +                xprime[n][d] = x[n][d];
 +            }
 +        }
 +    }
 +}
 +
++static void check_sd2_work_data_allocation(gmx_stochd_t *sd, int nrend)
++{
++    if (nrend > sd->sd_V_nalloc)
++    {
++        sd->sd_V_nalloc = over_alloc_dd(nrend);
++        srenew(sd->sd_V, sd->sd_V_nalloc);
++    }
++}
++
 +static void do_update_sd2(gmx_stochd_t *sd,
 +                          gmx_rng_t gaussrand,
 +                          gmx_bool bInitStep,
 +                          int start, int nrend,
 +                          rvec accel[], ivec nFreeze[],
 +                          real invmass[], unsigned short ptype[],
 +                          unsigned short cFREEZE[], unsigned short cACC[],
 +                          unsigned short cTC[],
 +                          rvec x[], rvec xprime[], rvec v[], rvec f[],
 +                          rvec sd_X[],
 +                          int ngtc, real tau_t[], real ref_t[],
 +                          gmx_bool bFirstHalf)
 +{
 +    gmx_sd_const_t *sdc;
 +    gmx_sd_sigma_t *sig;
 +    /* The random part of the velocity update, generated in the first
 +     * half of the update, needs to be remembered for the second half.
 +     */
 +    rvec  *sd_V;
 +    real   kT;
 +    int    gf = 0, ga = 0, gt = 0;
 +    real   vn = 0, Vmh, Xmh;
 +    real   ism;
 +    int    n, d;
 +
-                     sd_V[n-start][d] = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
++    sdc  = sd->sdc;
++    sig  = sd->sdsig;
 +    sd_V = sd->sd_V;
 +
 +    if (bFirstHalf)
 +    {
 +        for (n = 0; n < ngtc; n++)
 +        {
 +            kT = BOLTZ*ref_t[n];
 +            /* The mass is encounted for later, since this differs per atom */
 +            sig[n].V  = sqrt(kT*(1-sdc[n].em));
 +            sig[n].X  = sqrt(kT*sqr(tau_t[n])*sdc[n].c);
 +            sig[n].Yv = sqrt(kT*sdc[n].b/sdc[n].c);
 +            sig[n].Yx = sqrt(kT*sqr(tau_t[n])*sdc[n].b/(1-sdc[n].em));
 +        }
 +    }
 +
 +    for (n = start; n < nrend; n++)
 +    {
 +        ism = sqrt(invmass[n]);
 +        if (cFREEZE)
 +        {
 +            gf  = cFREEZE[n];
 +        }
 +        if (cACC)
 +        {
 +            ga  = cACC[n];
 +        }
 +        if (cTC)
 +        {
 +            gt  = cTC[n];
 +        }
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (bFirstHalf)
 +            {
 +                vn             = v[n][d];
 +            }
 +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +            {
 +                if (bFirstHalf)
 +                {
 +                    if (bInitStep)
 +                    {
 +                        sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand);
 +                    }
 +                    Vmh = sd_X[n][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c)
 +                        + ism*sig[gt].Yv*gmx_rng_gaussian_table(gaussrand);
-                         + sd_V[n-start][d] - sdc[gt].em*Vmh;
++                    sd_V[n][d] = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
 +
 +                    v[n][d] = vn*sdc[gt].em
 +                        + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em)
-                     Xmh = sd_V[n-start][d]*tau_t[gt]*sdc[gt].d/(sdc[gt].em-1)
++                        + sd_V[n][d] - sdc[gt].em*Vmh;
 +
 +                    xprime[n][d] = x[n][d] + v[n][d]*tau_t[gt]*(sdc[gt].eph - sdc[gt].emh);
 +                }
 +                else
 +                {
 +
 +                    /* Correct the velocities for the constraints.
 +                     * This operation introduces some inaccuracy,
 +                     * since the velocity is determined from differences in coordinates.
 +                     */
 +                    v[n][d] =
 +                        (xprime[n][d] - x[n][d])/(tau_t[gt]*(sdc[gt].eph - sdc[gt].emh));
 +
++                    Xmh = sd_V[n][d]*tau_t[gt]*sdc[gt].d/(sdc[gt].em-1)
 +                        + ism*sig[gt].Yx*gmx_rng_gaussian_table(gaussrand);
 +                    sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand);
 +
 +                    xprime[n][d] += sd_X[n][d] - Xmh;
 +
 +                }
 +            }
 +            else
 +            {
 +                if (bFirstHalf)
 +                {
 +                    v[n][d]        = 0.0;
 +                    xprime[n][d]   = x[n][d];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static void do_update_bd(int start, int nrend, double dt,
 +                         ivec nFreeze[],
 +                         real invmass[], unsigned short ptype[],
 +                         unsigned short cFREEZE[], unsigned short cTC[],
 +                         rvec x[], rvec xprime[], rvec v[],
 +                         rvec f[], real friction_coefficient,
 +                         int ngtc, real tau_t[], real ref_t[],
 +                         real *rf, gmx_rng_t gaussrand)
 +{
 +    /* note -- these appear to be full step velocities . . .  */
 +    int    gf = 0, gt = 0;
 +    real   vn;
 +    real   invfr = 0;
 +    int    n, d;
 +
 +    if (friction_coefficient != 0)
 +    {
 +        invfr = 1.0/friction_coefficient;
 +        for (n = 0; n < ngtc; n++)
 +        {
 +            rf[n] = sqrt(2.0*BOLTZ*ref_t[n]/(friction_coefficient*dt));
 +        }
 +    }
 +    else
 +    {
 +        for (n = 0; n < ngtc; n++)
 +        {
 +            rf[n] = sqrt(2.0*BOLTZ*ref_t[n]);
 +        }
 +    }
 +    for (n = start; (n < nrend); n++)
 +    {
 +        if (cFREEZE)
 +        {
 +            gf = cFREEZE[n];
 +        }
 +        if (cTC)
 +        {
 +            gt = cTC[n];
 +        }
 +        for (d = 0; (d < DIM); d++)
 +        {
 +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 +            {
 +                if (friction_coefficient != 0)
 +                {
 +                    vn = invfr*f[n][d] + rf[gt]*gmx_rng_gaussian_table(gaussrand);
 +                }
 +                else
 +                {
 +                    /* NOTE: invmass = 2/(mass*friction_constant*dt) */
 +                    vn = 0.5*invmass[n]*f[n][d]*dt
 +                        + sqrt(0.5*invmass[n])*rf[gt]*gmx_rng_gaussian_table(gaussrand);
 +                }
 +
 +                v[n][d]      = vn;
 +                xprime[n][d] = x[n][d]+vn*dt;
 +            }
 +            else
 +            {
 +                v[n][d]      = 0.0;
 +                xprime[n][d] = x[n][d];
 +            }
 +        }
 +    }
 +}
 +
 +static void dump_it_all(FILE *fp, const char *title,
 +                        int natoms, rvec x[], rvec xp[], rvec v[], rvec f[])
 +{
 +#ifdef DEBUG
 +    if (fp)
 +    {
 +        fprintf(fp, "%s\n", title);
 +        pr_rvecs(fp, 0, "x", x, natoms);
 +        pr_rvecs(fp, 0, "xp", xp, natoms);
 +        pr_rvecs(fp, 0, "v", v, natoms);
 +        pr_rvecs(fp, 0, "f", f, natoms);
 +    }
 +#endif
 +}
 +
 +static void calc_ke_part_normal(rvec v[], t_grpopts *opts, t_mdatoms *md,
 +                                gmx_ekindata_t *ekind, t_nrnb *nrnb, gmx_bool bEkinAveVel,
 +                                gmx_bool bSaveEkinOld)
 +{
 +    int           g;
 +    t_grp_tcstat *tcstat  = ekind->tcstat;
 +    t_grp_acc    *grpstat = ekind->grpstat;
 +    int           nthread, thread;
 +
 +    /* three main: VV with AveVel, vv with AveEkin, leap with AveEkin.  Leap with AveVel is also
 +       an option, but not supported now.  Additionally, if we are doing iterations.
 +       bEkinAveVel: If TRUE, we sum into ekin, if FALSE, into ekinh.
 +       bSavEkinOld: If TRUE (in the case of iteration = bIterate is TRUE), we don't copy over the ekinh_old.
 +       If FALSE, we overrwrite it.
 +     */
 +
 +    /* group velocities are calculated in update_ekindata and
 +     * accumulated in acumulate_groups.
 +     * Now the partial global and groups ekin.
 +     */
 +    for (g = 0; (g < opts->ngtc); g++)
 +    {
 +
 +        if (!bSaveEkinOld)
 +        {
 +            copy_mat(tcstat[g].ekinh, tcstat[g].ekinh_old);
 +        }
 +        if (bEkinAveVel)
 +        {
 +            clear_mat(tcstat[g].ekinf);
 +        }
 +        else
 +        {
 +            clear_mat(tcstat[g].ekinh);
 +        }
 +        if (bEkinAveVel)
 +        {
 +            tcstat[g].ekinscalef_nhc = 1.0; /* need to clear this -- logic is complicated! */
 +        }
 +    }
 +    ekind->dekindl_old = ekind->dekindl;
 +
 +    nthread = gmx_omp_nthreads_get(emntUpdate);
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        int     start_t, end_t, n;
 +        int     ga, gt;
 +        rvec    v_corrt;
 +        real    hm;
 +        int     d, m;
 +        matrix *ekin_sum;
 +        real   *dekindl_sum;
 +
 +        start_t = md->start + ((thread+0)*md->homenr)/nthread;
 +        end_t   = md->start + ((thread+1)*md->homenr)/nthread;
 +
 +        ekin_sum    = ekind->ekin_work[thread];
 +        dekindl_sum = &ekind->ekin_work[thread][opts->ngtc][0][0];
 +
 +        for (gt = 0; gt < opts->ngtc; gt++)
 +        {
 +            clear_mat(ekin_sum[gt]);
 +        }
 +
 +        ga = 0;
 +        gt = 0;
 +        for (n = start_t; n < end_t; n++)
 +        {
 +            if (md->cACC)
 +            {
 +                ga = md->cACC[n];
 +            }
 +            if (md->cTC)
 +            {
 +                gt = md->cTC[n];
 +            }
 +            hm   = 0.5*md->massT[n];
 +
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                v_corrt[d]  = v[n][d]  - grpstat[ga].u[d];
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                for (m = 0; (m < DIM); m++)
 +                {
 +                    /* if we're computing a full step velocity, v_corrt[d] has v(t).  Otherwise, v(t+dt/2) */
 +                    ekin_sum[gt][m][d] += hm*v_corrt[m]*v_corrt[d];
 +                }
 +            }
 +            if (md->nMassPerturbed && md->bPerturbed[n])
 +            {
 +                *dekindl_sum -=
 +                    0.5*(md->massB[n] - md->massA[n])*iprod(v_corrt, v_corrt);
 +            }
 +        }
 +    }
 +
 +    ekind->dekindl = 0;
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        for (g = 0; g < opts->ngtc; g++)
 +        {
 +            if (bEkinAveVel)
 +            {
 +                m_add(tcstat[g].ekinf, ekind->ekin_work[thread][g],
 +                      tcstat[g].ekinf);
 +            }
 +            else
 +            {
 +                m_add(tcstat[g].ekinh, ekind->ekin_work[thread][g],
 +                      tcstat[g].ekinh);
 +            }
 +        }
 +
 +        ekind->dekindl += ekind->ekin_work[thread][opts->ngtc][0][0];
 +    }
 +
 +    inc_nrnb(nrnb, eNR_EKIN, md->homenr);
 +}
 +
 +static void calc_ke_part_visc(matrix box, rvec x[], rvec v[],
 +                              t_grpopts *opts, t_mdatoms *md,
 +                              gmx_ekindata_t *ekind,
 +                              t_nrnb *nrnb, gmx_bool bEkinAveVel, gmx_bool bSaveEkinOld)
 +{
 +    int           start = md->start, homenr = md->homenr;
 +    int           g, d, n, m, gt = 0;
 +    rvec          v_corrt;
 +    real          hm;
 +    t_grp_tcstat *tcstat = ekind->tcstat;
 +    t_cos_acc    *cosacc = &(ekind->cosacc);
 +    real          dekindl;
 +    real          fac, cosz;
 +    double        mvcos;
 +
 +    for (g = 0; g < opts->ngtc; g++)
 +    {
 +        copy_mat(ekind->tcstat[g].ekinh, ekind->tcstat[g].ekinh_old);
 +        clear_mat(ekind->tcstat[g].ekinh);
 +    }
 +    ekind->dekindl_old = ekind->dekindl;
 +
 +    fac     = 2*M_PI/box[ZZ][ZZ];
 +    mvcos   = 0;
 +    dekindl = 0;
 +    for (n = start; n < start+homenr; n++)
 +    {
 +        if (md->cTC)
 +        {
 +            gt = md->cTC[n];
 +        }
 +        hm   = 0.5*md->massT[n];
 +
 +        /* Note that the times of x and v differ by half a step */
 +        /* MRS -- would have to be changed for VV */
 +        cosz         = cos(fac*x[n][ZZ]);
 +        /* Calculate the amplitude of the new velocity profile */
 +        mvcos       += 2*cosz*md->massT[n]*v[n][XX];
 +
 +        copy_rvec(v[n], v_corrt);
 +        /* Subtract the profile for the kinetic energy */
 +        v_corrt[XX] -= cosz*cosacc->vcos;
 +        for (d = 0; (d < DIM); d++)
 +        {
 +            for (m = 0; (m < DIM); m++)
 +            {
 +                /* if we're computing a full step velocity, v_corrt[d] has v(t).  Otherwise, v(t+dt/2) */
 +                if (bEkinAveVel)
 +                {
 +                    tcstat[gt].ekinf[m][d] += hm*v_corrt[m]*v_corrt[d];
 +                }
 +                else
 +                {
 +                    tcstat[gt].ekinh[m][d] += hm*v_corrt[m]*v_corrt[d];
 +                }
 +            }
 +        }
 +        if (md->nPerturbed && md->bPerturbed[n])
 +        {
 +            dekindl -= 0.5*(md->massB[n] - md->massA[n])*iprod(v_corrt, v_corrt);
 +        }
 +    }
 +    ekind->dekindl = dekindl;
 +    cosacc->mvcos  = mvcos;
 +
 +    inc_nrnb(nrnb, eNR_EKIN, homenr);
 +}
 +
 +void calc_ke_part(t_state *state, t_grpopts *opts, t_mdatoms *md,
 +                  gmx_ekindata_t *ekind, t_nrnb *nrnb, gmx_bool bEkinAveVel, gmx_bool bSaveEkinOld)
 +{
 +    if (ekind->cosacc.cos_accel == 0)
 +    {
 +        calc_ke_part_normal(state->v, opts, md, ekind, nrnb, bEkinAveVel, bSaveEkinOld);
 +    }
 +    else
 +    {
 +        calc_ke_part_visc(state->box, state->x, state->v, opts, md, ekind, nrnb, bEkinAveVel, bSaveEkinOld);
 +    }
 +}
 +
 +extern void init_ekinstate(ekinstate_t *ekinstate, const t_inputrec *ir)
 +{
 +    ekinstate->ekin_n = ir->opts.ngtc;
 +    snew(ekinstate->ekinh, ekinstate->ekin_n);
 +    snew(ekinstate->ekinf, ekinstate->ekin_n);
 +    snew(ekinstate->ekinh_old, ekinstate->ekin_n);
 +    snew(ekinstate->ekinscalef_nhc, ekinstate->ekin_n);
 +    snew(ekinstate->ekinscaleh_nhc, ekinstate->ekin_n);
 +    snew(ekinstate->vscale_nhc, ekinstate->ekin_n);
 +    ekinstate->dekindl = 0;
 +    ekinstate->mvcos   = 0;
 +}
 +
 +void update_ekinstate(ekinstate_t *ekinstate, gmx_ekindata_t *ekind)
 +{
 +    int i;
 +
 +    for (i = 0; i < ekinstate->ekin_n; i++)
 +    {
 +        copy_mat(ekind->tcstat[i].ekinh, ekinstate->ekinh[i]);
 +        copy_mat(ekind->tcstat[i].ekinf, ekinstate->ekinf[i]);
 +        copy_mat(ekind->tcstat[i].ekinh_old, ekinstate->ekinh_old[i]);
 +        ekinstate->ekinscalef_nhc[i] = ekind->tcstat[i].ekinscalef_nhc;
 +        ekinstate->ekinscaleh_nhc[i] = ekind->tcstat[i].ekinscaleh_nhc;
 +        ekinstate->vscale_nhc[i]     = ekind->tcstat[i].vscale_nhc;
 +    }
 +
 +    copy_mat(ekind->ekin, ekinstate->ekin_total);
 +    ekinstate->dekindl = ekind->dekindl;
 +    ekinstate->mvcos   = ekind->cosacc.mvcos;
 +
 +}
 +
 +void restore_ekinstate_from_state(t_commrec *cr,
 +                                  gmx_ekindata_t *ekind, ekinstate_t *ekinstate)
 +{
 +    int i, n;
 +
 +    if (MASTER(cr))
 +    {
 +        for (i = 0; i < ekinstate->ekin_n; i++)
 +        {
 +            copy_mat(ekinstate->ekinh[i], ekind->tcstat[i].ekinh);
 +            copy_mat(ekinstate->ekinf[i], ekind->tcstat[i].ekinf);
 +            copy_mat(ekinstate->ekinh_old[i], ekind->tcstat[i].ekinh_old);
 +            ekind->tcstat[i].ekinscalef_nhc = ekinstate->ekinscalef_nhc[i];
 +            ekind->tcstat[i].ekinscaleh_nhc = ekinstate->ekinscaleh_nhc[i];
 +            ekind->tcstat[i].vscale_nhc     = ekinstate->vscale_nhc[i];
 +        }
 +
 +        copy_mat(ekinstate->ekin_total, ekind->ekin);
 +
 +        ekind->dekindl      = ekinstate->dekindl;
 +        ekind->cosacc.mvcos = ekinstate->mvcos;
 +        n                   = ekinstate->ekin_n;
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        gmx_bcast(sizeof(n), &n, cr);
 +        for (i = 0; i < n; i++)
 +        {
 +            gmx_bcast(DIM*DIM*sizeof(ekind->tcstat[i].ekinh[0][0]),
 +                      ekind->tcstat[i].ekinh[0], cr);
 +            gmx_bcast(DIM*DIM*sizeof(ekind->tcstat[i].ekinf[0][0]),
 +                      ekind->tcstat[i].ekinf[0], cr);
 +            gmx_bcast(DIM*DIM*sizeof(ekind->tcstat[i].ekinh_old[0][0]),
 +                      ekind->tcstat[i].ekinh_old[0], cr);
 +
 +            gmx_bcast(sizeof(ekind->tcstat[i].ekinscalef_nhc),
 +                      &(ekind->tcstat[i].ekinscalef_nhc), cr);
 +            gmx_bcast(sizeof(ekind->tcstat[i].ekinscaleh_nhc),
 +                      &(ekind->tcstat[i].ekinscaleh_nhc), cr);
 +            gmx_bcast(sizeof(ekind->tcstat[i].vscale_nhc),
 +                      &(ekind->tcstat[i].vscale_nhc), cr);
 +        }
 +        gmx_bcast(DIM*DIM*sizeof(ekind->ekin[0][0]),
 +                  ekind->ekin[0], cr);
 +
 +        gmx_bcast(sizeof(ekind->dekindl), &ekind->dekindl, cr);
 +        gmx_bcast(sizeof(ekind->cosacc.mvcos), &ekind->cosacc.mvcos, cr);
 +    }
 +}
 +
 +void set_deform_reference_box(gmx_update_t upd, gmx_large_int_t step, matrix box)
 +{
 +    upd->deformref_step = step;
 +    copy_mat(box, upd->deformref_box);
 +}
 +
 +static void deform(gmx_update_t upd,
 +                   int start, int homenr, rvec x[], matrix box, matrix *scale_tot,
 +                   const t_inputrec *ir, gmx_large_int_t step)
 +{
 +    matrix bnew, invbox, mu;
 +    real   elapsed_time;
 +    int    i, j;
 +
 +    elapsed_time = (step + 1 - upd->deformref_step)*ir->delta_t;
 +    copy_mat(box, bnew);
 +    for (i = 0; i < DIM; i++)
 +    {
 +        for (j = 0; j < DIM; j++)
 +        {
 +            if (ir->deform[i][j] != 0)
 +            {
 +                bnew[i][j] =
 +                    upd->deformref_box[i][j] + elapsed_time*ir->deform[i][j];
 +            }
 +        }
 +    }
 +    /* We correct the off-diagonal elements,
 +     * which can grow indefinitely during shearing,
 +     * so the shifts do not get messed up.
 +     */
 +    for (i = 1; i < DIM; i++)
 +    {
 +        for (j = i-1; j >= 0; j--)
 +        {
 +            while (bnew[i][j] - box[i][j] > 0.5*bnew[j][j])
 +            {
 +                rvec_dec(bnew[i], bnew[j]);
 +            }
 +            while (bnew[i][j] - box[i][j] < -0.5*bnew[j][j])
 +            {
 +                rvec_inc(bnew[i], bnew[j]);
 +            }
 +        }
 +    }
 +    m_inv_ur0(box, invbox);
 +    copy_mat(bnew, box);
 +    mmul_ur0(box, invbox, mu);
 +
 +    for (i = start; i < start+homenr; i++)
 +    {
 +        x[i][XX] = mu[XX][XX]*x[i][XX]+mu[YY][XX]*x[i][YY]+mu[ZZ][XX]*x[i][ZZ];
 +        x[i][YY] = mu[YY][YY]*x[i][YY]+mu[ZZ][YY]*x[i][ZZ];
 +        x[i][ZZ] = mu[ZZ][ZZ]*x[i][ZZ];
 +    }
 +    if (*scale_tot)
 +    {
 +        /* The transposes of the scaling matrices are stored,
 +         * so we need to do matrix multiplication in the inverse order.
 +         */
 +        mmul_ur0(*scale_tot, mu, *scale_tot);
 +    }
 +}
 +
 +static void combine_forces(int nstcalclr,
 +                           gmx_constr_t constr,
 +                           t_inputrec *ir, t_mdatoms *md, t_idef *idef,
 +                           t_commrec *cr,
 +                           gmx_large_int_t step,
 +                           t_state *state, gmx_bool bMolPBC,
 +                           int start, int nrend,
 +                           rvec f[], rvec f_lr[],
 +                           t_nrnb *nrnb)
 +{
 +    int  i, d, nm1;
 +
 +    /* f contains the short-range forces + the long range forces
 +     * which are stored separately in f_lr.
 +     */
 +
 +    if (constr != NULL && !(ir->eConstrAlg == econtSHAKE && ir->epc == epcNO))
 +    {
 +        /* We need to constrain the LR forces separately,
 +         * because due to the different pre-factor for the SR and LR
 +         * forces in the update algorithm, we can not determine
 +         * the constraint force for the coordinate constraining.
 +         * Constrain only the additional LR part of the force.
 +         */
 +        /* MRS -- need to make sure this works with trotter integration -- the constraint calls may not be right.*/
 +        constrain(NULL, FALSE, FALSE, constr, idef, ir, NULL, cr, step, 0, md,
 +                  state->x, f_lr, f_lr, bMolPBC, state->box, state->lambda[efptBONDED], NULL,
 +                  NULL, NULL, nrnb, econqForce, ir->epc == epcMTTK, state->veta, state->veta);
 +    }
 +
 +    /* Add nstcalclr-1 times the LR force to the sum of both forces
 +     * and store the result in forces_lr.
 +     */
 +    nm1 = nstcalclr - 1;
 +    for (i = start; i < nrend; i++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            f_lr[i][d] = f[i][d] + nm1*f_lr[i][d];
 +        }
 +    }
 +}
 +
 +void update_tcouple(FILE             *fplog,
 +                    gmx_large_int_t   step,
 +                    t_inputrec       *inputrec,
 +                    t_state          *state,
 +                    gmx_ekindata_t   *ekind,
 +                    gmx_wallcycle_t   wcycle,
 +                    gmx_update_t      upd,
 +                    t_extmass        *MassQ,
 +                    t_mdatoms        *md)
 +
 +{
 +    gmx_bool   bTCouple = FALSE;
 +    real       dttc;
 +    int        i, start, end, homenr, offset;
 +
 +    /* if using vv with trotter decomposition methods, we do this elsewhere in the code */
 +    if (inputrec->etc != etcNO &&
 +        !(IR_NVT_TROTTER(inputrec) || IR_NPT_TROTTER(inputrec) || IR_NPH_TROTTER(inputrec)))
 +    {
 +        /* We should only couple after a step where energies were determined (for leapfrog versions)
 +           or the step energies are determined, for velocity verlet versions */
 +
 +        if (EI_VV(inputrec->eI))
 +        {
 +            offset = 0;
 +        }
 +        else
 +        {
 +            offset = 1;
 +        }
 +        bTCouple = (inputrec->nsttcouple == 1 ||
 +                    do_per_step(step+inputrec->nsttcouple-offset,
 +                                inputrec->nsttcouple));
 +    }
 +
 +    if (bTCouple)
 +    {
 +        dttc = inputrec->nsttcouple*inputrec->delta_t;
 +
 +        switch (inputrec->etc)
 +        {
 +            case etcNO:
 +                break;
 +            case etcBERENDSEN:
 +                berendsen_tcoupl(inputrec, ekind, dttc);
 +                break;
 +            case etcNOSEHOOVER:
 +                nosehoover_tcoupl(&(inputrec->opts), ekind, dttc,
 +                                  state->nosehoover_xi, state->nosehoover_vxi, MassQ);
 +                break;
 +            case etcVRESCALE:
 +                vrescale_tcoupl(inputrec, ekind, dttc,
 +                                state->therm_integral, upd->sd->gaussrand[0]);
 +                break;
 +        }
 +        /* rescale in place here */
 +        if (EI_VV(inputrec->eI))
 +        {
 +            rescale_velocities(ekind, md, md->start, md->start+md->homenr, state->v);
 +        }
 +    }
 +    else
 +    {
 +        /* Set the T scaling lambda to 1 to have no scaling */
 +        for (i = 0; (i < inputrec->opts.ngtc); i++)
 +        {
 +            ekind->tcstat[i].lambda = 1.0;
 +        }
 +    }
 +}
 +
 +void update_pcouple(FILE             *fplog,
 +                    gmx_large_int_t   step,
 +                    t_inputrec       *inputrec,
 +                    t_state          *state,
 +                    matrix            pcoupl_mu,
 +                    matrix            M,
 +                    gmx_wallcycle_t   wcycle,
 +                    gmx_update_t      upd,
 +                    gmx_bool          bInitStep)
 +{
 +    gmx_bool   bPCouple = FALSE;
 +    real       dtpc     = 0;
 +    int        i;
 +
 +    /* if using Trotter pressure, we do this in coupling.c, so we leave it false. */
 +    if (inputrec->epc != epcNO && (!(IR_NPT_TROTTER(inputrec) || IR_NPH_TROTTER(inputrec))))
 +    {
 +        /* We should only couple after a step where energies were determined */
 +        bPCouple = (inputrec->nstpcouple == 1 ||
 +                    do_per_step(step+inputrec->nstpcouple-1,
 +                                inputrec->nstpcouple));
 +    }
 +
 +    clear_mat(pcoupl_mu);
 +    for (i = 0; i < DIM; i++)
 +    {
 +        pcoupl_mu[i][i] = 1.0;
 +    }
 +
 +    clear_mat(M);
 +
 +    if (bPCouple)
 +    {
 +        dtpc = inputrec->nstpcouple*inputrec->delta_t;
 +
 +        switch (inputrec->epc)
 +        {
 +            /* We can always pcoupl, even if we did not sum the energies
 +             * the previous step, since state->pres_prev is only updated
 +             * when the energies have been summed.
 +             */
 +            case (epcNO):
 +                break;
 +            case (epcBERENDSEN):
 +                if (!bInitStep)
 +                {
 +                    berendsen_pcoupl(fplog, step, inputrec, dtpc, state->pres_prev, state->box,
 +                                     pcoupl_mu);
 +                }
 +                break;
 +            case (epcPARRINELLORAHMAN):
 +                parrinellorahman_pcoupl(fplog, step, inputrec, dtpc, state->pres_prev,
 +                                        state->box, state->box_rel, state->boxv,
 +                                        M, pcoupl_mu, bInitStep);
 +                break;
 +            default:
 +                break;
 +        }
 +    }
 +}
 +
 +static rvec *get_xprime(const t_state *state, gmx_update_t upd)
 +{
 +    if (state->nalloc > upd->xp_nalloc)
 +    {
 +        upd->xp_nalloc = state->nalloc;
 +        srenew(upd->xp, upd->xp_nalloc);
 +    }
 +
 +    return upd->xp;
 +}
 +
 +void update_constraints(FILE             *fplog,
 +                        gmx_large_int_t   step,
 +                        real             *dvdlambda, /* the contribution to be added to the bonded interactions */
 +                        t_inputrec       *inputrec,  /* input record and box stuff    */
 +                        gmx_ekindata_t   *ekind,
 +                        t_mdatoms        *md,
 +                        t_state          *state,
 +                        gmx_bool          bMolPBC,
 +                        t_graph          *graph,
 +                        rvec              force[],   /* forces on home particles */
 +                        t_idef           *idef,
 +                        tensor            vir_part,
 +                        tensor            vir,       /* tensors for virial and ekin, needed for computing */
 +                        t_commrec        *cr,
 +                        t_nrnb           *nrnb,
 +                        gmx_wallcycle_t   wcycle,
 +                        gmx_update_t      upd,
 +                        gmx_constr_t      constr,
 +                        gmx_bool          bInitStep,
 +                        gmx_bool          bFirstHalf,
 +                        gmx_bool          bCalcVir,
 +                        real              vetanew)
 +{
 +    gmx_bool             bExtended, bLastStep, bLog = FALSE, bEner = FALSE, bDoConstr = FALSE;
 +    double               dt;
 +    real                 dt_1;
 +    int                  start, homenr, nrend, i, n, m, g, d;
 +    tensor               vir_con;
 +    rvec                *vbuf, *xprime = NULL;
 +    int                  nth, th;
 +
 +    if (constr)
 +    {
 +        bDoConstr = TRUE;
 +    }
 +    if (bFirstHalf && !EI_VV(inputrec->eI))
 +    {
 +        bDoConstr = FALSE;
 +    }
 +
 +    /* for now, SD update is here -- though it really seems like it
 +       should be reformulated as a velocity verlet method, since it has two parts */
 +
 +    start  = md->start;
 +    homenr = md->homenr;
 +    nrend  = start+homenr;
 +
 +    dt   = inputrec->delta_t;
 +    dt_1 = 1.0/dt;
 +
 +    /*
 +     *  Steps (7C, 8C)
 +     *  APPLY CONSTRAINTS:
 +     *  BLOCK SHAKE
 +
 +     * When doing PR pressure coupling we have to constrain the
 +     * bonds in each iteration. If we are only using Nose-Hoover tcoupling
 +     * it is enough to do this once though, since the relative velocities
 +     * after this will be normal to the bond vector
 +     */
 +
 +    if (bDoConstr)
 +    {
 +        /* clear out constraints before applying */
 +        clear_mat(vir_part);
 +
 +        xprime = get_xprime(state, upd);
 +
 +        bLastStep = (step == inputrec->init_step+inputrec->nsteps);
 +        bLog      = (do_per_step(step, inputrec->nstlog) || bLastStep || (step < 0));
 +        bEner     = (do_per_step(step, inputrec->nstenergy) || bLastStep);
 +        /* Constrain the coordinates xprime */
 +        wallcycle_start(wcycle, ewcCONSTR);
 +        if (EI_VV(inputrec->eI) && bFirstHalf)
 +        {
 +            constrain(NULL, bLog, bEner, constr, idef,
 +                      inputrec, ekind, cr, step, 1, md,
 +                      state->x, state->v, state->v,
 +                      bMolPBC, state->box,
 +                      state->lambda[efptBONDED], dvdlambda,
 +                      NULL, bCalcVir ? &vir_con : NULL, nrnb, econqVeloc,
 +                      inputrec->epc == epcMTTK, state->veta, vetanew);
 +        }
 +        else
 +        {
 +            constrain(NULL, bLog, bEner, constr, idef,
 +                      inputrec, ekind, cr, step, 1, md,
 +                      state->x, xprime, NULL,
 +                      bMolPBC, state->box,
 +                      state->lambda[efptBONDED], dvdlambda,
 +                      state->v, bCalcVir ? &vir_con : NULL, nrnb, econqCoord,
 +                      inputrec->epc == epcMTTK, state->veta, state->veta);
 +        }
 +        wallcycle_stop(wcycle, ewcCONSTR);
 +
 +        where();
 +
 +        dump_it_all(fplog, "After Shake",
 +                    state->natoms, state->x, xprime, state->v, force);
 +
 +        if (bCalcVir)
 +        {
 +            if (inputrec->eI == eiSD2)
 +            {
 +                /* A correction factor eph is needed for the SD constraint force */
 +                /* Here we can, unfortunately, not have proper corrections
 +                 * for different friction constants, so we use the first one.
 +                 */
 +                for (i = 0; i < DIM; i++)
 +                {
 +                    for (m = 0; m < DIM; m++)
 +                    {
 +                        vir_part[i][m] += upd->sd->sdc[0].eph*vir_con[i][m];
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                m_add(vir_part, vir_con, vir_part);
 +            }
 +            if (debug)
 +            {
 +                pr_rvecs(debug, 0, "constraint virial", vir_part, DIM);
 +            }
 +        }
 +    }
 +
 +    where();
 +    if ((inputrec->eI == eiSD2) && !(bFirstHalf))
 +    {
 +        xprime = get_xprime(state, upd);
 +
 +        nth = gmx_omp_nthreads_get(emntUpdate);
 +
 +#pragma omp parallel for num_threads(nth) schedule(static)
 +        for (th = 0; th < nth; th++)
 +        {
 +            int start_th, end_th;
 +
 +            start_th = start + ((nrend-start)* th   )/nth;
 +            end_th   = start + ((nrend-start)*(th+1))/nth;
 +
 +            /* The second part of the SD integration */
 +            do_update_sd2(upd->sd, upd->sd->gaussrand[th],
 +                          FALSE, start_th, end_th,
 +                          inputrec->opts.acc, inputrec->opts.nFreeze,
 +                          md->invmass, md->ptype,
 +                          md->cFREEZE, md->cACC, md->cTC,
 +                          state->x, xprime, state->v, force, state->sd_X,
 +                          inputrec->opts.ngtc, inputrec->opts.tau_t,
 +                          inputrec->opts.ref_t, FALSE);
 +        }
 +        inc_nrnb(nrnb, eNR_UPDATE, homenr);
 +
 +        if (bDoConstr)
 +        {
 +            /* Constrain the coordinates xprime */
 +            wallcycle_start(wcycle, ewcCONSTR);
 +            constrain(NULL, bLog, bEner, constr, idef,
 +                      inputrec, NULL, cr, step, 1, md,
 +                      state->x, xprime, NULL,
 +                      bMolPBC, state->box,
 +                      state->lambda[efptBONDED], dvdlambda,
 +                      NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
 +            wallcycle_stop(wcycle, ewcCONSTR);
 +        }
 +    }
 +
 +    /* We must always unshift after updating coordinates; if we did not shake
 +       x was shifted in do_force */
 +
 +    if (!(bFirstHalf)) /* in the first half of vv, no shift. */
 +    {
 +        if (graph && (graph->nnodes > 0))
 +        {
 +            unshift_x(graph, state->box, state->x, upd->xp);
 +            if (TRICLINIC(state->box))
 +            {
 +                inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
 +            }
 +            else
 +            {
 +                inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
 +            }
 +        }
 +        else
 +        {
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntUpdate)) schedule(static)
 +            for (i = start; i < nrend; i++)
 +            {
 +                copy_rvec(upd->xp[i], state->x[i]);
 +            }
 +        }
 +
 +        dump_it_all(fplog, "After unshift",
 +                    state->natoms, state->x, upd->xp, state->v, force);
 +    }
 +/* ############# END the update of velocities and positions ######### */
 +}
 +
 +void update_box(FILE             *fplog,
 +                gmx_large_int_t   step,
 +                t_inputrec       *inputrec,  /* input record and box stuff    */
 +                t_mdatoms        *md,
 +                t_state          *state,
 +                t_graph          *graph,
 +                rvec              force[],   /* forces on home particles */
 +                matrix           *scale_tot,
 +                matrix            pcoupl_mu,
 +                t_nrnb           *nrnb,
 +                gmx_wallcycle_t   wcycle,
 +                gmx_update_t      upd,
 +                gmx_bool          bInitStep,
 +                gmx_bool          bFirstHalf)
 +{
 +    gmx_bool             bExtended, bLastStep, bLog = FALSE, bEner = FALSE;
 +    double               dt;
 +    real                 dt_1;
 +    int                  start, homenr, nrend, i, n, m, g;
 +    tensor               vir_con;
 +
 +    start  = md->start;
 +    homenr = md->homenr;
 +    nrend  = start+homenr;
 +
 +    bExtended =
 +        (inputrec->etc == etcNOSEHOOVER) ||
 +        (inputrec->epc == epcPARRINELLORAHMAN) ||
 +        (inputrec->epc == epcMTTK);
 +
 +    dt = inputrec->delta_t;
 +
 +    where();
 +
 +    /* now update boxes */
 +    switch (inputrec->epc)
 +    {
 +        case (epcNO):
 +            break;
 +        case (epcBERENDSEN):
 +            berendsen_pscale(inputrec, pcoupl_mu, state->box, state->box_rel,
 +                             start, homenr, state->x, md->cFREEZE, nrnb);
 +            break;
 +        case (epcPARRINELLORAHMAN):
 +            /* The box velocities were updated in do_pr_pcoupl in the update
 +             * iteration, but we dont change the box vectors until we get here
 +             * since we need to be able to shift/unshift above.
 +             */
 +            for (i = 0; i < DIM; i++)
 +            {
 +                for (m = 0; m <= i; m++)
 +                {
 +                    state->box[i][m] += dt*state->boxv[i][m];
 +                }
 +            }
 +            preserve_box_shape(inputrec, state->box_rel, state->box);
 +
 +            /* Scale the coordinates */
 +            for (n = start; (n < start+homenr); n++)
 +            {
 +                tmvmul_ur0(pcoupl_mu, state->x[n], state->x[n]);
 +            }
 +            break;
 +        case (epcMTTK):
 +            switch (inputrec->epct)
 +            {
 +                case (epctISOTROPIC):
 +                    /* DIM * eta = ln V.  so DIM*eta_new = DIM*eta_old + DIM*dt*veta =>
 +                       ln V_new = ln V_old + 3*dt*veta => V_new = V_old*exp(3*dt*veta) =>
 +                       Side length scales as exp(veta*dt) */
 +
 +                    msmul(state->box, exp(state->veta*dt), state->box);
 +
 +                    /* Relate veta to boxv.  veta = d(eta)/dT = (1/DIM)*1/V dV/dT.
 +                       o               If we assume isotropic scaling, and box length scaling
 +                       factor L, then V = L^DIM (det(M)).  So dV/dt = DIM
 +                       L^(DIM-1) dL/dt det(M), and veta = (1/L) dL/dt.  The
 +                       determinant of B is L^DIM det(M), and the determinant
 +                       of dB/dt is (dL/dT)^DIM det (M).  veta will be
 +                       (det(dB/dT)/det(B))^(1/3).  Then since M =
 +                       B_new*(vol_new)^(1/3), dB/dT_new = (veta_new)*B(new). */
 +
 +                    msmul(state->box, state->veta, state->boxv);
 +                    break;
 +                default:
 +                    break;
 +            }
 +            break;
 +        default:
 +            break;
 +    }
 +
 +    if ((!(IR_NPT_TROTTER(inputrec) || IR_NPH_TROTTER(inputrec))) && scale_tot)
 +    {
 +        /* The transposes of the scaling matrices are stored,
 +         * therefore we need to reverse the order in the multiplication.
 +         */
 +        mmul_ur0(*scale_tot, pcoupl_mu, *scale_tot);
 +    }
 +
 +    if (DEFORM(*inputrec))
 +    {
 +        deform(upd, start, homenr, state->x, state->box, scale_tot, inputrec, step);
 +    }
 +    where();
 +    dump_it_all(fplog, "After update",
 +                state->natoms, state->x, upd->xp, state->v, force);
 +}
 +
 +void update_coords(FILE             *fplog,
 +                   gmx_large_int_t   step,
 +                   t_inputrec       *inputrec,  /* input record and box stuff */
 +                   t_mdatoms        *md,
 +                   t_state          *state,
 +                   gmx_bool          bMolPBC,
 +                   rvec             *f,    /* forces on home particles */
 +                   gmx_bool          bDoLR,
 +                   rvec             *f_lr,
 +                   t_fcdata         *fcd,
 +                   gmx_ekindata_t   *ekind,
 +                   matrix            M,
 +                   gmx_wallcycle_t   wcycle,
 +                   gmx_update_t      upd,
 +                   gmx_bool          bInitStep,
 +                   int               UpdatePart,
 +                   t_commrec        *cr, /* these shouldn't be here -- need to think about it */
 +                   t_nrnb           *nrnb,
 +                   gmx_constr_t      constr,
 +                   t_idef           *idef)
 +{
 +    gmx_bool          bNH, bPR, bLastStep, bLog = FALSE, bEner = FALSE;
 +    double            dt, alpha;
 +    real             *imass, *imassin;
 +    rvec             *force;
 +    real              dt_1;
 +    int               start, homenr, nrend, i, j, d, n, m, g;
 +    int               blen0, blen1, iatom, jatom, nshake, nsettle, nconstr, nexpand;
 +    int              *icom = NULL;
 +    tensor            vir_con;
 +    rvec             *vcom, *xcom, *vall, *xall, *xin, *vin, *forcein, *fall, *xpall, *xprimein, *xprime;
 +    int               nth, th;
 +
 +    /* Running the velocity half does nothing except for velocity verlet */
 +    if ((UpdatePart == etrtVELOCITY1 || UpdatePart == etrtVELOCITY2) &&
 +        !EI_VV(inputrec->eI))
 +    {
 +        gmx_incons("update_coords called for velocity without VV integrator");
 +    }
 +
 +    start  = md->start;
 +    homenr = md->homenr;
 +    nrend  = start+homenr;
 +
 +    xprime = get_xprime(state, upd);
 +
 +    dt   = inputrec->delta_t;
 +    dt_1 = 1.0/dt;
 +
 +    /* We need to update the NMR restraint history when time averaging is used */
 +    if (state->flags & (1<<estDISRE_RM3TAV))
 +    {
 +        update_disres_history(fcd, &state->hist);
 +    }
 +    if (state->flags & (1<<estORIRE_DTAV))
 +    {
 +        update_orires_history(fcd, &state->hist);
 +    }
 +
 +
 +    bNH = inputrec->etc == etcNOSEHOOVER;
 +    bPR = ((inputrec->epc == epcPARRINELLORAHMAN) || (inputrec->epc == epcMTTK));
 +
 +    if (bDoLR && inputrec->nstcalclr > 1 && !EI_VV(inputrec->eI))  /* get this working with VV? */
 +    {
 +        /* Store the total force + nstcalclr-1 times the LR force
 +         * in forces_lr, so it can be used in a normal update algorithm
 +         * to produce twin time stepping.
 +         */
 +        /* is this correct in the new construction? MRS */
 +        combine_forces(inputrec->nstcalclr, constr, inputrec, md, idef, cr,
 +                       step, state, bMolPBC,
 +                       start, nrend, f, f_lr, nrnb);
 +        force = f_lr;
 +    }
 +    else
 +    {
 +        force = f;
 +    }
 +
 +    /* ############# START The update of velocities and positions ######### */
 +    where();
 +    dump_it_all(fplog, "Before update",
 +                state->natoms, state->x, xprime, state->v, force);
 +
 +    if (EI_RANDOM(inputrec->eI))
 +    {
 +        /* We still need to take care of generating random seeds properly
 +         * when multi-threading.
 +         */
 +        nth = 1;
 +    }
 +    else
 +    {
 +        nth = gmx_omp_nthreads_get(emntUpdate);
 +    }
 +
++    if (inputrec->eI == eiSD2)
++    {
++        check_sd2_work_data_allocation(upd->sd, nrend);
++    }
++
 +#pragma omp parallel for num_threads(nth) schedule(static) private(alpha)
 +    for (th = 0; th < nth; th++)
 +    {
 +        int start_th, end_th;
 +
 +        start_th = start + ((nrend-start)* th   )/nth;
 +        end_th   = start + ((nrend-start)*(th+1))/nth;
 +
 +        switch (inputrec->eI)
 +        {
 +            case (eiMD):
 +                if (ekind->cosacc.cos_accel == 0)
 +                {
 +                    do_update_md(start_th, end_th, dt,
 +                                 ekind->tcstat, state->nosehoover_vxi,
 +                                 ekind->bNEMD, ekind->grpstat, inputrec->opts.acc,
 +                                 inputrec->opts.nFreeze,
 +                                 md->invmass, md->ptype,
 +                                 md->cFREEZE, md->cACC, md->cTC,
 +                                 state->x, xprime, state->v, force, M,
 +                                 bNH, bPR);
 +                }
 +                else
 +                {
 +                    do_update_visc(start_th, end_th, dt,
 +                                   ekind->tcstat, state->nosehoover_vxi,
 +                                   md->invmass, md->ptype,
 +                                   md->cTC, state->x, xprime, state->v, force, M,
 +                                   state->box,
 +                                   ekind->cosacc.cos_accel,
 +                                   ekind->cosacc.vcos,
 +                                   bNH, bPR);
 +                }
 +                break;
 +            case (eiSD1):
 +                do_update_sd1(upd->sd, upd->sd->gaussrand[th],
 +                              start_th, end_th, dt,
 +                              inputrec->opts.acc, inputrec->opts.nFreeze,
 +                              md->invmass, md->ptype,
 +                              md->cFREEZE, md->cACC, md->cTC,
 +                              state->x, xprime, state->v, force, state->sd_X,
 +                              inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t);
 +                break;
 +            case (eiSD2):
 +                /* The SD update is done in 2 parts, because an extra constraint step
 +                 * is needed
 +                 */
 +                do_update_sd2(upd->sd, upd->sd->gaussrand[th],
 +                              bInitStep, start_th, end_th,
 +                              inputrec->opts.acc, inputrec->opts.nFreeze,
 +                              md->invmass, md->ptype,
 +                              md->cFREEZE, md->cACC, md->cTC,
 +                              state->x, xprime, state->v, force, state->sd_X,
 +                              inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t,
 +                              TRUE);
 +                break;
 +            case (eiBD):
 +                do_update_bd(start_th, end_th, dt,
 +                             inputrec->opts.nFreeze, md->invmass, md->ptype,
 +                             md->cFREEZE, md->cTC,
 +                             state->x, xprime, state->v, force,
 +                             inputrec->bd_fric,
 +                             inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t,
 +                             upd->sd->bd_rf, upd->sd->gaussrand[th]);
 +                break;
 +            case (eiVV):
 +            case (eiVVAK):
 +                alpha = 1.0 + DIM/((double)inputrec->opts.nrdf[0]); /* assuming barostat coupled to group 0. */
 +                switch (UpdatePart)
 +                {
 +                    case etrtVELOCITY1:
 +                    case etrtVELOCITY2:
 +                        do_update_vv_vel(start_th, end_th, dt,
 +                                         ekind->tcstat, ekind->grpstat,
 +                                         inputrec->opts.acc, inputrec->opts.nFreeze,
 +                                         md->invmass, md->ptype,
 +                                         md->cFREEZE, md->cACC,
 +                                         state->v, force,
 +                                         (bNH || bPR), state->veta, alpha);
 +                        break;
 +                    case etrtPOSITION:
 +                        do_update_vv_pos(start_th, end_th, dt,
 +                                         ekind->tcstat, ekind->grpstat,
 +                                         inputrec->opts.acc, inputrec->opts.nFreeze,
 +                                         md->invmass, md->ptype, md->cFREEZE,
 +                                         state->x, xprime, state->v, force,
 +                                         (bNH || bPR), state->veta, alpha);
 +                        break;
 +                }
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Don't know how to update coordinates");
 +                break;
 +        }
 +    }
 +
 +}
 +
 +
 +void correct_ekin(FILE *log, int start, int end, rvec v[], rvec vcm, real mass[],
 +                  real tmass, tensor ekin)
 +{
 +    /*
 +     * This is a debugging routine. It should not be called for production code
 +     *
 +     * The kinetic energy should calculated according to:
 +     *   Ekin = 1/2 m (v-vcm)^2
 +     * However the correction is not always applied, since vcm may not be
 +     * known in time and we compute
 +     *   Ekin' = 1/2 m v^2 instead
 +     * This can be corrected afterwards by computing
 +     *   Ekin = Ekin' + 1/2 m ( -2 v vcm + vcm^2)
 +     * or in hsorthand:
 +     *   Ekin = Ekin' - m v vcm + 1/2 m vcm^2
 +     */
 +    int    i, j, k;
 +    real   m, tm;
 +    rvec   hvcm, mv;
 +    tensor dekin;
 +
 +    /* Local particles */
 +    clear_rvec(mv);
 +
 +    /* Processor dependent part. */
 +    tm = 0;
 +    for (i = start; (i < end); i++)
 +    {
 +        m      = mass[i];
 +        tm    += m;
 +        for (j = 0; (j < DIM); j++)
 +        {
 +            mv[j] += m*v[i][j];
 +        }
 +    }
 +    /* Shortcut */
 +    svmul(1/tmass, vcm, vcm);
 +    svmul(0.5, vcm, hvcm);
 +    clear_mat(dekin);
 +    for (j = 0; (j < DIM); j++)
 +    {
 +        for (k = 0; (k < DIM); k++)
 +        {
 +            dekin[j][k] += vcm[k]*(tm*hvcm[j]-mv[j]);
 +        }
 +    }
 +    pr_rvecs(log, 0, "dekin", dekin, DIM);
 +    pr_rvecs(log, 0, " ekin", ekin, DIM);
 +    fprintf(log, "dekin = %g, ekin = %g  vcm = (%8.4f %8.4f %8.4f)\n",
 +            trace(dekin), trace(ekin), vcm[XX], vcm[YY], vcm[ZZ]);
 +    fprintf(log, "mv = (%8.4f %8.4f %8.4f)\n",
 +            mv[XX], mv[YY], mv[ZZ]);
 +}
 +
 +extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_large_int_t step, t_mdatoms *md, t_state *state, gmx_update_t upd, t_idef *idef, gmx_constr_t constr)
 +{
 +
 +    int  i;
 +    real rate = (ir->delta_t)/ir->opts.tau_t[0];
 +    /* proceed with andersen if 1) it's fixed probability per
 +       particle andersen or 2) it's massive andersen and it's tau_t/dt */
 +    if ((ir->etc == etcANDERSEN) || do_per_step(step, (int)(1.0/rate)))
 +    {
 +        srenew(upd->randatom, state->nalloc);
 +        srenew(upd->randatom_list, state->nalloc);
 +        if (upd->randatom_list_init == FALSE)
 +        {
 +            for (i = 0; i < state->nalloc; i++)
 +            {
 +                upd->randatom[i]      = FALSE;
 +                upd->randatom_list[i] = 0;
 +            }
 +            upd->randatom_list_init = TRUE;
 +        }
 +        andersen_tcoupl(ir, md, state, upd->sd->gaussrand[0], rate,
 +                        (ir->etc == etcANDERSEN) ? idef : NULL,
 +                        constr ? get_nblocks(constr) : 0,
 +                        constr ? get_sblock(constr) : NULL,
 +                        upd->randatom, upd->randatom_list,
 +                        upd->sd->randomize_group, upd->sd->boltzfac);
 +        return TRUE;
 +    }
 +    return FALSE;
 +}
index 39a25df690111199fa76aa8c547496a45e229963,0000000000000000000000000000000000000000..292e23d06d61a6281c34cde7ea714c59863dc4ba
mode 100644,000000..100644
--- /dev/null
@@@ -1,2221 -1,0 +1,2226 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "vcm.h"
 +#include "mdebin.h"
 +#include "nrnb.h"
 +#include "calcmu.h"
 +#include "index.h"
 +#include "vsite.h"
 +#include "update.h"
 +#include "ns.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "mdrun.h"
 +#include "md_support.h"
 +#include "md_logging.h"
 +#include "confio.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "xvgr.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "xmdrun.h"
 +#include "ionize.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "partdec.h"
 +#include "topsort.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "shellfc.h"
 +#include "compute_io.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "txtdump.h"
 +#include "string2.h"
 +#include "pme_loadbal.h"
 +#include "bondf.h"
 +#include "membed.h"
 +#include "types/nlistheuristics.h"
 +#include "types/iteratedconstraints.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +static void reset_all_counters(FILE *fplog, t_commrec *cr,
 +                               gmx_large_int_t step,
 +                               gmx_large_int_t *step_rel, t_inputrec *ir,
 +                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
 +                               gmx_runtime_t *runtime,
 +                               nbnxn_cuda_ptr_t cu_nbv)
 +{
 +    char sbuf[STEPSTRSIZE];
 +
 +    /* Reset all the counters related to performance over the run */
 +    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
 +                  gmx_step_str(step, sbuf));
 +
 +    if (cu_nbv)
 +    {
 +        nbnxn_cuda_reset_timings(cu_nbv);
 +    }
 +
 +    wallcycle_stop(wcycle, ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        reset_dd_statistics_counters(cr->dd);
 +    }
 +    init_nrnb(nrnb);
 +    ir->init_step += *step_rel;
 +    ir->nsteps    -= *step_rel;
 +    *step_rel      = 0;
 +    wallcycle_start(wcycle, ewcRUN);
 +    runtime_start(runtime);
 +    print_date_and_time(fplog, cr->nodeid, "Restarted time", runtime);
 +}
 +
 +double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int stepout, t_inputrec *ir,
 +             gmx_mtop_t *top_global,
 +             t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed, t_forcerec *fr,
 +             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
 +             real cpt_period, real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    gmx_mdoutf_t   *outf;
 +    gmx_large_int_t step, step_rel;
 +    double          run_time;
 +    double          t, t0, lam0[efptNR];
 +    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEner;
 +    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE,
 +                    bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep,
 +                    bBornRadii, bStartingFromCpt;
 +    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 +    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
 +                      bForceUpdate = FALSE, bCPT;
 +    int               mdof_flags;
 +    gmx_bool          bMasterState;
 +    int               force_flags, cglo_flags;
 +    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
 +    int               i, m;
 +    t_trxstatus      *status;
 +    rvec              mu_tot;
 +    t_vcm            *vcm;
 +    t_state          *bufstate = NULL;
 +    matrix           *scale_tot, pcoupl_mu, M, ebox;
 +    gmx_nlheur_t      nlh;
 +    t_trxframe        rerun_fr;
 +    gmx_repl_ex_t     repl_ex = NULL;
 +    int               nchkpt  = 1;
 +    gmx_localtop_t   *top;
 +    t_mdebin         *mdebin = NULL;
 +    df_history_t      df_history;
 +    t_state          *state    = NULL;
 +    rvec             *f_global = NULL;
 +    int               n_xtc    = -1;
 +    rvec             *x_xtc    = NULL;
 +    gmx_enerdata_t   *enerd;
 +    rvec             *f = NULL;
 +    gmx_global_stat_t gstat;
 +    gmx_update_t      upd   = NULL;
 +    t_graph          *graph = NULL;
 +    globsig_t         gs;
 +    gmx_rng_t         mcrng = NULL;
 +    gmx_bool          bFFscan;
 +    gmx_groups_t     *groups;
 +    gmx_ekindata_t   *ekind, *ekind_save;
 +    gmx_shellfc_t     shellfc;
 +    int               count, nconverged = 0;
 +    real              timestep = 0;
 +    double            tcount   = 0;
 +    gmx_bool          bIonize  = FALSE;
 +    gmx_bool          bTCR     = FALSE, bConverged = TRUE, bOK, bSumEkinhOld, bExchanged;
 +    gmx_bool          bAppend;
 +    gmx_bool          bResetCountersHalfMaxH = FALSE;
 +    gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
 +    gmx_bool          bUpdateDoLR;
 +    real              mu_aver = 0, dvdl;
 +    int               a0, a1, gnx = 0, ii;
 +    atom_id          *grpindex = NULL;
 +    char             *grpname;
 +    t_coupl_rec      *tcr     = NULL;
 +    rvec             *xcopy   = NULL, *vcopy = NULL, *cbuf = NULL;
 +    matrix            boxcopy = {{0}}, lastbox;
 +    tensor            tmpvir;
 +    real              fom, oldfom, veta_save, pcurr, scalevir, tracevir;
 +    real              vetanew = 0;
 +    int               lamnew  = 0;
 +    /* for FEP */
 +    int               nstfep;
 +    real              rate;
 +    double            cycles;
 +    real              saved_conserved_quantity = 0;
 +    real              last_ekin                = 0;
 +    int               iter_i;
 +    t_extmass         MassQ;
 +    int             **trotter_seq;
 +    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 +    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
 +    gmx_iterate_t     iterate;
 +    gmx_large_int_t   multisim_nsteps = -1;                        /* number of steps to do  before first multisim
 +                                                                      simulation stops. If equal to zero, don't
 +                                                                      communicate any more between multisims.*/
 +    /* PME load balancing data for GPU kernels */
 +    pme_load_balancing_t pme_loadbal = NULL;
 +    double               cycles_pmes;
 +    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
 +
 +#ifdef GMX_FAHCORE
 +    /* Temporary addition for FAHCORE checkpointing */
 +    int chkpt_ret;
 +#endif
 +
 +    /* Check for special mdrun options */
 +    bRerunMD = (Flags & MD_RERUN);
 +    bIonize  = (Flags & MD_IONIZE);
 +    bFFscan  = (Flags & MD_FFSCAN);
 +    bAppend  = (Flags & MD_APPENDFILES);
 +    if (Flags & MD_RESETCOUNTERSHALFWAY)
 +    {
 +        if (ir->nsteps > 0)
 +        {
 +            /* Signal to reset the counters half the simulation steps. */
 +            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
 +        }
 +        /* Signal to reset the counters halfway the simulation time. */
 +        bResetCountersHalfMaxH = (max_hours > 0);
 +    }
 +
 +    /* md-vv uses averaged full step velocities for T-control
 +       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 +       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 +    bVV = EI_VV(ir->eI);
 +    if (bVV) /* to store the initial velocities while computing virial */
 +    {
 +        snew(cbuf, top_global->natoms);
 +    }
 +    /* all the iteratative cases - only if there are constraints */
 +    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
 +    gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
 +                                          false in this step.  The correct value, true or false,
 +                                          is set at each step, as it depends on the frequency of temperature
 +                                          and pressure control.*/
 +    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
 +
 +    if (bRerunMD)
 +    {
 +        /* Since we don't know if the frames read are related in any way,
 +         * rebuild the neighborlist at every step.
 +         */
 +        ir->nstlist       = 1;
 +        ir->nstcalcenergy = 1;
 +        nstglobalcomm     = 1;
 +    }
 +
 +    check_ir_old_tpx_versions(cr, fplog, ir, top_global);
 +
 +    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
 +    bGStatEveryStep = (nstglobalcomm == 1);
 +
 +    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
 +    {
 +        fprintf(fplog,
 +                "To reduce the energy communication with nstlist = -1\n"
 +                "the neighbor list validity should not be checked at every step,\n"
 +                "this means that exact integration is not guaranteed.\n"
 +                "The neighbor list validity is checked after:\n"
 +                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
 +                "In most cases this will result in exact integration.\n"
 +                "This reduces the energy communication by a factor of 2 to 3.\n"
 +                "If you want less energy communication, set nstlist > 3.\n\n");
 +    }
 +
 +    if (bRerunMD || bFFscan)
 +    {
 +        ir->nstxtcout = 0;
 +    }
 +    groups = &top_global->groups;
 +
 +    /* Initial values */
 +    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
 +            &(state_global->fep_state), lam0,
 +            nrnb, top_global, &upd,
 +            nfile, fnm, &outf, &mdebin,
 +            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, state_global, Flags);
 +
 +    clear_mat(total_vir);
 +    clear_mat(pres);
 +    /* Energy terms and groups */
 +    snew(enerd, 1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
 +                  enerd);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        f = NULL;
 +    }
 +    else
 +    {
 +        snew(f, top_global->natoms);
 +    }
 +
 +    /* lambda Monte carlo random number generator  */
 +    if (ir->bExpanded)
 +    {
 +        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
 +    }
 +    /* copy the state into df_history */
 +    copy_df_history(&df_history, &state_global->dfhist);
 +
 +    /* Kinetic energy data */
 +    snew(ekind, 1);
 +    init_ekindata(fplog, top_global, &(ir->opts), ekind);
 +    /* needed for iteration of constraints */
 +    snew(ekind_save, 1);
 +    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
 +    /* Copy the cos acceleration to the groups struct */
 +    ekind->cosacc.cos_accel = ir->cos_accel;
 +
 +    gstat = global_stat_init(ir);
 +    debug_gmx();
 +
 +    /* Check for polarizable models and flexible constraints */
 +    shellfc = init_shell_flexcon(fplog,
 +                                 top_global, n_flexible_constraints(constr),
 +                                 (ir->bContinuation ||
 +                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
 +                                 NULL : state_global->x);
 +
 +    if (DEFORM(*ir))
 +    {
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        set_deform_reference_box(upd,
 +                                 deform_init_init_step_tpx,
 +                                 deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    {
 +        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
 +        if ((io > 2000) && MASTER(cr))
 +        {
 +            fprintf(stderr,
 +                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
 +                    io);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        top = dd_init_local_top(top_global);
 +
 +        snew(state, 1);
 +        dd_init_local_state(cr->dd, state_global, state);
 +
 +        if (DDMASTER(cr->dd) && ir->nstfout)
 +        {
 +            snew(f_global, state_global->natoms);
 +        }
 +    }
 +    else
 +    {
 +        if (PAR(cr))
 +        {
 +            /* Initialize the particle decomposition and split the topology */
 +            top = split_system(fplog, top_global, ir, cr);
 +
 +            pd_cg_range(cr, &fr->cg0, &fr->hcg);
 +            pd_at_range(cr, &a0, &a1);
 +        }
 +        else
 +        {
 +            top = gmx_mtop_generate_local_top(top_global, ir);
 +
 +            a0 = 0;
 +            a1 = top_global->natoms;
 +        }
 +
 +        forcerec_set_excl_load(fr, top, cr);
 +
 +        state    = partdec_init_local_state(cr, state_global);
 +        f_global = f;
 +
 +        atoms2md(top_global, ir, 0, NULL, a0, a1-a0, mdatoms);
 +
 +        if (vsite)
 +        {
 +            set_vsite_top(vsite, top, mdatoms, cr);
 +        }
 +
 +        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
 +        {
 +            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
 +        }
 +
 +        if (shellfc)
 +        {
 +            make_local_shells(cr, mdatoms, shellfc);
 +        }
 +
 +        init_bonded_thread_force_reduction(fr, &top->idef);
 +
 +        if (ir->pull && PAR(cr))
 +        {
 +            dd_make_local_pull_groups(NULL, ir->pull, mdatoms);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
 +                            state_global, top_global, ir,
 +                            state, &f, mdatoms, top, fr,
 +                            vsite, shellfc, constr,
 +                            nrnb, wcycle, FALSE);
 +
 +    }
 +
 +    update_mdatoms(mdatoms, state->lambda[efptMASS]);
 +
 +    if (opt2bSet("-cpi", nfile, fnm))
 +    {
 +        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr);
 +    }
 +    else
 +    {
 +        bStateFromCP = FALSE;
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        if (bStateFromCP)
 +        {
 +            /* Update mdebin with energy history if appending to output files */
 +            if (Flags & MD_APPENDFILES)
 +            {
 +                restore_energyhistory_from_state(mdebin, &state_global->enerhist);
 +            }
 +            else
 +            {
 +                /* We might have read an energy history from checkpoint,
 +                 * free the allocated memory and reset the counts.
 +                 */
 +                done_energyhistory(&state_global->enerhist);
 +                init_energyhistory(&state_global->enerhist);
 +            }
 +        }
 +        /* Set the initial energy history in state by updating once */
 +        update_energyhistory(&state_global->enerhist, mdebin);
 +    }
 +
 +    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG))
 +    {
 +        /* Set the random state if we read a checkpoint file */
 +        set_stochd_state(upd, state);
 +    }
 +
 +    if (state->flags & (1<<estMC_RNG))
 +    {
 +        set_mc_state(mcrng, state);
 +    }
 +
 +    /* Initialize constraints */
 +    if (constr)
 +    {
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            set_constraints(constr, top, ir, mdatoms, cr);
 +        }
 +    }
 +
 +    /* Check whether we have to GCT stuff */
 +    bTCR = ftp2bSet(efGCT, nfile, fnm);
 +    if (bTCR)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "Will do General Coupling Theory!\n");
 +        }
 +        gnx = top_global->mols.nr;
 +        snew(grpindex, gnx);
 +        for (i = 0; (i < gnx); i++)
 +        {
 +            grpindex[i] = i;
 +        }
 +    }
 +
 +    if (repl_ex_nst > 0)
 +    {
 +        /* We need to be sure replica exchange can only occur
 +         * when the energies are current */
 +        check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy,
 +                        "repl_ex_nst", &repl_ex_nst);
 +        /* This check needs to happen before inter-simulation
 +         * signals are initialized, too */
 +    }
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
 +                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
 +    }
 +
 +    /* PME tuning is only supported with GPUs or PME nodes and not with rerun */
 +    if ((Flags & MD_TUNEPME) &&
 +        EEL_PME(fr->eeltype) &&
 +        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
 +        !bRerunMD)
 +    {
 +        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
 +        cycles_pmes = 0;
 +        if (cr->duty & DUTY_PME)
 +        {
 +            /* Start tuning right away, as we can't measure the load */
 +            bPMETuneRunning = TRUE;
 +        }
 +        else
 +        {
 +            /* Separate PME nodes, we can measure the PP/PME load balance */
 +            bPMETuneTry = TRUE;
 +        }
 +    }
 +
 +    if (!ir->bContinuation && !bRerunMD)
 +    {
 +        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
 +        {
 +            /* Set the velocities of frozen particles to zero */
 +            for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +            {
 +                for (m = 0; m < DIM; m++)
 +                {
 +                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 +                    {
 +                        state->v[i][m] = 0;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (constr)
 +        {
 +            /* Constrain the initial coordinates and velocities */
 +            do_constrain_first(fplog, constr, ir, mdatoms, state, f,
 +                               graph, cr, nrnb, fr, top, shake_vir);
 +        }
 +        if (vsite)
 +        {
 +            /* Construct the virtual sites for the initial configuration */
 +            construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, NULL,
 +                             top->idef.iparams, top->idef.il,
 +                             fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +        }
 +    }
 +
 +    debug_gmx();
 +
 +    /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/
 +    nstfep = ir->fepvals->nstdhdl;
 +    if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded))
 +    {
 +        nstfep = ir->expandedvals->nstexpanded;
 +    }
 +    if (repl_ex_nst > 0 && nstfep > repl_ex_nst)
 +    {
 +        nstfep = repl_ex_nst;
 +    }
 +
 +    /* I'm assuming we need global communication the first time! MRS */
 +    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
 +                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
 +                  | (bVV ? CGLO_PRESSURE : 0)
 +                  | (bVV ? CGLO_CONSTRAINT : 0)
 +                  | (bRerunMD ? CGLO_RERUNMD : 0)
 +                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
 +
 +    bSumEkinhOld = FALSE;
 +    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                    constr, NULL, FALSE, state->box,
 +                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld, cglo_flags);
 +    if (ir->eI == eiVVAK)
 +    {
 +        /* a second call to get the half step temperature initialized as well */
 +        /* we do the same call as above, but turn the pressure off -- internally to
 +           compute_globals, this is recognized as a velocity verlet half-step
 +           kinetic energy calculation.  This minimized excess variables, but
 +           perhaps loses some logic?*/
 +
 +        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                        constr, NULL, FALSE, state->box,
 +                        top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
 +    }
 +
 +    /* Calculate the initial half step temperature, and save the ekinh_old */
 +    if (!(Flags & MD_STARTFROMCPT))
 +    {
 +        for (i = 0; (i < ir->opts.ngtc); i++)
 +        {
 +            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 +        }
 +    }
 +    if (ir->eI != eiVV)
 +    {
 +        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
 +                                     and there is no previous step */
 +    }
 +
 +    /* if using an iterative algorithm, we need to create a working directory for the state. */
 +    if (bIterativeCase)
 +    {
 +        bufstate = init_bufstate(state);
 +    }
 +    if (bFFscan)
 +    {
 +        snew(xcopy, state->natoms);
 +        snew(vcopy, state->natoms);
 +        copy_rvecn(state->x, xcopy, 0, state->natoms);
 +        copy_rvecn(state->v, vcopy, 0, state->natoms);
 +        copy_mat(state->box, boxcopy);
 +    }
 +
 +    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
 +       temperature control */
 +    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 +
 +    if (MASTER(cr))
 +    {
 +        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
 +        {
 +            fprintf(fplog,
 +                    "RMS relative constraint deviation after constraining: %.2e\n",
 +                    constr_rmsd(constr, FALSE));
 +        }
 +        if (EI_STATE_VELOCITY(ir->eI))
 +        {
 +            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
 +        }
 +        if (bRerunMD)
 +        {
 +            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
 +                    " input trajectory '%s'\n\n",
 +                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "Calculated time to finish depends on nsteps from "
 +                        "run input file,\nwhich may not correspond to the time "
 +                        "needed to process input trajectory.\n\n");
 +            }
 +        }
 +        else
 +        {
 +            char tbuf[20];
 +            fprintf(stderr, "starting mdrun '%s'\n",
 +                    *(top_global->name));
 +            if (ir->nsteps >= 0)
 +            {
 +                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
 +            }
 +            else
 +            {
 +                sprintf(tbuf, "%s", "infinite");
 +            }
 +            if (ir->init_step > 0)
 +            {
 +                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 +                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
 +                        gmx_step_str(ir->init_step, sbuf2),
 +                        ir->init_step*ir->delta_t);
 +            }
 +            else
 +            {
 +                fprintf(stderr, "%s steps, %s ps.\n",
 +                        gmx_step_str(ir->nsteps, sbuf), tbuf);
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +
 +    /* Set and write start time */
 +    runtime_start(runtime);
 +    print_date_and_time(fplog, cr->nodeid, "Started mdrun", runtime);
 +    wallcycle_start(wcycle, ewcRUN);
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\n");
 +    }
 +
 +    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 +#ifdef GMX_FAHCORE
 +    chkpt_ret = fcCheckPointParallel( cr->nodeid,
 +                                      NULL, 0);
 +    if (chkpt_ret == 0)
 +    {
 +        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
 +    }
 +#endif
 +
 +    debug_gmx();
 +    /***********************************************************
 +     *
 +     *             Loop over MD steps
 +     *
 +     ************************************************************/
 +
 +    /* if rerunMD then read coordinates and velocities from input trajectory */
 +    if (bRerunMD)
 +    {
 +        if (getenv("GMX_FORCE_UPDATE"))
 +        {
 +            bForceUpdate = TRUE;
 +        }
 +
 +        rerun_fr.natoms = 0;
 +        if (MASTER(cr))
 +        {
 +            bNotLastFrame = read_first_frame(oenv, &status,
 +                                             opt2fn("-rerun", nfile, fnm),
 +                                             &rerun_fr, TRX_NEED_X | TRX_READ_V);
 +            if (rerun_fr.natoms != top_global->natoms)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Number of atoms in trajectory (%d) does not match the "
 +                          "run input file (%d)\n",
 +                          rerun_fr.natoms, top_global->natoms);
 +            }
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                if (!rerun_fr.bBox)
 +                {
 +                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
 +                }
 +                if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong))
 +                {
 +                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
 +                }
 +            }
 +        }
 +
 +        if (PAR(cr))
 +        {
 +            rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
 +        }
 +
 +        if (ir->ePBC != epbcNONE)
 +        {
 +            /* Set the shift vectors.
 +             * Necessary here when have a static box different from the tpr box.
 +             */
 +            calc_shifts(rerun_fr.box, fr->shift_vec);
 +        }
 +    }
 +
 +    /* loop over MD steps or if rerunMD to end of input trajectory */
 +    bFirstStep = TRUE;
 +    /* Skip the first Nose-Hoover integration when we get the state from tpx */
 +    bStateFromTPX    = !bStateFromCP;
 +    bInitStep        = bFirstStep && (bStateFromTPX || bVV);
 +    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
 +    bLastStep        = FALSE;
 +    bSumEkinhOld     = FALSE;
 +    bExchanged       = FALSE;
 +
 +    init_global_signals(&gs, cr, ir, repl_ex_nst);
 +
 +    step     = ir->init_step;
 +    step_rel = 0;
 +
 +    if (ir->nstlist == -1)
 +    {
 +        init_nlistheuristics(&nlh, bGStatEveryStep, step);
 +    }
 +
 +    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
 +    {
 +        /* check how many steps are left in other sims */
 +        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
 +    }
 +
 +
 +    /* and stop now if we should */
 +    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
 +                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
 +    while (!bLastStep || (bRerunMD && bNotLastFrame))
 +    {
 +
 +        wallcycle_start(wcycle, ewcSTEP);
 +
 +        if (bRerunMD)
 +        {
 +            if (rerun_fr.bStep)
 +            {
 +                step     = rerun_fr.step;
 +                step_rel = step - ir->init_step;
 +            }
 +            if (rerun_fr.bTime)
 +            {
 +                t = rerun_fr.time;
 +            }
 +            else
 +            {
 +                t = step;
 +            }
 +        }
 +        else
 +        {
 +            bLastStep = (step_rel == ir->nsteps);
 +            t         = t0 + step*ir->delta_t;
 +        }
 +
 +        if (ir->efep != efepNO || ir->bSimTemp)
 +        {
 +            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
 +               requiring different logic. */
 +
 +            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
 +            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
 +            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
 +            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
 +        }
 +
 +        if (bSimAnn)
 +        {
 +            update_annealing_target_temp(&(ir->opts), t);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
 +            {
 +                for (i = 0; i < state_global->natoms; i++)
 +                {
 +                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
 +                }
 +                if (rerun_fr.bV)
 +                {
 +                    for (i = 0; i < state_global->natoms; i++)
 +                    {
 +                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
 +                    }
 +                }
 +                else
 +                {
 +                    for (i = 0; i < state_global->natoms; i++)
 +                    {
 +                        clear_rvec(state_global->v[i]);
 +                    }
 +                    if (bRerunWarnNoV)
 +                    {
 +                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
 +                                "         Ekin, temperature and pressure are incorrect,\n"
 +                                "         the virial will be incorrect when constraints are present.\n"
 +                                "\n");
 +                        bRerunWarnNoV = FALSE;
 +                    }
 +                }
 +            }
 +            copy_mat(rerun_fr.box, state_global->box);
 +            copy_mat(state_global->box, state->box);
 +
 +            if (vsite && (Flags & MD_RERUN_VSITE))
 +            {
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
 +                }
 +                if (graph)
 +                {
 +                    /* Following is necessary because the graph may get out of sync
 +                     * with the coordinates if we only have every N'th coordinate set
 +                     */
 +                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
 +                    shift_self(graph, state->box, state->x);
 +                }
 +                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
 +                                 top->idef.iparams, top->idef.il,
 +                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +                if (graph)
 +                {
 +                    unshift_self(graph, state->box, state->x);
 +                }
 +            }
 +        }
 +
 +        /* Stop Center of Mass motion */
 +        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
 +
 +        /* Copy back starting coordinates in case we're doing a forcefield scan */
 +        if (bFFscan)
 +        {
 +            for (ii = 0; (ii < state->natoms); ii++)
 +            {
 +                copy_rvec(xcopy[ii], state->x[ii]);
 +                copy_rvec(vcopy[ii], state->v[ii]);
 +            }
 +            copy_mat(boxcopy, state->box);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            /* for rerun MD always do Neighbour Searching */
 +            bNS      = (bFirstStep || ir->nstlist != 0);
 +            bNStList = bNS;
 +        }
 +        else
 +        {
 +            /* Determine whether or not to do Neighbour Searching and LR */
 +            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
 +
 +            bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
 +                   (ir->nstlist == -1 && nlh.nabnsb > 0));
 +
 +            if (bNS && ir->nstlist == -1)
 +            {
 +                set_nlistheuristics(&nlh, bFirstStep || bExchanged || bDoFEP, step);
 +            }
 +        }
 +
 +        /* check whether we should stop because another simulation has
 +           stopped. */
 +        if (MULTISIM(cr))
 +        {
 +            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
 +                 (multisim_nsteps != ir->nsteps) )
 +            {
 +                if (bNS)
 +                {
 +                    if (MASTER(cr))
 +                    {
 +                        fprintf(stderr,
 +                                "Stopping simulation %d because another one has finished\n",
 +                                cr->ms->sim);
 +                    }
 +                    bLastStep         = TRUE;
 +                    gs.sig[eglsCHKPT] = 1;
 +                }
 +            }
 +        }
 +
 +        /* < 0 means stop at next step, > 0 means stop at next NS step */
 +        if ( (gs.set[eglsSTOPCOND] < 0 ) ||
 +             ( (gs.set[eglsSTOPCOND] > 0 ) && ( bNS || ir->nstlist == 0)) )
 +        {
 +            bLastStep = TRUE;
 +        }
 +
 +        /* Determine whether or not to update the Born radii if doing GB */
 +        bBornRadii = bFirstStep;
 +        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
 +        {
 +            bBornRadii = TRUE;
 +        }
 +
 +        do_log     = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep;
 +        do_verbose = bVerbose &&
 +            (step % stepout == 0 || bFirstStep || bLastStep);
 +
 +        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
 +        {
 +            if (bRerunMD)
 +            {
 +                bMasterState = TRUE;
 +            }
 +            else
 +            {
 +                bMasterState = FALSE;
 +                /* Correct the new box if it is too skewed */
 +                if (DYNAMIC_BOX(*ir))
 +                {
 +                    if (correct_box(fplog, step, state->box, graph))
 +                    {
 +                        bMasterState = TRUE;
 +                    }
 +                }
 +                if (DOMAINDECOMP(cr) && bMasterState)
 +                {
 +                    dd_collect_state(cr->dd, state, state_global);
 +                }
 +            }
 +
 +            if (DOMAINDECOMP(cr))
 +            {
 +                /* Repartition the domain decomposition */
 +                wallcycle_start(wcycle, ewcDOMDEC);
 +                dd_partition_system(fplog, step, cr,
 +                                    bMasterState, nstglobalcomm,
 +                                    state_global, top_global, ir,
 +                                    state, &f, mdatoms, top, fr,
 +                                    vsite, shellfc, constr,
 +                                    nrnb, wcycle,
 +                                    do_verbose && !bPMETuneRunning);
 +                wallcycle_stop(wcycle, ewcDOMDEC);
 +                /* If using an iterative integrator, reallocate space to match the decomposition */
 +            }
 +        }
 +
 +        if (MASTER(cr) && do_log && !bFFscan)
 +        {
 +            print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */
 +        }
 +
 +        if (ir->efep != efepNO)
 +        {
 +            update_mdatoms(mdatoms, state->lambda[efptMASS]);
 +        }
 +
 +        if ((bRerunMD && rerun_fr.bV) || bExchanged)
 +        {
 +
 +            /* We need the kinetic energy at minus the half step for determining
 +             * the full step kinetic energy and possibly for T-coupling.*/
 +            /* This may not be quite working correctly yet . . . . */
 +            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
 +                            constr, NULL, FALSE, state->box,
 +                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +        }
 +        clear_mat(force_vir);
 +
 +        /* Ionize the atoms if necessary */
 +        if (bIonize)
 +        {
 +            ionize(fplog, oenv, mdatoms, top_global, t, ir, state->x, state->v,
 +                   mdatoms->start, mdatoms->start+mdatoms->homenr, state->box, cr);
 +        }
 +
 +        /* Update force field in ffscan program */
 +        if (bFFscan)
 +        {
 +            if (update_forcefield(fplog,
 +                                  nfile, fnm, fr,
 +                                  mdatoms->nr, state->x, state->box))
 +            {
 +                gmx_finalize_par();
 +
 +                exit(0);
 +            }
 +        }
 +
 +        /* We write a checkpoint at this MD step when:
 +         * either at an NS step when we signalled through gs,
 +         * or at the last step (but not when we do not want confout),
 +         * but never at the first step or with rerun.
 +         */
 +        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
 +                 (bLastStep && (Flags & MD_CONFOUT))) &&
 +                step > ir->init_step && !bRerunMD);
 +        if (bCPT)
 +        {
 +            gs.set[eglsCHKPT] = 0;
 +        }
 +
 +        /* Determine the energy and pressure:
 +         * at nstcalcenergy steps and at energy output steps (set below).
 +         */
 +        if (EI_VV(ir->eI) && (!bInitStep))
 +        {
 +            /* for vv, the first half of the integration actually corresponds
 +               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
 +               but the virial needs to be calculated on both the current step and the 'next' step. Future
 +               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
 +
 +            bCalcEner = do_per_step(step-1, ir->nstcalcenergy);
 +            bCalcVir  = bCalcEner ||
 +                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
 +        }
 +        else
 +        {
 +            bCalcEner = do_per_step(step, ir->nstcalcenergy);
 +            bCalcVir  = bCalcEner ||
 +                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
 +        }
 +
 +        /* Do we need global communication ? */
 +        bGStat = (bCalcVir || bCalcEner || bStopCM ||
 +                  do_per_step(step, nstglobalcomm) ||
 +                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
 +
 +        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 +
 +        if (do_ene || do_log)
 +        {
 +            bCalcVir  = TRUE;
 +            bCalcEner = TRUE;
 +            bGStat    = TRUE;
 +        }
 +
 +        /* these CGLO_ options remain the same throughout the iteration */
 +        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
 +                      (bGStat ? CGLO_GSTAT : 0)
 +                      );
 +
 +        force_flags = (GMX_FORCE_STATECHANGED |
 +                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
 +                       GMX_FORCE_ALLFORCES |
 +                       GMX_FORCE_SEPLRF |
 +                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
 +                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
 +                       (bDoFEP ? GMX_FORCE_DHDL : 0)
 +                       );
 +
 +        if (fr->bTwinRange)
 +        {
 +            if (do_per_step(step, ir->nstcalclr))
 +            {
 +                force_flags |= GMX_FORCE_DO_LR;
 +            }
 +        }
 +
 +        if (shellfc)
 +        {
 +            /* Now is the time to relax the shells */
 +            count = relax_shell_flexcon(fplog, cr, bVerbose, bFFscan ? step+1 : step,
 +                                        ir, bNS, force_flags,
 +                                        bStopCM, top, top_global,
 +                                        constr, enerd, fcd,
 +                                        state, f, force_vir, mdatoms,
 +                                        nrnb, wcycle, graph, groups,
 +                                        shellfc, fr, bBornRadii, t, mu_tot,
 +                                        state->natoms, &bConverged, vsite,
 +                                        outf->fp_field);
 +            tcount += count;
 +
 +            if (bConverged)
 +            {
 +                nconverged++;
 +            }
 +        }
 +        else
 +        {
 +            /* The coordinates (x) are shifted (to get whole molecules)
 +             * in do_force.
 +             * This is parallellized as well, and does communication too.
 +             * Check comments in sim_util.c
 +             */
 +            do_force(fplog, cr, ir, step, nrnb, wcycle, top, top_global, groups,
 +                     state->box, state->x, &state->hist,
 +                     f, force_vir, mdatoms, enerd, fcd,
 +                     state->lambda, graph,
 +                     fr, vsite, mu_tot, t, outf->fp_field, ed, bBornRadii,
 +                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
 +        }
 +
 +        if (bTCR)
 +        {
 +            mu_aver = calc_mu_aver(cr, state->x, mdatoms->chargeA,
 +                                   mu_tot, &top_global->mols, mdatoms, gnx, grpindex);
 +        }
 +
 +        if (bTCR && bFirstStep)
 +        {
 +            tcr = init_coupling(fplog, nfile, fnm, cr, fr, mdatoms, &(top->idef));
 +            fprintf(fplog, "Done init_coupling\n");
 +            fflush(fplog);
 +        }
 +
 +        if (bVV && !bStartingFromCpt && !bRerunMD)
 +        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 +        {
 +            if (ir->eI == eiVV && bInitStep)
 +            {
 +                /* if using velocity verlet with full time step Ekin,
 +                 * take the first half step only to compute the
 +                 * virial for the first step. From there,
 +                 * revert back to the initial coordinates
 +                 * so that the input is actually the initial step.
 +                 */
 +                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
 +            }
 +            else
 +            {
 +                /* this is for NHC in the Ekin(t+dt/2) version of vv */
 +                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
 +            }
 +
 +            /* If we are using twin-range interactions where the long-range component
 +             * is only evaluated every nstcalclr>1 steps, we should do a special update
 +             * step to combine the long-range forces on these steps.
 +             * For nstcalclr=1 this is not done, since the forces would have been added
 +             * directly to the short-range forces already.
 +             */
 +            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
 +                          f, bUpdateDoLR, fr->f_twin, fcd,
 +                          ekind, M, wcycle, upd, bInitStep, etrtVELOCITY1,
 +                          cr, nrnb, constr, &top->idef);
 +
 +            if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
 +            {
 +                gmx_iterate_init(&iterate, TRUE);
 +            }
 +            /* for iterations, we save these vectors, as we will be self-consistently iterating
 +               the calculations */
 +
 +            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
 +
 +            /* save the state */
 +            if (iterate.bIterationActive)
 +            {
 +                copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
 +            }
 +
 +            bFirstIterate = TRUE;
 +            while (bFirstIterate || iterate.bIterationActive)
 +            {
 +                if (iterate.bIterationActive)
 +                {
 +                    copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
 +                    if (bFirstIterate && bTrotter)
 +                    {
 +                        /* The first time through, we need a decent first estimate
 +                           of veta(t+dt) to compute the constraints.  Do
 +                           this by computing the box volume part of the
 +                           trotter integration at this time. Nothing else
 +                           should be changed by this routine here.  If
 +                           !(first time), we start with the previous value
 +                           of veta.  */
 +
 +                        veta_save = state->veta;
 +                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
 +                        vetanew     = state->veta;
 +                        state->veta = veta_save;
 +                    }
 +                }
 +
 +                bOK = TRUE;
 +                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
 +                {
 +                    dvdl = 0;
 +
 +                    update_constraints(fplog, step, &dvdl, ir, ekind, mdatoms,
 +                                       state, fr->bMolPBC, graph, f,
 +                                       &top->idef, shake_vir, NULL,
 +                                       cr, nrnb, wcycle, upd, constr,
 +                                       bInitStep, TRUE, bCalcVir, vetanew);
 +
 +                    if (!bOK && !bFFscan)
 +                    {
 +                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                    }
 +
 +                }
 +                else if (graph)
 +                {
 +                    /* Need to unshift here if a do_force has been
 +                       called in the previous step */
 +                    unshift_self(graph, state->box, state->x);
 +                }
 +
 +                /* if VV, compute the pressure and constraints */
 +                /* For VV2, we strictly only need this if using pressure
 +                 * control, but we really would like to have accurate pressures
 +                 * printed out.
 +                 * Think about ways around this in the future?
 +                 * For now, keep this choice in comments.
 +                 */
 +                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
 +                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
 +                bPres = TRUE;
 +                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
 +                if (bCalcEner && ir->eI == eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
 +                {
 +                    bSumEkinhOld = TRUE;
 +                }
 +                /* for vv, the first half of the integration actually corresponds to the previous step.
 +                   So we need information from the last step in the first half of the integration */
 +                if (bGStat || do_per_step(step-1, nstglobalcomm))
 +                {
 +                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                                    constr, NULL, FALSE, state->box,
 +                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                                    cglo_flags
 +                                    | CGLO_ENERGY
 +                                    | (bTemp ? CGLO_TEMPERATURE : 0)
 +                                    | (bPres ? CGLO_PRESSURE : 0)
 +                                    | (bPres ? CGLO_CONSTRAINT : 0)
 +                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
 +                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                    | CGLO_SCALEEKIN
 +                                    );
 +                    /* explanation of above:
 +                       a) We compute Ekin at the full time step
 +                       if 1) we are using the AveVel Ekin, and it's not the
 +                       initial step, or 2) if we are using AveEkin, but need the full
 +                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
 +                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
 +                       EkinAveVel because it's needed for the pressure */
 +                }
 +                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
 +                if (!bInitStep)
 +                {
 +                    if (bTrotter)
 +                    {
 +                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
 +                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
 +                    }
 +                    else
 +                    {
 +                        if (bExchanged)
 +                        {
 +
 +                            /* We need the kinetic energy at minus the half step for determining
 +                             * the full step kinetic energy and possibly for T-coupling.*/
 +                            /* This may not be quite working correctly yet . . . . */
 +                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
 +                                            constr, NULL, FALSE, state->box,
 +                                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +                        }
 +                    }
 +                }
 +
 +                if (iterate.bIterationActive &&
 +                    done_iterating(cr, fplog, step, &iterate, bFirstIterate,
 +                                   state->veta, &vetanew))
 +                {
 +                    break;
 +                }
 +                bFirstIterate = FALSE;
 +            }
 +
 +            if (bTrotter && !bInitStep)
 +            {
 +                enerd->term[F_DVDL_BONDED] += dvdl;        /* only add after iterations */
 +                copy_mat(shake_vir, state->svir_prev);
 +                copy_mat(force_vir, state->fvir_prev);
 +                if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
 +                {
 +                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
 +                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE, FALSE);
 +                    enerd->term[F_EKIN] = trace(ekind->ekin);
 +                }
 +            }
 +            /* if it's the initial step, we performed this first step just to get the constraint virial */
 +            if (bInitStep && ir->eI == eiVV)
 +            {
 +                copy_rvecn(cbuf, state->v, 0, state->natoms);
 +            }
 +
 +            if (fr->bSepDVDL && fplog && do_log)
 +            {
 +                fprintf(fplog, sepdvdlformat, "Constraint", 0.0, dvdl);
 +            }
 +            enerd->term[F_DVDL_BONDED] += dvdl;
 +        }
 +
 +        /* MRS -- now done iterating -- compute the conserved quantity */
 +        if (bVV)
 +        {
 +            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
 +            if (ir->eI == eiVV)
 +            {
 +                last_ekin = enerd->term[F_EKIN];
 +            }
 +            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
 +            {
 +                saved_conserved_quantity -= enerd->term[F_DISPCORR];
 +            }
 +            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
 +            if (!bRerunMD)
 +            {
 +                sum_dhdl(enerd, state->lambda, ir->fepvals);
 +            }
 +        }
 +
 +        /* ########  END FIRST UPDATE STEP  ############## */
 +        /* ########  If doing VV, we now have v(dt) ###### */
 +        if (bDoExpanded)
 +        {
 +            /* perform extended ensemble sampling in lambda - we don't
 +               actually move to the new state before outputting
 +               statistics, but if performing simulated tempering, we
 +               do update the velocities and the tau_t. */
 +
 +            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, &df_history, step, mcrng, state->v, mdatoms);
 +        }
 +        /* ################## START TRAJECTORY OUTPUT ################# */
 +
 +        /* Now we have the energies and forces corresponding to the
 +         * coordinates at time t. We must output all of this before
 +         * the update.
 +         * for RerunMD t is read from input trajectory
 +         */
 +        mdof_flags = 0;
 +        if (do_per_step(step, ir->nstxout))
 +        {
 +            mdof_flags |= MDOF_X;
 +        }
 +        if (do_per_step(step, ir->nstvout))
 +        {
 +            mdof_flags |= MDOF_V;
 +        }
 +        if (do_per_step(step, ir->nstfout))
 +        {
 +            mdof_flags |= MDOF_F;
 +        }
 +        if (do_per_step(step, ir->nstxtcout))
 +        {
 +            mdof_flags |= MDOF_XTC;
 +        }
 +        if (bCPT)
 +        {
 +            mdof_flags |= MDOF_CPT;
 +        }
 +        ;
 +
 +#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
 +        if (bLastStep)
 +        {
 +            /* Enforce writing positions and velocities at end of run */
 +            mdof_flags |= (MDOF_X | MDOF_V);
 +        }
 +#endif
 +#ifdef GMX_FAHCORE
 +        if (MASTER(cr))
 +        {
 +            fcReportProgress( ir->nsteps, step );
 +        }
 +
 +        /* sync bCPT and fc record-keeping */
 +        if (bCPT && MASTER(cr))
 +        {
 +            fcRequestCheckPoint();
 +        }
 +#endif
 +
 +        if (mdof_flags != 0)
 +        {
 +            wallcycle_start(wcycle, ewcTRAJ);
 +            if (bCPT)
 +            {
 +                if (state->flags & (1<<estLD_RNG))
 +                {
 +                    get_stochd_state(upd, state);
 +                }
 +                if (state->flags  & (1<<estMC_RNG))
 +                {
 +                    get_mc_state(mcrng, state);
 +                }
 +                if (MASTER(cr))
 +                {
 +                    if (bSumEkinhOld)
 +                    {
 +                        state_global->ekinstate.bUpToDate = FALSE;
 +                    }
 +                    else
 +                    {
 +                        update_ekinstate(&state_global->ekinstate, ekind);
 +                        state_global->ekinstate.bUpToDate = TRUE;
 +                    }
 +                    update_energyhistory(&state_global->enerhist, mdebin);
 +                    if (ir->efep != efepNO || ir->bSimTemp)
 +                    {
 +                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
 +                                                                       structured so this isn't necessary.
 +                                                                       Note this reassignment is only necessary
 +                                                                       for single threads.*/
 +                        copy_df_history(&state_global->dfhist, &df_history);
 +                    }
 +                }
 +            }
 +            write_traj(fplog, cr, outf, mdof_flags, top_global,
 +                       step, t, state, state_global, f, f_global, &n_xtc, &x_xtc);
 +            if (bCPT)
 +            {
 +                nchkpt++;
 +                bCPT = FALSE;
 +            }
 +            debug_gmx();
 +            if (bLastStep && step_rel == ir->nsteps &&
 +                (Flags & MD_CONFOUT) && MASTER(cr) &&
 +                !bRerunMD && !bFFscan)
 +            {
 +                /* x and v have been collected in write_traj,
 +                 * because a checkpoint file will always be written
 +                 * at the last step.
 +                 */
 +                fprintf(stderr, "\nWriting final coordinates.\n");
 +                if (fr->bMolPBC)
 +                {
 +                    /* Make molecules whole only for confout writing */
 +                    do_pbc_mtop(fplog, ir->ePBC, state->box, top_global, state_global->x);
 +                }
 +                write_sto_conf_mtop(ftp2fn(efSTO, nfile, fnm),
 +                                    *top_global->name, top_global,
 +                                    state_global->x, state_global->v,
 +                                    ir->ePBC, state->box);
 +                debug_gmx();
 +            }
 +            wallcycle_stop(wcycle, ewcTRAJ);
 +        }
 +
 +        /* kludge -- virial is lost with restart for NPT control. Must restart */
 +        if (bStartingFromCpt && bVV)
 +        {
 +            copy_mat(state->svir_prev, shake_vir);
 +            copy_mat(state->fvir_prev, force_vir);
 +        }
 +        /*  ################## END TRAJECTORY OUTPUT ################ */
 +
 +        /* Determine the wallclock run time up till now */
 +        run_time = gmx_gettime() - (double)runtime->real;
 +
 +        /* Check whether everything is still allright */
 +        if (((int)gmx_get_stop_condition() > handled_stop_condition)
 +#ifdef GMX_THREAD_MPI
 +            && MASTER(cr)
 +#endif
 +            )
 +        {
 +            /* this is just make gs.sig compatible with the hack
 +               of sending signals around by MPI_Reduce with together with
 +               other floats */
 +            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
 +            {
 +                gs.sig[eglsSTOPCOND] = 1;
 +            }
 +            if (gmx_get_stop_condition() == gmx_stop_cond_next)
 +            {
 +                gs.sig[eglsSTOPCOND] = -1;
 +            }
 +            /* < 0 means stop at next step, > 0 means stop at next NS step */
 +            if (fplog)
 +            {
 +                fprintf(fplog,
 +                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                        gmx_get_signal_name(),
 +                        gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
 +                fflush(fplog);
 +            }
 +            fprintf(stderr,
 +                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                    gmx_get_signal_name(),
 +                    gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
 +            fflush(stderr);
 +            handled_stop_condition = (int)gmx_get_stop_condition();
 +        }
 +        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
 +                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
 +                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
 +        {
 +            /* Signal to terminate the run */
 +            gs.sig[eglsSTOPCOND] = 1;
 +            if (fplog)
 +            {
 +                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
 +            }
 +            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
 +        }
 +
 +        if (bResetCountersHalfMaxH && MASTER(cr) &&
 +            run_time > max_hours*60.0*60.0*0.495)
 +        {
 +            gs.sig[eglsRESETCOUNTERS] = 1;
 +        }
 +
 +        if (ir->nstlist == -1 && !bRerunMD)
 +        {
 +            /* When bGStatEveryStep=FALSE, global_stat is only called
 +             * when we check the atom displacements, not at NS steps.
 +             * This means that also the bonded interaction count check is not
 +             * performed immediately after NS. Therefore a few MD steps could
 +             * be performed with missing interactions.
 +             * But wrong energies are never written to file,
 +             * since energies are only written after global_stat
 +             * has been called.
 +             */
 +            if (step >= nlh.step_nscheck)
 +            {
 +                nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
 +                                                     nlh.scale_tot, state->x);
 +            }
 +            else
 +            {
 +                /* This is not necessarily true,
 +                 * but step_nscheck is determined quite conservatively.
 +                 */
 +                nlh.nabnsb = 0;
 +            }
 +        }
 +
 +        /* In parallel we only have to check for checkpointing in steps
 +         * where we do global communication,
 +         *  otherwise the other nodes don't know.
 +         */
 +        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
 +                           cpt_period >= 0 &&
 +                           (cpt_period == 0 ||
 +                            run_time >= nchkpt*cpt_period*60.0)) &&
 +            gs.set[eglsCHKPT] == 0)
 +        {
 +            gs.sig[eglsCHKPT] = 1;
 +        }
 +
 +        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
 +        if (EI_VV(ir->eI))
 +        {
 +            if (!bInitStep)
 +            {
 +                update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
 +            }
 +            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
 +            {
 +                gmx_bool bIfRandomize;
 +                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr);
 +                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
 +                if (constr && bIfRandomize)
 +                {
 +                    update_constraints(fplog, step, &dvdl, ir, ekind, mdatoms,
 +                                       state, fr->bMolPBC, graph, f,
 +                                       &top->idef, tmp_vir, NULL,
 +                                       cr, nrnb, wcycle, upd, constr,
 +                                       bInitStep, TRUE, bCalcVir, vetanew);
 +                }
 +            }
 +        }
 +
 +        if (bIterativeCase && do_per_step(step, ir->nstpcouple))
 +        {
 +            gmx_iterate_init(&iterate, TRUE);
 +            /* for iterations, we save these vectors, as we will be redoing the calculations */
 +            copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
 +        }
 +
 +        bFirstIterate = TRUE;
 +        while (bFirstIterate || iterate.bIterationActive)
 +        {
 +            /* We now restore these vectors to redo the calculation with improved extended variables */
 +            if (iterate.bIterationActive)
 +            {
 +                copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
 +            }
 +
 +            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
 +               so scroll down for that logic */
 +
 +            /* #########   START SECOND UPDATE STEP ################# */
 +            /* Box is changed in update() when we do pressure coupling,
 +             * but we should still use the old box for energy corrections and when
 +             * writing it to the energy file, so it matches the trajectory files for
 +             * the same timestep above. Make a copy in a separate array.
 +             */
 +            copy_mat(state->box, lastbox);
 +
 +            bOK = TRUE;
 +            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
 +            {
 +                wallcycle_start(wcycle, ewcUPDATE);
 +                dvdl = 0;
 +                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
 +                if (bTrotter)
 +                {
 +                    if (iterate.bIterationActive)
 +                    {
 +                        if (bFirstIterate)
 +                        {
 +                            scalevir = 1;
 +                        }
 +                        else
 +                        {
 +                            /* we use a new value of scalevir to converge the iterations faster */
 +                            scalevir = tracevir/trace(shake_vir);
 +                        }
 +                        msmul(shake_vir, scalevir, shake_vir);
 +                        m_add(force_vir, shake_vir, total_vir);
 +                        clear_mat(shake_vir);
 +                    }
 +                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
 +                    /* We can only do Berendsen coupling after we have summed
 +                     * the kinetic energy or virial. Since the happens
 +                     * in global_state after update, we should only do it at
 +                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
 +                     */
 +                }
 +                else
 +                {
 +                    update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
 +                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, wcycle,
 +                                   upd, bInitStep);
 +                }
 +
 +                if (bVV)
 +                {
 +                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +                    /* velocity half-step update */
 +                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
 +                                  bUpdateDoLR, fr->f_twin, fcd,
 +                                  ekind, M, wcycle, upd, FALSE, etrtVELOCITY2,
 +                                  cr, nrnb, constr, &top->idef);
 +                }
 +
 +                /* Above, initialize just copies ekinh into ekin,
 +                 * it doesn't copy position (for VV),
 +                 * and entire integrator for MD.
 +                 */
 +
 +                if (ir->eI == eiVVAK)
 +                {
 +                    copy_rvecn(state->x, cbuf, 0, state->natoms);
 +                }
 +                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
 +                              bUpdateDoLR, fr->f_twin, fcd,
 +                              ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
 +                wallcycle_stop(wcycle, ewcUPDATE);
 +
 +                update_constraints(fplog, step, &dvdl, ir, ekind, mdatoms, state,
 +                                   fr->bMolPBC, graph, f,
 +                                   &top->idef, shake_vir, force_vir,
 +                                   cr, nrnb, wcycle, upd, constr,
 +                                   bInitStep, FALSE, bCalcVir, state->veta);
 +
 +                if (ir->eI == eiVVAK)
 +                {
 +                    /* erase F_EKIN and F_TEMP here? */
 +                    /* just compute the kinetic energy at the half step to perform a trotter step */
 +                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                                    constr, NULL, FALSE, lastbox,
 +                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                                    cglo_flags | CGLO_TEMPERATURE
 +                                    );
 +                    wallcycle_start(wcycle, ewcUPDATE);
 +                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
 +                    /* now we know the scaling, we can compute the positions again again */
 +                    copy_rvecn(cbuf, state->x, 0, state->natoms);
 +
 +                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
 +                                  bUpdateDoLR, fr->f_twin, fcd,
 +                                  ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
 +                    wallcycle_stop(wcycle, ewcUPDATE);
 +
 +                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
 +                    /* are the small terms in the shake_vir here due
 +                     * to numerical errors, or are they important
 +                     * physically? I'm thinking they are just errors, but not completely sure.
 +                     * For now, will call without actually constraining, constr=NULL*/
 +                    update_constraints(fplog, step, &dvdl, ir, ekind, mdatoms,
 +                                       state, fr->bMolPBC, graph, f,
 +                                       &top->idef, tmp_vir, force_vir,
 +                                       cr, nrnb, wcycle, upd, NULL,
 +                                       bInitStep, FALSE, bCalcVir,
 +                                       state->veta);
 +                }
 +                if (!bOK && !bFFscan)
 +                {
 +                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                }
 +
 +                if (fr->bSepDVDL && fplog && do_log)
 +                {
 +                    fprintf(fplog, sepdvdlformat, "Constraint dV/dl", 0.0, dvdl);
 +                }
 +                enerd->term[F_DVDL_BONDED] += dvdl;
 +            }
 +            else if (graph)
 +            {
 +                /* Need to unshift here */
 +                unshift_self(graph, state->box, state->x);
 +            }
 +
 +            if (vsite != NULL)
 +            {
 +                wallcycle_start(wcycle, ewcVSITECONSTR);
 +                if (graph != NULL)
 +                {
 +                    shift_self(graph, state->box, state->x);
 +                }
 +                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
 +                                 top->idef.iparams, top->idef.il,
 +                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +
 +                if (graph != NULL)
 +                {
 +                    unshift_self(graph, state->box, state->x);
 +                }
 +                wallcycle_stop(wcycle, ewcVSITECONSTR);
 +            }
 +
 +            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
 +            /* With Leap-Frog we can skip compute_globals at
 +             * non-communication steps, but we need to calculate
 +             * the kinetic energy one step before communication.
 +             */
 +            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
 +            {
 +                if (ir->nstlist == -1 && bFirstIterate)
 +                {
 +                    gs.sig[eglsNABNSB] = nlh.nabnsb;
 +                }
 +                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                                constr,
 +                                bFirstIterate ? &gs : NULL,
 +                                (step_rel % gs.nstms == 0) &&
 +                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
 +                                lastbox,
 +                                top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                                cglo_flags
 +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
 +                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
 +                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
 +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
 +                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
 +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                | CGLO_CONSTRAINT
 +                                );
 +                if (ir->nstlist == -1 && bFirstIterate)
 +                {
 +                    nlh.nabnsb         = gs.set[eglsNABNSB];
 +                    gs.set[eglsNABNSB] = 0;
 +                }
 +            }
 +            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
 +            /* #############  END CALC EKIN AND PRESSURE ################# */
 +
 +            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
 +               the virial that should probably be addressed eventually. state->veta has better properies,
 +               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
 +               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
 +
 +            if (iterate.bIterationActive &&
 +                done_iterating(cr, fplog, step, &iterate, bFirstIterate,
 +                               trace(shake_vir), &tracevir))
 +            {
 +                break;
 +            }
 +            bFirstIterate = FALSE;
 +        }
 +
 +        /* only add constraint dvdl after constraints */
 +        enerd->term[F_DVDL_BONDED] += dvdl;
 +        if (!bVV || bRerunMD)
 +        {
 +            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
 +            sum_dhdl(enerd, state->lambda, ir->fepvals);
 +        }
 +        update_box(fplog, step, ir, mdatoms, state, graph, f,
 +                   ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, wcycle, upd, bInitStep, FALSE);
 +
 +        /* ################# END UPDATE STEP 2 ################# */
 +        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
 +
 +        /* The coordinates (x) were unshifted in update */
 +        if (bFFscan && (shellfc == NULL || bConverged))
 +        {
 +            if (print_forcefield(fplog, enerd->term, mdatoms->homenr,
 +                                 f, NULL, xcopy,
 +                                 &(top_global->mols), mdatoms->massT, pres))
 +            {
 +                gmx_finalize_par();
 +
 +                fprintf(stderr, "\n");
 +                exit(0);
 +            }
 +        }
 +        if (!bGStat)
 +        {
 +            /* We will not sum ekinh_old,
 +             * so signal that we still have to do it.
 +             */
 +            bSumEkinhOld = TRUE;
 +        }
 +
 +        if (bTCR)
 +        {
 +            /* Only do GCT when the relaxation of shells (minimization) has converged,
 +             * otherwise we might be coupling to bogus energies.
 +             * In parallel we must always do this, because the other sims might
 +             * update the FF.
 +             */
 +
 +            /* Since this is called with the new coordinates state->x, I assume
 +             * we want the new box state->box too. / EL 20040121
 +             */
 +            do_coupling(fplog, oenv, nfile, fnm, tcr, t, step, enerd->term, fr,
 +                        ir, MASTER(cr),
 +                        mdatoms, &(top->idef), mu_aver,
 +                        top_global->mols.nr, cr,
 +                        state->box, total_vir, pres,
 +                        mu_tot, state->x, f, bConverged);
 +            debug_gmx();
 +        }
 +
 +        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
 +
 +        /* use the directly determined last velocity, not actually the averaged half steps */
 +        if (bTrotter && ir->eI == eiVV)
 +        {
 +            enerd->term[F_EKIN] = last_ekin;
 +        }
 +        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
 +
 +        if (bVV)
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
 +        }
 +        else
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
 +        }
 +        /* Check for excessively large energies */
 +        if (bIonize)
 +        {
 +#ifdef GMX_DOUBLE
 +            real etot_max = 1e200;
 +#else
 +            real etot_max = 1e30;
 +#endif
 +            if (fabs(enerd->term[F_ETOT]) > etot_max)
 +            {
 +                fprintf(stderr, "Energy too large (%g), giving up\n",
 +                        enerd->term[F_ETOT]);
 +            }
 +        }
 +        /* #########  END PREPARING EDR OUTPUT  ###########  */
 +
 +        /* Time for performance */
 +        if (((step % stepout) == 0) || bLastStep)
 +        {
 +            runtime_upd_proc(runtime);
 +        }
 +
 +        /* Output stuff */
 +        if (MASTER(cr))
 +        {
 +            gmx_bool do_dr, do_or;
 +
 +            if (fplog && do_log && bDoExpanded)
 +            {
 +                /* only needed if doing expanded ensemble */
 +                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
 +                                          &df_history, state->fep_state, ir->nstlog, step);
 +            }
 +            if (!(bStartingFromCpt && (EI_VV(ir->eI))))
 +            {
 +                if (bCalcEner)
 +                {
 +                    upd_mdebin(mdebin, bDoDHDL, TRUE,
 +                               t, mdatoms->tmass, enerd, state,
 +                               ir->fepvals, ir->expandedvals, lastbox,
 +                               shake_vir, force_vir, total_vir, pres,
 +                               ekind, mu_tot, constr);
 +                }
 +                else
 +                {
 +                    upd_mdebin_step(mdebin);
 +                }
 +
 +                do_dr  = do_per_step(step, ir->nstdisreout);
 +                do_or  = do_per_step(step, ir->nstorireout);
 +
 +                print_ebin(outf->fp_ene, do_ene, do_dr, do_or, do_log ? fplog : NULL,
 +                           step, t,
 +                           eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
 +            }
 +            if (ir->ePull != epullNO)
 +            {
 +                pull_print_output(ir->pull, step, t);
 +            }
 +
 +            if (do_per_step(step, ir->nstlog))
 +            {
 +                if (fflush(fplog) != 0)
 +                {
 +                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
 +                }
 +            }
 +        }
 +        if (bDoExpanded)
 +        {
 +            /* Have to do this part after outputting the logfile and the edr file */
 +            state->fep_state = lamnew;
 +            for (i = 0; i < efptNR; i++)
 +            {
 +                state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
 +            }
 +        }
 +        /* Remaining runtime */
 +        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
 +        {
 +            if (shellfc)
 +            {
 +                fprintf(stderr, "\n");
 +            }
 +            print_time(stderr, runtime, step, ir, cr);
 +        }
 +
 +        /* Replica exchange */
 +        bExchanged = FALSE;
 +        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
 +            do_per_step(step, repl_ex_nst))
 +        {
 +            bExchanged = replica_exchange(fplog, cr, repl_ex,
 +                                          state_global, enerd,
 +                                          state, step, t);
 +
 +            if (bExchanged && DOMAINDECOMP(cr))
 +            {
 +                dd_partition_system(fplog, step, cr, TRUE, 1,
 +                                    state_global, top_global, ir,
 +                                    state, &f, mdatoms, top, fr,
 +                                    vsite, shellfc, constr,
 +                                    nrnb, wcycle, FALSE);
 +            }
 +        }
 +
 +        bFirstStep       = FALSE;
 +        bInitStep        = FALSE;
 +        bStartingFromCpt = FALSE;
 +
 +        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
 +        /* With all integrators, except VV, we need to retain the pressure
 +         * at the current step for coupling at the next step.
 +         */
 +        if ((state->flags & (1<<estPRES_PREV)) &&
 +            (bGStatEveryStep ||
 +             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
 +        {
 +            /* Store the pressure in t_state for pressure coupling
 +             * at the next MD step.
 +             */
 +            copy_mat(pres, state->pres_prev);
 +        }
 +
 +        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
 +
 +        if ( (membed != NULL) && (!bLastStep) )
 +        {
 +            rescale_membed(step_rel, membed, state_global->x);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (MASTER(cr))
 +            {
 +                /* read next frame from input trajectory */
 +                bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
 +            }
 +
 +            if (PAR(cr))
 +            {
 +                rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
 +            }
 +        }
 +
 +        if (!bRerunMD || !rerun_fr.bStep)
 +        {
 +            /* increase the MD step number */
 +            step++;
 +            step_rel++;
 +        }
 +
 +        cycles = wallcycle_stop(wcycle, ewcSTEP);
 +        if (DOMAINDECOMP(cr) && wcycle)
 +        {
 +            dd_cycles_add(cr->dd, cycles, ddCyclStep);
 +        }
 +
 +        if (bPMETuneRunning || bPMETuneTry)
 +        {
 +            /* PME grid + cut-off optimization with GPUs or PME nodes */
 +
 +            /* Count the total cycles over the last steps */
 +            cycles_pmes += cycles;
 +
 +            /* We can only switch cut-off at NS steps */
 +            if (step % ir->nstlist == 0)
 +            {
 +                /* PME grid + cut-off optimization with GPUs or PME nodes */
 +                if (bPMETuneTry)
 +                {
 +                    if (DDMASTER(cr->dd))
 +                    {
 +                        /* PME node load is too high, start tuning */
 +                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
 +                    }
 +                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
 +
 +                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
 +                    {
 +                        bPMETuneTry     = FALSE;
 +                    }
 +                }
 +                if (bPMETuneRunning)
 +                {
 +                    /* init_step might not be a multiple of nstlist,
 +                     * but the first cycle is always skipped anyhow.
 +                     */
 +                    bPMETuneRunning =
 +                        pme_load_balance(pme_loadbal, cr,
 +                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
 +                                         fplog,
 +                                         ir, state, cycles_pmes,
 +                                         fr->ic, fr->nbv, &fr->pmedata,
 +                                         step);
 +
 +                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
 +                    fr->ewaldcoeff = fr->ic->ewaldcoeff;
 +                    fr->rlist      = fr->ic->rlist;
 +                    fr->rlistlong  = fr->ic->rlistlong;
 +                    fr->rcoulomb   = fr->ic->rcoulomb;
 +                    fr->rvdw       = fr->ic->rvdw;
 +                }
 +                cycles_pmes = 0;
 +            }
 +        }
 +
 +        if (step_rel == wcycle_get_reset_counters(wcycle) ||
 +            gs.set[eglsRESETCOUNTERS] != 0)
 +        {
 +            /* Reset all the counters related to performance over the run */
 +            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, runtime,
 +                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
 +            wcycle_set_reset_counters(wcycle, -1);
++            if (!(cr->duty & DUTY_PME))
++            {
++                /* Tell our PME node to reset its counters */
++                gmx_pme_send_resetcounters(cr, step);
++            }
 +            /* Correct max_hours for the elapsed time */
 +            max_hours                -= run_time/(60.0*60.0);
 +            bResetCountersHalfMaxH    = FALSE;
 +            gs.set[eglsRESETCOUNTERS] = 0;
 +        }
 +
 +    }
 +    /* End of main MD loop */
 +    debug_gmx();
 +
 +    /* Stop the time */
 +    runtime_end(runtime);
 +
 +    if (bRerunMD && MASTER(cr))
 +    {
 +        close_trj(status);
 +    }
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Tell the PME only node to finish */
 +        gmx_pme_send_finish(cr);
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        if (ir->nstcalcenergy > 0 && !bRerunMD)
 +        {
 +            print_ebin(outf->fp_ene, FALSE, FALSE, FALSE, fplog, step, t,
 +                       eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts));
 +        }
 +    }
 +
 +    done_mdoutf(outf);
 +
 +    debug_gmx();
 +
 +    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
 +    {
 +        fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
 +        fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
 +    }
 +
 +    if (pme_loadbal != NULL)
 +    {
 +        pme_loadbal_done(pme_loadbal, fplog);
 +    }
 +
 +    if (shellfc && fplog)
 +    {
 +        fprintf(fplog, "Fraction of iterations that converged:           %.2f %%\n",
 +                (nconverged*100.0)/step_rel);
 +        fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n",
 +                tcount/step_rel);
 +    }
 +
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        print_replica_exchange_statistics(fplog, repl_ex);
 +    }
 +
 +    runtime->nsteps_done = step_rel;
 +
 +    return 0;
 +}
index f7c1985b757342e82143acef119f56a296fb4854,0000000000000000000000000000000000000000..8a0cf21554c098405607cf961dd1ecf606f7f033
mode 100644,000000..100644
--- /dev/null
@@@ -1,763 -1,0 +1,763 @@@
-         gmx_pme_send_switch(cr, set->grid, set->ewaldcoeff);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 4.6.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2011, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "smalloc.h"
 +#include "network.h"
 +#include "calcgrid.h"
 +#include "pme.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "force.h"
 +#include "macros.h"
 +#include "pme_loadbal.h"
 +
 +/* Parameters and setting for one PP-PME setup */
 +typedef struct {
 +    real      rcut_coulomb;    /* Coulomb cut-off                              */
 +    real      rlist;           /* pair-list cut-off                            */
 +    real      rlistlong;       /* LR pair-list cut-off                         */
 +    int       nstcalclr;       /* frequency of evaluating long-range forces for group scheme */
 +    real      spacing;         /* (largest) PME grid spacing                   */
 +    ivec      grid;            /* the PME grid dimensions                      */
 +    real      grid_efficiency; /* ineffiency factor for non-uniform grids <= 1 */
 +    real      ewaldcoeff;      /* the Ewald coefficient                        */
 +    gmx_pme_t pmedata;         /* the data structure used in the PME code      */
 +
 +    int       count;           /* number of times this setup has been timed    */
 +    double    cycles;          /* the fastest time for this setup in cycles    */
 +} pme_setup_t;
 +
 +/* In the initial scan, step by grids that are at least a factor 0.8 coarser */
 +#define PME_LB_GRID_SCALE_FAC  0.8
 +/* In the initial scan, try to skip grids with uneven x/y/z spacing,
 + * checking if the "efficiency" is more than 5% worse than the previous grid.
 + */
 +#define PME_LB_GRID_EFFICIENCY_REL_FAC  1.05
 +/* Rerun up till 12% slower setups than the fastest up till now */
 +#define PME_LB_SLOW_FAC  1.12
 +/* If setups get more than 2% faster, do another round to avoid
 + * choosing a slower setup due to acceleration or fluctuations.
 + */
 +#define PME_LB_ACCEL_TOL 1.02
 +
 +enum {
 +    epmelblimNO, epmelblimBOX, epmelblimDD, epmelblimNR
 +};
 +
 +const char *pmelblim_str[epmelblimNR] =
 +{ "no", "box size", "domain decompostion" };
 +
 +struct pme_load_balancing {
 +    int          nstage;             /* the current maximum number of stages */
 +
 +    real         cut_spacing;        /* the minimum cutoff / PME grid spacing ratio */
 +    real         rcut_vdw;           /* Vdw cutoff (does not change) */
 +    real         rcut_coulomb_start; /* Initial electrostatics cutoff */
 +    int          nstcalclr_start;    /* Initial electrostatics cutoff */
 +    real         rbuf_coulomb;       /* the pairlist buffer size */
 +    real         rbuf_vdw;           /* the pairlist buffer size */
 +    matrix       box_start;          /* the initial simulation box */
 +    int          n;                  /* the count of setup as well as the allocation size */
 +    pme_setup_t *setup;              /* the PME+cutoff setups */
 +    int          cur;                /* the current setup */
 +    int          fastest;            /* fastest setup up till now */
 +    int          start;              /* start of setup range to consider in stage>0 */
 +    int          end;                /* end   of setup range to consider in stage>0 */
 +    int          elimited;           /* was the balancing limited, uses enum above */
 +    int          cutoff_scheme;      /* Verlet or group cut-offs */
 +
 +    int          stage;              /* the current stage */
 +};
 +
 +void pme_loadbal_init(pme_load_balancing_t *pme_lb_p,
 +                      const t_inputrec *ir, matrix box,
 +                      const interaction_const_t *ic,
 +                      gmx_pme_t pmedata)
 +{
 +    pme_load_balancing_t pme_lb;
 +    real                 spm, sp;
 +    int                  d;
 +
 +    snew(pme_lb, 1);
 +
 +    /* Any number of stages >= 2 is supported */
 +    pme_lb->nstage   = 2;
 +
 +    pme_lb->cutoff_scheme = ir->cutoff_scheme;
 +
 +    if (pme_lb->cutoff_scheme == ecutsVERLET)
 +    {
 +        pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb;
 +        pme_lb->rbuf_vdw     = pme_lb->rbuf_coulomb;
 +    }
 +    else
 +    {
 +        if (ic->rcoulomb > ic->rlist)
 +        {
 +            pme_lb->rbuf_coulomb = ic->rlistlong - ic->rcoulomb;
 +        }
 +        else
 +        {
 +            pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb;
 +        }
 +        if (ic->rvdw > ic->rlist)
 +        {
 +            pme_lb->rbuf_vdw = ic->rlistlong - ic->rvdw;
 +        }
 +        else
 +        {
 +            pme_lb->rbuf_vdw = ic->rlist - ic->rvdw;
 +        }
 +    }
 +
 +    copy_mat(box, pme_lb->box_start);
 +    if (ir->ePBC == epbcXY && ir->nwall == 2)
 +    {
 +        svmul(ir->wall_ewald_zfac, pme_lb->box_start[ZZ], pme_lb->box_start[ZZ]);
 +    }
 +
 +    pme_lb->n = 1;
 +    snew(pme_lb->setup, pme_lb->n);
 +
 +    pme_lb->rcut_vdw              = ic->rvdw;
 +    pme_lb->rcut_coulomb_start    = ir->rcoulomb;
 +    pme_lb->nstcalclr_start       = ir->nstcalclr;
 +
 +    pme_lb->cur                   = 0;
 +    pme_lb->setup[0].rcut_coulomb = ic->rcoulomb;
 +    pme_lb->setup[0].rlist        = ic->rlist;
 +    pme_lb->setup[0].rlistlong    = ic->rlistlong;
 +    pme_lb->setup[0].nstcalclr    = ir->nstcalclr;
 +    pme_lb->setup[0].grid[XX]     = ir->nkx;
 +    pme_lb->setup[0].grid[YY]     = ir->nky;
 +    pme_lb->setup[0].grid[ZZ]     = ir->nkz;
 +    pme_lb->setup[0].ewaldcoeff   = ic->ewaldcoeff;
 +
 +    pme_lb->setup[0].pmedata  = pmedata;
 +
 +    spm = 0;
 +    for (d = 0; d < DIM; d++)
 +    {
 +        sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d];
 +        if (sp > spm)
 +        {
 +            spm = sp;
 +        }
 +    }
 +    pme_lb->setup[0].spacing = spm;
 +
 +    if (ir->fourier_spacing > 0)
 +    {
 +        pme_lb->cut_spacing = ir->rcoulomb/ir->fourier_spacing;
 +    }
 +    else
 +    {
 +        pme_lb->cut_spacing = ir->rcoulomb/pme_lb->setup[0].spacing;
 +    }
 +
 +    pme_lb->stage = 0;
 +
 +    pme_lb->fastest  = 0;
 +    pme_lb->start    = 0;
 +    pme_lb->end      = 0;
 +    pme_lb->elimited = epmelblimNO;
 +
 +    *pme_lb_p = pme_lb;
 +}
 +
 +static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t pme_lb,
 +                                            int                  pme_order)
 +{
 +    pme_setup_t *set;
 +    real         fac, sp;
 +    real         tmpr_coulomb, tmpr_vdw;
 +    int          d;
 +
 +    /* Try to add a new setup with next larger cut-off to the list */
 +    pme_lb->n++;
 +    srenew(pme_lb->setup, pme_lb->n);
 +    set          = &pme_lb->setup[pme_lb->n-1];
 +    set->pmedata = NULL;
 +
 +    fac = 1;
 +    do
 +    {
 +        fac *= 1.01;
 +        clear_ivec(set->grid);
 +        sp = calc_grid(NULL, pme_lb->box_start,
 +                       fac*pme_lb->setup[pme_lb->cur].spacing,
 +                       &set->grid[XX],
 +                       &set->grid[YY],
 +                       &set->grid[ZZ]);
 +
 +        /* In parallel we can't have grids smaller than 2*pme_order,
 +         * and we would anyhow not gain much speed at these grid sizes.
 +         */
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (set->grid[d] <= 2*pme_order)
 +            {
 +                pme_lb->n--;
 +
 +                return FALSE;
 +            }
 +        }
 +    }
 +    while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing);
 +
 +    set->rcut_coulomb = pme_lb->cut_spacing*sp;
 +
 +    if (pme_lb->cutoff_scheme == ecutsVERLET)
 +    {
 +        set->rlist        = set->rcut_coulomb + pme_lb->rbuf_coulomb;
 +        /* We dont use LR lists with Verlet, but this avoids if-statements in further checks */
 +        set->rlistlong    = set->rlist;
 +    }
 +    else
 +    {
 +        tmpr_coulomb          = set->rcut_coulomb + pme_lb->rbuf_coulomb;
 +        tmpr_vdw              = pme_lb->rcut_vdw + pme_lb->rbuf_vdw;
 +        set->rlist            = min(tmpr_coulomb, tmpr_vdw);
 +        set->rlistlong        = max(tmpr_coulomb, tmpr_vdw);
 +
 +        /* Set the long-range update frequency */
 +        if (set->rlist == set->rlistlong)
 +        {
 +            /* No long-range interactions if the short-/long-range cutoffs are identical */
 +            set->nstcalclr = 0;
 +        }
 +        else if (pme_lb->nstcalclr_start == 0 || pme_lb->nstcalclr_start == 1)
 +        {
 +            /* We were not doing long-range before, but now we are since rlist!=rlistlong */
 +            set->nstcalclr = 1;
 +        }
 +        else
 +        {
 +            /* We were already doing long-range interactions from the start */
 +            if (pme_lb->rcut_vdw > pme_lb->rcut_coulomb_start)
 +            {
 +                /* We were originally doing long-range VdW-only interactions.
 +                 * If rvdw is still longer than rcoulomb we keep the original nstcalclr,
 +                 * but if the coulomb cutoff has become longer we should update the long-range
 +                 * part every step.
 +                 */
 +                set->nstcalclr = (tmpr_vdw > tmpr_coulomb) ? pme_lb->nstcalclr_start : 1;
 +            }
 +            else
 +            {
 +                /* We were not doing any long-range interaction from the start,
 +                 * since it is not possible to do twin-range coulomb for the PME interaction.
 +                 */
 +                set->nstcalclr = 1;
 +            }
 +        }
 +    }
 +
 +    set->spacing      = sp;
 +    /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
 +    set->grid_efficiency = 1;
 +    for (d = 0; d < DIM; d++)
 +    {
 +        set->grid_efficiency *= (set->grid[d]*sp)/norm(pme_lb->box_start[d]);
 +    }
 +    /* The Ewald coefficient is inversly proportional to the cut-off */
 +    set->ewaldcoeff =
 +        pme_lb->setup[0].ewaldcoeff*pme_lb->setup[0].rcut_coulomb/set->rcut_coulomb;
 +
 +    set->count   = 0;
 +    set->cycles  = 0;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "PME loadbal: grid %d %d %d, coulomb cutoff %f\n",
 +                set->grid[XX], set->grid[YY], set->grid[ZZ], set->rcut_coulomb);
 +    }
 +    return TRUE;
 +}
 +
 +static void print_grid(FILE *fp_err, FILE *fp_log,
 +                       const char *pre,
 +                       const char *desc,
 +                       const pme_setup_t *set,
 +                       double cycles)
 +{
 +    char buf[STRLEN], buft[STRLEN];
 +
 +    if (cycles >= 0)
 +    {
 +        sprintf(buft, ": %.1f M-cycles", cycles*1e-6);
 +    }
 +    else
 +    {
 +        buft[0] = '\0';
 +    }
 +    sprintf(buf, "%-11s%10s pme grid %d %d %d, coulomb cutoff %.3f%s",
 +            pre,
 +            desc, set->grid[XX], set->grid[YY], set->grid[ZZ], set->rcut_coulomb,
 +            buft);
 +    if (fp_err != NULL)
 +    {
 +        fprintf(fp_err, "\r%s\n", buf);
 +    }
 +    if (fp_log != NULL)
 +    {
 +        fprintf(fp_log, "%s\n", buf);
 +    }
 +}
 +
 +static int pme_loadbal_end(pme_load_balancing_t pme_lb)
 +{
 +    /* In the initial stage only n is set; end is not set yet */
 +    if (pme_lb->end > 0)
 +    {
 +        return pme_lb->end;
 +    }
 +    else
 +    {
 +        return pme_lb->n;
 +    }
 +}
 +
 +static void print_loadbal_limited(FILE *fp_err, FILE *fp_log,
 +                                  gmx_large_int_t step,
 +                                  pme_load_balancing_t pme_lb)
 +{
 +    char buf[STRLEN], sbuf[22];
 +
 +    sprintf(buf, "step %4s: the %s limited the PME load balancing to a coulomb cut-off of %.3f",
 +            gmx_step_str(step, sbuf),
 +            pmelblim_str[pme_lb->elimited],
 +            pme_lb->setup[pme_loadbal_end(pme_lb)-1].rcut_coulomb);
 +    if (fp_err != NULL)
 +    {
 +        fprintf(fp_err, "\r%s\n", buf);
 +    }
 +    if (fp_log != NULL)
 +    {
 +        fprintf(fp_log, "%s\n", buf);
 +    }
 +}
 +
 +static void switch_to_stage1(pme_load_balancing_t pme_lb)
 +{
 +    pme_lb->start = 0;
 +    while (pme_lb->start+1 < pme_lb->n &&
 +           (pme_lb->setup[pme_lb->start].count == 0 ||
 +            pme_lb->setup[pme_lb->start].cycles >
 +            pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC))
 +    {
 +        pme_lb->start++;
 +    }
 +    while (pme_lb->start > 0 && pme_lb->setup[pme_lb->start-1].cycles == 0)
 +    {
 +        pme_lb->start--;
 +    }
 +
 +    pme_lb->end = pme_lb->n;
 +    if (pme_lb->setup[pme_lb->end-1].count > 0 &&
 +        pme_lb->setup[pme_lb->end-1].cycles >
 +        pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
 +    {
 +        pme_lb->end--;
 +    }
 +
 +    pme_lb->stage = 1;
 +
 +    /* Next we want to choose setup pme_lb->start, but as we will increase
 +     * pme_ln->cur by one right after returning, we subtract 1 here.
 +     */
 +    pme_lb->cur = pme_lb->start - 1;
 +}
 +
 +gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
 +                          t_commrec           *cr,
 +                          FILE                *fp_err,
 +                          FILE                *fp_log,
 +                          t_inputrec          *ir,
 +                          t_state             *state,
 +                          double               cycles,
 +                          interaction_const_t *ic,
 +                          nonbonded_verlet_t  *nbv,
 +                          gmx_pme_t           *pmedata,
 +                          gmx_large_int_t      step)
 +{
 +    gmx_bool     OK;
 +    pme_setup_t *set;
 +    double       cycles_fast;
 +    char         buf[STRLEN], sbuf[22];
 +    real         rtab;
 +    gmx_bool     bUsesSimpleTables = TRUE;
 +
 +    if (pme_lb->stage == pme_lb->nstage)
 +    {
 +        return FALSE;
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        gmx_sumd(1, &cycles, cr);
 +        cycles /= cr->nnodes;
 +    }
 +
 +    set = &pme_lb->setup[pme_lb->cur];
 +    set->count++;
 +
 +    rtab = ir->rlistlong + ir->tabext;
 +
 +    if (set->count % 2 == 1)
 +    {
 +        /* Skip the first cycle, because the first step after a switch
 +         * is much slower due to allocation and/or caching effects.
 +         */
 +        return TRUE;
 +    }
 +
 +    sprintf(buf, "step %4s: ", gmx_step_str(step, sbuf));
 +    print_grid(fp_err, fp_log, buf, "timed with", set, cycles);
 +
 +    if (set->count <= 2)
 +    {
 +        set->cycles = cycles;
 +    }
 +    else
 +    {
 +        if (cycles*PME_LB_ACCEL_TOL < set->cycles &&
 +            pme_lb->stage == pme_lb->nstage - 1)
 +        {
 +            /* The performance went up a lot (due to e.g. DD load balancing).
 +             * Add a stage, keep the minima, but rescan all setups.
 +             */
 +            pme_lb->nstage++;
 +
 +            if (debug)
 +            {
 +                fprintf(debug, "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
 +                        "Increased the number stages to %d"
 +                        " and ignoring the previous performance\n",
 +                        set->grid[XX], set->grid[YY], set->grid[ZZ],
 +                        cycles*1e-6, set->cycles*1e-6, PME_LB_ACCEL_TOL,
 +                        pme_lb->nstage);
 +            }
 +        }
 +        set->cycles = min(set->cycles, cycles);
 +    }
 +
 +    if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles)
 +    {
 +        pme_lb->fastest = pme_lb->cur;
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            /* We found a new fastest setting, ensure that with subsequent
 +             * shorter cut-off's the dynamic load balancing does not make
 +             * the use of the current cut-off impossible. This solution is
 +             * a trade-off, as the PME load balancing and DD domain size
 +             * load balancing can interact in complex ways.
 +             * With the Verlet kernels, DD load imbalance will usually be
 +             * mainly due to bonded interaction imbalance, which will often
 +             * quickly push the domain boundaries beyond the limit for the
 +             * optimal, PME load balanced, cut-off. But it could be that
 +             * better overal performance can be obtained with a slightly
 +             * shorter cut-off and better DD load balancing.
 +             */
 +            change_dd_dlb_cutoff_limit(cr);
 +        }
 +    }
 +    cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;
 +
 +    /* Check in stage 0 if we should stop scanning grids.
 +     * Stop when the time is more than SLOW_FAC longer than the fastest.
 +     */
 +    if (pme_lb->stage == 0 && pme_lb->cur > 0 &&
 +        cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
 +    {
 +        pme_lb->n = pme_lb->cur + 1;
 +        /* Done with scanning, go to stage 1 */
 +        switch_to_stage1(pme_lb);
 +    }
 +
 +    if (pme_lb->stage == 0)
 +    {
 +        int gridsize_start;
 +
 +        gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
 +
 +        do
 +        {
 +            if (pme_lb->cur+1 < pme_lb->n)
 +            {
 +                /* We had already generated the next setup */
 +                OK = TRUE;
 +            }
 +            else
 +            {
 +                /* Find the next setup */
 +                OK = pme_loadbal_increase_cutoff(pme_lb, ir->pme_order);
 +            }
 +
 +            if (OK && ir->ePBC != epbcNONE)
 +            {
 +                OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlistlong)
 +                      <= max_cutoff2(ir->ePBC, state->box));
 +                if (!OK)
 +                {
 +                    pme_lb->elimited = epmelblimBOX;
 +                }
 +            }
 +
 +            if (OK)
 +            {
 +                pme_lb->cur++;
 +
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    OK = change_dd_cutoff(cr, state, ir,
 +                                          pme_lb->setup[pme_lb->cur].rlistlong);
 +                    if (!OK)
 +                    {
 +                        /* Failed: do not use this setup */
 +                        pme_lb->cur--;
 +                        pme_lb->elimited = epmelblimDD;
 +                    }
 +                }
 +            }
 +            if (!OK)
 +            {
 +                /* We hit the upper limit for the cut-off,
 +                 * the setup should not go further than cur.
 +                 */
 +                pme_lb->n = pme_lb->cur + 1;
 +                print_loadbal_limited(fp_err, fp_log, step, pme_lb);
 +                /* Switch to the next stage */
 +                switch_to_stage1(pme_lb);
 +            }
 +        }
 +        while (OK &&
 +               !(pme_lb->setup[pme_lb->cur].grid[XX]*
 +                 pme_lb->setup[pme_lb->cur].grid[YY]*
 +                 pme_lb->setup[pme_lb->cur].grid[ZZ] <
 +                 gridsize_start*PME_LB_GRID_SCALE_FAC
 +                 &&
 +                 pme_lb->setup[pme_lb->cur].grid_efficiency <
 +                 pme_lb->setup[pme_lb->cur-1].grid_efficiency*PME_LB_GRID_EFFICIENCY_REL_FAC));
 +    }
 +
 +    if (pme_lb->stage > 0 && pme_lb->end == 1)
 +    {
 +        pme_lb->cur   = 0;
 +        pme_lb->stage = pme_lb->nstage;
 +    }
 +    else if (pme_lb->stage > 0 && pme_lb->end > 1)
 +    {
 +        /* If stage = nstage-1:
 +         *   scan over all setups, rerunning only those setups
 +         *   which are not much slower than the fastest
 +         * else:
 +         *   use the next setup
 +         */
 +        do
 +        {
 +            pme_lb->cur++;
 +            if (pme_lb->cur == pme_lb->end)
 +            {
 +                pme_lb->stage++;
 +                pme_lb->cur = pme_lb->start;
 +            }
 +        }
 +        while (pme_lb->stage == pme_lb->nstage - 1 &&
 +               pme_lb->setup[pme_lb->cur].count > 0 &&
 +               pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC);
 +
 +        if (pme_lb->stage == pme_lb->nstage)
 +        {
 +            /* We are done optimizing, use the fastest setup we found */
 +            pme_lb->cur = pme_lb->fastest;
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
 +    {
 +        OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong);
 +        if (!OK)
 +        {
 +            /* Failsafe solution */
 +            if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage)
 +            {
 +                pme_lb->stage--;
 +            }
 +            pme_lb->fastest  = 0;
 +            pme_lb->start    = 0;
 +            pme_lb->end      = pme_lb->cur;
 +            pme_lb->cur      = pme_lb->start;
 +            pme_lb->elimited = epmelblimDD;
 +            print_loadbal_limited(fp_err, fp_log, step, pme_lb);
 +        }
 +    }
 +
 +    /* Change the Coulomb cut-off and the PME grid */
 +
 +    set = &pme_lb->setup[pme_lb->cur];
 +
 +    ic->rcoulomb   = set->rcut_coulomb;
 +    ic->rlist      = set->rlist;
 +    ic->rlistlong  = set->rlistlong;
 +    ir->nstcalclr  = set->nstcalclr;
 +    ic->ewaldcoeff = set->ewaldcoeff;
 +
 +    bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
 +    if (pme_lb->cutoff_scheme == ecutsVERLET &&
 +        nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
 +    {
 +        nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic);
 +    }
 +    else
 +    {
 +        init_interaction_const_tables(NULL, ic, bUsesSimpleTables,
 +                                      rtab);
 +    }
 +
 +    if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->ngrp > 1)
 +    {
 +        init_interaction_const_tables(NULL, ic, bUsesSimpleTables,
 +                                      rtab);
 +    }
 +
 +    if (cr->duty & DUTY_PME)
 +    {
 +        if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
 +        {
 +            /* Generate a new PME data structure,
 +             * copying part of the old pointers.
 +             */
 +            gmx_pme_reinit(&set->pmedata,
 +                           cr, pme_lb->setup[0].pmedata, ir,
 +                           set->grid);
 +        }
 +        *pmedata = set->pmedata;
 +    }
 +    else
 +    {
 +        /* Tell our PME-only node to switch grid */
++        gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff);
 +    }
 +
 +    if (debug)
 +    {
 +        print_grid(NULL, debug, "", "switched to", set, -1);
 +    }
 +
 +    if (pme_lb->stage == pme_lb->nstage)
 +    {
 +        print_grid(fp_err, fp_log, "", "optimal", set, -1);
 +    }
 +
 +    return TRUE;
 +}
 +
 +void restart_pme_loadbal(pme_load_balancing_t pme_lb, int n)
 +{
 +    pme_lb->nstage += n;
 +}
 +
 +static int pme_grid_points(const pme_setup_t *setup)
 +{
 +    return setup->grid[XX]*setup->grid[YY]*setup->grid[ZZ];
 +}
 +
 +static void print_pme_loadbal_setting(FILE              *fplog,
 +                                      char              *name,
 +                                      const pme_setup_t *setup)
 +{
 +    fprintf(fplog,
 +            "   %-7s %6.3f nm %6.3f nm     %3d %3d %3d   %5.3f nm  %5.3f nm\n",
 +            name,
 +            setup->rcut_coulomb, setup->rlist,
 +            setup->grid[XX], setup->grid[YY], setup->grid[ZZ],
 +            setup->spacing, 1/setup->ewaldcoeff);
 +}
 +
 +static void print_pme_loadbal_settings(pme_load_balancing_t pme_lb,
 +                                       FILE                *fplog)
 +{
 +    double pp_ratio, grid_ratio;
 +
 +    pp_ratio   = pow(pme_lb->setup[pme_lb->cur].rlist/pme_lb->setup[0].rlistlong, 3.0);
 +    grid_ratio = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
 +        (double)pme_grid_points(&pme_lb->setup[0]);
 +
 +    fprintf(fplog, "\n");
 +    fprintf(fplog, "       P P   -   P M E   L O A D   B A L A N C I N G\n");
 +    fprintf(fplog, "\n");
 +    /* Here we only warn when the optimal setting is the last one */
 +    if (pme_lb->elimited != epmelblimNO &&
 +        pme_lb->cur == pme_loadbal_end(pme_lb)-1)
 +    {
 +        fprintf(fplog, " NOTE: The PP/PME load balancing was limited by the %s,\n",
 +                pmelblim_str[pme_lb->elimited]);
 +        fprintf(fplog, "       you might not have reached a good load balance.\n");
 +        if (pme_lb->elimited == epmelblimDD)
 +        {
 +            fprintf(fplog, "       Try different mdrun -dd settings or lower the -dds value.\n");
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    fprintf(fplog, " PP/PME load balancing changed the cut-off and PME settings:\n");
 +    fprintf(fplog, "           particle-particle                    PME\n");
 +    fprintf(fplog, "            rcoulomb  rlist            grid      spacing   1/beta\n");
 +    print_pme_loadbal_setting(fplog, "initial", &pme_lb->setup[0]);
 +    print_pme_loadbal_setting(fplog, "final", &pme_lb->setup[pme_lb->cur]);
 +    fprintf(fplog, " cost-ratio           %4.2f             %4.2f\n",
 +            pp_ratio, grid_ratio);
 +    fprintf(fplog, " (note that these numbers concern only part of the total PP and PME load)\n");
 +    fprintf(fplog, "\n");
 +}
 +
 +void pme_loadbal_done(pme_load_balancing_t pme_lb, FILE *fplog)
 +{
 +    if (fplog != NULL && (pme_lb->cur > 0 || pme_lb->elimited != epmelblimNO))
 +    {
 +        print_pme_loadbal_settings(pme_lb, fplog);
 +    }
 +
 +    /* TODO: Here we should free all pointers in pme_lb,
 +     * but as it contains pme data structures,
 +     * we need to first make pme.c free all data.
 +     */
 +}