Merge branch 'release-5-0' into release-5-1

author Mark Abraham <mark.j.abraham@gmail.com>

Tue, 8 Dec 2015 07:45:22 +0000 (18:45 +1100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Tue, 8 Dec 2015 07:45:22 +0000 (18:45 +1100)
author Mark Abraham <mark.j.abraham@gmail.com>
Tue, 8 Dec 2015 07:45:22 +0000 (18:45 +1100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Tue, 8 Dec 2015 07:45:22 +0000 (18:45 +1100)
diff --combined src/gromacs/mdlib/coupling.cpp

index d0169dcdc28d4af97d65db9e4a2e5d454520ac33,9d0ee0886caefbd3d7b388f26eb5faaa38e7909c..d5244e7134e431ec07ff984a7664fd6abc6eb087
--- 1/src/gromacs/mdlib/coupling.cpp
--- 2/src/gromacs/mdlib/coupling.c
+++ b/src/gromacs/mdlib/coupling.cpp
@@@ -34,26 -34,25 +34,26 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
   #include <assert.h>
   
- -#include "typedefs.h"
- -#include "types/commrec.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "update.h"
- -#include "vec.h"
- -#include "macros.h"
- -#include "physics.h"
- -#include "names.h"
- -#include "gmx_fatal.h"
- -#include "txtdump.h"
- -#include "nrnb.h"
+ +#include <algorithm>
+ +
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/mdrun.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/nrnb.h"
+ +#include "gromacs/legacyheaders/txtdump.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/legacyheaders/update.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
+ +#include "gromacs/math/units.h"
+ +#include "gromacs/math/vec.h"
   #include "gromacs/random/random.h"
- -#include "update.h"
- -#include "mdrun.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/smalloc.h"
   
   #define NTROTTERPARTS 3
   
@@@ -95,7 -94,7 +95,7 @@@ static void NHC_trotter(t_grpopts *opts
   {
       /* general routine for both barostat and thermostat nose hoover chains */
   
- -    int           i, j, mi, mj, jmax;
+ +    int           i, j, mi, mj;
       double        Ekin, Efac, reft, kT, nd;
       double        dt;
       t_grp_tcstat *tcstat;
@@@ -130,7 -129,7 +130,7 @@@
           {
               iQinv = &(MassQ->QPinv[i*nh]);
               nd    = 1.0; /* THIS WILL CHANGE IF NOT ISOTROPIC */
- -            reft  = max(0.0, opts->ref_t[0]);
+ +            reft  = std::max<real>(0, opts->ref_t[0]);
               Ekin  = sqr(*veta)/MassQ->Winv;
           }
           else
@@@ -138,7 -137,7 +138,7 @@@
               iQinv  = &(MassQ->Qinv[i*nh]);
               tcstat = &ekind->tcstat[i];
               nd     = opts->nrdf[i];
- -            reft   = max(0.0, opts->ref_t[i]);
+ +            reft   = std::max<real>(0, opts->ref_t[i]);
               if (bEkinAveVel)
               {
                   Ekin = 2*trace(tcstat->ekinf)*tcstat->ekinscalef_nhc;
@@@ -230,9 -229,9 +230,9 @@@ static void boxv_trotter(t_inputrec *ir
   
       real   pscal;
       double alpha;
- -    int    i, j, d, n, nwall;
- -    real   T, GW, vol;
- -    tensor Winvm, ekinmod, localpres;
+ +    int    nwall;
+ +    real   GW, vol;
+ +    tensor ekinmod, localpres;
   
       /* The heat bath is coupled to a separate barostat, the last temperature group.  In the
          2006 Tuckerman et al paper., the order is iL_{T_baro} iL {T_part}
@@@ -376,8 -375,8 +376,8 @@@ void parrinellorahman_pcoupl(FILE *fplo
            * The pressure and compressibility always occur as a product,
            * therefore the pressure unit drops out.
            */
- -        maxl = max(box[XX][XX], box[YY][YY]);
- -        maxl = max(maxl, box[ZZ][ZZ]);
+ +        maxl = std::max(box[XX][XX], box[YY][YY]);
+ +        maxl = std::max(maxl, box[ZZ][ZZ]);
           for (d = 0; d < DIM; d++)
           {
               for (n = 0; n < DIM; n++)
@@@ -534,7 -533,7 +534,7 @@@ void berendsen_pcoupl(FILE *fplog, gmx_
   {
       int     d, n;
       real    scalar_pressure, xy_pressure, p_corr_z;
- -    char   *ptr, buf[STRLEN];
+ +    char    buf[STRLEN];
   
       /*
        *  Calculate the scaling matrix mu
@@@ -647,25 -646,12 +647,25 @@@ void berendsen_pscale(t_inputrec *ir, m
                         t_nrnb *nrnb)
   {
       ivec   *nFreeze = ir->opts.nFreeze;
- -    int     n, d, g = 0;
+ +    int     n, d;
+ +    int     nthreads gmx_unused;
+ +
+ +#ifndef __clang_analyzer__
+ +    // cppcheck-suppress unreadVariable
+ +    nthreads = gmx_omp_nthreads_get(emntUpdate);
+ +#endif
   
       /* Scale the positions */
+ +#pragma omp parallel for num_threads(nthreads) schedule(static)
       for (n = start; n < start+nr_atoms; n++)
       {
- -        if (cFREEZE)
+ +        int g;
+ +
+ +        if (cFREEZE == NULL)
+ +        {
+ +            g = 0;
+ +        }
+ +        else
           {
               g = cFREEZE[n];
           }
@@@ -720,9 -706,9 +720,9 @@@ void berendsen_tcoupl(t_inputrec *ir, g
   
           if ((opts->tau_t[i] > 0) && (T > 0.0))
           {
- -            reft                    = max(0.0, opts->ref_t[i]);
+ +            reft                    = std::max<real>(0, opts->ref_t[i]);
               lll                     = sqrt(1.0 + (dt/opts->tau_t[i])*(reft/T-1.0));
- -            ekind->tcstat[i].lambda = max(min(lll, 1.25), 0.8);
+ +            ekind->tcstat[i].lambda = std::max<real>(std::min<real>(lll, 1.25), 0.8);
           }
           else
           {
@@@ -797,7 -783,7 +797,7 @@@ void nosehoover_tcoupl(t_grpopts *opts
   
       for (i = 0; (i < opts->ngtc); i++)
       {
- -        reft     = max(0.0, opts->ref_t[i]);
+ +        reft     = std::max<real>(0, opts->ref_t[i]);
           oldvxi   = vxi[i];
           vxi[i]  += dt*MassQ->Qinv[i]*(ekind->tcstat[i].Th - reft);
           xi[i]   += dt*(oldvxi + vxi[i])*0.5;
@@@ -836,14 -822,16 +836,14 @@@ void trotter_update(t_inputrec *ir, gmx
                       t_extmass *MassQ, int **trotter_seqlist, int trotter_seqno)
   {
   
-     int             n, i, d, ngtc, gc = 0;
- -    int             n, i, j, d, ntgrp, ngtc, gc = 0, t;
++    int             n, i, d, ngtc, gc = 0, t;
       t_grp_tcstat   *tcstat;
       t_grpopts      *opts;
       gmx_int64_t     step_eff;
- -    real            ecorr, pcorr, dvdlcorr;
- -    real            bmass, qmass, reft, kT, dt, nd;
- -    tensor          dumpres, dumvir;
+ +    real            dt;
       double         *scalefac, dtc;
       int            *trotter_seq;
- -    rvec            sumv = {0, 0, 0}, consk;
+ +    rvec            sumv = {0, 0, 0};
       gmx_bool        bCouple;
   
       if (trotter_seqno <= ettTSEQ2)
@@@ -910,12 -898,12 +910,12 @@@
                      scale the velocities later, but we need them scaled in order to
                      produce the correct outputs, so we'll scale them here. */
   
-                 for (i = 0; i < ngtc; i++)
+                 for (t = 0; t < ngtc; t++)
                   {
-                     tcstat                  = &ekind->tcstat[i];
-                     tcstat->vscale_nhc      = scalefac[i];
-                     tcstat->ekinscaleh_nhc *= (scalefac[i]*scalefac[i]);
-                     tcstat->ekinscalef_nhc *= (scalefac[i]*scalefac[i]);
+                     tcstat                  = &ekind->tcstat[t];
+                     tcstat->vscale_nhc      = scalefac[t];
+                     tcstat->ekinscaleh_nhc *= (scalefac[t]*scalefac[t]);
+                     tcstat->ekinscalef_nhc *= (scalefac[t]*scalefac[t]);
                   }
                   /* now that we've scaled the groupwise velocities, we can add them up to get the total */
                   /* but do we actually need the total? */
@@@ -965,12 -953,16 +965,12 @@@
   
   extern void init_npt_masses(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bInit)
   {
- -    int           n, i, j, d, ntgrp, ngtc, nnhpres, nh, gc = 0;
- -    t_grp_tcstat *tcstat;
+ +    int           n, i, j, d, ngtc, nh;
       t_grpopts    *opts;
- -    real          ecorr, pcorr, dvdlcorr;
- -    real          bmass, qmass, reft, kT, dt, ndj, nd;
- -    tensor        dumpres, dumvir;
+ +    real          reft, kT, ndj, nd;
   
       opts    = &(ir->opts); /* just for ease of referencing */
       ngtc    = ir->opts.ngtc;
- -    nnhpres = state->nnhpres;
       nh      = state->nhchainlength;
   
       if (ir->eI == eiMD)
@@@ -1031,7 -1023,7 +1031,7 @@@
           {
               if ((opts->tau_t[i] > 0) && (opts->ref_t[i] > 0))
               {
- -                reft = max(0.0, opts->ref_t[i]);
+ +                reft = std::max<real>(0, opts->ref_t[i]);
                   nd   = opts->nrdf[i];
                   kT   = BOLTZ*reft;
                   for (j = 0; j < nh; j++)
@@@ -1049,6 -1041,7 +1049,6 @@@
               }
               else
               {
- -                reft = 0.0;
                   for (j = 0; j < nh; j++)
                   {
                       MassQ->Qinv[i*nh+j] = 0.0;
@@@ -1060,12 -1053,16 +1060,12 @@@
   
   int **init_npt_vars(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bTrotter)
   {
- -    int           n, i, j, d, ntgrp, ngtc, nnhpres, nh, gc = 0;
- -    t_grp_tcstat *tcstat;
+ +    int           i, j, nnhpres, nh;
       t_grpopts    *opts;
- -    real          ecorr, pcorr, dvdlcorr;
- -    real          bmass, qmass, reft, kT, dt, ndj, nd;
- -    tensor        dumpres, dumvir;
+ +    real          bmass, qmass, reft, kT;
       int         **trotter_seq;
   
       opts    = &(ir->opts); /* just for ease of referencing */
- -    ngtc    = state->ngtc;
       nnhpres = state->nnhpres;
       nh      = state->nhchainlength;
   
@@@ -1210,7 -1207,7 +1210,7 @@@
       /* barostat temperature */
       if ((ir->tau_p > 0) && (opts->ref_t[0] > 0))
       {
- -        reft = max(0.0, opts->ref_t[0]);
+ +        reft = std::max<real>(0, opts->ref_t[0]);
           kT   = BOLTZ*reft;
           for (i = 0; i < nnhpres; i++)
           {
@@@ -1243,12 -1240,11 +1243,12 @@@
   
   real NPT_energy(t_inputrec *ir, t_state *state, t_extmass *MassQ)
   {
- -    int     i, j, nd, ndj, bmass, qmass, ngtcall;
- -    real    ener_npt, reft, eta, kT, tau;
+ +    int     i, j;
+ +    real    nd, ndj;
+ +    real    ener_npt, reft, kT;
       double *ivxi, *ixi;
       double *iQinv;
- -    real    vol, dbaro, W, Q;
+ +    real    vol;
       int     nh = state->nhchainlength;
   
       ener_npt = 0;
@@@ -1296,7 -1292,7 +1296,7 @@@
               ivxi  = &state->nhpres_vxi[i*nh];
               ixi   = &state->nhpres_xi[i*nh];
               iQinv = &(MassQ->QPinv[i*nh]);
- -            reft  = max(ir->opts.ref_t[0], 0); /* using 'System' temperature */
+ +            reft  = std::max<real>(ir->opts.ref_t[0], 0.0); /* using 'System' temperature */
               kT    = BOLTZ * reft;
   
               for (j = 0; j < nh; j++)
@@@ -1324,10 -1320,10 +1324,10 @@@
               iQinv = &(MassQ->Qinv[i*nh]);
   
               nd   = ir->opts.nrdf[i];
- -            reft = max(ir->opts.ref_t[i], 0);
+ +            reft = std::max<real>(ir->opts.ref_t[i], 0);
               kT   = BOLTZ * reft;
   
- -            if (nd > 0)
+ +            if (nd > 0.0)
               {
                   if (IR_NVT_TROTTER(ir))
                   {
@@@ -1344,7 -1340,7 +1344,7 @@@
                               }
                               else
                               {
- -                                ndj = 1;
+ +                                ndj = 1.0;
                               }
                               ener_npt += ndj*ixi[j]*kT;
                           }
@@@ -1641,7 -1637,7 +1641,7 @@@ void update_annealing_target_temp(t_grp
               case  eannPERIODIC:
                   /* calculate time modulo the period */
                   pert  = opts->anneal_time[i][npoints-1];
- -                n     = t / pert;
+ +                n     = static_cast<int>(t / pert);
                   thist = t - n*pert; /* modulo time */
                   /* Make sure rounding didn't get us outside the interval */
                   if (fabs(thist-pert) < GMX_REAL_EPS*100)
diff --combined src/gromacs/mdlib/forcerec.cpp

index 79884576ff5f6c4a50851359e4427fdc2ff67298,27ced84ab23f7d37b6ea98a08300b58d3eaf5df9..1f2c44ecebf2713b487ddf0cd49c8c96c84c5917
--- 1/src/gromacs/mdlib/forcerec.cpp
--- 2/src/gromacs/mdlib/forcerec.c
+++ b/src/gromacs/mdlib/forcerec.cpp
@@@ -34,56 -34,53 +34,56 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
+ +#include "config.h"
   
+ +#include <assert.h>
   #include <math.h>
+ +#include <stdlib.h>
   #include <string.h>
- -#include <assert.h>
- -#include "sysstuff.h"
- -#include "typedefs.h"
- -#include "types/commrec.h"
- -#include "vec.h"
+ +
+ +#include <algorithm>
+ +
+ +#include "gromacs/domdec/domdec.h"
+ +#include "gromacs/ewald/ewald.h"
+ +#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+ +#include "gromacs/legacyheaders/copyrite.h"
+ +#include "gromacs/legacyheaders/force.h"
+ +#include "gromacs/legacyheaders/gmx_detect_hardware.h"
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/inputrec.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/md_logging.h"
+ +#include "gromacs/legacyheaders/md_support.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/network.h"
+ +#include "gromacs/legacyheaders/nonbonded.h"
+ +#include "gromacs/legacyheaders/ns.h"
+ +#include "gromacs/legacyheaders/qmmm.h"
+ +#include "gromacs/legacyheaders/tables.h"
+ +#include "gromacs/legacyheaders/txtdump.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
+ +#include "gromacs/listed-forces/manage-threading.h"
+ +#include "gromacs/math/calculate-ewald-splitting-coefficient.h"
+ +#include "gromacs/math/units.h"
   #include "gromacs/math/utilities.h"
- -#include "macros.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "macros.h"
- -#include "gmx_fatal.h"
- -#include "physics.h"
- -#include "force.h"
- -#include "tables.h"
- -#include "nonbonded.h"
- -#include "invblock.h"
- -#include "names.h"
- -#include "network.h"
- -#include "pbc.h"
- -#include "ns.h"
- -#include "mshift.h"
- -#include "txtdump.h"
- -#include "coulomb.h"
- -#include "md_support.h"
- -#include "md_logging.h"
- -#include "domdec.h"
- -#include "qmmm.h"
- -#include "copyrite.h"
- -#include "mtop_util.h"
- -#include "nbnxn_simd.h"
- -#include "nbnxn_search.h"
- -#include "nbnxn_atomdata.h"
- -#include "nbnxn_consts.h"
- -#include "gmx_omp_nthreads.h"
- -#include "gmx_detect_hardware.h"
- -#include "inputrec.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/mdlib/forcerec-threading.h"
+ +#include "gromacs/mdlib/nb_verlet.h"
+ +#include "gromacs/mdlib/nbnxn_atomdata.h"
+ +#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+ +#include "gromacs/mdlib/nbnxn_search.h"
+ +#include "gromacs/mdlib/nbnxn_simd.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/pbcutil/pbc.h"
   #include "gromacs/simd/simd.h"
+ +#include "gromacs/topology/mtop_util.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/smalloc.h"
   
- -#include "types/nbnxn_cuda_types_ext.h"
- -#include "gpu_utils.h"
- -#include "nbnxn_cuda_data_mgmt.h"
- -#include "pmalloc_cuda.h"
+ +#include "nbnxn_gpu_jit_support.h"
   
   t_forcerec *mk_forcerec(void)
   {
@@@ -158,10 -155,9 +158,10 @@@ static real *mk_nbfp(const gmx_ffparams
   
   static real *make_ljpme_c6grid(const gmx_ffparams_t *idef, t_forcerec *fr)
   {
- -    int   i, j, k, atnr;
- -    real  c6, c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
- -    real *grid;
+ +    int        i, j, k, atnr;
+ +    real       c6, c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
+ +    real      *grid;
+ +    const real oneOverSix = 1.0 / 6.0;
   
       /* For LJ-PME simulations, we correct the energies with the reciprocal space
        * inside of the cut-off. To do this the non-bonded kernels needs to have
@@@ -182,8 -178,8 +182,8 @@@
               if (fr->ljpme_combination_rule == eljpmeLB
                   && !gmx_numzero(c6) && !gmx_numzero(c12i) && !gmx_numzero(c12j))
               {
- -                sigmai = pow(c12i / c6i, 1.0/6.0);
- -                sigmaj = pow(c12j / c6j, 1.0/6.0);
+ +                sigmai = pow(c12i / c6i, oneOverSix);
+ +                sigmaj = pow(c12j / c6j, oneOverSix);
                   epsi   = c6i * c6i / c12i;
                   epsj   = c6j * c6j / c12j;
                   c6     = sqrt(epsi * epsj) * pow(0.5*(sigmai+sigmaj), 6);
@@@ -199,11 -195,10 +199,11 @@@
   
   static real *mk_nbfp_combination_rule(const gmx_ffparams_t *idef, int comb_rule)
   {
- -    real *nbfp;
- -    int   i, j, k, atnr;
- -    real  c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
- -    real  c6, c12;
+ +    real      *nbfp;
+ +    int        i, j, atnr;
+ +    real       c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
+ +    real       c6, c12;
+ +    const real oneOverSix = 1.0 / 6.0;
   
       atnr = idef->atnr;
       snew(nbfp, 2*atnr*atnr);
@@@ -220,8 -215,8 +220,8 @@@
               if (comb_rule == eCOMB_ARITHMETIC
                   && !gmx_numzero(c6) && !gmx_numzero(c12))
               {
- -                sigmai = pow(c12i / c6i, 1.0/6.0);
- -                sigmaj = pow(c12j / c6j, 1.0/6.0);
+ +                sigmai = pow(c12i / c6i, oneOverSix);
+ +                sigmaj = pow(c12j / c6j, oneOverSix);
                   epsi   = c6i * c6i / c12i;
                   epsj   = c6j * c6j / c12j;
                   c6     = sqrt(epsi * epsj) * pow(0.5*(sigmai+sigmaj), 6);
@@@ -269,6 -264,7 +269,6 @@@ check_solvent_cg(const gmx_moltype_
                    int                     cginfo,
                    int                    *cg_sp)
   {
- -    const t_blocka       *excl;
       t_atom               *atom;
       int                   j, k;
       int                   j0, j1, nj;
@@@ -513,8 -509,9 +513,8 @@@ check_solvent(FILE  *                fp
                 cginfo_mb_t           *cginfo_mb)
   {
       const t_block     *   cgs;
- -    const t_block     *   mols;
       const gmx_moltype_t  *molt;
- -    int                   mb, mol, cg_mol, at_offset, cg_offset, am, cgm, i, nmol_ch, nmol;
+ +    int                   mb, mol, cg_mol, at_offset, am, cgm, i, nmol_ch, nmol;
       int                   n_solvent_parameters;
       solvent_parameters_t *solvent_parameters;
       int                 **cg_sp;
@@@ -525,11 -522,14 +525,11 @@@
           fprintf(debug, "Going to determine what solvent types we have.\n");
       }
   
- -    mols = &mtop->mols;
- -
       n_solvent_parameters = 0;
       solvent_parameters   = NULL;
       /* Allocate temporary array for solvent type */
       snew(cg_sp, mtop->nmolblock);
   
- -    cg_offset = 0;
       at_offset = 0;
       for (mb = 0; mb < mtop->nmolblock; mb++)
       {
@@@ -557,6 -557,7 +557,6 @@@
                                    &cg_sp[mb][cgm+cg_mol]);
               }
           }
- -        cg_offset += cgs->nr;
           at_offset += cgs->index[cgs->nr];
       }
   
@@@ -584,6 -585,10 +584,6 @@@
           bestsol = esolNO;
       }
   
- -#ifdef DISABLE_WATER_NLIST
- -    bestsol = esolNO;
- -#endif
- -
       fr->nWatMol = 0;
       for (mb = 0; mb < mtop->nmolblock; mb++)
       {
@@@ -632,13 -637,14 +632,13 @@@ static cginfo_mb_t *init_cginfo_mb(FIL
       cginfo_mb_t          *cginfo_mb;
       gmx_bool             *type_VDW;
       int                  *cginfo;
- -    int                   cg_offset, a_offset, cgm, am;
- -    int                   mb, m, ncg_tot, cg, a0, a1, gid, ai, j, aj, excl_nalloc;
+ +    int                   cg_offset, a_offset;
+ +    int                   mb, m, cg, a0, a1, gid, ai, j, aj, excl_nalloc;
       int                  *a_con;
       int                   ftype;
       int                   ia;
       gmx_bool              bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ, bHavePerturbedAtoms;
   
- -    ncg_tot = ncg_mtop(mtop);
       snew(cginfo_mb, mtop->nmolblock);
   
       snew(type_VDW, fr->ntype);
@@@ -673,9 -679,10 +673,9 @@@
            * Otherwise we make an array of #mol times #cgs per molecule.
            */
           bId = TRUE;
- -        am  = 0;
           for (m = 0; m < molb->nmol; m++)
           {
- -            am = m*cgs->index[cgs->nr];
+ +            int am = m*cgs->index[cgs->nr];
               for (cg = 0; cg < cgs->nr; cg++)
               {
                   a0 = cgs->index[cg];
@@@ -729,8 -736,8 +729,8 @@@
   
           for (m = 0; m < (bId ? 1 : molb->nmol); m++)
           {
- -            cgm = m*cgs->nr;
- -            am  = m*cgs->index[cgs->nr];
+ +            int cgm = m*cgs->nr;
+ +            int am  = m*cgs->index[cgs->nr];
               for (cg = 0; cg < cgs->nr; cg++)
               {
                   a0 = cgs->index[cg];
@@@ -984,7 -991,7 +984,7 @@@ void set_avcsixtwelve(FILE *fplog, t_fo
   {
       const t_atoms  *atoms, *atoms_tpi;
       const t_blocka *excl;
- -    int             mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, n, nexcl, q;
+ +    int             mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, nexcl, q;
       gmx_int64_t     npair, npair_ij, tmpi, tmpj;
       double          csix, ctwelve;
       int             ntp, *typecount;
@@@ -1540,47 -1547,42 +1540,47 @@@ gmx_bool can_use_allvsall(const t_input
   }
   
   
- -static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp)
+ +gmx_bool nbnxn_gpu_acceleration_supported(FILE             *fplog,
+ +                                          const t_commrec  *cr,
+ +                                          const t_inputrec *ir,
+ +                                          gmx_bool          bRerunMD)
   {
- -    int t, i;
- -
- -    /* These thread local data structures are used for bondeds only */
- -    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
+ +    if (bRerunMD && ir->opts.ngener > 1)
+ +    {
+ +        /* Rerun execution time is dominated by I/O and pair search,
+ +         * so GPUs are not very useful, plus they do not support more
+ +         * than one energy group. If the user requested GPUs
+ +         * explicitly, a fatal error is given later.  With non-reruns,
+ +         * we fall back to a single whole-of system energy group
+ +         * (which runs much faster than a multiple-energy-groups
+ +         * implementation would), and issue a note in the .log
+ +         * file. Users can re-run if they want the information. */
+ +        md_print_warn(cr, fplog, "Rerun with energy groups is not implemented for GPUs, falling back to the CPU\n");
+ +        return FALSE;
+ +    }
   
- -    if (fr->nthreads > 1)
+ +    if (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB)
       {
- -        snew(fr->f_t, fr->nthreads);
- -        /* Thread 0 uses the global force and energy arrays */
- -        for (t = 1; t < fr->nthreads; t++)
- -        {
- -            fr->f_t[t].f        = NULL;
- -            fr->f_t[t].f_nalloc = 0;
- -            snew(fr->f_t[t].fshift, SHIFTS);
- -            fr->f_t[t].grpp.nener = nenergrp*nenergrp;
- -            for (i = 0; i < egNR; i++)
- -            {
- -                snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
- -            }
- -        }
+ +        /* LJ PME with LB combination rule does 7 mesh operations.
+ +         * This so slow that we don't compile GPU non-bonded kernels for that.
+ +         */
+ +        md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with GPUs, falling back to CPU only\n");
+ +        return FALSE;
       }
- -}
   
+ +    return TRUE;
+ +}
   
- -gmx_bool nbnxn_acceleration_supported(FILE             *fplog,
- -                                      const t_commrec  *cr,
- -                                      const t_inputrec *ir,
- -                                      gmx_bool          bGPU)
+ +gmx_bool nbnxn_simd_supported(FILE             *fplog,
+ +                              const t_commrec  *cr,
+ +                              const t_inputrec *ir)
   {
- -    if (!bGPU && (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB))
+ +    if (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB)
       {
- -        md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with %s, falling back to %s\n",
- -                      bGPU ? "GPUs" : "SIMD kernels",
- -                      bGPU ? "CPU only" : "plain-C kernels");
+ +        /* LJ PME with LB combination rule does 7 mesh operations.
+ +         * This so slow that we don't compile SIMD non-bonded kernels
+ +         * for that. */
+ +        md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with SIMD kernels, falling back to plain C kernels\n");
           return FALSE;
       }
   
@@@ -1641,7 -1643,7 +1641,7 @@@ static void pick_nbnxn_kernel_cpu(cons
   #ifdef GMX_NBNXN_SIMD_4XN
               *kernel_type = nbnxnk4xN_SIMD_4xN;
   #else
- -            gmx_fatal(FARGS, "SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
+ +            gmx_fatal(FARGS, "SIMD 4xN kernels requested, but GROMACS has been compiled without support for these kernels");
   #endif
           }
           if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
@@@ -1649,7 -1651,7 +1649,7 @@@
   #ifdef GMX_NBNXN_SIMD_2XNN
               *kernel_type = nbnxnk4xN_SIMD_2xNN;
   #else
- -            gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
+ +            gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but GROMACS has been compiled without support for these kernels");
   #endif
           }
   
@@@ -1711,7 -1713,7 +1711,7 @@@ const char *lookup_nbnxn_kernel_name(in
               returnvalue = "not available";
   #endif /* GMX_NBNXN_SIMD */
               break;
- -        case nbnxnk8x8x8_CUDA: returnvalue   = "CUDA"; break;
+ +        case nbnxnk8x8x8_GPU: returnvalue    = "GPU"; break;
           case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
   
           case nbnxnkNR:
@@@ -1749,13 -1751,16 +1749,13 @@@ static void pick_nbnxn_kernel(FIL
       }
       else if (bUseGPU)
       {
- -        *kernel_type = nbnxnk8x8x8_CUDA;
+ +        *kernel_type = nbnxnk8x8x8_GPU;
       }
   
       if (*kernel_type == nbnxnkNotSet)
       {
- -        /* LJ PME with LB combination rule does 7 mesh operations.
- -         * This so slow that we don't compile SIMD non-bonded kernels for that.
- -         */
           if (use_simd_kernels &&
- -            nbnxn_acceleration_supported(fp, cr, ir, FALSE))
+ +            nbnxn_simd_supported(fp, cr, ir))
           {
               pick_nbnxn_kernel_cpu(ir, kernel_type, ewald_excl);
           }
@@@ -1769,7 -1774,7 +1769,7 @@@
       {
           fprintf(fp, "\nUsing %s %dx%d non-bonded kernels\n\n",
                   lookup_nbnxn_kernel_name(*kernel_type),
- -                nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
+ +                nbnxn_kernel_to_ci_size(*kernel_type),
                   nbnxn_kernel_to_cj_size(*kernel_type));
   
           if (nbnxnk4x4_PlainC == *kernel_type ||
@@@ -1783,8 -1788,7 +1783,8 @@@
       }
   }
   
- -static void pick_nbnxn_resources(const t_commrec     *cr,
+ +static void pick_nbnxn_resources(FILE                *fp,
+ +                                 const t_commrec     *cr,
                                    const gmx_hw_info_t *hwinfo,
                                    gmx_bool             bDoNonbonded,
                                    gmx_bool            *bUseGPU,
@@@ -1810,22 -1814,20 +1810,22 @@@
        * Note that you should freezing the system as otherwise it will explode.
        */
       *bEmulateGPU = (bEmulateGPUEnvVarSet ||
- -                    (!bDoNonbonded &&
- -                     gpu_opt->ncuda_dev_use > 0));
+ +                    (!bDoNonbonded && gpu_opt->n_dev_use > 0));
   
       /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
        */
- -    if (gpu_opt->ncuda_dev_use > 0 && !(*bEmulateGPU))
+ +    if (gpu_opt->n_dev_use > 0 && !(*bEmulateGPU))
       {
           /* Each PP node will use the intra-node id-th device from the
            * list of detected/selected GPUs. */
- -        if (!init_gpu(cr->rank_pp_intranode, gpu_err_str,
+ +        if (!init_gpu(fp, cr->rank_pp_intranode, gpu_err_str,
                         &hwinfo->gpu_info, gpu_opt))
           {
               /* At this point the init should never fail as we made sure that
                * we have all the GPUs we need. If it still does, we'll bail. */
+ +            /* TODO the decorating of gpu_err_str is nicer if it
+ +               happens inside init_gpu. Out here, the decorating with
+ +               the MPI rank makes sense. */
               gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s",
                         cr->nodeid,
                         get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
@@@ -1862,24 -1864,27 +1862,24 @@@ gmx_bool uses_simple_tables(in
   }
   
   static void init_ewald_f_table(interaction_const_t *ic,
- -                               gmx_bool             bUsesSimpleTables,
                                  real                 rtab)
   {
       real maxr;
   
- -    if (bUsesSimpleTables)
- -    {
- -        /* Get the Ewald table spacing based on Coulomb and/or LJ
- -         * Ewald coefficients and rtol.
- -         */
- -        ic->tabq_scale = ewald_spline3_table_scale(ic);
+ +    /* Get the Ewald table spacing based on Coulomb and/or LJ
+ +     * Ewald coefficients and rtol.
+ +     */
+ +    ic->tabq_scale = ewald_spline3_table_scale(ic);
   
- -        maxr           = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
- -        ic->tabq_size  = (int)(maxr*ic->tabq_scale) + 2;
+ +    if (ic->cutoff_scheme == ecutsVERLET)
+ +    {
+ +        maxr = ic->rcoulomb;
       }
       else
       {
- -        ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
- -        /* Subtract 2 iso 1 to avoid access out of range due to rounding */
- -        ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
+ +        maxr = std::max(ic->rcoulomb, rtab);
       }
+ +    ic->tabq_size  = static_cast<int>(maxr*ic->tabq_scale) + 2;
   
       sfree_aligned(ic->tabq_coul_FDV0);
       sfree_aligned(ic->tabq_coul_F);
@@@ -1911,11 -1916,14 +1911,11 @@@
   
   void init_interaction_const_tables(FILE                *fp,
                                      interaction_const_t *ic,
- -                                   gmx_bool             bUsesSimpleTables,
                                      real                 rtab)
   {
- -    real spacing;
- -
       if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype) || EVDW_PME(ic->vdwtype))
       {
- -        init_ewald_f_table(ic, bUsesSimpleTables, rtab);
+ +        init_ewald_f_table(ic, rtab);
   
           if (fp != NULL)
           {
@@@ -1965,25 -1973,18 +1965,25 @@@ static void potential_switch_constants(
       sc->c5 =  -6*pow(rc - rsw, -5);
   }
   
+ +/*! \brief Construct interaction constants
+ + *
+ + * This data is used (particularly) by search and force code for
+ + * short-range interactions. Many of these are constant for the whole
+ + * simulation; some are constant only after PME tuning completes.
+ + */
   static void
   init_interaction_const(FILE                       *fp,
- -                       const t_commrec gmx_unused *cr,
                          interaction_const_t       **interaction_const,
- -                       const t_forcerec           *fr,
- -                       real                        rtab)
+ +                       const t_forcerec           *fr)
   {
       interaction_const_t *ic;
- -    gmx_bool             bUsesSimpleTables = TRUE;
+ +    const real           minusSix          = -6.0;
+ +    const real           minusTwelve       = -12.0;
   
       snew(ic, 1);
   
+ +    ic->cutoff_scheme   = fr->cutoff_scheme;
+ +
       /* Just allocate something so we can free it */
       snew_aligned(ic->tabq_coul_FDV0, 16, 32);
       snew_aligned(ic->tabq_coul_F, 16, 32);
@@@ -2007,14 -2008,14 +2007,14 @@@
       {
           case eintmodPOTSHIFT:
               /* Only shift the potential, don't touch the force */
- -            ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0);
- -            ic->repulsion_shift.cpot  = -pow(ic->rvdw, -12.0);
+ +            ic->dispersion_shift.cpot = -pow(ic->rvdw, minusSix);
+ +            ic->repulsion_shift.cpot  = -pow(ic->rvdw, minusTwelve);
               if (EVDW_PME(ic->vdwtype))
               {
                   real crc2;
   
                   crc2            = sqr(ic->ewaldcoeff_lj*ic->rvdw);
- -                ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0);
+ +                ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, minusSix);
               }
               break;
           case eintmodFORCESWITCH:
@@@ -2102,6 -2103,33 +2102,6 @@@
       }
   
       *interaction_const = ic;
- -
- -    if (fr->nbv != NULL && fr->nbv->bUseGPU)
- -    {
- -        nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
- -
- -        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
- -         * also sharing texture references. To keep the code simple, we don't
- -         * treat texture references as shared resources, but this means that
- -         * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
- -         * Hence, to ensure that the non-bonded kernels don't start before all
- -         * texture binding operations are finished, we need to wait for all ranks
- -         * to arrive here before continuing.
- -         *
- -         * Note that we could omit this barrier if GPUs are not shared (or
- -         * texture objects are used), but as this is initialization code, there
- -         * is not point in complicating things.
- -         */
- -#ifdef GMX_THREAD_MPI
- -        if (PAR(cr))
- -        {
- -            gmx_barrier(cr);
- -        }
- -#endif  /* GMX_THREAD_MPI */
- -    }
- -
- -    bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
- -    init_interaction_const_tables(fp, ic, bUsesSimpleTables, rtab);
   }
   
   static void init_nb_verlet(FILE                *fp,
@@@ -2122,14 -2150,13 +2122,14 @@@
   
       snew(nbv, 1);
   
- -    pick_nbnxn_resources(cr, fr->hwinfo,
+ +    pick_nbnxn_resources(fp, cr, fr->hwinfo,
                            fr->bNonbonded,
                            &nbv->bUseGPU,
                            &bEmulateGPU,
                            fr->gpu_opt);
   
- -    nbv->nbs = NULL;
+ +    nbv->nbs             = NULL;
+ +    nbv->min_ci_balanced = 0;
   
       nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
       for (i = 0; i < nbv->ngrp; i++)
@@@ -2168,6 -2195,48 +2168,6 @@@
           }
       }
   
- -    if (nbv->bUseGPU)
- -    {
- -        /* init the NxN GPU data; the last argument tells whether we'll have
- -         * both local and non-local NB calculation on GPU */
- -        nbnxn_cuda_init(fp, &nbv->cu_nbv,
- -                        &fr->hwinfo->gpu_info, fr->gpu_opt,
- -                        cr->rank_pp_intranode,
- -                        (nbv->ngrp > 1) && !bHybridGPURun);
- -
- -        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
- -        {
- -            char *end;
- -
- -            nbv->min_ci_balanced = strtol(env, &end, 10);
- -            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
- -            {
- -                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
- -            }
- -
- -            if (debug)
- -            {
- -                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
- -                        nbv->min_ci_balanced);
- -            }
- -        }
- -        else
- -        {
- -            nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
- -            if (debug)
- -            {
- -                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
- -                        nbv->min_ci_balanced);
- -            }
- -        }
- -    }
- -    else
- -    {
- -        nbv->min_ci_balanced = 0;
- -    }
- -
- -    *nb_verlet = nbv;
- -
       nbnxn_init_search(&nbv->nbs,
                         DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
                         DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
@@@ -2176,8 -2245,16 +2176,8 @@@
   
       for (i = 0; i < nbv->ngrp; i++)
       {
- -        if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
- -        {
- -            nb_alloc = &pmalloc;
- -            nb_free  = &pfree;
- -        }
- -        else
- -        {
- -            nb_alloc = NULL;
- -            nb_free  = NULL;
- -        }
+ +        gpu_set_host_malloc_and_free(nbv->grp[0].kernel_type == nbnxnk8x8x8_GPU,
+ +                                     &nb_alloc, &nb_free);
   
           nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
                                   nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
@@@ -2232,73 -2309,6 +2232,73 @@@
               nbv->grp[i].nbat = nbv->grp[0].nbat;
           }
       }
+ +
+ +    if (nbv->bUseGPU)
+ +    {
+ +        /* init the NxN GPU data; the last argument tells whether we'll have
+ +         * both local and non-local NB calculation on GPU */
+ +        nbnxn_gpu_init(fp, &nbv->gpu_nbv,
+ +                       &fr->hwinfo->gpu_info,
+ +                       fr->gpu_opt,
+ +                       fr->ic,
+ +                       nbv->grp,
+ +                       cr->rank_pp_intranode,
+ +                       cr->nodeid,
+ +                       (nbv->ngrp > 1) && !bHybridGPURun);
+ +
+ +        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+ +         * also sharing texture references. To keep the code simple, we don't
+ +         * treat texture references as shared resources, but this means that
+ +         * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
+ +         * Hence, to ensure that the non-bonded kernels don't start before all
+ +         * texture binding operations are finished, we need to wait for all ranks
+ +         * to arrive here before continuing.
+ +         *
+ +         * Note that we could omit this barrier if GPUs are not shared (or
+ +         * texture objects are used), but as this is initialization code, there
+ +         * is no point in complicating things.
+ +         */
+ +#ifdef GMX_THREAD_MPI
+ +        if (PAR(cr))
+ +        {
+ +            gmx_barrier(cr);
+ +        }
+ +#endif  /* GMX_THREAD_MPI */
+ +
+ +        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
+ +        {
+ +            char *end;
+ +
+ +            nbv->min_ci_balanced = strtol(env, &end, 10);
+ +            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
+ +            {
+ +                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
+ +            }
+ +
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
+ +                        nbv->min_ci_balanced);
+ +            }
+ +        }
+ +        else
+ +        {
+ +            nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
+ +                        nbv->min_ci_balanced);
+ +            }
+ +        }
+ +
+ +    }
+ +
+ +    *nb_verlet = nbv;
+ +}
+ +
+ +gmx_bool usingGpu(nonbonded_verlet_t *nbv)
+ +{
+ +    return nbv != NULL && nbv->bUseGPU;
   }
   
   void init_forcerec(FILE              *fp,
@@@ -2317,7 -2327,7 +2317,7 @@@
                      gmx_bool           bNoSolvOpt,
                      real               print_force)
   {
- -    int            i, j, m, natoms, ngrp, negp_pp, negptable, egi, egj;
+ +    int            i, m, negp_pp, negptable, egi, egj;
       real           rtab;
       char          *env;
       double         dbl;
@@@ -2325,6 -2335,7 +2325,6 @@@
       gmx_bool       bGenericKernelOnly;
       gmx_bool       bMakeTables, bMakeSeparate14Table, bSomeNormalNbListsAreInUse;
       gmx_bool       bFEP_NonBonded;
- -    t_nblists     *nbl;
       int           *nm_ind, egp_flags;
   
       if (fr->hwinfo == NULL)
@@@ -2341,6 -2352,8 +2341,6 @@@
   
       fr->bDomDec = DOMAINDECOMP(cr);
   
- -    natoms = mtop->natoms;
- -
       if (check_box(ir->ePBC, box))
       {
           gmx_fatal(FARGS, check_box(ir->ePBC, box));
@@@ -2432,7 -2445,7 +2432,7 @@@
       if (env != NULL)
       {
           dbl = 0;
- -        sscanf(env, "%lf", &dbl);
+ +        sscanf(env, "%20lf", &dbl);
           fr->sc_sigma6_min = pow(dbl, 6);
           if (fp)
           {
@@@ -2480,8 -2493,7 +2480,8 @@@
           {
               fprintf(fp,
                       "\nFound environment variable GMX_DISABLE_SIMD_KERNELS.\n"
- -                    "Disabling the usage of any SIMD-specific kernel routines (e.g. SSE2/SSE4.1/AVX).\n\n");
+ +                    "Disabling the usage of any SIMD-specific non-bonded & bonded kernel routines\n"
+ +                    "(e.g. SSE2/SSE4.1/AVX).\n\n");
           }
       }
   
@@@ -2492,18 -2504,23 +2492,18 @@@
       fr->AllvsAll_work   = NULL;
       fr->AllvsAll_workgb = NULL;
   
- -    /* All-vs-all kernels have not been implemented in 4.6, and
- -     * the SIMD group kernels are also buggy in this case. Non-SIMD
- -     * group kernels are OK. See Redmine #1249. */
+ +    /* All-vs-all kernels have not been implemented in 4.6 and later.
+ +     * See Redmine #1249. */
       if (fr->bAllvsAll)
       {
           fr->bAllvsAll            = FALSE;
- -        fr->use_simd_kernels     = FALSE;
           if (fp != NULL)
           {
               fprintf(fp,
                       "\nYour simulation settings would have triggered the efficient all-vs-all\n"
                       "kernels in GROMACS 4.5, but these have not been implemented in GROMACS\n"
- -                    "4.6. Also, we can't use the accelerated SIMD kernels here because\n"
- -                    "of an unfixed bug. The reference C kernels are correct, though, so\n"
- -                    "we are proceeding by disabling all CPU architecture-specific\n"
- -                    "(e.g. SSE2/SSE4/AVX) routines. If performance is important, please\n"
- -                    "use GROMACS 4.5.7 or try cutoff-scheme = Verlet.\n\n");
+ +                    "4.6 and 5.x. If performance is important, please use GROMACS 4.5.7\n"
+ +                    "or try cutoff-scheme = Verlet.\n\n");
           }
       }
   
@@@ -2536,49 -2553,27 +2536,49 @@@
       {
           if (!DOMAINDECOMP(cr))
           {
+ +            gmx_bool bSHAKE;
+ +
+ +            bSHAKE = (ir->eConstrAlg == econtSHAKE &&
+ +                      (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 ||
+ +                       gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0));
+ +
               /* The group cut-off scheme and SHAKE assume charge groups
                * are whole, but not using molpbc is faster in most cases.
+ +             * With intermolecular interactions we need PBC for calculating
+ +             * distances between atoms in different molecules.
                */
- -            if (fr->cutoff_scheme == ecutsGROUP ||
- -                (ir->eConstrAlg == econtSHAKE &&
- -                 (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 ||
- -                  gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0)))
+ +            if ((fr->cutoff_scheme == ecutsGROUP || bSHAKE) &&
+ +                !mtop->bIntermolecularInteractions)
               {
                   fr->bMolPBC = ir->bPeriodicMols;
+ +
+ +                if (bSHAKE && fr->bMolPBC)
+ +                {
+ +                    gmx_fatal(FARGS, "SHAKE is not supported with periodic molecules");
+ +                }
               }
               else
               {
                   fr->bMolPBC = TRUE;
+ +
                   if (getenv("GMX_USE_GRAPH") != NULL)
                   {
                       fr->bMolPBC = FALSE;
                       if (fp)
                       {
- -                        fprintf(fp, "\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
+ +                        md_print_warn(cr, fp, "GMX_USE_GRAPH is set, using the graph for bonded interactions\n");
+ +                    }
+ +
+ +                    if (mtop->bIntermolecularInteractions)
+ +                    {
+ +                        md_print_warn(cr, fp, "WARNING: Molecules linked by intermolecular interactions have to reside in the same periodic image, otherwise artifacts will occur!\n");
                       }
                   }
+ +
+ +                if (bSHAKE && fr->bMolPBC)
+ +                {
+ +                    gmx_fatal(FARGS, "SHAKE is not properly supported with intermolecular interactions. For short simulations where linked molecules remain in the same periodic image, the environment variable GMX_USE_GRAPH can be used to override this check.\n");
+ +                }
               }
           }
           else
@@@ -2629,6 -2624,7 +2629,7 @@@
               break;
   
           case eelPME:
+         case eelP3M_AD:
           case eelEWALD:
               fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
               break;
@@@ -3098,6 -3094,7 +3099,6 @@@
                       egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
                       if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL))
                       {
- -                        nbl = &(fr->nblists[m]);
                           if (fr->nnblists > 1)
                           {
                               fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = m;
@@@ -3239,14 -3236,10 +3240,14 @@@
       }
   
       /* Initialize the thread working data for bonded interactions */
- -    init_forcerec_f_threads(fr, mtop->groups.grps[egcENER].nr);
+ +    init_bonded_threading(fp, fr, mtop->groups.grps[egcENER].nr);
   
       snew(fr->excl_load, fr->nthreads+1);
   
+ +    /* fr->ic is used both by verlet and group kernels (to some extent) now */
+ +    init_interaction_const(fp, &fr->ic, fr);
+ +    init_interaction_const_tables(fp, fr->ic, rtab);
+ +
       if (fr->cutoff_scheme == ecutsVERLET)
       {
           if (ir->rcoulomb != ir->rvdw)
@@@ -3257,6 -3250,9 +3258,6 @@@
           init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
       }
   
- -    /* fr->ic is used both by verlet and group kernels (to some extent) now */
- -    init_interaction_const(fp, cr, &fr->ic, fr, rtab);
- -
       if (ir->eDispCorr != edispcNO)
       {
           calc_enervirdiff(fp, ir->eDispCorr, fr);
@@@ -3329,46 -3325,3 +3330,46 @@@ void forcerec_set_excl_load(t_forcere
           fr->excl_load[t] = i;
       }
   }
+ +
+ +/* Frees GPU memory and destroys the GPU context.
+ + *
+ + * Note that this function needs to be called even if GPUs are not used
+ + * in this run because the PME ranks have no knowledge of whether GPUs
+ + * are used or not, but all ranks need to enter the barrier below.
+ + */
+ +void free_gpu_resources(const t_forcerec     *fr,
+ +                        const t_commrec      *cr,
+ +                        const gmx_gpu_info_t *gpu_info,
+ +                        const gmx_gpu_opt_t  *gpu_opt)
+ +{
+ +    gmx_bool bIsPPrankUsingGPU;
+ +    char     gpu_err_str[STRLEN];
+ +
+ +    bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr && fr->nbv && fr->nbv->bUseGPU;
+ +
+ +    if (bIsPPrankUsingGPU)
+ +    {
+ +        /* free nbnxn data in GPU memory */
+ +        nbnxn_gpu_free(fr->nbv->gpu_nbv);
+ +
+ +        /* With tMPI we need to wait for all ranks to finish deallocation before
+ +         * destroying the context in free_gpu() as some ranks may be sharing
+ +         * GPU and context.
+ +         * Note: as only PP ranks need to free GPU resources, so it is safe to
+ +         * not call the barrier on PME ranks.
+ +         */
+ +#ifdef GMX_THREAD_MPI
+ +        if (PAR(cr))
+ +        {
+ +            gmx_barrier(cr);
+ +        }
+ +#endif  /* GMX_THREAD_MPI */
+ +
+ +        /* uninitialize GPU (by destroying the context) */
+ +        if (!free_cuda_gpu(cr->rank_pp_intranode, gpu_err_str, gpu_info, gpu_opt))
+ +        {
+ +            gmx_warning("On rank %d failed to free GPU #%d: %s",
+ +                        cr->nodeid, get_current_cuda_gpu_device_id(), gpu_err_str);
+ +        }
+ +    }
+ +}
author	Mark Abraham <mark.j.abraham@gmail.com>
	Tue, 8 Dec 2015 07:45:22 +0000 (18:45 +1100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Tue, 8 Dec 2015 07:45:22 +0000 (18:45 +1100)
		1	2
src/gromacs/mdlib/coupling.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/forcerec.cpp	patch \|	diff1 \|	diff2 \|	blob \| history