* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
+
#include <assert.h>
-#include "typedefs.h"
-#include "types/commrec.h"
-#include "gromacs/utility/smalloc.h"
-#include "update.h"
-#include "vec.h"
-#include "macros.h"
-#include "physics.h"
-#include "names.h"
-#include "gmx_fatal.h"
-#include "txtdump.h"
-#include "nrnb.h"
+#include <algorithm>
+
+#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/mdrun.h"
+#include "gromacs/legacyheaders/names.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/update.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
#include "gromacs/random/random.h"
-#include "update.h"
-#include "mdrun.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
#define NTROTTERPARTS 3
{
/* general routine for both barostat and thermostat nose hoover chains */
- int i, j, mi, mj, jmax;
+ int i, j, mi, mj;
double Ekin, Efac, reft, kT, nd;
double dt;
t_grp_tcstat *tcstat;
{
iQinv = &(MassQ->QPinv[i*nh]);
nd = 1.0; /* THIS WILL CHANGE IF NOT ISOTROPIC */
- reft = max(0.0, opts->ref_t[0]);
+ reft = std::max<real>(0, opts->ref_t[0]);
Ekin = sqr(*veta)/MassQ->Winv;
}
else
iQinv = &(MassQ->Qinv[i*nh]);
tcstat = &ekind->tcstat[i];
nd = opts->nrdf[i];
- reft = max(0.0, opts->ref_t[i]);
+ reft = std::max<real>(0, opts->ref_t[i]);
if (bEkinAveVel)
{
Ekin = 2*trace(tcstat->ekinf)*tcstat->ekinscalef_nhc;
real pscal;
double alpha;
- int i, j, d, n, nwall;
- real T, GW, vol;
- tensor Winvm, ekinmod, localpres;
+ int nwall;
+ real GW, vol;
+ tensor ekinmod, localpres;
/* The heat bath is coupled to a separate barostat, the last temperature group. In the
2006 Tuckerman et al paper., the order is iL_{T_baro} iL {T_part}
* The pressure and compressibility always occur as a product,
* therefore the pressure unit drops out.
*/
- maxl = max(box[XX][XX], box[YY][YY]);
- maxl = max(maxl, box[ZZ][ZZ]);
+ maxl = std::max(box[XX][XX], box[YY][YY]);
+ maxl = std::max(maxl, box[ZZ][ZZ]);
for (d = 0; d < DIM; d++)
{
for (n = 0; n < DIM; n++)
{
int d, n;
real scalar_pressure, xy_pressure, p_corr_z;
- char *ptr, buf[STRLEN];
+ char buf[STRLEN];
/*
* Calculate the scaling matrix mu
t_nrnb *nrnb)
{
ivec *nFreeze = ir->opts.nFreeze;
- int n, d, g = 0;
+ int n, d;
+ int nthreads gmx_unused;
+
+#ifndef __clang_analyzer__
+ // cppcheck-suppress unreadVariable
+ nthreads = gmx_omp_nthreads_get(emntUpdate);
+#endif
/* Scale the positions */
+#pragma omp parallel for num_threads(nthreads) schedule(static)
for (n = start; n < start+nr_atoms; n++)
{
- if (cFREEZE)
+ int g;
+
+ if (cFREEZE == NULL)
+ {
+ g = 0;
+ }
+ else
{
g = cFREEZE[n];
}
if ((opts->tau_t[i] > 0) && (T > 0.0))
{
- reft = max(0.0, opts->ref_t[i]);
+ reft = std::max<real>(0, opts->ref_t[i]);
lll = sqrt(1.0 + (dt/opts->tau_t[i])*(reft/T-1.0));
- ekind->tcstat[i].lambda = max(min(lll, 1.25), 0.8);
+ ekind->tcstat[i].lambda = std::max<real>(std::min<real>(lll, 1.25), 0.8);
}
else
{
for (i = 0; (i < opts->ngtc); i++)
{
- reft = max(0.0, opts->ref_t[i]);
+ reft = std::max<real>(0, opts->ref_t[i]);
oldvxi = vxi[i];
vxi[i] += dt*MassQ->Qinv[i]*(ekind->tcstat[i].Th - reft);
xi[i] += dt*(oldvxi + vxi[i])*0.5;
t_extmass *MassQ, int **trotter_seqlist, int trotter_seqno)
{
- int n, i, d, ngtc, gc = 0;
- int n, i, j, d, ntgrp, ngtc, gc = 0, t;
++ int n, i, d, ngtc, gc = 0, t;
t_grp_tcstat *tcstat;
t_grpopts *opts;
gmx_int64_t step_eff;
- real ecorr, pcorr, dvdlcorr;
- real bmass, qmass, reft, kT, dt, nd;
- tensor dumpres, dumvir;
+ real dt;
double *scalefac, dtc;
int *trotter_seq;
- rvec sumv = {0, 0, 0}, consk;
+ rvec sumv = {0, 0, 0};
gmx_bool bCouple;
if (trotter_seqno <= ettTSEQ2)
scale the velocities later, but we need them scaled in order to
produce the correct outputs, so we'll scale them here. */
- for (i = 0; i < ngtc; i++)
+ for (t = 0; t < ngtc; t++)
{
- tcstat = &ekind->tcstat[i];
- tcstat->vscale_nhc = scalefac[i];
- tcstat->ekinscaleh_nhc *= (scalefac[i]*scalefac[i]);
- tcstat->ekinscalef_nhc *= (scalefac[i]*scalefac[i]);
+ tcstat = &ekind->tcstat[t];
+ tcstat->vscale_nhc = scalefac[t];
+ tcstat->ekinscaleh_nhc *= (scalefac[t]*scalefac[t]);
+ tcstat->ekinscalef_nhc *= (scalefac[t]*scalefac[t]);
}
/* now that we've scaled the groupwise velocities, we can add them up to get the total */
/* but do we actually need the total? */
extern void init_npt_masses(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bInit)
{
- int n, i, j, d, ntgrp, ngtc, nnhpres, nh, gc = 0;
- t_grp_tcstat *tcstat;
+ int n, i, j, d, ngtc, nh;
t_grpopts *opts;
- real ecorr, pcorr, dvdlcorr;
- real bmass, qmass, reft, kT, dt, ndj, nd;
- tensor dumpres, dumvir;
+ real reft, kT, ndj, nd;
opts = &(ir->opts); /* just for ease of referencing */
ngtc = ir->opts.ngtc;
- nnhpres = state->nnhpres;
nh = state->nhchainlength;
if (ir->eI == eiMD)
{
if ((opts->tau_t[i] > 0) && (opts->ref_t[i] > 0))
{
- reft = max(0.0, opts->ref_t[i]);
+ reft = std::max<real>(0, opts->ref_t[i]);
nd = opts->nrdf[i];
kT = BOLTZ*reft;
for (j = 0; j < nh; j++)
}
else
{
- reft = 0.0;
for (j = 0; j < nh; j++)
{
MassQ->Qinv[i*nh+j] = 0.0;
int **init_npt_vars(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bTrotter)
{
- int n, i, j, d, ntgrp, ngtc, nnhpres, nh, gc = 0;
- t_grp_tcstat *tcstat;
+ int i, j, nnhpres, nh;
t_grpopts *opts;
- real ecorr, pcorr, dvdlcorr;
- real bmass, qmass, reft, kT, dt, ndj, nd;
- tensor dumpres, dumvir;
+ real bmass, qmass, reft, kT;
int **trotter_seq;
opts = &(ir->opts); /* just for ease of referencing */
- ngtc = state->ngtc;
nnhpres = state->nnhpres;
nh = state->nhchainlength;
/* barostat temperature */
if ((ir->tau_p > 0) && (opts->ref_t[0] > 0))
{
- reft = max(0.0, opts->ref_t[0]);
+ reft = std::max<real>(0, opts->ref_t[0]);
kT = BOLTZ*reft;
for (i = 0; i < nnhpres; i++)
{
real NPT_energy(t_inputrec *ir, t_state *state, t_extmass *MassQ)
{
- int i, j, nd, ndj, bmass, qmass, ngtcall;
- real ener_npt, reft, eta, kT, tau;
+ int i, j;
+ real nd, ndj;
+ real ener_npt, reft, kT;
double *ivxi, *ixi;
double *iQinv;
- real vol, dbaro, W, Q;
+ real vol;
int nh = state->nhchainlength;
ener_npt = 0;
ivxi = &state->nhpres_vxi[i*nh];
ixi = &state->nhpres_xi[i*nh];
iQinv = &(MassQ->QPinv[i*nh]);
- reft = max(ir->opts.ref_t[0], 0); /* using 'System' temperature */
+ reft = std::max<real>(ir->opts.ref_t[0], 0.0); /* using 'System' temperature */
kT = BOLTZ * reft;
for (j = 0; j < nh; j++)
iQinv = &(MassQ->Qinv[i*nh]);
nd = ir->opts.nrdf[i];
- reft = max(ir->opts.ref_t[i], 0);
+ reft = std::max<real>(ir->opts.ref_t[i], 0);
kT = BOLTZ * reft;
- if (nd > 0)
+ if (nd > 0.0)
{
if (IR_NVT_TROTTER(ir))
{
}
else
{
- ndj = 1;
+ ndj = 1.0;
}
ener_npt += ndj*ixi[j]*kT;
}
case eannPERIODIC:
/* calculate time modulo the period */
pert = opts->anneal_time[i][npoints-1];
- n = t / pert;
+ n = static_cast<int>(t / pert);
thist = t - n*pert; /* modulo time */
/* Make sure rounding didn't get us outside the interval */
if (fabs(thist-pert) < GMX_REAL_EPS*100)
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
+
+#include "config.h"
+#include <assert.h>
#include <math.h>
+#include <stdlib.h>
#include <string.h>
-#include <assert.h>
-#include "sysstuff.h"
-#include "typedefs.h"
-#include "types/commrec.h"
-#include "vec.h"
+
+#include <algorithm>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/ewald/ewald.h"
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+#include "gromacs/legacyheaders/copyrite.h"
+#include "gromacs/legacyheaders/force.h"
+#include "gromacs/legacyheaders/gmx_detect_hardware.h"
+#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+#include "gromacs/legacyheaders/inputrec.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/md_logging.h"
+#include "gromacs/legacyheaders/md_support.h"
+#include "gromacs/legacyheaders/names.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nonbonded.h"
+#include "gromacs/legacyheaders/ns.h"
+#include "gromacs/legacyheaders/qmmm.h"
+#include "gromacs/legacyheaders/tables.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/calculate-ewald-splitting-coefficient.h"
+#include "gromacs/math/units.h"
#include "gromacs/math/utilities.h"
-#include "macros.h"
-#include "gromacs/utility/smalloc.h"
-#include "macros.h"
-#include "gmx_fatal.h"
-#include "physics.h"
-#include "force.h"
-#include "tables.h"
-#include "nonbonded.h"
-#include "invblock.h"
-#include "names.h"
-#include "network.h"
-#include "pbc.h"
-#include "ns.h"
-#include "mshift.h"
-#include "txtdump.h"
-#include "coulomb.h"
-#include "md_support.h"
-#include "md_logging.h"
-#include "domdec.h"
-#include "qmmm.h"
-#include "copyrite.h"
-#include "mtop_util.h"
-#include "nbnxn_simd.h"
-#include "nbnxn_search.h"
-#include "nbnxn_atomdata.h"
-#include "nbnxn_consts.h"
-#include "gmx_omp_nthreads.h"
-#include "gmx_detect_hardware.h"
-#include "inputrec.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/forcerec-threading.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_atomdata.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_simd.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/pbc.h"
#include "gromacs/simd/simd.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
-#include "types/nbnxn_cuda_types_ext.h"
-#include "gpu_utils.h"
-#include "nbnxn_cuda_data_mgmt.h"
-#include "pmalloc_cuda.h"
+#include "nbnxn_gpu_jit_support.h"
t_forcerec *mk_forcerec(void)
{
static real *make_ljpme_c6grid(const gmx_ffparams_t *idef, t_forcerec *fr)
{
- int i, j, k, atnr;
- real c6, c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
- real *grid;
+ int i, j, k, atnr;
+ real c6, c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
+ real *grid;
+ const real oneOverSix = 1.0 / 6.0;
/* For LJ-PME simulations, we correct the energies with the reciprocal space
* inside of the cut-off. To do this the non-bonded kernels needs to have
if (fr->ljpme_combination_rule == eljpmeLB
&& !gmx_numzero(c6) && !gmx_numzero(c12i) && !gmx_numzero(c12j))
{
- sigmai = pow(c12i / c6i, 1.0/6.0);
- sigmaj = pow(c12j / c6j, 1.0/6.0);
+ sigmai = pow(c12i / c6i, oneOverSix);
+ sigmaj = pow(c12j / c6j, oneOverSix);
epsi = c6i * c6i / c12i;
epsj = c6j * c6j / c12j;
c6 = sqrt(epsi * epsj) * pow(0.5*(sigmai+sigmaj), 6);
static real *mk_nbfp_combination_rule(const gmx_ffparams_t *idef, int comb_rule)
{
- real *nbfp;
- int i, j, k, atnr;
- real c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
- real c6, c12;
+ real *nbfp;
+ int i, j, atnr;
+ real c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
+ real c6, c12;
+ const real oneOverSix = 1.0 / 6.0;
atnr = idef->atnr;
snew(nbfp, 2*atnr*atnr);
if (comb_rule == eCOMB_ARITHMETIC
&& !gmx_numzero(c6) && !gmx_numzero(c12))
{
- sigmai = pow(c12i / c6i, 1.0/6.0);
- sigmaj = pow(c12j / c6j, 1.0/6.0);
+ sigmai = pow(c12i / c6i, oneOverSix);
+ sigmaj = pow(c12j / c6j, oneOverSix);
epsi = c6i * c6i / c12i;
epsj = c6j * c6j / c12j;
c6 = sqrt(epsi * epsj) * pow(0.5*(sigmai+sigmaj), 6);
int cginfo,
int *cg_sp)
{
- const t_blocka *excl;
t_atom *atom;
int j, k;
int j0, j1, nj;
cginfo_mb_t *cginfo_mb)
{
const t_block * cgs;
- const t_block * mols;
const gmx_moltype_t *molt;
- int mb, mol, cg_mol, at_offset, cg_offset, am, cgm, i, nmol_ch, nmol;
+ int mb, mol, cg_mol, at_offset, am, cgm, i, nmol_ch, nmol;
int n_solvent_parameters;
solvent_parameters_t *solvent_parameters;
int **cg_sp;
fprintf(debug, "Going to determine what solvent types we have.\n");
}
- mols = &mtop->mols;
-
n_solvent_parameters = 0;
solvent_parameters = NULL;
/* Allocate temporary array for solvent type */
snew(cg_sp, mtop->nmolblock);
- cg_offset = 0;
at_offset = 0;
for (mb = 0; mb < mtop->nmolblock; mb++)
{
&cg_sp[mb][cgm+cg_mol]);
}
}
- cg_offset += cgs->nr;
at_offset += cgs->index[cgs->nr];
}
bestsol = esolNO;
}
-#ifdef DISABLE_WATER_NLIST
- bestsol = esolNO;
-#endif
-
fr->nWatMol = 0;
for (mb = 0; mb < mtop->nmolblock; mb++)
{
cginfo_mb_t *cginfo_mb;
gmx_bool *type_VDW;
int *cginfo;
- int cg_offset, a_offset, cgm, am;
- int mb, m, ncg_tot, cg, a0, a1, gid, ai, j, aj, excl_nalloc;
+ int cg_offset, a_offset;
+ int mb, m, cg, a0, a1, gid, ai, j, aj, excl_nalloc;
int *a_con;
int ftype;
int ia;
gmx_bool bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ, bHavePerturbedAtoms;
- ncg_tot = ncg_mtop(mtop);
snew(cginfo_mb, mtop->nmolblock);
snew(type_VDW, fr->ntype);
* Otherwise we make an array of #mol times #cgs per molecule.
*/
bId = TRUE;
- am = 0;
for (m = 0; m < molb->nmol; m++)
{
- am = m*cgs->index[cgs->nr];
+ int am = m*cgs->index[cgs->nr];
for (cg = 0; cg < cgs->nr; cg++)
{
a0 = cgs->index[cg];
for (m = 0; m < (bId ? 1 : molb->nmol); m++)
{
- cgm = m*cgs->nr;
- am = m*cgs->index[cgs->nr];
+ int cgm = m*cgs->nr;
+ int am = m*cgs->index[cgs->nr];
for (cg = 0; cg < cgs->nr; cg++)
{
a0 = cgs->index[cg];
{
const t_atoms *atoms, *atoms_tpi;
const t_blocka *excl;
- int mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, n, nexcl, q;
+ int mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, nexcl, q;
gmx_int64_t npair, npair_ij, tmpi, tmpj;
double csix, ctwelve;
int ntp, *typecount;
}
-static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp)
+gmx_bool nbnxn_gpu_acceleration_supported(FILE *fplog,
+ const t_commrec *cr,
+ const t_inputrec *ir,
+ gmx_bool bRerunMD)
{
- int t, i;
-
- /* These thread local data structures are used for bondeds only */
- fr->nthreads = gmx_omp_nthreads_get(emntBonded);
+ if (bRerunMD && ir->opts.ngener > 1)
+ {
+ /* Rerun execution time is dominated by I/O and pair search,
+ * so GPUs are not very useful, plus they do not support more
+ * than one energy group. If the user requested GPUs
+ * explicitly, a fatal error is given later. With non-reruns,
+ * we fall back to a single whole-of system energy group
+ * (which runs much faster than a multiple-energy-groups
+ * implementation would), and issue a note in the .log
+ * file. Users can re-run if they want the information. */
+ md_print_warn(cr, fplog, "Rerun with energy groups is not implemented for GPUs, falling back to the CPU\n");
+ return FALSE;
+ }
- if (fr->nthreads > 1)
+ if (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB)
{
- snew(fr->f_t, fr->nthreads);
- /* Thread 0 uses the global force and energy arrays */
- for (t = 1; t < fr->nthreads; t++)
- {
- fr->f_t[t].f = NULL;
- fr->f_t[t].f_nalloc = 0;
- snew(fr->f_t[t].fshift, SHIFTS);
- fr->f_t[t].grpp.nener = nenergrp*nenergrp;
- for (i = 0; i < egNR; i++)
- {
- snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
- }
- }
+ /* LJ PME with LB combination rule does 7 mesh operations.
+ * This so slow that we don't compile GPU non-bonded kernels for that.
+ */
+ md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with GPUs, falling back to CPU only\n");
+ return FALSE;
}
-}
+ return TRUE;
+}
-gmx_bool nbnxn_acceleration_supported(FILE *fplog,
- const t_commrec *cr,
- const t_inputrec *ir,
- gmx_bool bGPU)
+gmx_bool nbnxn_simd_supported(FILE *fplog,
+ const t_commrec *cr,
+ const t_inputrec *ir)
{
- if (!bGPU && (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB))
+ if (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB)
{
- md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with %s, falling back to %s\n",
- bGPU ? "GPUs" : "SIMD kernels",
- bGPU ? "CPU only" : "plain-C kernels");
+ /* LJ PME with LB combination rule does 7 mesh operations.
+ * This so slow that we don't compile SIMD non-bonded kernels
+ * for that. */
+ md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with SIMD kernels, falling back to plain C kernels\n");
return FALSE;
}
#ifdef GMX_NBNXN_SIMD_4XN
*kernel_type = nbnxnk4xN_SIMD_4xN;
#else
- gmx_fatal(FARGS, "SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
+ gmx_fatal(FARGS, "SIMD 4xN kernels requested, but GROMACS has been compiled without support for these kernels");
#endif
}
if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
#ifdef GMX_NBNXN_SIMD_2XNN
*kernel_type = nbnxnk4xN_SIMD_2xNN;
#else
- gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
+ gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but GROMACS has been compiled without support for these kernels");
#endif
}
returnvalue = "not available";
#endif /* GMX_NBNXN_SIMD */
break;
- case nbnxnk8x8x8_CUDA: returnvalue = "CUDA"; break;
+ case nbnxnk8x8x8_GPU: returnvalue = "GPU"; break;
case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
case nbnxnkNR:
}
else if (bUseGPU)
{
- *kernel_type = nbnxnk8x8x8_CUDA;
+ *kernel_type = nbnxnk8x8x8_GPU;
}
if (*kernel_type == nbnxnkNotSet)
{
- /* LJ PME with LB combination rule does 7 mesh operations.
- * This so slow that we don't compile SIMD non-bonded kernels for that.
- */
if (use_simd_kernels &&
- nbnxn_acceleration_supported(fp, cr, ir, FALSE))
+ nbnxn_simd_supported(fp, cr, ir))
{
pick_nbnxn_kernel_cpu(ir, kernel_type, ewald_excl);
}
{
fprintf(fp, "\nUsing %s %dx%d non-bonded kernels\n\n",
lookup_nbnxn_kernel_name(*kernel_type),
- nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
+ nbnxn_kernel_to_ci_size(*kernel_type),
nbnxn_kernel_to_cj_size(*kernel_type));
if (nbnxnk4x4_PlainC == *kernel_type ||
}
}
-static void pick_nbnxn_resources(const t_commrec *cr,
+static void pick_nbnxn_resources(FILE *fp,
+ const t_commrec *cr,
const gmx_hw_info_t *hwinfo,
gmx_bool bDoNonbonded,
gmx_bool *bUseGPU,
* Note that you should freezing the system as otherwise it will explode.
*/
*bEmulateGPU = (bEmulateGPUEnvVarSet ||
- (!bDoNonbonded &&
- gpu_opt->ncuda_dev_use > 0));
+ (!bDoNonbonded && gpu_opt->n_dev_use > 0));
/* Enable GPU mode when GPUs are available or no GPU emulation is requested.
*/
- if (gpu_opt->ncuda_dev_use > 0 && !(*bEmulateGPU))
+ if (gpu_opt->n_dev_use > 0 && !(*bEmulateGPU))
{
/* Each PP node will use the intra-node id-th device from the
* list of detected/selected GPUs. */
- if (!init_gpu(cr->rank_pp_intranode, gpu_err_str,
+ if (!init_gpu(fp, cr->rank_pp_intranode, gpu_err_str,
&hwinfo->gpu_info, gpu_opt))
{
/* At this point the init should never fail as we made sure that
* we have all the GPUs we need. If it still does, we'll bail. */
+ /* TODO the decorating of gpu_err_str is nicer if it
+ happens inside init_gpu. Out here, the decorating with
+ the MPI rank makes sense. */
gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s",
cr->nodeid,
get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
}
static void init_ewald_f_table(interaction_const_t *ic,
- gmx_bool bUsesSimpleTables,
real rtab)
{
real maxr;
- if (bUsesSimpleTables)
- {
- /* Get the Ewald table spacing based on Coulomb and/or LJ
- * Ewald coefficients and rtol.
- */
- ic->tabq_scale = ewald_spline3_table_scale(ic);
+ /* Get the Ewald table spacing based on Coulomb and/or LJ
+ * Ewald coefficients and rtol.
+ */
+ ic->tabq_scale = ewald_spline3_table_scale(ic);
- maxr = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
- ic->tabq_size = (int)(maxr*ic->tabq_scale) + 2;
+ if (ic->cutoff_scheme == ecutsVERLET)
+ {
+ maxr = ic->rcoulomb;
}
else
{
- ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
- /* Subtract 2 iso 1 to avoid access out of range due to rounding */
- ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
+ maxr = std::max(ic->rcoulomb, rtab);
}
+ ic->tabq_size = static_cast<int>(maxr*ic->tabq_scale) + 2;
sfree_aligned(ic->tabq_coul_FDV0);
sfree_aligned(ic->tabq_coul_F);
void init_interaction_const_tables(FILE *fp,
interaction_const_t *ic,
- gmx_bool bUsesSimpleTables,
real rtab)
{
- real spacing;
-
if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype) || EVDW_PME(ic->vdwtype))
{
- init_ewald_f_table(ic, bUsesSimpleTables, rtab);
+ init_ewald_f_table(ic, rtab);
if (fp != NULL)
{
sc->c5 = -6*pow(rc - rsw, -5);
}
+/*! \brief Construct interaction constants
+ *
+ * This data is used (particularly) by search and force code for
+ * short-range interactions. Many of these are constant for the whole
+ * simulation; some are constant only after PME tuning completes.
+ */
static void
init_interaction_const(FILE *fp,
- const t_commrec gmx_unused *cr,
interaction_const_t **interaction_const,
- const t_forcerec *fr,
- real rtab)
+ const t_forcerec *fr)
{
interaction_const_t *ic;
- gmx_bool bUsesSimpleTables = TRUE;
+ const real minusSix = -6.0;
+ const real minusTwelve = -12.0;
snew(ic, 1);
+ ic->cutoff_scheme = fr->cutoff_scheme;
+
/* Just allocate something so we can free it */
snew_aligned(ic->tabq_coul_FDV0, 16, 32);
snew_aligned(ic->tabq_coul_F, 16, 32);
{
case eintmodPOTSHIFT:
/* Only shift the potential, don't touch the force */
- ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0);
- ic->repulsion_shift.cpot = -pow(ic->rvdw, -12.0);
+ ic->dispersion_shift.cpot = -pow(ic->rvdw, minusSix);
+ ic->repulsion_shift.cpot = -pow(ic->rvdw, minusTwelve);
if (EVDW_PME(ic->vdwtype))
{
real crc2;
crc2 = sqr(ic->ewaldcoeff_lj*ic->rvdw);
- ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0);
+ ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, minusSix);
}
break;
case eintmodFORCESWITCH:
}
*interaction_const = ic;
-
- if (fr->nbv != NULL && fr->nbv->bUseGPU)
- {
- nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
-
- /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
- * also sharing texture references. To keep the code simple, we don't
- * treat texture references as shared resources, but this means that
- * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
- * Hence, to ensure that the non-bonded kernels don't start before all
- * texture binding operations are finished, we need to wait for all ranks
- * to arrive here before continuing.
- *
- * Note that we could omit this barrier if GPUs are not shared (or
- * texture objects are used), but as this is initialization code, there
- * is not point in complicating things.
- */
-#ifdef GMX_THREAD_MPI
- if (PAR(cr))
- {
- gmx_barrier(cr);
- }
-#endif /* GMX_THREAD_MPI */
- }
-
- bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
- init_interaction_const_tables(fp, ic, bUsesSimpleTables, rtab);
}
static void init_nb_verlet(FILE *fp,
snew(nbv, 1);
- pick_nbnxn_resources(cr, fr->hwinfo,
+ pick_nbnxn_resources(fp, cr, fr->hwinfo,
fr->bNonbonded,
&nbv->bUseGPU,
&bEmulateGPU,
fr->gpu_opt);
- nbv->nbs = NULL;
+ nbv->nbs = NULL;
+ nbv->min_ci_balanced = 0;
nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
for (i = 0; i < nbv->ngrp; i++)
}
}
- if (nbv->bUseGPU)
- {
- /* init the NxN GPU data; the last argument tells whether we'll have
- * both local and non-local NB calculation on GPU */
- nbnxn_cuda_init(fp, &nbv->cu_nbv,
- &fr->hwinfo->gpu_info, fr->gpu_opt,
- cr->rank_pp_intranode,
- (nbv->ngrp > 1) && !bHybridGPURun);
-
- if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
- {
- char *end;
-
- nbv->min_ci_balanced = strtol(env, &end, 10);
- if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
- {
- gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
- }
-
- if (debug)
- {
- fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
- nbv->min_ci_balanced);
- }
- }
- else
- {
- nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
- if (debug)
- {
- fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
- nbv->min_ci_balanced);
- }
- }
- }
- else
- {
- nbv->min_ci_balanced = 0;
- }
-
- *nb_verlet = nbv;
-
nbnxn_init_search(&nbv->nbs,
DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
for (i = 0; i < nbv->ngrp; i++)
{
- if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
- {
- nb_alloc = &pmalloc;
- nb_free = &pfree;
- }
- else
- {
- nb_alloc = NULL;
- nb_free = NULL;
- }
+ gpu_set_host_malloc_and_free(nbv->grp[0].kernel_type == nbnxnk8x8x8_GPU,
+ &nb_alloc, &nb_free);
nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
nbv->grp[i].nbat = nbv->grp[0].nbat;
}
}
+
+ if (nbv->bUseGPU)
+ {
+ /* init the NxN GPU data; the last argument tells whether we'll have
+ * both local and non-local NB calculation on GPU */
+ nbnxn_gpu_init(fp, &nbv->gpu_nbv,
+ &fr->hwinfo->gpu_info,
+ fr->gpu_opt,
+ fr->ic,
+ nbv->grp,
+ cr->rank_pp_intranode,
+ cr->nodeid,
+ (nbv->ngrp > 1) && !bHybridGPURun);
+
+ /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+ * also sharing texture references. To keep the code simple, we don't
+ * treat texture references as shared resources, but this means that
+ * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
+ * Hence, to ensure that the non-bonded kernels don't start before all
+ * texture binding operations are finished, we need to wait for all ranks
+ * to arrive here before continuing.
+ *
+ * Note that we could omit this barrier if GPUs are not shared (or
+ * texture objects are used), but as this is initialization code, there
+ * is no point in complicating things.
+ */
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_barrier(cr);
+ }
+#endif /* GMX_THREAD_MPI */
+
+ if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
+ {
+ char *end;
+
+ nbv->min_ci_balanced = strtol(env, &end, 10);
+ if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
+ {
+ gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
+ nbv->min_ci_balanced);
+ }
+ }
+ else
+ {
+ nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
+ if (debug)
+ {
+ fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
+ nbv->min_ci_balanced);
+ }
+ }
+
+ }
+
+ *nb_verlet = nbv;
+}
+
+gmx_bool usingGpu(nonbonded_verlet_t *nbv)
+{
+ return nbv != NULL && nbv->bUseGPU;
}
void init_forcerec(FILE *fp,
gmx_bool bNoSolvOpt,
real print_force)
{
- int i, j, m, natoms, ngrp, negp_pp, negptable, egi, egj;
+ int i, m, negp_pp, negptable, egi, egj;
real rtab;
char *env;
double dbl;
gmx_bool bGenericKernelOnly;
gmx_bool bMakeTables, bMakeSeparate14Table, bSomeNormalNbListsAreInUse;
gmx_bool bFEP_NonBonded;
- t_nblists *nbl;
int *nm_ind, egp_flags;
if (fr->hwinfo == NULL)
fr->bDomDec = DOMAINDECOMP(cr);
- natoms = mtop->natoms;
-
if (check_box(ir->ePBC, box))
{
gmx_fatal(FARGS, check_box(ir->ePBC, box));
if (env != NULL)
{
dbl = 0;
- sscanf(env, "%lf", &dbl);
+ sscanf(env, "%20lf", &dbl);
fr->sc_sigma6_min = pow(dbl, 6);
if (fp)
{
{
fprintf(fp,
"\nFound environment variable GMX_DISABLE_SIMD_KERNELS.\n"
- "Disabling the usage of any SIMD-specific kernel routines (e.g. SSE2/SSE4.1/AVX).\n\n");
+ "Disabling the usage of any SIMD-specific non-bonded & bonded kernel routines\n"
+ "(e.g. SSE2/SSE4.1/AVX).\n\n");
}
}
fr->AllvsAll_work = NULL;
fr->AllvsAll_workgb = NULL;
- /* All-vs-all kernels have not been implemented in 4.6, and
- * the SIMD group kernels are also buggy in this case. Non-SIMD
- * group kernels are OK. See Redmine #1249. */
+ /* All-vs-all kernels have not been implemented in 4.6 and later.
+ * See Redmine #1249. */
if (fr->bAllvsAll)
{
fr->bAllvsAll = FALSE;
- fr->use_simd_kernels = FALSE;
if (fp != NULL)
{
fprintf(fp,
"\nYour simulation settings would have triggered the efficient all-vs-all\n"
"kernels in GROMACS 4.5, but these have not been implemented in GROMACS\n"
- "4.6. Also, we can't use the accelerated SIMD kernels here because\n"
- "of an unfixed bug. The reference C kernels are correct, though, so\n"
- "we are proceeding by disabling all CPU architecture-specific\n"
- "(e.g. SSE2/SSE4/AVX) routines. If performance is important, please\n"
- "use GROMACS 4.5.7 or try cutoff-scheme = Verlet.\n\n");
+ "4.6 and 5.x. If performance is important, please use GROMACS 4.5.7\n"
+ "or try cutoff-scheme = Verlet.\n\n");
}
}
{
if (!DOMAINDECOMP(cr))
{
+ gmx_bool bSHAKE;
+
+ bSHAKE = (ir->eConstrAlg == econtSHAKE &&
+ (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 ||
+ gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0));
+
/* The group cut-off scheme and SHAKE assume charge groups
* are whole, but not using molpbc is faster in most cases.
+ * With intermolecular interactions we need PBC for calculating
+ * distances between atoms in different molecules.
*/
- if (fr->cutoff_scheme == ecutsGROUP ||
- (ir->eConstrAlg == econtSHAKE &&
- (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 ||
- gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0)))
+ if ((fr->cutoff_scheme == ecutsGROUP || bSHAKE) &&
+ !mtop->bIntermolecularInteractions)
{
fr->bMolPBC = ir->bPeriodicMols;
+
+ if (bSHAKE && fr->bMolPBC)
+ {
+ gmx_fatal(FARGS, "SHAKE is not supported with periodic molecules");
+ }
}
else
{
fr->bMolPBC = TRUE;
+
if (getenv("GMX_USE_GRAPH") != NULL)
{
fr->bMolPBC = FALSE;
if (fp)
{
- fprintf(fp, "\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
+ md_print_warn(cr, fp, "GMX_USE_GRAPH is set, using the graph for bonded interactions\n");
+ }
+
+ if (mtop->bIntermolecularInteractions)
+ {
+ md_print_warn(cr, fp, "WARNING: Molecules linked by intermolecular interactions have to reside in the same periodic image, otherwise artifacts will occur!\n");
}
}
+
+ if (bSHAKE && fr->bMolPBC)
+ {
+ gmx_fatal(FARGS, "SHAKE is not properly supported with intermolecular interactions. For short simulations where linked molecules remain in the same periodic image, the environment variable GMX_USE_GRAPH can be used to override this check.\n");
+ }
}
}
else
break;
case eelPME:
+ case eelP3M_AD:
case eelEWALD:
fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
break;
egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL))
{
- nbl = &(fr->nblists[m]);
if (fr->nnblists > 1)
{
fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = m;
}
/* Initialize the thread working data for bonded interactions */
- init_forcerec_f_threads(fr, mtop->groups.grps[egcENER].nr);
+ init_bonded_threading(fp, fr, mtop->groups.grps[egcENER].nr);
snew(fr->excl_load, fr->nthreads+1);
+ /* fr->ic is used both by verlet and group kernels (to some extent) now */
+ init_interaction_const(fp, &fr->ic, fr);
+ init_interaction_const_tables(fp, fr->ic, rtab);
+
if (fr->cutoff_scheme == ecutsVERLET)
{
if (ir->rcoulomb != ir->rvdw)
init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
}
- /* fr->ic is used both by verlet and group kernels (to some extent) now */
- init_interaction_const(fp, cr, &fr->ic, fr, rtab);
-
if (ir->eDispCorr != edispcNO)
{
calc_enervirdiff(fp, ir->eDispCorr, fr);
fr->excl_load[t] = i;
}
}
+
+/* Frees GPU memory and destroys the GPU context.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ */
+void free_gpu_resources(const t_forcerec *fr,
+ const t_commrec *cr,
+ const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt)
+{
+ gmx_bool bIsPPrankUsingGPU;
+ char gpu_err_str[STRLEN];
+
+ bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr && fr->nbv && fr->nbv->bUseGPU;
+
+ if (bIsPPrankUsingGPU)
+ {
+ /* free nbnxn data in GPU memory */
+ nbnxn_gpu_free(fr->nbv->gpu_nbv);
+
+ /* With tMPI we need to wait for all ranks to finish deallocation before
+ * destroying the context in free_gpu() as some ranks may be sharing
+ * GPU and context.
+ * Note: as only PP ranks need to free GPU resources, so it is safe to
+ * not call the barrier on PME ranks.
+ */
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_barrier(cr);
+ }
+#endif /* GMX_THREAD_MPI */
+
+ /* uninitialize GPU (by destroying the context) */
+ if (!free_cuda_gpu(cr->rank_pp_intranode, gpu_err_str, gpu_info, gpu_opt))
+ {
+ gmx_warning("On rank %d failed to free GPU #%d: %s",
+ cr->nodeid, get_current_cuda_gpu_device_id(), gpu_err_str);
+ }
+ }
+}