--- /dev/null
- real mass; /* mass */
- int type; /* type (used for LJ parameters) */
- real q; /* charge */
- int con; /* constrained: 0, else 1, if 1, use #DOF=2 iso 3 */
- int n; /* total #atoms of this type in the system */
- } verletbuf_atomtype_t;
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.03
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <assert.h>
+
+#include <sys/types.h>
+#include <math.h>
+#include "typedefs.h"
+#include "physics.h"
+#include "smalloc.h"
+#include "gmx_fatal.h"
+#include "macros.h"
+#include "vec.h"
+#include "coulomb.h"
+#include "calc_verletbuf.h"
+#include "../mdlib/nbnxn_consts.h"
+
+#ifdef GMX_NBNXN_SIMD
+/* The include below sets the SIMD instruction type (precision+width)
+ * for all nbnxn SIMD search and non-bonded kernel code.
+ */
+#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
+#define GMX_USE_HALF_WIDTH_SIMD_HERE
+#endif
+#include "gmx_simd_macros.h"
+#endif
+
++
++/* The code in this file estimates a pairlist buffer length
++ * given a target energy drift per atom per picosecond.
++ * This is done by estimating the drift given a buffer length.
++ * Ideally we would like to have a tight overestimate of the drift,
++ * but that can be difficult to achieve.
++ *
++ * Significant approximations used:
++ *
++ * Uniform particle density. UNDERESTIMATES the drift by rho_global/rho_local.
++ *
++ * Interactions don't affect particle motion. OVERESTIMATES the drift on longer
++ * time scales. This approximation probably introduces the largest errors.
++ *
++ * Only take one constraint per particle into account: OVERESTIMATES the drift.
++ *
++ * For rotating constraints assume the same functional shape for time scales
++ * where the constraints rotate significantly as the exact expression for
++ * short time scales. OVERESTIMATES the drift on long time scales.
++ *
++ * For non-linear virtual sites use the mass of the lightest constructing atom
++ * to determine the displacement. OVER/UNDERESTIMATES the drift, depending on
++ * the geometry and masses of constructing atoms.
++ *
++ * Note that the formulas for normal atoms and linear virtual sites are exact,
++ * apart from the first two approximations.
++ *
++ * Note that apart from the effect of the above approximations, the actual
++ * drift of the total energy of a system can be order of magnitude smaller
++ * due to cancellation of positive and negative drift for different pairs.
++ */
++
++
+/* Struct for unique atom type for calculating the energy drift.
+ * The atom displacement depends on mass and constraints.
+ * The energy jump for given distance depend on LJ type and q.
+ */
+typedef struct
+{
- real mass, int type, real q, int con, int nmol)
++ real mass; /* mass */
++ int type; /* type (used for LJ parameters) */
++ real q; /* charge */
++ gmx_bool bConstr; /* constrained, if TRUE, use #DOF=2 iso 3 */
++ real con_mass; /* mass of heaviest atom connected by constraints */
++ real con_len; /* constraint length to the heaviest atom */
++} atom_nonbonded_kinetic_prop_t;
+
++/* Struct for unique atom type for calculating the energy drift.
++ * The atom displacement depends on mass and constraints.
++ * The energy jump for given distance depend on LJ type and q.
++ */
++typedef struct
++{
++ atom_nonbonded_kinetic_prop_t prop; /* non-bonded and kinetic atom prop. */
++ int n; /* #atoms of this type in the system */
++} verletbuf_atomtype_t;
+
+void verletbuf_get_list_setup(gmx_bool bGPU,
+ verletbuf_list_setup_t *list_setup)
+{
+ list_setup->cluster_size_i = NBNXN_CPU_CLUSTER_I_SIZE;
+
+ if (bGPU)
+ {
+ list_setup->cluster_size_j = NBNXN_GPU_CLUSTER_SIZE;
+ }
+ else
+ {
+#ifndef GMX_NBNXN_SIMD
+ list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
+#else
+ list_setup->cluster_size_j = GMX_SIMD_WIDTH_HERE;
+#ifdef GMX_NBNXN_SIMD_2XNN
+ /* We assume the smallest cluster size to be on the safe side */
+ list_setup->cluster_size_j /= 2;
+#endif
+#endif
+ }
+}
+
++static gmx_bool
++atom_nonbonded_kinetic_prop_equal(const atom_nonbonded_kinetic_prop_t *prop1,
++ const atom_nonbonded_kinetic_prop_t *prop2)
++{
++ return (prop1->mass == prop2->mass &&
++ prop1->type == prop2->type &&
++ prop1->q == prop2->q &&
++ prop1->bConstr == prop2->bConstr &&
++ prop1->con_mass == prop2->con_mass &&
++ prop1->con_len == prop2->con_len);
++}
++
+static void add_at(verletbuf_atomtype_t **att_p, int *natt_p,
- int natt, i;
++ const atom_nonbonded_kinetic_prop_t *prop,
++ int nmol)
+{
+ verletbuf_atomtype_t *att;
- if (mass == 0)
++ int natt, i;
+
- while (i < natt &&
- !(mass == att[i].mass &&
- type == att[i].type &&
- q == att[i].q &&
- con == att[i].con))
++ if (prop->mass == 0)
+ {
+ /* Ignore massless particles */
+ return;
+ }
+
+ att = *att_p;
+ natt = *natt_p;
+
+ i = 0;
- (*att_p)[i].mass = mass;
- (*att_p)[i].type = type;
- (*att_p)[i].q = q;
- (*att_p)[i].con = con;
++ while (i < natt && !atom_nonbonded_kinetic_prop_equal(prop, &att[i].prop))
+ {
+ i++;
+ }
+
+ if (i < natt)
+ {
+ att[i].n += nmol;
+ }
+ else
+ {
+ (*natt_p)++;
+ srenew(*att_p, *natt_p);
- verletbuf_atomtype_t *att;
- int natt;
- int mb, nmol, ft, i, j, a1, a2, a3, a;
- const t_atoms *atoms;
- const t_ilist *il;
- const t_atom *at;
- const t_iparams *ip;
- real *con_m, *vsite_m, cam[5];
++ (*att_p)[i].prop = *prop;
+ (*att_p)[i].n = nmol;
+ }
+}
+
++static void get_vsite_masses(const gmx_moltype_t *moltype,
++ const gmx_ffparams_t *ffparams,
++ real *vsite_m,
++ int *n_nonlin_vsite)
++{
++ int ft, i;
++ const t_ilist *il;
++
++ *n_nonlin_vsite = 0;
++
++ /* Check for virtual sites, determine mass from constructing atoms */
++ for (ft = 0; ft < F_NRE; ft++)
++ {
++ if (IS_VSITE(ft))
++ {
++ il = &moltype->ilist[ft];
++
++ for (i = 0; i < il->nr; i += 1+NRAL(ft))
++ {
++ const t_iparams *ip;
++ real cam[5], inv_mass, m_aj;
++ int a1, j, aj, coeff;
++
++ ip = &ffparams->iparams[il->iatoms[i]];
++
++ a1 = il->iatoms[i+1];
++
++ if (ft != F_VSITEN)
++ {
++ for (j = 1; j < NRAL(ft); j++)
++ {
++ cam[j] = moltype->atoms.atom[il->iatoms[i+1+j]].m;
++ if (cam[j] == 0)
++ {
++ cam[j] = vsite_m[il->iatoms[i+1+j]];
++ }
++ if (cam[j] == 0)
++ {
++ gmx_fatal(FARGS, "In molecule type '%s' %s construction involves atom %d, which is a virtual site of equal or high complexity. This is not supported.",
++ *moltype->name,
++ interaction_function[ft].longname,
++ il->iatoms[i+1+j]+1);
++ }
++ }
++ }
++
++ switch (ft)
++ {
++ case F_VSITE2:
++ /* Exact */
++ vsite_m[a1] = (cam[1]*cam[2])/(cam[2]*sqr(1-ip->vsite.a) + cam[1]*sqr(ip->vsite.a));
++ break;
++ case F_VSITE3:
++ /* Exact */
++ vsite_m[a1] = (cam[1]*cam[2]*cam[3])/(cam[2]*cam[3]*sqr(1-ip->vsite.a-ip->vsite.b) + cam[1]*cam[3]*sqr(ip->vsite.a) + cam[1]*cam[2]*sqr(ip->vsite.b));
++ break;
++ case F_VSITEN:
++ /* Exact */
++ inv_mass = 0;
++ for (j = 0; j < 3*ip->vsiten.n; j += 3)
++ {
++ aj = il->iatoms[i+j+2];
++ coeff = ip[il->iatoms[i+j]].vsiten.a;
++ if (moltype->atoms.atom[aj].ptype == eptVSite)
++ {
++ m_aj = vsite_m[aj];
++ }
++ else
++ {
++ m_aj = moltype->atoms.atom[aj].m;
++ }
++ if (m_aj <= 0)
++ {
++ gmx_incons("The mass of a vsiten constructing atom is <= 0");
++ }
++ inv_mass += coeff*coeff/m_aj;
++ }
++ vsite_m[a1] = 1/inv_mass;
++ break;
++ default:
++ /* Use the mass of the lightest constructing atom.
++ * This is an approximation.
++ * If the distance of the virtual site to the
++ * constructing atom is less than all distances
++ * between constructing atoms, this is a safe
++ * over-estimate of the displacement of the vsite.
++ * This condition holds for all H mass replacement
++ * vsite constructions, except for SP2/3 groups.
++ * In SP3 groups one H will have a F_VSITE3
++ * construction, so even there the total drift
++ * estimate shouldn't be far off.
++ */
++ assert(j >= 1);
++ vsite_m[a1] = cam[1];
++ for (j = 2; j < NRAL(ft); j++)
++ {
++ vsite_m[a1] = min(vsite_m[a1], cam[j]);
++ }
++ (*n_nonlin_vsite)++;
++ break;
++ }
++ if (gmx_debug_at)
++ {
++ fprintf(debug, "atom %4d %-20s mass %6.3f\n",
++ a1, interaction_function[ft].longname, vsite_m[a1]);
++ }
++ }
++ }
++ }
++}
++
+static void get_verlet_buffer_atomtypes(const gmx_mtop_t *mtop,
+ verletbuf_atomtype_t **att_p,
+ int *natt_p,
+ int *n_nonlin_vsite)
+{
- /* Check for constraints, as they affect the kinetic energy */
- snew(con_m, atoms->nr);
++ verletbuf_atomtype_t *att;
++ int natt;
++ int mb, nmol, ft, i, a1, a2, a3, a;
++ const t_atoms *atoms;
++ const t_ilist *il;
++ const t_iparams *ip;
++ atom_nonbonded_kinetic_prop_t *prop;
++ real *vsite_m;
++ int n_nonlin_vsite_mol;
+
+ att = NULL;
+ natt = 0;
+
+ if (n_nonlin_vsite != NULL)
+ {
+ *n_nonlin_vsite = 0;
+ }
+
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ nmol = mtop->molblock[mb].nmol;
+
+ atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+
- con_m[a1] += atoms->atom[a2].m;
- con_m[a2] += atoms->atom[a1].m;
++ /* Check for constraints, as they affect the kinetic energy.
++ * For virtual sites we need the masses and geometry of
++ * the constructing atoms to determine their velocity distribution.
++ */
++ snew(prop, atoms->nr);
+ snew(vsite_m, atoms->nr);
+
+ for (ft = F_CONSTR; ft <= F_CONSTRNC; ft++)
+ {
+ il = &mtop->moltype[mtop->molblock[mb].type].ilist[ft];
+
+ for (i = 0; i < il->nr; i += 1+NRAL(ft))
+ {
++ ip = &mtop->ffparams.iparams[il->iatoms[i]];
+ a1 = il->iatoms[i+1];
+ a2 = il->iatoms[i+2];
- con_m[a1] += atoms->atom[a2].m + atoms->atom[a3].m;
- con_m[a2] += atoms->atom[a1].m + atoms->atom[a3].m;
- con_m[a3] += atoms->atom[a1].m + atoms->atom[a2].m;
- }
-
- /* Check for virtual sites, determine mass from constructing atoms */
- for (ft = 0; ft < F_NRE; ft++)
- {
- if (IS_VSITE(ft))
- {
- il = &mtop->moltype[mtop->molblock[mb].type].ilist[ft];
-
- for (i = 0; i < il->nr; i += 1+NRAL(ft))
- {
- ip = &mtop->ffparams.iparams[il->iatoms[i]];
++ if (atoms->atom[a2].m > prop[a1].con_mass)
++ {
++ prop[a1].con_mass = atoms->atom[a2].m;
++ prop[a1].con_len = ip->constr.dA;
++ }
++ if (atoms->atom[a1].m > prop[a2].con_mass)
++ {
++ prop[a2].con_mass = atoms->atom[a1].m;
++ prop[a2].con_len = ip->constr.dA;
++ }
+ }
+ }
+
+ il = &mtop->moltype[mtop->molblock[mb].type].ilist[F_SETTLE];
+
+ for (i = 0; i < il->nr; i += 1+NRAL(F_SETTLE))
+ {
++ ip = &mtop->ffparams.iparams[il->iatoms[i]];
+ a1 = il->iatoms[i+1];
+ a2 = il->iatoms[i+2];
+ a3 = il->iatoms[i+3];
- a1 = il->iatoms[i+1];
++ /* Usually the mass of a1 (usually oxygen) is larger than a2/a3.
++ * If this is not the case, we overestimate the displacement,
++ * which leads to a larger buffer (ok since this is an exotic case).
++ */
++ prop[a1].con_mass = atoms->atom[a2].m;
++ prop[a1].con_len = ip->settle.doh;
+
- for (j = 1; j < NRAL(ft); j++)
- {
- cam[j] = atoms->atom[il->iatoms[i+1+j]].m;
- if (cam[j] == 0)
- {
- cam[j] = vsite_m[il->iatoms[i+1+j]];
- }
- if (cam[j] == 0)
- {
- gmx_fatal(FARGS, "In molecule type '%s' %s construction involves atom %d, which is a virtual site of equal or high complexity. This is not supported.",
- *mtop->moltype[mtop->molblock[mb].type].name,
- interaction_function[ft].longname,
- il->iatoms[i+1+j]+1);
- }
- }
++ prop[a2].con_mass = atoms->atom[a1].m;
++ prop[a2].con_len = ip->settle.doh;
+
- switch (ft)
- {
- case F_VSITE2:
- /* Exact except for ignoring constraints */
- vsite_m[a1] = (cam[2]*sqr(1-ip->vsite.a) + cam[1]*sqr(ip->vsite.a))/(cam[1]*cam[2]);
- break;
- case F_VSITE3:
- /* Exact except for ignoring constraints */
- vsite_m[a1] = (cam[2]*cam[3]*sqr(1-ip->vsite.a-ip->vsite.b) + cam[1]*cam[3]*sqr(ip->vsite.a) + cam[1]*cam[2]*sqr(ip->vsite.b))/(cam[1]*cam[2]*cam[3]);
- break;
- default:
- /* Use the mass of the lightest constructing atom.
- * This is an approximation.
- * If the distance of the virtual site to the
- * constructing atom is less than all distances
- * between constructing atoms, this is a safe
- * over-estimate of the displacement of the vsite.
- * This condition holds for all H mass replacement
- * replacement vsite constructions, except for SP2/3
- * groups. In SP3 groups one H will have a F_VSITE3
- * construction, so even there the total drift
- * estimation shouldn't be far off.
- */
- assert(j >= 1);
- vsite_m[a1] = cam[1];
- for (j = 2; j < NRAL(ft); j++)
- {
- vsite_m[a1] = min(vsite_m[a1], cam[j]);
- }
- if (n_nonlin_vsite != NULL)
- {
- *n_nonlin_vsite += nmol;
- }
- break;
- }
- }
- }
++ prop[a3].con_mass = atoms->atom[a1].m;
++ prop[a3].con_len = ip->settle.doh;
++ }
+
- at = &atoms->atom[a];
- /* We consider an atom constrained, #DOF=2, when it is
- * connected with constraints to one or more atoms with
- * total mass larger than 1.5 that of the atom itself.
++ get_vsite_masses(&mtop->moltype[mtop->molblock[mb].type],
++ &mtop->ffparams,
++ vsite_m,
++ &n_nonlin_vsite_mol);
++ if (n_nonlin_vsite != NULL)
++ {
++ *n_nonlin_vsite += nmol*n_nonlin_vsite_mol;
+ }
+
+ for (a = 0; a < atoms->nr; a++)
+ {
- add_at(&att, &natt,
- at->m, at->type, at->q, con_m[a] > 1.5*at->m, nmol);
++ if (atoms->atom[a].ptype == eptVSite)
++ {
++ prop[a].mass = vsite_m[a];
++ }
++ else
++ {
++ prop[a].mass = atoms->atom[a].m;
++ }
++ prop[a].type = atoms->atom[a].type;
++ prop[a].q = atoms->atom[a].q;
++ /* We consider an atom constrained, #DOF=2, when it is
++ * connected with constraints to (at least one) atom with
++ * a mass of more than 0.4x its own mass. This is not a critical
++ * parameter, since with roughly equal masses the unconstrained
++ * and constrained displacement will not differ much (and both
++ * overestimate the displacement).
+ */
- sfree(con_m);
++ prop[a].bConstr = (prop[a].con_mass > 0.4*prop[a].mass);
++
++ add_at(&att, &natt, &prop[a], nmol);
+ }
+
+ sfree(vsite_m);
- fprintf(debug, "type %d: m %5.2f t %d q %6.3f con %d n %d\n",
- a, att[a].mass, att[a].type, att[a].q, att[a].con, att[a].n);
++ sfree(prop);
+ }
+
+ if (gmx_debug_at)
+ {
+ for (a = 0; a < natt; a++)
+ {
- static void approx_2dof(real s2, real x,
- real *shift, real *scale)
++ fprintf(debug, "type %d: m %5.2f t %d q %6.3f con %d con_m %5.3f con_l %5.3f n %d\n",
++ a, att[a].prop.mass, att[a].prop.type, att[a].prop.q,
++ att[a].prop.bConstr, att[a].prop.con_mass, att[a].prop.con_len,
++ att[a].n);
+ }
+ }
+
+ *att_p = att;
+ *natt_p = natt;
+}
+
- real s2i, s2j, s2, s;
++/* This function computes two components of the estimate of the variance
++ * in the displacement of one atom in a system of two constrained atoms.
++ * Returns in sigma2_2d the variance due to rotation of the constrained
++ * atom around the atom to which it constrained.
++ * Returns in sigma2_3d the variance due to displacement of the COM
++ * of the whole system of the two constrained atoms.
++ *
++ * Note that we only take a single constraint (the one to the heaviest atom)
++ * into account. If an atom has multiple constraints, this will result in
++ * an overestimate of the displacement, which gives a larger drift and buffer.
++ */
++static void constrained_atom_sigma2(real kT_fac,
++ const atom_nonbonded_kinetic_prop_t *prop,
++ real *sigma2_2d,
++ real *sigma2_3d)
++{
++ real sigma2_rot;
++ real com_dist;
++ real sigma2_rel;
++ real scale;
++
++ /* Here we decompose the motion of a constrained atom into two
++ * components: rotation around the COM and translation of the COM.
++ */
++
++ /* Determine the variance for the displacement of the rotational mode */
++ sigma2_rot = kT_fac/(prop->mass*(prop->mass + prop->con_mass)/prop->con_mass);
++
++ /* The distance from the atom to the COM, i.e. the rotational arm */
++ com_dist = prop->con_len*prop->con_mass/(prop->mass + prop->con_mass);
++
++ /* The variance relative to the arm */
++ sigma2_rel = sigma2_rot/(com_dist*com_dist);
++ /* At 6 the scaling formula has slope 0,
++ * so we keep sigma2_2d constant after that.
++ */
++ if (sigma2_rel < 6)
++ {
++ /* A constrained atom rotates around the atom it is constrained to.
++ * This results in a smaller linear displacement than for a free atom.
++ * For a perfectly circular displacement, this lowers the displacement
++ * by: 1/arcsin(arc_length)
++ * and arcsin(x) = 1 + x^2/6 + ...
++ * For sigma2_rel<<1 the displacement distribution is erfc
++ * (exact formula is provided below). For larger sigma, it is clear
++ * that the displacement can't be larger than 2*com_dist.
++ * It turns out that the distribution becomes nearly uniform.
++ * For intermediate sigma2_rel, scaling down sigma with the third
++ * order expansion of arcsin with argument sigma_rel turns out
++ * to give a very good approximation of the distribution and variance.
++ * Even for larger values, the variance is only slightly overestimated.
++ * Note that the most relevant displacements are in the long tail.
++ * This rotation approximation always overestimates the tail (which
++ * runs to infinity, whereas it should be <= 2*com_dist).
++ * Thus we always overestimate the drift and the buffer size.
++ */
++ scale = 1/(1 + sigma2_rel/6);
++ *sigma2_2d = sigma2_rot*scale*scale;
++ }
++ else
++ {
++ /* sigma_2d is set to the maximum given by the scaling above.
++ * For large sigma2 the real displacement distribution is close
++ * to uniform over -2*con_len to 2*com_dist.
++ * Our erfc with sigma_2d=sqrt(1.5)*com_dist (which means the sigma
++ * of the erfc output distribution is con_dist) overestimates
++ * the variance and additionally has a long tail. This means
++ * we have a (safe) overestimation of the drift.
++ */
++ *sigma2_2d = 1.5*com_dist*com_dist;
++ }
++
++ /* The constrained atom also moves (in 3D) with the COM of both atoms */
++ *sigma2_3d = kT_fac/(prop->mass + prop->con_mass);
++}
++
++static void get_atom_sigma2(real kT_fac,
++ const atom_nonbonded_kinetic_prop_t *prop,
++ real *sigma2_2d,
++ real *sigma2_3d)
++{
++ if (prop->bConstr)
++ {
++ /* Complicated constraint calculation in a separate function */
++ constrained_atom_sigma2(kT_fac, prop, sigma2_2d, sigma2_3d);
++ }
++ else
++ {
++ /* Unconstrained atom: trivial */
++ *sigma2_2d = 0;
++ *sigma2_3d = kT_fac/prop->mass;
++ }
++}
++
++static void approx_2dof(real s2, real x, real *shift, real *scale)
+{
+ /* A particle with 1 DOF constrained has 2 DOFs instead of 3.
+ * This code is also used for particles with multiple constraints,
+ * in which case we overestimate the displacement.
+ * The 2DOF distribution is sqrt(pi/2)*erfc(r/(sqrt(2)*s))/(2*s).
+ * We approximate this with scale*Gaussian(s,r+shift),
+ * by matching the distribution value and derivative at x.
+ * This is a tight overestimate for all r>=0 at any s and x.
+ */
+ real ex, er;
+
+ ex = exp(-x*x/(2*s2));
+ er = gmx_erfc(x/sqrt(2*s2));
+
+ *shift = -x + sqrt(2*s2/M_PI)*ex/er;
+ *scale = 0.5*M_PI*exp(ex*ex/(M_PI*er*er))*er;
+}
+
+static real ener_drift(const verletbuf_atomtype_t *att, int natt,
+ const gmx_ffparams_t *ffp,
+ real kT_fac,
+ real md_ljd, real md_ljr, real md_el, real dd_el,
+ real r_buffer,
+ real rlist, real boxvol)
+{
+ double drift_tot, pot1, pot2, pot;
+ int i, j;
- s2i = kT_fac/att[i].mass;
- ti = att[i].type;
++ real s2i_2d, s2i_3d, s2j_2d, s2j_3d, s2, s;
+ int ti, tj;
+ real md, dd;
+ real sc_fac, rsh;
+ double c_exp, c_erfc;
+
+ drift_tot = 0;
+
+ /* Loop over the different atom type pairs */
+ for (i = 0; i < natt; i++)
+ {
- s2j = kT_fac/att[j].mass;
- tj = att[j].type;
++ get_atom_sigma2(kT_fac, &att[i].prop, &s2i_2d, &s2i_3d);
++ ti = att[i].prop.type;
+
+ for (j = i; j < natt; j++)
+ {
- md_el*att[i].q*att[j].q;
++ get_atom_sigma2(kT_fac, &att[j].prop, &s2j_2d, &s2j_3d);
++ tj = att[j].prop.type;
++
++ /* Add up the up to four independent variances */
++ s2 = s2i_2d + s2i_3d + s2j_2d + s2j_3d;
+
+ /* Note that attractive and repulsive potentials for individual
+ * pairs will partially cancel.
+ */
+ /* -dV/dr at the cut-off for LJ + Coulomb */
+ md =
+ md_ljd*ffp->iparams[ti*ffp->atnr+tj].lj.c6 +
+ md_ljr*ffp->iparams[ti*ffp->atnr+tj].lj.c12 +
- dd = dd_el*att[i].q*att[j].q;
-
- s2 = s2i + s2j;
++ md_el*att[i].prop.q*att[j].prop.q;
+
+ /* d2V/dr2 at the cut-off for Coulomb, we neglect LJ */
- if (att[i].con)
++ dd = dd_el*att[i].prop.q*att[j].prop.q;
+
+ rsh = r_buffer;
+ sc_fac = 1.0;
+ /* For constraints: adapt r and scaling for the Gaussian */
- approx_2dof(s2i, r_buffer*s2i/s2, &sh, &sc);
++ if (att[i].prop.bConstr)
+ {
+ real sh, sc;
- if (att[j].con)
++
++ approx_2dof(s2i_2d, r_buffer*s2i_2d/s2, &sh, &sc);
+ rsh += sh;
+ sc_fac *= sc;
+ }
- approx_2dof(s2j, r_buffer*s2j/s2, &sh, &sc);
++ if (att[j].prop.bConstr)
+ {
+ real sh, sc;
- fprintf(debug, "n %d %d d s %.3f %.3f con %d md %8.1e dd %8.1e pot1 %8.1e pot2 %8.1e pot %8.1e\n",
- att[i].n, att[j].n, sqrt(s2i), sqrt(s2j),
- att[i].con+att[j].con,
++
++ approx_2dof(s2j_2d, r_buffer*s2j_2d/s2, &sh, &sc);
+ rsh += sh;
+ sc_fac *= sc;
+ }
+
+ /* Exact contribution of an atom pair with Gaussian displacement
+ * with sigma s to the energy drift for a potential with
+ * derivative -md and second derivative dd at the cut-off.
+ * The only catch is that for potentials that change sign
+ * near the cut-off there could be an unlucky compensation
+ * of positive and negative energy drift.
+ * Such potentials are extremely rare though.
+ *
+ * Note that pot has unit energy*length, as the linear
+ * atom density still needs to be put in.
+ */
+ c_exp = exp(-rsh*rsh/(2*s2))/sqrt(2*M_PI);
+ c_erfc = 0.5*gmx_erfc(rsh/(sqrt(2*s2)));
+ s = sqrt(s2);
+
+ pot1 = sc_fac*
+ md/2*((rsh*rsh + s2)*c_erfc - rsh*s*c_exp);
+ pot2 = sc_fac*
+ dd/6*(s*(rsh*rsh + 2*s2)*c_exp - rsh*(rsh*rsh + 3*s2)*c_erfc);
+ pot = pot1 + pot2;
+
+ if (gmx_debug_at)
+ {
- att[i].mass = 1;
++ fprintf(debug, "n %d %d d s %.3f %.3f %.3f %.3f con %d md %8.1e dd %8.1e pot1 %8.1e pot2 %8.1e pot %8.1e\n",
++ att[i].n, att[j].n,
++ sqrt(s2i_2d), sqrt(s2i_3d),
++ sqrt(s2j_2d), sqrt(s2j_3d),
++ att[i].prop.bConstr+att[j].prop.bConstr,
+ md, dd, pot1, pot2, pot);
+ }
+
+ /* Multiply by the number of atom pairs */
+ if (j == i)
+ {
+ pot *= (double)att[i].n*(att[i].n - 1)/2;
+ }
+ else
+ {
+ pot *= (double)att[i].n*att[j].n;
+ }
+ /* We need the line density to get the energy drift of the system.
+ * The effective average r^2 is close to (rlist+sigma)^2.
+ */
+ pot *= 4*M_PI*sqr(rlist + s)/boxvol;
+
+ /* Add the unsigned drift to avoid cancellation of errors */
+ drift_tot += fabs(pot);
+ }
+ }
+
+ return drift_tot;
+}
+
+static real surface_frac(int cluster_size, real particle_distance, real rlist)
+{
+ real d, area_rel;
+
+ if (rlist < 0.5*particle_distance)
+ {
+ /* We have non overlapping spheres */
+ return 1.0;
+ }
+
+ /* Half the inter-particle distance relative to rlist */
+ d = 0.5*particle_distance/rlist;
+
+ /* Determine the area of the surface at distance rlist to the closest
+ * particle, relative to surface of a sphere of radius rlist.
+ * The formulas below assume close to cubic cells for the pair search grid,
+ * which the pair search code tries to achieve.
+ * Note that in practice particle distances will not be delta distributed,
+ * but have some spread, often involving shorter distances,
+ * as e.g. O-H bonds in a water molecule. Thus the estimates below will
+ * usually be slightly too high and thus conservative.
+ */
+ switch (cluster_size)
+ {
+ case 1:
+ /* One particle: trivial */
+ area_rel = 1.0;
+ break;
+ case 2:
+ /* Two particles: two spheres at fractional distance 2*a */
+ area_rel = 1.0 + d;
+ break;
+ case 4:
+ /* We assume a perfect, symmetric tetrahedron geometry.
+ * The surface around a tetrahedron is too complex for a full
+ * analytical solution, so we use a Taylor expansion.
+ */
+ area_rel = (1.0 + 1/M_PI*(6*acos(1/sqrt(3))*d +
+ sqrt(3)*d*d*(1.0 +
+ 5.0/18.0*d*d +
+ 7.0/45.0*d*d*d*d +
+ 83.0/756.0*d*d*d*d*d*d)));
+ break;
+ default:
+ gmx_incons("surface_frac called with unsupported cluster_size");
+ area_rel = 1.0;
+ }
+
+ return area_rel/cluster_size;
+}
+
+void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
+ const t_inputrec *ir, real drift_target,
+ const verletbuf_list_setup_t *list_setup,
+ int *n_nonlin_vsite,
+ real *rlist)
+{
+ double resolution;
+ char *env;
+
+ real particle_distance;
+ real nb_clust_frac_pairs_not_in_list_at_cutoff;
+
+ verletbuf_atomtype_t *att = NULL;
+ int natt = -1, i;
+ double reppow;
+ real md_ljd, md_ljr, md_el, dd_el;
+ real elfac;
+ real kT_fac, mass_min;
+ int ib0, ib1, ib;
+ real rb, rl;
+ real drift;
+
+ /* Resolution of the buffer size */
+ resolution = 0.001;
+
+ env = getenv("GMX_VERLET_BUFFER_RES");
+ if (env != NULL)
+ {
+ sscanf(env, "%lf", &resolution);
+ }
+
+ /* In an atom wise pair-list there would be no pairs in the list
+ * beyond the pair-list cut-off.
+ * However, we use a pair-list of groups vs groups of atoms.
+ * For groups of 4 atoms, the parallelism of SSE instructions, only
+ * 10% of the atoms pairs are not in the list just beyond the cut-off.
+ * As this percentage increases slowly compared to the decrease of the
+ * Gaussian displacement distribution over this range, we can simply
+ * reduce the drift by this fraction.
+ * For larger groups, e.g. of 8 atoms, this fraction will be lower,
+ * so then buffer size will be on the conservative (large) side.
+ *
+ * Note that the formulas used here do not take into account
+ * cancellation of errors which could occur by missing both
+ * attractive and repulsive interactions.
+ *
+ * The only major assumption is homogeneous particle distribution.
+ * For an inhomogeneous system, such as a liquid-vapor system,
+ * the buffer will be underestimated. The actual energy drift
+ * will be higher by the factor: local/homogeneous particle density.
+ *
+ * The results of this estimate have been checked againt simulations.
+ * In most cases the real drift differs by less than a factor 2.
+ */
+
+ /* Worst case assumption: HCP packing of particles gives largest distance */
+ particle_distance = pow(boxvol*sqrt(2)/mtop->natoms, 1.0/3.0);
+
+ get_verlet_buffer_atomtypes(mtop, &att, &natt, n_nonlin_vsite);
+ assert(att != NULL && natt >= 0);
+
+ if (debug)
+ {
+ fprintf(debug, "particle distance assuming HCP packing: %f nm\n",
+ particle_distance);
+ fprintf(debug, "energy drift atom types: %d\n", natt);
+ }
+
+ reppow = mtop->ffparams.reppow;
+ md_ljd = 0;
+ md_ljr = 0;
+ if (ir->vdwtype == evdwCUT)
+ {
+ /* -dV/dr of -r^-6 and r^-repporw */
+ md_ljd = -6*pow(ir->rvdw, -7.0);
+ md_ljr = reppow*pow(ir->rvdw, -(reppow+1));
+ /* The contribution of the second derivative is negligible */
+ }
+ else
+ {
+ gmx_fatal(FARGS, "Energy drift calculation is only implemented for plain cut-off Lennard-Jones interactions");
+ }
+
+ elfac = ONE_4PI_EPS0/ir->epsilon_r;
+
+ /* Determine md=-dV/dr and dd=d^2V/dr^2 */
+ md_el = 0;
+ dd_el = 0;
+ if (ir->coulombtype == eelCUT || EEL_RF(ir->coulombtype))
+ {
+ real eps_rf, k_rf;
+
+ if (ir->coulombtype == eelCUT)
+ {
+ eps_rf = 1;
+ k_rf = 0;
+ }
+ else
+ {
+ eps_rf = ir->epsilon_rf/ir->epsilon_r;
+ if (eps_rf != 0)
+ {
+ k_rf = pow(ir->rcoulomb, -3.0)*(eps_rf - ir->epsilon_r)/(2*eps_rf + ir->epsilon_r);
+ }
+ else
+ {
+ /* epsilon_rf = infinity */
+ k_rf = 0.5*pow(ir->rcoulomb, -3.0);
+ }
+ }
+
+ if (eps_rf > 0)
+ {
+ md_el = elfac*(pow(ir->rcoulomb, -2.0) - 2*k_rf*ir->rcoulomb);
+ }
+ dd_el = elfac*(2*pow(ir->rcoulomb, -3.0) + 2*k_rf);
+ }
+ else if (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD)
+ {
+ real b, rc, br;
+
+ b = calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
+ rc = ir->rcoulomb;
+ br = b*rc;
+ md_el = elfac*(b*exp(-br*br)*M_2_SQRTPI/rc + gmx_erfc(br)/(rc*rc));
+ dd_el = elfac/(rc*rc)*(2*b*(1 + br*br)*exp(-br*br)*M_2_SQRTPI + 2*gmx_erfc(br)/rc);
+ }
+ else
+ {
+ gmx_fatal(FARGS, "Energy drift calculation is only implemented for Reaction-Field and Ewald electrostatics");
+ }
+
+ /* Determine the variance of the atomic displacement
+ * over nstlist-1 steps: kT_fac
+ * For inertial dynamics (not Brownian dynamics) the mass factor
+ * is not included in kT_fac, it is added later.
+ */
+ if (ir->eI == eiBD)
+ {
+ /* Get the displacement distribution from the random component only.
+ * With accurate integration the systematic (force) displacement
+ * should be negligible (unless nstlist is extremely large, which
+ * you wouldn't do anyhow).
+ */
+ kT_fac = 2*BOLTZ*ir->opts.ref_t[0]*(ir->nstlist-1)*ir->delta_t;
+ if (ir->bd_fric > 0)
+ {
+ /* This is directly sigma^2 of the displacement */
+ kT_fac /= ir->bd_fric;
+
+ /* Set the masses to 1 as kT_fac is the full sigma^2,
+ * but we divide by m in ener_drift().
+ */
+ for (i = 0; i < natt; i++)
+ {
- mass_min = att[0].mass;
++ att[i].prop.mass = 1;
+ }
+ }
+ else
+ {
+ real tau_t;
+
+ /* Per group tau_t is not implemented yet, use the maximum */
+ tau_t = ir->opts.tau_t[0];
+ for (i = 1; i < ir->opts.ngtc; i++)
+ {
+ tau_t = max(tau_t, ir->opts.tau_t[i]);
+ }
+
+ kT_fac *= tau_t;
+ /* This kT_fac needs to be divided by the mass to get sigma^2 */
+ }
+ }
+ else
+ {
+ kT_fac = BOLTZ*ir->opts.ref_t[0]*sqr((ir->nstlist-1)*ir->delta_t);
+ }
+
- mass_min = min(mass_min, att[i].mass);
++ mass_min = att[0].prop.mass;
+ for (i = 1; i < natt; i++)
+ {
++ mass_min = min(mass_min, att[i].prop.mass);
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "md_ljd %e md_ljr %e\n", md_ljd, md_ljr);
+ fprintf(debug, "md_el %e dd_el %e\n", md_el, dd_el);
+ fprintf(debug, "sqrt(kT_fac) %f\n", sqrt(kT_fac));
+ fprintf(debug, "mass_min %f\n", mass_min);
+ }
+
+ /* Search using bisection */
+ ib0 = -1;
+ /* The drift will be neglible at 5 times the max sigma */
+ ib1 = (int)(5*2*sqrt(kT_fac/mass_min)/resolution) + 1;
+ while (ib1 - ib0 > 1)
+ {
+ ib = (ib0 + ib1)/2;
+ rb = ib*resolution;
+ rl = max(ir->rvdw, ir->rcoulomb) + rb;
+
+ /* Calculate the average energy drift at the last step
+ * of the nstlist steps at which the pair-list is used.
+ */
+ drift = ener_drift(att, natt, &mtop->ffparams,
+ kT_fac,
+ md_ljd, md_ljr, md_el, dd_el, rb,
+ rl, boxvol);
+
+ /* Correct for the fact that we are using a Ni x Nj particle pair list
+ * and not a 1 x 1 particle pair list. This reduces the drift.
+ */
+ /* We don't have a formula for 8 (yet), use 4 which is conservative */
+ nb_clust_frac_pairs_not_in_list_at_cutoff =
+ surface_frac(min(list_setup->cluster_size_i, 4),
+ particle_distance, rl)*
+ surface_frac(min(list_setup->cluster_size_j, 4),
+ particle_distance, rl);
+ drift *= nb_clust_frac_pairs_not_in_list_at_cutoff;
+
+ /* Convert the drift to drift per unit time per atom */
+ drift /= ir->nstlist*ir->delta_t*mtop->natoms;
+
+ if (debug)
+ {
+ fprintf(debug, "ib %3d %3d %3d rb %.3f %dx%d fac %.3f drift %f\n",
+ ib0, ib, ib1, rb,
+ list_setup->cluster_size_i, list_setup->cluster_size_j,
+ nb_clust_frac_pairs_not_in_list_at_cutoff,
+ drift);
+ }
+
+ if (fabs(drift) > drift_target)
+ {
+ ib0 = ib;
+ }
+ else
+ {
+ ib1 = ib;
+ }
+ }
+
+ sfree(att);
+
+ *rlist = max(ir->rvdw, ir->rcoulomb) + ib1*resolution;
+}
--- /dev/null
- #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ > 300
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+
+#if defined(_MSVC)
+#include <limits>
+#endif
+
+#include <cuda.h>
+
+#include "types/simple.h"
+#include "types/nbnxn_pairlist.h"
+#include "types/nb_verlet.h"
+#include "types/ishift.h"
+#include "types/force_flags.h"
+#include "../nbnxn_consts.h"
+
+#ifdef TMPI_ATOMICS
+#include "thread_mpi/atomic.h"
+#endif
+
+#include "nbnxn_cuda_types.h"
+#include "../../gmxlib/cuda_tools/cudautils.cuh"
+#include "nbnxn_cuda.h"
+#include "nbnxn_cuda_data_mgmt.h"
+
++#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
+#define USE_TEXOBJ
+#endif
+
+/*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
+texture<float, 1, cudaReadModeElementType> nbfp_texref;
+
+/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
+texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
+
+/* Convenience defines */
+#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
+
+/***** The kernels come here *****/
+#include "nbnxn_cuda_kernel_utils.cuh"
+
+/* Top-level kernel generation: will generate through multiple inclusion the
+ * following flavors for all kernels:
+ * - force-only output;
+ * - force and energy output;
+ * - force-only with pair list pruning;
+ * - force and energy output with pair list pruning.
+ */
+/** Force only **/
+#include "nbnxn_cuda_kernels.cuh"
+/** Force & energy **/
+#define CALC_ENERGIES
+#include "nbnxn_cuda_kernels.cuh"
+#undef CALC_ENERGIES
+
+/*** Pair-list pruning kernels ***/
+/** Force only **/
+#define PRUNE_NBL
+#include "nbnxn_cuda_kernels.cuh"
+/** Force & energy **/
+#define CALC_ENERGIES
+#include "nbnxn_cuda_kernels.cuh"
+#undef CALC_ENERGIES
+#undef PRUNE_NBL
+
+/*! Nonbonded kernel function pointer type */
+typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
+ const cu_nbparam_t,
+ const cu_plist_t,
+ bool);
+
+/*********************************/
+
+/* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
+static bool always_ener = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
+static bool never_ener = (getenv("GMX_GPU_NEVER_ENER") != NULL);
+static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
+
+
+/* Bit-pattern used for polling-based GPU synchronization. It is used as a float
+ * and corresponds to having the exponent set to the maximum (127 -- single
+ * precision) and the mantissa to 0.
+ */
+static unsigned int poll_wait_pattern = (0x7FU << 23);
+
+/*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
+static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
+{
+ int max_grid_x_size;
+
+ assert(dinfo);
+
+ max_grid_x_size = dinfo->prop.maxGridSize[0];
+
+ /* do we exceed the grid x dimension limit? */
+ if (nwork_units > max_grid_x_size)
+ {
+ gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
+ "The number of nonbonded work units (=number of super-clusters) exceeds the"
+ "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
+ }
+
+ return nwork_units;
+}
+
+
+/* Constant arrays listing all kernel function pointers and enabling selection
+ of a kernel in an elegant manner. */
+
+static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
+static const int nPruneKernelTypes = 2; /* 0 - no prune, 1 - prune */
+
+/*! Pointers to the default kernels organized in a 3 dim array by:
+ * electrostatics type, energy calculation on/off, and pruning on/off.
+ *
+ * Note that the order of electrostatics (1st dimension) has to match the
+ * order of corresponding enumerated types defined in nbnxn_cuda_types.h.
+ */
+static const nbnxn_cu_kfunc_ptr_t
+nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+{
+ { { k_nbnxn_cutoff, k_nbnxn_cutoff_prune },
+ { k_nbnxn_cutoff_ener, k_nbnxn_cutoff_ener_prune } },
+ { { k_nbnxn_rf, k_nbnxn_rf_prune },
+ { k_nbnxn_rf_ener, k_nbnxn_rf_ener_prune } },
+ { { k_nbnxn_ewald_tab, k_nbnxn_ewald_tab_prune },
+ { k_nbnxn_ewald_tab_ener, k_nbnxn_ewald_tab_ener_prune } },
+ { { k_nbnxn_ewald_tab_twin, k_nbnxn_ewald_tab_twin_prune },
+ { k_nbnxn_ewald_tab_twin_ener, k_nbnxn_ewald_twin_ener_prune } },
+ { { k_nbnxn_ewald, k_nbnxn_ewald_prune },
+ { k_nbnxn_ewald_ener, k_nbnxn_ewald_ener_prune } },
+ { { k_nbnxn_ewald_twin, k_nbnxn_ewald_twin_prune },
+ { k_nbnxn_ewald_twin_ener, k_nbnxn_ewald_twin_ener_prune } },
+};
+
+/*! Pointers to the legacy kernels organized in a 3 dim array by:
+ * electrostatics type, energy calculation on/off, and pruning on/off.
+ *
+ * Note that the order of electrostatics (1st dimension) has to match the
+ * order of corresponding enumerated types defined in nbnxn_cuda_types.h.
+ */
+static const nbnxn_cu_kfunc_ptr_t
+nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+{
+ { { k_nbnxn_cutoff_legacy, k_nbnxn_cutoff_prune_legacy },
+ { k_nbnxn_cutoff_ener_legacy, k_nbnxn_cutoff_ener_prune_legacy } },
+ { { k_nbnxn_rf_legacy, k_nbnxn_rf_prune_legacy },
+ { k_nbnxn_rf_ener_legacy, k_nbnxn_rf_ener_prune_legacy } },
+ { { k_nbnxn_ewald_tab_legacy, k_nbnxn_ewald_tab_prune_legacy },
+ { k_nbnxn_ewald_tab_ener_legacy, k_nbnxn_ewald_tab_ener_prune_legacy } },
+ { { k_nbnxn_ewald_tab_twin_legacy, k_nbnxn_ewald_tab_twin_prune_legacy },
+ { k_nbnxn_ewald_tab_twin_ener_legacy, k_nbnxn_ewald_tab_twin_ener_prune_legacy } },
+};
+
+/*! Return a pointer to the kernel version to be executed at the current step. */
+static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
+ bool bDoEne, bool bDoPrune)
+{
+ assert(kver < eNbnxnCuKNR);
+ assert(eeltype < eelCuNR);
+
+ if (NBNXN_KVER_LEGACY(kver))
+ {
+ /* no analytical Ewald with legacy kernels */
+ assert(eeltype <= eelCuEWALD_TAB_TWIN);
+
+ return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
+ }
+ else
+ {
+ return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
+ }
+}
+
+/*! Calculates the amount of shared memory required for kernel version in use. */
+static inline int calc_shmem_required(int kver)
+{
+ int shmem;
+
+ /* size of shmem (force-buffers/xq/atom type preloading) */
+ if (NBNXN_KVER_LEGACY(kver))
+ {
+ /* i-atom x+q in shared memory */
+ shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+ /* force reduction buffers in shared memory */
+ shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+ }
+ else
+ {
+ /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
+ /* i-atom x+q in shared memory */
+ shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+ /* cj in shared memory, for both warps separately */
+ shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
+#ifdef IATYPE_SHMEM
+ /* i-atom types in shared memory */
+ shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
+#endif
+#if __CUDA_ARCH__ < 300
+ /* force reduction buffers in shared memory */
+ shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+#endif
+ }
+
+ return shmem;
+}
+
+/*! As we execute nonbonded workload in separate streams, before launching
+ the kernel we need to make sure that he following operations have completed:
+ - atomdata allocation and related H2D transfers (every nstlist step);
+ - pair list H2D transfer (every nstlist step);
+ - shift vector H2D transfer (every nstlist step);
+ - force (+shift force and energy) output clearing (every step).
+
+ These operations are issued in the local stream at the beginning of the step
+ and therefore always complete before the local kernel launch. The non-local
+ kernel is launched after the local on the same device/context, so this is
+ inherently scheduled after the operations in the local stream (including the
+ above "misc_ops").
+ However, for the sake of having a future-proof implementation, we use the
+ misc_ops_done event to record the point in time when the above operations
+ are finished and synchronize with this event in the non-local stream.
+*/
+void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
+ const nbnxn_atomdata_t *nbatom,
+ int flags,
+ int iloc)
+{
+ cudaError_t stat;
+ int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+ /* CUDA kernel launch-related stuff */
+ int shmem, nblock;
+ dim3 dim_block, dim_grid;
+ nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
+
+ cu_atomdata_t *adat = cu_nb->atdat;
+ cu_nbparam_t *nbp = cu_nb->nbparam;
+ cu_plist_t *plist = cu_nb->plist[iloc];
+ cu_timers_t *t = cu_nb->timers;
+ cudaStream_t stream = cu_nb->stream[iloc];
+
+ bool bCalcEner = flags & GMX_FORCE_VIRIAL;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bDoTime = cu_nb->bDoTime;
+
+ /* turn energy calculation always on/off (for debugging/testing only) */
+ bCalcEner = (bCalcEner || always_ener) && !never_ener;
+
+ /* don't launch the kernel if there is no work to do */
+ if (plist->nsci == 0)
+ {
+ return;
+ }
+
+ /* calculate the atom data index range based on locality */
+ if (LOCAL_I(iloc))
+ {
+ adat_begin = 0;
+ adat_len = adat->natoms_local;
+ }
+ else
+ {
+ adat_begin = adat->natoms_local;
+ adat_len = adat->natoms - adat->natoms_local;
+ }
+
+ /* When we get here all misc operations issues in the local stream are done,
+ so we record that in the local stream and wait for it in the nonlocal one. */
+ if (cu_nb->bUseTwoStreams)
+ {
+ if (iloc == eintLocal)
+ {
+ stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
+ CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
+ }
+ else
+ {
+ stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
+ CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
+ }
+ }
+
+ /* beginning of timed HtoD section */
+ if (bDoTime)
+ {
+ stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
+ CU_RET_ERR(stat, "cudaEventRecord failed");
+ }
+
+ /* HtoD x, q */
+ cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
+ adat_len * sizeof(*adat->xq), stream);
+
+ if (bDoTime)
+ {
+ stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
+ CU_RET_ERR(stat, "cudaEventRecord failed");
+ }
+
+ /* beginning of timed nonbonded calculation section */
+ if (bDoTime)
+ {
+ stat = cudaEventRecord(t->start_nb_k[iloc], stream);
+ CU_RET_ERR(stat, "cudaEventRecord failed");
+ }
+
+ /* get the pointer to the kernel flavor we need to use */
+ nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
+ plist->bDoPrune || always_prune);
+
+ /* kernel launch config */
+ nblock = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
+ dim_block = dim3(CL_SIZE, CL_SIZE, 1);
+ dim_grid = dim3(nblock, 1, 1);
+ shmem = calc_shmem_required(cu_nb->kernel_ver);
+
+ if (debug)
+ {
+ fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
+ "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+ dim_block.x, dim_block.y, dim_block.z,
+ dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
+ NCL_PER_SUPERCL, plist->na_c);
+ }
+
+ nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
+ CU_LAUNCH_ERR("k_calc_nb");
+
+ if (bDoTime)
+ {
+ stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
+ CU_RET_ERR(stat, "cudaEventRecord failed");
+ }
+}
+
+void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
+ const nbnxn_atomdata_t *nbatom,
+ int flags,
+ int aloc)
+{
+ cudaError_t stat;
+ int adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
+ int iloc = -1;
+
+ /* determine interaction locality from atom locality */
+ if (LOCAL_A(aloc))
+ {
+ iloc = eintLocal;
+ }
+ else if (NONLOCAL_A(aloc))
+ {
+ iloc = eintNonlocal;
+ }
+ else
+ {
+ char stmp[STRLEN];
+ sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
+ "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
+ gmx_incons(stmp);
+ }
+
+ cu_atomdata_t *adat = cu_nb->atdat;
+ cu_timers_t *t = cu_nb->timers;
+ bool bDoTime = cu_nb->bDoTime;
+ cudaStream_t stream = cu_nb->stream[iloc];
+
+ bool bCalcEner = flags & GMX_FORCE_VIRIAL;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+
+ /* don't launch copy-back if there was no work to do */
+ if (cu_nb->plist[iloc]->nsci == 0)
+ {
+ return;
+ }
+
+ /* calculate the atom data index range based on locality */
+ if (LOCAL_A(aloc))
+ {
+ adat_begin = 0;
+ adat_len = adat->natoms_local;
+ adat_end = cu_nb->atdat->natoms_local;
+ }
+ else
+ {
+ adat_begin = adat->natoms_local;
+ adat_len = adat->natoms - adat->natoms_local;
+ adat_end = cu_nb->atdat->natoms;
+ }
+
+ /* beginning of timed D2H section */
+ if (bDoTime)
+ {
+ stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
+ CU_RET_ERR(stat, "cudaEventRecord failed");
+ }
+
+ if (!cu_nb->bUseStreamSync)
+ {
+ /* For safety reasons set a few (5%) forces to NaN. This way even if the
+ polling "hack" fails with some future NVIDIA driver we'll get a crash. */
+ for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
+ {
+#ifdef NAN
+ nbatom->out[0].f[i] = NAN;
+#else
+# ifdef _MSVC
+ if (numeric_limits<float>::has_quiet_NaN)
+ {
+ nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
+ }
+ else
+# endif
+ {
+ nbatom->out[0].f[i] = GMX_REAL_MAX;
+ }
+#endif
+ }
+
+ /* Set the last four bytes of the force array to a bit pattern
+ which can't be the result of the force calculation:
+ max exponent (127) and zero mantissa. */
+ *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
+ }
+
+ /* With DD the local D2H transfer can only start after the non-local
+ has been launched. */
+ if (iloc == eintLocal && cu_nb->bUseTwoStreams)
+ {
+ stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
+ CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
+ }
+
+ /* DtoH f */
+ cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
+ (adat_len)*sizeof(*adat->f), stream);
+
+ /* After the non-local D2H is launched the nonlocal_done event can be
+ recorded which signals that the local D2H can proceed. This event is not
+ placed after the non-local kernel because we first need the non-local
+ data back first. */
+ if (iloc == eintNonlocal)
+ {
+ stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
+ CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
+ }
+
+ /* only transfer energies in the local stream */
+ if (LOCAL_I(iloc))
+ {
+ /* DtoH fshift */
+ if (bCalcFshift)
+ {
+ cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
+ SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
+ }
+
+ /* DtoH energies */
+ if (bCalcEner)
+ {
+ cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
+ sizeof(*cu_nb->nbst.e_lj), stream);
+ cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
+ sizeof(*cu_nb->nbst.e_el), stream);
+ }
+ }
+
+ if (bDoTime)
+ {
+ stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
+ CU_RET_ERR(stat, "cudaEventRecord failed");
+ }
+}
+
+/* Atomic compare-exchange operation on unsigned values. It is used in
+ * polling wait for the GPU.
+ */
+static inline bool atomic_cas(volatile unsigned int *ptr,
+ unsigned int oldval,
+ unsigned int newval)
+{
+ assert(ptr);
+
+#ifdef TMPI_ATOMICS
+ return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
+#else
+ gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
+ return true;
+#endif
+}
+
+void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
+ const nbnxn_atomdata_t *nbatom,
+ int flags, int aloc,
+ real *e_lj, real *e_el, rvec *fshift)
+{
+ /* NOTE: only implemented for single-precision at this time */
+ cudaError_t stat;
+ int i, adat_end, iloc = -1;
+ volatile unsigned int *poll_word;
+
+ /* determine interaction locality from atom locality */
+ if (LOCAL_A(aloc))
+ {
+ iloc = eintLocal;
+ }
+ else if (NONLOCAL_A(aloc))
+ {
+ iloc = eintNonlocal;
+ }
+ else
+ {
+ char stmp[STRLEN];
+ sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
+ "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
+ gmx_incons(stmp);
+ }
+
+ cu_plist_t *plist = cu_nb->plist[iloc];
+ cu_timers_t *timers = cu_nb->timers;
+ wallclock_gpu_t *timings = cu_nb->timings;
+ nb_staging nbst = cu_nb->nbst;
+
+ bool bCalcEner = flags & GMX_FORCE_VIRIAL;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+
+ /* turn energy calculation always on/off (for debugging/testing only) */
+ bCalcEner = (bCalcEner || always_ener) && !never_ener;
+
+ /* don't launch wait/update timers & counters if there was no work to do
+
+ NOTE: if timing with multiple GPUs (streams) becomes possible, the
+ counters could end up being inconsistent due to not being incremented
+ on some of the nodes! */
+ if (cu_nb->plist[iloc]->nsci == 0)
+ {
+ return;
+ }
+
+ /* calculate the atom data index range based on locality */
+ if (LOCAL_A(aloc))
+ {
+ adat_end = cu_nb->atdat->natoms_local;
+ }
+ else
+ {
+ adat_end = cu_nb->atdat->natoms;
+ }
+
+ if (cu_nb->bUseStreamSync)
+ {
+ stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
+ CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
+ }
+ else
+ {
+ /* Busy-wait until we get the signal pattern set in last byte
+ * of the l/nl float vector. This pattern corresponds to a floating
+ * point number which can't be the result of the force calculation
+ * (maximum, 127 exponent and 0 mantissa).
+ * The polling uses atomic compare-exchange.
+ */
+ poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
+ while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
+ }
+
+ /* timing data accumulation */
+ if (cu_nb->bDoTime)
+ {
+ /* only increase counter once (at local F wait) */
+ if (LOCAL_I(iloc))
+ {
+ timings->nb_c++;
+ timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
+ }
+
+ /* kernel timings */
+ timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
+ cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
+
+ /* X/q H2D and F D2H timings */
+ timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
+ timers->stop_nb_h2d[iloc]);
+ timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
+ timers->stop_nb_d2h[iloc]);
+
+ /* only count atdat and pair-list H2D at pair-search step */
+ if (plist->bDoPrune)
+ {
+ /* atdat transfer timing (add only once, at local F wait) */
+ if (LOCAL_A(aloc))
+ {
+ timings->pl_h2d_c++;
+ timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
+ timers->stop_atdat);
+ }
+
+ timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
+ timers->stop_pl_h2d[iloc]);
+ }
+ }
+
+ /* add up energies and shift forces (only once at local F wait) */
+ if (LOCAL_I(iloc))
+ {
+ if (bCalcEner)
+ {
+ *e_lj += *nbst.e_lj;
+ *e_el += *nbst.e_el;
+ }
+
+ if (bCalcFshift)
+ {
+ for (i = 0; i < SHIFTS; i++)
+ {
+ fshift[i][0] += nbst.fshift[i].x;
+ fshift[i][1] += nbst.fshift[i].y;
+ fshift[i][2] += nbst.fshift[i].z;
+ }
+ }
+ }
+
+ /* turn off pruning (doesn't matter if this is pair-search step or not) */
+ plist->bDoPrune = false;
+}
+
+/*! Return the reference to the nbfp texture. */
+const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
+{
+ return nbfp_texref;
+}
+
+/*! Return the reference to the coulomb_tab. */
+const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
+{
+ return coulomb_tab_texref;
+}
+
+/*! Set up the cache configuration for the non-bonded kernels,
+ */
+void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
+{
+ cudaError_t stat;
+
+ for (int i = 0; i < eelCuNR; i++)
+ {
+ for (int j = 0; j < nEnergyKernelTypes; j++)
+ {
+ for (int k = 0; k < nPruneKernelTypes; k++)
+ {
+ /* Legacy kernel 16/48 kB Shared/L1
+ * No analytical Ewald!
+ */
+ if (i != eelCuEWALD_ANA && i != eelCuEWALD_ANA_TWIN)
+ {
+ stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
+ CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+ }
+
+ if (devinfo->prop.major >= 3)
+ {
+ /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
+ stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
+ }
+ else
+ {
+ /* On Fermi prefer L1 gives 2% higher performance */
+ /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
+ stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
+ }
+ CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+ }
+ }
+ }
+}
--- /dev/null
- /* Set the stride for the lookup of the two LJ parameters from their
- * (padded) array.
- * Note that currently only arrays with stride 2 and 4 are available.
- * Since the reference code does not require alignment, we can always use 2.
- */
- static const int nbfp_stride = 2;
-
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef _nbnxn_kernel_simd_utils_h_
+#define _nbnxn_kernel_simd_utils_h_
+
+/*! \brief Provides hardware-specific utility routines for the SIMD kernels.
+ *
+ * Defines all functions, typedefs, constants and macros that have
+ * explicit dependencies on the j-cluster size, precision, or SIMD
+ * width. This includes handling diagonal, Newton and topology
+ * exclusions.
+ *
+ * The functionality which depends on the j-cluster size is:
+ * LJ-parameter lookup
+ * force table lookup
+ * energy group pair energy storage
+ */
+
+#if !defined GMX_NBNXN_SIMD_2XNN && !defined GMX_NBNXN_SIMD_4XN
+#error "Must define an NBNxN kernel flavour before including NBNxN kernel utility functions"
+#endif
+
+#ifdef GMX_SIMD_REFERENCE_PLAIN_C
+
+/* Align a stack-based thread-local working array. */
+static gmx_inline int *
+prepare_table_load_buffer(const int *array)
+{
+ return NULL;
+}
+
+#include "nbnxn_kernel_simd_utils_ref.h"
+
+#else /* GMX_SIMD_REFERENCE_PLAIN_C */
+
+#ifdef GMX_X86_SSE2
+/* Include x86 SSE2 compatible SIMD functions */
+
+/* Set the stride for the lookup of the two LJ parameters from their
+ * (padded) array. We use the minimum supported SIMD memory alignment.
+ */
+#if defined GMX_DOUBLE
+static const int nbfp_stride = 2;
+#else
+static const int nbfp_stride = 4;
+#endif
+
+/* Align a stack-based thread-local working array. Table loads on
+ * full-width AVX_256 use the array, but other implementations do
+ * not. */
+static gmx_inline int *
+prepare_table_load_buffer(const int *array)
+{
+#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+ return gmx_simd_align_int(array);
+#else
+ return NULL;
+#endif
+}
+
+#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+
+/* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
+#if GMX_SIMD_WIDTH_HERE == 8
+#define TAB_FDV0
+#endif
+
+/*
+Berk, 2xnn.c had the following code, but I think it is safe to remove now, given the code immediately above.
+
+#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
+/ * AVX-256 single precision 2x(4+4) kernel,
+ * we can do half SIMD-width aligned FDV0 table loads.
+ * /
+#define TAB_FDV0
+#endif
+*/
+
+#ifdef GMX_DOUBLE
+#include "nbnxn_kernel_simd_utils_x86_256d.h"
+#else /* GMX_DOUBLE */
+#include "nbnxn_kernel_simd_utils_x86_256s.h"
+#endif /* GMX_DOUBLE */
+
+#else /* defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
+
+/* We use the FDV0 table layout when we can use aligned table loads */
+#if GMX_SIMD_WIDTH_HERE == 4
+#define TAB_FDV0
+#endif
+
+#ifdef GMX_DOUBLE
+#include "nbnxn_kernel_simd_utils_x86_128d.h"
+#else /* GMX_DOUBLE */
+#include "nbnxn_kernel_simd_utils_x86_128s.h"
+#endif /* GMX_DOUBLE */
+
+#endif /* defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
+
+#else /* GMX_X86_SSE2 */
+
+#if GMX_SIMD_WIDTH_HERE > 4
+static const int nbfp_stride = 4;
+#else
+static const int nbfp_stride = GMX_SIMD_WIDTH_HERE;
+#endif
+
+/* We use the FDV0 table layout when we can use aligned table loads */
+#if GMX_SIMD_WIDTH_HERE == 4
+#define TAB_FDV0
+#endif
+
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+#include "nbnxn_kernel_simd_utils_ibm_qpx.h"
+#endif /* GMX_CPU_ACCELERATION_IBM_QPX */
+
+#endif /* GMX_X86_SSE2 */
+#endif /* GMX_SIMD_REFERENCE_PLAIN_C */
+
+
+#ifdef UNROLLJ
+/* Add energy register to possibly multiple terms in the energy array */
+static inline void add_ener_grp(gmx_mm_pr e_S, real *v, const int *offset_jj)
+{
+ int jj;
+
+ /* We need to balance the number of store operations with
+ * the rapidly increases number of combinations of energy groups.
+ * We add to a temporary buffer for 1 i-group vs 2 j-groups.
+ */
+ for (jj = 0; jj < (UNROLLJ/2); jj++)
+ {
+ gmx_mm_pr v_S;
+
+ v_S = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
+ gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE, gmx_add_pr(v_S, e_S));
+ }
+}
+#endif
+
+#if defined GMX_NBNXN_SIMD_2XNN && defined UNROLLJ
+/* As add_ener_grp, but for two groups of UNROLLJ/2 stored in
+ * a single SIMD register.
+ */
+static inline void
+add_ener_grp_halves(gmx_mm_pr e_S, real *v0, real *v1, const int *offset_jj)
+{
+ gmx_mm_hpr e_S0, e_S1;
+ int jj;
+
+ gmx_pr_to_2hpr(e_S, &e_S0, &e_S1);
+
+ for (jj = 0; jj < (UNROLLJ/2); jj++)
+ {
+ gmx_mm_hpr v_S;
+
+ gmx_load_hpr(&v_S, v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S0));
+ }
+ for (jj = 0; jj < (UNROLLJ/2); jj++)
+ {
+ gmx_mm_hpr v_S;
+
+ gmx_load_hpr(&v_S, v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S1));
+ }
+}
+#endif
+
+#endif /* _nbnxn_kernel_simd_utils_h_ */
--- /dev/null
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "repl_ex.h"
+#include "network.h"
+#include "random.h"
+#include "smalloc.h"
+#include "physics.h"
+#include "copyrite.h"
+#include "macros.h"
+#include "vec.h"
+#include "names.h"
+#include "mvdata.h"
+#include "domdec.h"
+#include "partdec.h"
+
+#define PROBABILITYCUTOFF 100
+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
+
+enum {
+ ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
+};
+const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
+/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
+ it are multiple replica exchange methods */
+/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
+ Let's wait until we feel better about the pressure control methods giving exact ensembles. Right now, we assume constant pressure */
+
+typedef struct gmx_repl_ex
+{
+ int repl;
+ int nrepl;
+ real temp;
+ int type;
+ real **q;
+ gmx_bool bNPT;
+ real *pres;
+ int *ind;
+ int *allswaps;
+ int nst;
+ int nex;
+ int seed;
+ int nattempt[2];
+ real *prob_sum;
+ int **nmoves;
+ int *nexchange;
+
+ /* these are helper arrays for replica exchange; allocated here so they
+ don't have to be allocated each time */
+ int *destinations;
+ int **cyclic;
+ int **order;
+ int *tmpswap;
+ gmx_bool *incycle;
+ gmx_bool *bEx;
+
+ /* helper arrays to hold the quantities that are exchanged */
+ real *prob;
+ real *Epot;
+ real *beta;
+ real *Vol;
+ real **de;
+
+} t_gmx_repl_ex;
+
+static gmx_bool repl_quantity(const gmx_multisim_t *ms,
+ struct gmx_repl_ex *re, int ere, real q)
+{
+ real *qall;
+ gmx_bool bDiff;
+ int i, s;
+
+ snew(qall, ms->nsim);
+ qall[re->repl] = q;
+ gmx_sum_sim(ms->nsim, qall, ms);
+
+ bDiff = FALSE;
+ for (s = 1; s < ms->nsim; s++)
+ {
+ if (qall[s] != qall[0])
+ {
+ bDiff = TRUE;
+ }
+ }
+
+ if (bDiff)
+ {
+ /* Set the replica exchange type and quantities */
+ re->type = ere;
+
+ snew(re->q[ere], re->nrepl);
+ for (s = 0; s < ms->nsim; s++)
+ {
+ re->q[ere][s] = qall[s];
+ }
+ }
+ sfree(qall);
+ return bDiff;
+}
+
+gmx_repl_ex_t init_replica_exchange(FILE *fplog,
+ const gmx_multisim_t *ms,
+ const t_state *state,
+ const t_inputrec *ir,
+ int nst, int nex, int init_seed)
+{
+ real temp, pres;
+ int i, j, k;
+ struct gmx_repl_ex *re;
+ gmx_bool bTemp;
+ gmx_bool bLambda = FALSE;
+
+ fprintf(fplog, "\nInitializing Replica Exchange\n");
+
+ if (ms == NULL || ms->nsim == 1)
+ {
+ gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
+ }
+
+ snew(re, 1);
+
+ re->repl = ms->sim;
+ re->nrepl = ms->nsim;
+ snew(re->q, ereENDSINGLE);
+
+ fprintf(fplog, "Repl There are %d replicas:\n", re->nrepl);
+
+ check_multi_int(fplog, ms, state->natoms, "the number of atoms", FALSE);
+ check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
+ check_multi_large_int(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
+ check_multi_large_int(fplog, ms, (ir->init_step+nst-1)/nst,
+ "first exchange step: init_step/-replex", FALSE);
+ check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
+ check_multi_int(fplog, ms, ir->opts.ngtc,
+ "the number of temperature coupling groups", FALSE);
+ check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
+ check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
+ check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
+
+ re->temp = ir->opts.ref_t[0];
+ for (i = 1; (i < ir->opts.ngtc); i++)
+ {
+ if (ir->opts.ref_t[i] != re->temp)
+ {
+ fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+ fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+ }
+ }
+
+ re->type = -1;
+ bTemp = repl_quantity(ms, re, ereTEMP, re->temp);
+ if (ir->efep != efepNO)
+ {
+ bLambda = repl_quantity(ms, re, ereLAMBDA, (real)ir->fepvals->init_fep_state);
+ }
+ if (re->type == -1) /* nothing was assigned */
+ {
+ gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
+ }
+ if (bLambda && bTemp)
+ {
+ re->type = ereTL;
+ }
+
+ if (bTemp)
+ {
+ please_cite(fplog, "Sugita1999a");
+ if (ir->epc != epcNO)
+ {
+ re->bNPT = TRUE;
+ fprintf(fplog, "Repl Using Constant Pressure REMD.\n");
+ please_cite(fplog, "Okabe2001a");
+ }
+ if (ir->etc == etcBERENDSEN)
+ {
+ gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
+ ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
+ }
+ }
+ if (bLambda)
+ {
+ if (ir->fepvals->delta_lambda != 0) /* check this? */
+ {
+ gmx_fatal(FARGS, "delta_lambda is not zero");
+ }
+ }
+ if (re->bNPT)
+ {
+ snew(re->pres, re->nrepl);
+ if (ir->epct == epctSURFACETENSION)
+ {
+ pres = ir->ref_p[ZZ][ZZ];
+ }
+ else
+ {
+ pres = 0;
+ j = 0;
+ for (i = 0; i < DIM; i++)
+ {
+ if (ir->compress[i][i] != 0)
+ {
+ pres += ir->ref_p[i][i];
+ j++;
+ }
+ }
+ pres /= j;
+ }
+ re->pres[re->repl] = pres;
+ gmx_sum_sim(re->nrepl, re->pres, ms);
+ }
+
+ /* Make an index for increasing replica order */
+ /* only makes sense if one or the other is varying, not both!
+ if both are varying, we trust the order the person gave. */
+ snew(re->ind, re->nrepl);
+ for (i = 0; i < re->nrepl; i++)
+ {
+ re->ind[i] = i;
+ }
+
+ if (re->type < ereENDSINGLE)
+ {
+
+ for (i = 0; i < re->nrepl; i++)
+ {
+ for (j = i+1; j < re->nrepl; j++)
+ {
+ if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
+ {
+ k = re->ind[i];
+ re->ind[i] = re->ind[j];
+ re->ind[j] = k;
+ }
+ else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
+ {
+ gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
+ }
+ }
+ }
+ }
+
+ /* keep track of all the swaps, starting with the initial placement. */
+ snew(re->allswaps, re->nrepl);
+ for (i = 0; i < re->nrepl; i++)
+ {
+ re->allswaps[i] = re->ind[i];
+ }
+
+ switch (re->type)
+ {
+ case ereTEMP:
+ fprintf(fplog, "\nReplica exchange in temperature\n");
+ for (i = 0; i < re->nrepl; i++)
+ {
+ fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
+ }
+ fprintf(fplog, "\n");
+ break;
+ case ereLAMBDA:
+ fprintf(fplog, "\nReplica exchange in lambda\n");
+ for (i = 0; i < re->nrepl; i++)
+ {
+ fprintf(fplog, " %3d", (int)re->q[re->type][re->ind[i]]);
+ }
+ fprintf(fplog, "\n");
+ break;
+ case ereTL:
+ fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
+ for (i = 0; i < re->nrepl; i++)
+ {
+ fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
+ }
+ fprintf(fplog, "\n");
+ for (i = 0; i < re->nrepl; i++)
+ {
+ fprintf(fplog, " %5d", (int)re->q[ereLAMBDA][re->ind[i]]);
+ }
+ fprintf(fplog, "\n");
+ break;
+ default:
+ gmx_incons("Unknown replica exchange quantity");
+ }
+ if (re->bNPT)
+ {
+ fprintf(fplog, "\nRepl p");
+ for (i = 0; i < re->nrepl; i++)
+ {
+ fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
+ }
+
+ for (i = 0; i < re->nrepl; i++)
+ {
+ if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
+ {
+ fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+ fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+ }
+ }
+ }
+ re->nst = nst;
+ if (init_seed == -1)
+ {
+ if (MASTERSIM(ms))
+ {
+ re->seed = make_seed();
+ }
+ else
+ {
+ re->seed = 0;
+ }
+ gmx_sumi_sim(1, &(re->seed), ms);
+ }
+ else
+ {
+ re->seed = init_seed;
+ }
+ fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
+ fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
+
+ re->nattempt[0] = 0;
+ re->nattempt[1] = 0;
+
+ snew(re->prob_sum, re->nrepl);
+ snew(re->nexchange, re->nrepl);
+ snew(re->nmoves, re->nrepl);
+ for (i = 0; i < re->nrepl; i++)
+ {
+ snew(re->nmoves[i], re->nrepl);
+ }
+ fprintf(fplog, "Replica exchange information below: x=exchange, pr=probability\n");
+
+ /* generate space for the helper functions so we don't have to snew each time */
+
+ snew(re->destinations, re->nrepl);
+ snew(re->incycle, re->nrepl);
+ snew(re->tmpswap, re->nrepl);
+ snew(re->cyclic, re->nrepl);
+ snew(re->order, re->nrepl);
+ for (i = 0; i < re->nrepl; i++)
+ {
+ snew(re->cyclic[i], re->nrepl);
+ snew(re->order[i], re->nrepl);
+ }
+ /* allocate space for the functions storing the data for the replicas */
+ /* not all of these arrays needed in all cases, but they don't take
+ up much space, since the max size is nrepl**2 */
+ snew(re->prob, re->nrepl);
+ snew(re->bEx, re->nrepl);
+ snew(re->beta, re->nrepl);
+ snew(re->Vol, re->nrepl);
+ snew(re->Epot, re->nrepl);
+ snew(re->de, re->nrepl);
+ for (i = 0; i < re->nrepl; i++)
+ {
+ snew(re->de[i], re->nrepl);
+ }
+ re->nex = nex;
+ return re;
+}
+
+static void exchange_reals(const gmx_multisim_t *ms, int b, real *v, int n)
+{
+ real *buf;
+ int i;
+
+ if (v)
+ {
+ snew(buf, n);
+#ifdef GMX_MPI
+ /*
+ MPI_Sendrecv(v, n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+ buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+ ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+ */
+ {
+ MPI_Request mpi_req;
+
+ MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+ ms->mpi_comm_masters, &mpi_req);
+ MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+ ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+ MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+ }
+#endif
+ for (i = 0; i < n; i++)
+ {
+ v[i] = buf[i];
+ }
+ sfree(buf);
+ }
+}
+
+
+static void exchange_ints(const gmx_multisim_t *ms, int b, int *v, int n)
+{
+ int *buf;
+ int i;
+
+ if (v)
+ {
+ snew(buf, n);
+#ifdef GMX_MPI
+ /*
+ MPI_Sendrecv(v, n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
+ buf,n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
+ ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+ */
+ {
+ MPI_Request mpi_req;
+
+ MPI_Isend(v, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
+ ms->mpi_comm_masters, &mpi_req);
+ MPI_Recv(buf, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
+ ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+ MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+ }
+#endif
+ for (i = 0; i < n; i++)
+ {
+ v[i] = buf[i];
+ }
+ sfree(buf);
+ }
+}
+
+static void exchange_doubles(const gmx_multisim_t *ms, int b, double *v, int n)
+{
+ double *buf;
+ int i;
+
+ if (v)
+ {
+ snew(buf, n);
+#ifdef GMX_MPI
+ /*
+ MPI_Sendrecv(v, n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+ buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+ ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+ */
+ {
+ MPI_Request mpi_req;
+
+ MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+ ms->mpi_comm_masters, &mpi_req);
+ MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+ ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+ MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+ }
+#endif
+ for (i = 0; i < n; i++)
+ {
+ v[i] = buf[i];
+ }
+ sfree(buf);
+ }
+}
+
+static void exchange_rvecs(const gmx_multisim_t *ms, int b, rvec *v, int n)
+{
+ rvec *buf;
+ int i;
+
+ if (v)
+ {
+ snew(buf, n);
+#ifdef GMX_MPI
+ /*
+ MPI_Sendrecv(v[0], n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+ buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+ ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+ */
+ {
+ MPI_Request mpi_req;
+
+ MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+ ms->mpi_comm_masters, &mpi_req);
+ MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+ ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+ MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+ }
+#endif
+ for (i = 0; i < n; i++)
+ {
+ copy_rvec(buf[i], v[i]);
+ }
+ sfree(buf);
+ }
+}
+
+static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
+{
+ /* When t_state changes, this code should be updated. */
+ int ngtc, nnhpres;
+ ngtc = state->ngtc * state->nhchainlength;
+ nnhpres = state->nnhpres* state->nhchainlength;
+ exchange_rvecs(ms, b, state->box, DIM);
+ exchange_rvecs(ms, b, state->box_rel, DIM);
+ exchange_rvecs(ms, b, state->boxv, DIM);
+ exchange_reals(ms, b, &(state->veta), 1);
+ exchange_reals(ms, b, &(state->vol0), 1);
+ exchange_rvecs(ms, b, state->svir_prev, DIM);
+ exchange_rvecs(ms, b, state->fvir_prev, DIM);
+ exchange_rvecs(ms, b, state->pres_prev, DIM);
+ exchange_doubles(ms, b, state->nosehoover_xi, ngtc);
+ exchange_doubles(ms, b, state->nosehoover_vxi, ngtc);
+ exchange_doubles(ms, b, state->nhpres_xi, nnhpres);
+ exchange_doubles(ms, b, state->nhpres_vxi, nnhpres);
+ exchange_doubles(ms, b, state->therm_integral, state->ngtc);
+ exchange_rvecs(ms, b, state->x, state->natoms);
+ exchange_rvecs(ms, b, state->v, state->natoms);
+ exchange_rvecs(ms, b, state->sd_X, state->natoms);
+}
+
+static void copy_rvecs(rvec *s, rvec *d, int n)
+{
+ int i;
+
+ if (d != NULL)
+ {
+ for (i = 0; i < n; i++)
+ {
+ copy_rvec(s[i], d[i]);
+ }
+ }
+}
+
+static void copy_doubles(const double *s, double *d, int n)
+{
+ int i;
+
+ if (d != NULL)
+ {
+ for (i = 0; i < n; i++)
+ {
+ d[i] = s[i];
+ }
+ }
+}
+
+static void copy_reals(const real *s, real *d, int n)
+{
+ int i;
+
+ if (d != NULL)
+ {
+ for (i = 0; i < n; i++)
+ {
+ d[i] = s[i];
+ }
+ }
+}
+
+static void copy_ints(const int *s, int *d, int n)
+{
+ int i;
+
+ if (d != NULL)
+ {
+ for (i = 0; i < n; i++)
+ {
+ d[i] = s[i];
+ }
+ }
+}
+
+#define scopy_rvecs(v, n) copy_rvecs(state->v, state_local->v, n);
+#define scopy_doubles(v, n) copy_doubles(state->v, state_local->v, n);
+#define scopy_reals(v, n) copy_reals(state->v, state_local->v, n);
+#define scopy_ints(v, n) copy_ints(state->v, state_local->v, n);
+
+static void copy_state_nonatomdata(t_state *state, t_state *state_local)
+{
+ /* When t_state changes, this code should be updated. */
+ int ngtc, nnhpres;
+ ngtc = state->ngtc * state->nhchainlength;
+ nnhpres = state->nnhpres* state->nhchainlength;
+ scopy_rvecs(box, DIM);
+ scopy_rvecs(box_rel, DIM);
+ scopy_rvecs(boxv, DIM);
+ state_local->veta = state->veta;
+ state_local->vol0 = state->vol0;
+ scopy_rvecs(svir_prev, DIM);
+ scopy_rvecs(fvir_prev, DIM);
+ scopy_rvecs(pres_prev, DIM);
+ scopy_doubles(nosehoover_xi, ngtc);
+ scopy_doubles(nosehoover_vxi, ngtc);
+ scopy_doubles(nhpres_xi, nnhpres);
+ scopy_doubles(nhpres_vxi, nnhpres);
+ scopy_doubles(therm_integral, state->ngtc);
+ scopy_rvecs(x, state->natoms);
+ scopy_rvecs(v, state->natoms);
+ scopy_rvecs(sd_X, state->natoms);
+ copy_ints(&(state->fep_state), &(state_local->fep_state), 1);
+ scopy_reals(lambda, efptNR);
+}
+
+static void scale_velocities(t_state *state, real fac)
+{
+ int i;
+
+ if (state->v)
+ {
+ for (i = 0; i < state->natoms; i++)
+ {
+ svmul(fac, state->v[i], state->v[i]);
+ }
+ }
+}
+
+static void pd_collect_state(const t_commrec *cr, t_state *state)
+{
+ int shift;
+
+ if (debug)
+ {
+ fprintf(debug, "Collecting state before exchange\n");
+ }
+ shift = cr->nnodes - cr->npmenodes - 1;
+ move_rvecs(cr, FALSE, FALSE, state->x, NULL, shift, NULL);
+ if (state->v)
+ {
+ move_rvecs(cr, FALSE, FALSE, state->v, NULL, shift, NULL);
+ }
+ if (state->sd_X)
+ {
+ move_rvecs(cr, FALSE, FALSE, state->sd_X, NULL, shift, NULL);
+ }
+}
+
+static void print_transition_matrix(FILE *fplog, int n, int **nmoves, int *nattempt)
+{
+ int i, j, ntot;
+ float Tprint;
+
+ ntot = nattempt[0] + nattempt[1];
+ fprintf(fplog, "\n");
+ fprintf(fplog, "Repl");
+ for (i = 0; i < n; i++)
+ {
+ fprintf(fplog, " "); /* put the title closer to the center */
+ }
+ fprintf(fplog, "Empirical Transition Matrix\n");
+
+ fprintf(fplog, "Repl");
+ for (i = 0; i < n; i++)
+ {
+ fprintf(fplog, "%8d", (i+1));
+ }
+ fprintf(fplog, "\n");
+
+ for (i = 0; i < n; i++)
+ {
+ fprintf(fplog, "Repl");
+ for (j = 0; j < n; j++)
+ {
+ Tprint = 0.0;
+ if (nmoves[i][j] > 0)
+ {
+ Tprint = nmoves[i][j]/(2.0*ntot);
+ }
+ fprintf(fplog, "%8.4f", Tprint);
+ }
+ fprintf(fplog, "%3d\n", i);
+ }
+}
+
+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, gmx_bool *bEx)
+{
+ int i;
+
+ fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
+ for (i = 1; i < n; i++)
+ {
+ fprintf(fplog, " %c %2d", (bEx != 0 && bEx[i]) ? 'x' : ' ', ind[i]);
+ }
+ fprintf(fplog, "\n");
+}
+
+static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ {
+ tmpswap[i] = allswaps[i];
+ }
+ for (i = 0; i < n; i++)
+ {
+ allswaps[i] = tmpswap[pind[i]];
+ }
+
+ fprintf(fplog, "\nAccepted Exchanges: ");
+ for (i = 0; i < n; i++)
+ {
+ fprintf(fplog, "%d ", pind[i]);
+ }
+ fprintf(fplog, "\n");
+
+ /* the "Order After Exchange" is the state label corresponding to the configuration that
+ started in state listed in order, i.e.
+
+ 3 0 1 2
+
+ means that the:
+ configuration starting in simulation 3 is now in simulation 0,
+ configuration starting in simulation 0 is now in simulation 1,
+ configuration starting in simulation 1 is now in simulation 2,
+ configuration starting in simulation 2 is now in simulation 3
+ */
+ fprintf(fplog, "Order After Exchange: ");
+ for (i = 0; i < n; i++)
+ {
+ fprintf(fplog, "%d ", allswaps[i]);
+ }
+ fprintf(fplog, "\n\n");
+}
+
+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
+{
+ int i;
+ char buf[8];
+
+ fprintf(fplog, "Repl %2s ", leg);
+ for (i = 1; i < n; i++)
+ {
+ if (prob[i] >= 0)
+ {
+ sprintf(buf, "%4.2f", prob[i]);
+ fprintf(fplog, " %3s", buf[0] == '1' ? "1.0" : buf+1);
+ }
+ else
+ {
+ fprintf(fplog, " ");
+ }
+ }
+ fprintf(fplog, "\n");
+}
+
+static void print_count(FILE *fplog, const char *leg, int n, int *count)
+{
+ int i;
+
+ fprintf(fplog, "Repl %2s ", leg);
+ for (i = 1; i < n; i++)
+ {
+ fprintf(fplog, " %4d", count[i]);
+ }
+ fprintf(fplog, "\n");
+}
+
+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
+{
+
+ real ediff, dpV, delta = 0;
+ real *Epot = re->Epot;
+ real *Vol = re->Vol;
+ real **de = re->de;
+ real *beta = re->beta;
+
+ /* Two cases; we are permuted and not. In all cases, setting ap = a and bp = b will reduce
+ to the non permuted case */
+
+ switch (re->type)
+ {
+ case ereTEMP:
+ /*
+ * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
+ */
+ ediff = Epot[b] - Epot[a];
+ delta = -(beta[bp] - beta[ap])*ediff;
+ break;
+ case ereLAMBDA:
+ /* two cases: when we are permuted, and not. */
+ /* non-permuted:
+ ediff = E_new - E_old
+ = [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
+ = [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
+ = de[b][a] + de[a][b] */
+
+ /* permuted:
+ ediff = E_new - E_old
+ = [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
+ = [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
+ = [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
+ = [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
+ = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]) */
+ /* but, in the current code implementation, we flip configurations, not indices . . .
+ So let's examine that.
+ = [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+ = [H_b(x_ap) - H_a(x_ap)] + [H_a(x_bp) - H_b(x_pb)]
+ = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+ So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+ So the simple solution is to flip the
+ position of perturbed and original indices in the tests.
+ */
+
+ ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
+ delta = ediff*beta[a]; /* assume all same temperature in this case */
+ break;
+ case ereTL:
+ /* not permuted: */
+ /* delta = reduced E_new - reduced E_old
+ = [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
+ = [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
+ = [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
+ [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
+ = [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
+ beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
+ = beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
+ /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
+ /* permuted (big breath!) */
+ /* delta = reduced E_new - reduced E_old
+ = [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
+ = [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+ = [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+ - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
+ - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+ = [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
+ [(beta_ap H_ap(x_b) - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
+ + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+ = [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
+ [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
+ + beta_pb (H_a(x_a) - H_b(x_b)) - beta_ap (H_a(x_a) - H_b(x_b))
+ = ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b] - beta_bp de[bp][b])
+ + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b)) */
+ delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
+ break;
+ default:
+ gmx_incons("Unknown replica exchange quantity");
+ }
+ if (bPrint)
+ {
+ fprintf(fplog, "Repl %d <-> %d dE_term = %10.3e (kT)\n", a, b, delta);
+ }
+ if (re->bNPT)
+ {
+ /* revist the calculation for 5.0. Might be some improvements. */
+ dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
+ if (bPrint)
+ {
+ fprintf(fplog, " dpV = %10.3e d = %10.3e\nb", dpV, delta + dpV);
+ }
+ delta += dpV;
+ }
+ return delta;
+}
+
+static void
+test_for_replica_exchange(FILE *fplog,
+ const gmx_multisim_t *ms,
+ struct gmx_repl_ex *re,
+ gmx_enerdata_t *enerd,
+ real vol,
+ gmx_large_int_t step,
+ real time)
+{
+ int m, i, j, a, b, ap, bp, i0, i1, tmp;
+ real ediff = 0, delta = 0, dpV = 0;
+ gmx_bool bPrint, bMultiEx;
+ gmx_bool *bEx = re->bEx;
+ real *prob = re->prob;
+ int *pind = re->destinations; /* permuted index */
+ gmx_bool bEpot = FALSE;
+ gmx_bool bDLambda = FALSE;
+ gmx_bool bVol = FALSE;
+
+ bMultiEx = (re->nex > 1); /* multiple exchanges at each state */
+ fprintf(fplog, "Replica exchange at step " gmx_large_int_pfmt " time %g\n", step, time);
+
+ if (re->bNPT)
+ {
+ for (i = 0; i < re->nrepl; i++)
+ {
+ re->Vol[i] = 0;
+ }
+ bVol = TRUE;
+ re->Vol[re->repl] = vol;
+ }
+ if ((re->type == ereTEMP || re->type == ereTL))
+ {
+ for (i = 0; i < re->nrepl; i++)
+ {
+ re->Epot[i] = 0;
+ }
+ bEpot = TRUE;
+ re->Epot[re->repl] = enerd->term[F_EPOT];
+ /* temperatures of different states*/
+ for (i = 0; i < re->nrepl; i++)
+ {
+ re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
+ }
+ }
+ else
+ {
+ for (i = 0; i < re->nrepl; i++)
+ {
+ re->beta[i] = 1.0/(re->temp*BOLTZ); /* we have a single temperature */
+ }
+ }
+ if (re->type == ereLAMBDA || re->type == ereTL)
+ {
+ bDLambda = TRUE;
+ /* lambda differences. */
+ /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
+ minus the energy of the jth simulation in the jth Hamiltonian */
+ for (i = 0; i < re->nrepl; i++)
+ {
+ for (j = 0; j < re->nrepl; j++)
+ {
+ re->de[i][j] = 0;
+ }
+ }
+ for (i = 0; i < re->nrepl; i++)
+ {
+ re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
+ }
+ }
+
+ /* now actually do the communication */
+ if (bVol)
+ {
+ gmx_sum_sim(re->nrepl, re->Vol, ms);
+ }
+ if (bEpot)
+ {
+ gmx_sum_sim(re->nrepl, re->Epot, ms);
+ }
+ if (bDLambda)
+ {
+ for (i = 0; i < re->nrepl; i++)
+ {
+ gmx_sum_sim(re->nrepl, re->de[i], ms);
+ }
+ }
+
+ /* make a duplicate set of indices for shuffling */
+ for (i = 0; i < re->nrepl; i++)
+ {
+ pind[i] = re->ind[i];
+ }
+
+ if (bMultiEx)
+ {
+ /* multiple random switch exchange */
+ for (i = 0; i < re->nex; i++)
+ {
+ /* randomly select a pair */
+ /* in theory, could reduce this by identifying only which switches had a nonneglibible
+ probability of occurring (log p > -100) and only operate on those switches */
+ /* find out which state it is from, and what label that state currently has. Likely
+ more work that useful. */
+ i0 = (int)(re->nrepl*rando(&(re->seed)));
+ i1 = (int)(re->nrepl*rando(&(re->seed)));
+ if (i0 == i1)
+ {
+ i--;
+ continue; /* self-exchange, back up and do it again */
+ }
+
+ a = re->ind[i0]; /* what are the indices of these states? */
+ b = re->ind[i1];
+ ap = pind[i0];
+ bp = pind[i1];
+
+ bPrint = FALSE; /* too noisy */
+ /* calculate the energy difference */
+ /* if the code changes to flip the STATES, rather than the configurations,
+ use the commented version of the code */
+ /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+ delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
+
+ /* we actually only use the first space in the prob and bEx array,
+ since there are actually many switches between pairs. */
+
+ if (delta <= 0)
+ {
+ /* accepted */
+ prob[0] = 1;
+ bEx[0] = TRUE;
+ }
+ else
+ {
+ if (delta > PROBABILITYCUTOFF)
+ {
+ prob[0] = 0;
+ }
+ else
+ {
+ prob[0] = exp(-delta);
+ }
+ /* roll a number to determine if accepted */
+ bEx[0] = (rando(&(re->seed)) < prob[0]);
+ }
+ re->prob_sum[0] += prob[0];
+
+ if (bEx[0])
+ {
+ /* swap the states */
+ tmp = pind[i0];
+ pind[i0] = pind[i1];
+ pind[i1] = tmp;
+ }
+ }
+ re->nattempt[0]++; /* keep track of total permutation trials here */
+ print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
+ }
+ else
+ {
+ /* standard nearest neighbor replica exchange */
+ m = (step / re->nst) % 2;
+ for (i = 1; i < re->nrepl; i++)
+ {
+ a = re->ind[i-1];
+ b = re->ind[i];
+
+ bPrint = (re->repl == a || re->repl == b);
+ if (i % 2 == m)
+ {
+ delta = calc_delta(fplog, bPrint, re, a, b, a, b);
+ if (delta <= 0)
+ {
+ /* accepted */
+ prob[i] = 1;
+ bEx[i] = TRUE;
+ }
+ else
+ {
+ if (delta > PROBABILITYCUTOFF)
+ {
+ prob[i] = 0;
+ }
+ else
+ {
+ prob[i] = exp(-delta);
+ }
+ /* roll a number to determine if accepted */
+ bEx[i] = (rando(&(re->seed)) < prob[i]);
+ }
+ re->prob_sum[i] += prob[i];
+
+ if (bEx[i])
+ {
+ /* swap these two */
+ tmp = pind[i-1];
+ pind[i-1] = pind[i];
+ pind[i] = tmp;
+ re->nexchange[i]++; /* statistics for back compatibility */
+ }
+ }
+ else
+ {
+ prob[i] = -1;
+ bEx[i] = FALSE;
+ }
+ }
+ /* print some statistics */
+ print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
+ print_prob(fplog, "pr", re->nrepl, prob);
+ fprintf(fplog, "\n");
+ re->nattempt[m]++;
+ }
+
+ /* record which moves were made and accepted */
+ for (i = 0; i < re->nrepl; i++)
+ {
+ re->nmoves[re->ind[i]][pind[i]] += 1;
+ re->nmoves[pind[i]][re->ind[i]] += 1;
+ }
+ fflush(fplog); /* make sure we can see what the last exchange was */
+}
+
+static void write_debug_x(t_state *state)
+{
+ int i;
+
+ if (debug)
+ {
+ for (i = 0; i < state->natoms; i += 10)
+ {
+ fprintf(debug, "dx %5d %10.5f %10.5f %10.5f\n", i, state->x[i][XX], state->x[i][YY], state->x[i][ZZ]);
+ }
+ }
+}
+
+static void
+cyclic_decomposition(const int *destinations,
+ int **cyclic,
+ gmx_bool *incycle,
+ const int nrepl,
+ int *nswap)
+{
+
+ int i, j, c, p;
+ int maxlen = 1;
+ for (i = 0; i < nrepl; i++)
+ {
+ incycle[i] = FALSE;
+ }
+ for (i = 0; i < nrepl; i++) /* one cycle for each replica */
+ {
+ if (incycle[i])
+ {
+ cyclic[i][0] = -1;
+ continue;
+ }
+ cyclic[i][0] = i;
+ incycle[i] = TRUE;
+ c = 1;
+ p = i;
+ for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
+ {
+ p = destinations[p]; /* start permuting */
+ if (p == i)
+ {
+ cyclic[i][c] = -1;
+ if (c > maxlen)
+ {
+ maxlen = c;
+ }
+ break; /* we've reached the original element, the cycle is complete, and we marked the end. */
+ }
+ else
+ {
+ cyclic[i][c] = p; /* each permutation gives a new member of the cycle */
+ incycle[p] = TRUE;
+ c++;
+ }
+ }
+ }
+ *nswap = maxlen - 1;
+
+ if (debug)
+ {
+ for (i = 0; i < nrepl; i++)
+ {
+ fprintf(debug, "Cycle %d:", i);
+ for (j = 0; j < nrepl; j++)
+ {
+ if (cyclic[i][j] < 0)
+ {
+ break;
+ }
+ fprintf(debug, "%2d", cyclic[i][j]);
+ }
+ fprintf(debug, "\n");
+ }
+ fflush(debug);
+ }
+}
+
+static void
+compute_exchange_order(FILE *fplog,
+ int **cyclic,
+ int **order,
+ const int nrepl,
+ const int maxswap)
+{
+ int i, j;
+
+ for (j = 0; j < maxswap; j++)
+ {
+ for (i = 0; i < nrepl; i++)
+ {
+ if (cyclic[i][j+1] >= 0)
+ {
+ order[cyclic[i][j+1]][j] = cyclic[i][j];
+ order[cyclic[i][j]][j] = cyclic[i][j+1];
+ }
+ }
+ for (i = 0; i < nrepl; i++)
+ {
+ if (order[i][j] < 0)
+ {
+ order[i][j] = i; /* if it's not exchanging, it should stay this round*/
+ }
+ }
+ }
+
+ if (debug)
+ {
+ fprintf(fplog, "Replica Exchange Order\n");
+ for (i = 0; i < nrepl; i++)
+ {
+ fprintf(fplog, "Replica %d:", i);
+ for (j = 0; j < maxswap; j++)
+ {
+ if (order[i][j] < 0)
+ {
+ break;
+ }
+ fprintf(debug, "%2d", order[i][j]);
+ }
+ fprintf(fplog, "\n");
+ }
+ fflush(fplog);
+ }
+}
+
+static void
+prepare_to_do_exchange(FILE *fplog,
+ const int *destinations,
+ const int replica_id,
+ const int nrepl,
+ int *maxswap,
+ int **order,
+ int **cyclic,
+ int *incycle,
+ gmx_bool *bThisReplicaExchanged)
+{
+ int i, j;
+ /* Hold the cyclic decomposition of the (multiple) replica
+ * exchange. */
+ gmx_bool bAnyReplicaExchanged = FALSE;
+ *bThisReplicaExchanged = FALSE;
+
+ for (i = 0; i < nrepl; i++)
+ {
+ if (destinations[i] != i)
+ {
+ /* only mark as exchanged if the index has been shuffled */
+ bAnyReplicaExchanged = TRUE;
+ break;
+ }
+ }
+ if (bAnyReplicaExchanged)
+ {
+ /* reinitialize the placeholder arrays */
+ for (i = 0; i < nrepl; i++)
+ {
+ for (j = 0; j < nrepl; j++)
+ {
+ cyclic[i][j] = -1;
+ order[i][j] = -1;
+ }
+ }
+
+ /* Identify the cyclic decomposition of the permutation (very
+ * fast if neighbor replica exchange). */
+ cyclic_decomposition(destinations, cyclic, incycle, nrepl, maxswap);
+
+ /* Now translate the decomposition into a replica exchange
+ * order at each step. */
+ compute_exchange_order(fplog, cyclic, order, nrepl, *maxswap);
+
+ /* Did this replica do any exchange at any point? */
+ for (j = 0; j < *maxswap; j++)
+ {
+ if (replica_id != order[replica_id][j])
+ {
+ *bThisReplicaExchanged = TRUE;
+ break;
+ }
+ }
+ }
+}
+
+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *re,
+ t_state *state, gmx_enerdata_t *enerd,
+ t_state *state_local, gmx_large_int_t step, real time)
+{
+ int i, j;
+ int replica_id = 0;
+ int exchange_partner;
+ int maxswap = 0;
+ /* Number of rounds of exchanges needed to deal with any multiple
+ * exchanges. */
+ /* Where each replica ends up after the exchange attempt(s). */
+ /* The order in which multiple exchanges will occur. */
+ gmx_bool bThisReplicaExchanged = FALSE;
+
+ if (MASTER(cr))
+ {
+ replica_id = re->repl;
+ test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
+ prepare_to_do_exchange(fplog, re->destinations, replica_id, re->nrepl, &maxswap,
+ re->order, re->cyclic, re->incycle, &bThisReplicaExchanged);
+ }
+ /* Do intra-simulation broadcast so all processors belonging to
+ * each simulation know whether they need to participate in
+ * collecting the state. Otherwise, they might as well get on with
+ * the next thing to do. */
+ if (PAR(cr))
+ {
+#ifdef GMX_MPI
+ MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
+ cr->mpi_comm_mygroup);
+#endif
+ }
+
+ if (bThisReplicaExchanged)
+ {
+ /* Exchange the states */
+
+ if (PAR(cr))
+ {
+ /* Collect the global state on the master node */
+ if (DOMAINDECOMP(cr))
+ {
+ dd_collect_state(cr->dd, state_local, state);
+ }
+ else
+ {
+ pd_collect_state(cr, state);
+ }
+ }
++ else
++ {
++ copy_state_nonatomdata(state_local, state);
++ }
+
+ if (MASTER(cr))
+ {
+ /* There will be only one swap cycle with standard replica
+ * exchange, but there may be multiple swap cycles if we
+ * allow multiple swaps. */
+
+ for (j = 0; j < maxswap; j++)
+ {
+ exchange_partner = re->order[replica_id][j];
+
+ if (exchange_partner != replica_id)
+ {
+ /* Exchange the global states between the master nodes */
+ if (debug)
+ {
+ fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
+ }
+ exchange_state(cr->ms, exchange_partner, state);
+ }
+ }
+ /* For temperature-type replica exchange, we need to scale
+ * the velocities. */
+ if (re->type == ereTEMP || re->type == ereTL)
+ {
+ scale_velocities(state, sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
+ }
+
+ }
+
+ /* With domain decomposition the global state is distributed later */
+ if (!DOMAINDECOMP(cr))
+ {
+ /* Copy the global state to the local state data structure */
+ copy_state_nonatomdata(state, state_local);
+
+ if (PAR(cr))
+ {
+ bcast_state(cr, state, FALSE);
+ }
+ }
+ }
+
+ return bThisReplicaExchanged;
+}
+
+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
+{
+ int i;
+
+ fprintf(fplog, "\nReplica exchange statistics\n");
+
+ if (re->nex == 0)
+ {
+ fprintf(fplog, "Repl %d attempts, %d odd, %d even\n",
+ re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
+
+ fprintf(fplog, "Repl average probabilities:\n");
+ for (i = 1; i < re->nrepl; i++)
+ {
+ if (re->nattempt[i%2] == 0)
+ {
+ re->prob[i] = 0;
+ }
+ else
+ {
+ re->prob[i] = re->prob_sum[i]/re->nattempt[i%2];
+ }
+ }
+ print_ind(fplog, "", re->nrepl, re->ind, NULL);
+ print_prob(fplog, "", re->nrepl, re->prob);
+
+ fprintf(fplog, "Repl number of exchanges:\n");
+ print_ind(fplog, "", re->nrepl, re->ind, NULL);
+ print_count(fplog, "", re->nrepl, re->nexchange);
+
+ fprintf(fplog, "Repl average number of exchanges:\n");
+ for (i = 1; i < re->nrepl; i++)
+ {
+ if (re->nattempt[i%2] == 0)
+ {
+ re->prob[i] = 0;
+ }
+ else
+ {
+ re->prob[i] = ((real)re->nexchange[i])/re->nattempt[i%2];
+ }
+ }
+ print_ind(fplog, "", re->nrepl, re->ind, NULL);
+ print_prob(fplog, "", re->nrepl, re->prob);
+
+ fprintf(fplog, "\n");
+ }
+ /* print the transition matrix */
+ print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
+}
--- /dev/null
- static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
- /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
- static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <signal.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <string.h>
+#include <assert.h>
+
+#include "typedefs.h"
+#include "smalloc.h"
+#include "sysstuff.h"
+#include "statutil.h"
+#include "force.h"
+#include "mdrun.h"
+#include "md_logging.h"
+#include "md_support.h"
+#include "network.h"
+#include "pull.h"
+#include "pull_rotation.h"
+#include "names.h"
+#include "disre.h"
+#include "orires.h"
+#include "pme.h"
+#include "mdatoms.h"
+#include "repl_ex.h"
+#include "qmmm.h"
+#include "domdec.h"
+#include "partdec.h"
+#include "coulomb.h"
+#include "constr.h"
+#include "mvdata.h"
+#include "checkpoint.h"
+#include "mtop_util.h"
+#include "sighandler.h"
+#include "gromacs/fileio/tpxio.h"
+#include "txtdump.h"
+#include "gmx_detect_hardware.h"
+#include "gmx_omp_nthreads.h"
+#include "pull_rotation.h"
+#include "calc_verletbuf.h"
+#include "../mdlib/nbnxn_search.h"
+#include "../mdlib/nbnxn_consts.h"
+#include "gmx_fatal_collective.h"
+#include "membed.h"
+#include "macros.h"
+#include "gmx_omp.h"
+#include "gmx_thread_affinity.h"
+
+#include "gromacs/utility/gmxmpi.h"
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+#include "gpu_utils.h"
+#include "nbnxn_cuda_data_mgmt.h"
+
+typedef struct {
+ gmx_integrator_t *func;
+} gmx_intp_t;
+
+/* The array should match the eI array in include/types/enums.h */
+const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}};
+
+gmx_large_int_t deform_init_init_step_tpx;
+matrix deform_init_box_tpx;
+#ifdef GMX_THREAD_MPI
+tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+#endif
+
+
+#ifdef GMX_THREAD_MPI
+struct mdrunner_arglist
+{
+ gmx_hw_opt_t *hw_opt;
+ FILE *fplog;
+ t_commrec *cr;
+ int nfile;
+ const t_filenm *fnm;
+ output_env_t oenv;
+ gmx_bool bVerbose;
+ gmx_bool bCompact;
+ int nstglobalcomm;
+ ivec ddxyz;
+ int dd_node_order;
+ real rdd;
+ real rconstr;
+ const char *dddlb_opt;
+ real dlb_scale;
+ const char *ddcsx;
+ const char *ddcsy;
+ const char *ddcsz;
+ const char *nbpu_opt;
+ gmx_large_int_t nsteps_cmdline;
+ int nstepout;
+ int resetstep;
+ int nmultisim;
+ int repl_ex_nst;
+ int repl_ex_nex;
+ int repl_ex_seed;
+ real pforce;
+ real cpt_period;
+ real max_hours;
+ const char *deviceOptions;
+ unsigned long Flags;
+ int ret; /* return value */
+};
+
+
+/* The function used for spawning threads. Extracts the mdrunner()
+ arguments from its one argument and calls mdrunner(), after making
+ a commrec. */
+static void mdrunner_start_fn(void *arg)
+{
+ struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
+ struct mdrunner_arglist mc = *mda; /* copy the arg list to make sure
+ that it's thread-local. This doesn't
+ copy pointed-to items, of course,
+ but those are all const. */
+ t_commrec *cr; /* we need a local version of this */
+ FILE *fplog = NULL;
+ t_filenm *fnm;
+
+ fnm = dup_tfn(mc.nfile, mc.fnm);
+
+ cr = reinitialize_commrec_for_this_thread(mc.cr);
+
+ if (MASTER(cr))
+ {
+ fplog = mc.fplog;
+ }
+
+ mda->ret = mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+ mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
+ mc.ddxyz, mc.dd_node_order, mc.rdd,
+ mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
+ mc.ddcsx, mc.ddcsy, mc.ddcsz,
+ mc.nbpu_opt,
+ mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+ mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+ mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
+}
+
+/* called by mdrunner() to start a specific number of threads (including
+ the main thread) for thread-parallel runs. This in turn calls mdrunner()
+ for each thread.
+ All options besides nthreads are the same as for mdrunner(). */
+static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
+ FILE *fplog, t_commrec *cr, int nfile,
+ const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+ gmx_bool bCompact, int nstglobalcomm,
+ ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+ const char *dddlb_opt, real dlb_scale,
+ const char *ddcsx, const char *ddcsy, const char *ddcsz,
+ const char *nbpu_opt,
+ gmx_large_int_t nsteps_cmdline,
+ int nstepout, int resetstep,
+ int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+ real pforce, real cpt_period, real max_hours,
+ const char *deviceOptions, unsigned long Flags)
+{
+ int ret;
+ struct mdrunner_arglist *mda;
+ t_commrec *crn; /* the new commrec */
+ t_filenm *fnmn;
+
+ /* first check whether we even need to start tMPI */
+ if (hw_opt->nthreads_tmpi < 2)
+ {
+ return cr;
+ }
+
+ /* a few small, one-time, almost unavoidable memory leaks: */
+ snew(mda, 1);
+ fnmn = dup_tfn(nfile, fnm);
+
+ /* fill the data structure to pass as void pointer to thread start fn */
+ mda->hw_opt = hw_opt;
+ mda->fplog = fplog;
+ mda->cr = cr;
+ mda->nfile = nfile;
+ mda->fnm = fnmn;
+ mda->oenv = oenv;
+ mda->bVerbose = bVerbose;
+ mda->bCompact = bCompact;
+ mda->nstglobalcomm = nstglobalcomm;
+ mda->ddxyz[XX] = ddxyz[XX];
+ mda->ddxyz[YY] = ddxyz[YY];
+ mda->ddxyz[ZZ] = ddxyz[ZZ];
+ mda->dd_node_order = dd_node_order;
+ mda->rdd = rdd;
+ mda->rconstr = rconstr;
+ mda->dddlb_opt = dddlb_opt;
+ mda->dlb_scale = dlb_scale;
+ mda->ddcsx = ddcsx;
+ mda->ddcsy = ddcsy;
+ mda->ddcsz = ddcsz;
+ mda->nbpu_opt = nbpu_opt;
+ mda->nsteps_cmdline = nsteps_cmdline;
+ mda->nstepout = nstepout;
+ mda->resetstep = resetstep;
+ mda->nmultisim = nmultisim;
+ mda->repl_ex_nst = repl_ex_nst;
+ mda->repl_ex_nex = repl_ex_nex;
+ mda->repl_ex_seed = repl_ex_seed;
+ mda->pforce = pforce;
+ mda->cpt_period = cpt_period;
+ mda->max_hours = max_hours;
+ mda->deviceOptions = deviceOptions;
+ mda->Flags = Flags;
+
+ /* now spawn new threads that start mdrunner_start_fn(), while
+ the main thread returns, we set thread affinity later */
+ ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
+ mdrunner_start_fn, (void*)(mda) );
+ if (ret != TMPI_SUCCESS)
+ {
+ return NULL;
+ }
+
+ crn = reinitialize_commrec_for_this_thread(cr);
+ return crn;
+}
+
+
+static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+ const gmx_hw_opt_t *hw_opt,
+ int nthreads_tot,
+ int ngpu)
+{
+ int nthreads_tmpi;
+
+ /* There are no separate PME nodes here, as we ensured in
+ * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
+ * and a conditional ensures we would not have ended up here.
+ * Note that separate PME nodes might be switched on later.
+ */
+ if (ngpu > 0)
+ {
+ nthreads_tmpi = ngpu;
+ if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
+ {
+ nthreads_tmpi = nthreads_tot;
+ }
+ }
+ else if (hw_opt->nthreads_omp > 0)
+ {
+ /* Here we could oversubscribe, when we do, we issue a warning later */
+ nthreads_tmpi = max(1, nthreads_tot/hw_opt->nthreads_omp);
+ }
+ else
+ {
+ /* TODO choose nthreads_omp based on hardware topology
+ when we have a hardware topology detection library */
+ /* In general, when running up to 4 threads, OpenMP should be faster.
+ * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
+ * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
+ * even on two CPUs it's usually faster (but with many OpenMP threads
+ * it could be faster not to use HT, currently we always use HT).
+ * On Nehalem/Westmere we want to avoid running 16 threads over
+ * two CPUs with HT, so we need a limit<16; thus we use 12.
+ * A reasonable limit for Intel Sandy and Ivy bridge,
+ * not knowing the topology, is 16 threads.
+ */
+ const int nthreads_omp_always_faster = 4;
+ const int nthreads_omp_always_faster_Nehalem = 12;
+ const int nthreads_omp_always_faster_SandyBridge = 16;
+ const int first_model_Nehalem = 0x1A;
+ const int first_model_SandyBridge = 0x2A;
+ gmx_bool bIntel_Family6;
+
+ bIntel_Family6 =
+ (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
+ gmx_cpuid_family(hwinfo->cpuid_info) == 6);
+
+ if (nthreads_tot <= nthreads_omp_always_faster ||
+ (bIntel_Family6 &&
+ ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
+ (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
+ {
+ /* Use pure OpenMP parallelization */
+ nthreads_tmpi = 1;
+ }
+ else
+ {
+ /* Don't use OpenMP parallelization */
+ nthreads_tmpi = nthreads_tot;
+ }
+ }
+
+ return nthreads_tmpi;
+}
+
+
+/* Get the number of threads to use for thread-MPI based on how many
+ * were requested, which algorithms we're using,
+ * and how many particles there are.
+ * At the point we have already called check_and_update_hw_opt.
+ * Thus all options should be internally consistent and consistent
+ * with the hardware, except that ntmpi could be larger than #GPU.
+ */
+static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
+ gmx_hw_opt_t *hw_opt,
+ t_inputrec *inputrec, gmx_mtop_t *mtop,
+ const t_commrec *cr,
+ FILE *fplog)
+{
+ int nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu;
+ int min_atoms_per_mpi_thread;
+ char *env;
+ char sbuf[STRLEN];
+ gmx_bool bCanUseGPU;
+
+ if (hw_opt->nthreads_tmpi > 0)
+ {
+ /* Trivial, return right away */
+ return hw_opt->nthreads_tmpi;
+ }
+
+ nthreads_hw = hwinfo->nthreads_hw_avail;
+
+ /* How many total (#tMPI*#OpenMP) threads can we start? */
+ if (hw_opt->nthreads_tot > 0)
+ {
+ nthreads_tot_max = hw_opt->nthreads_tot;
+ }
+ else
+ {
+ nthreads_tot_max = nthreads_hw;
+ }
+
+ bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
+ if (bCanUseGPU)
+ {
+ ngpu = hwinfo->gpu_info.ncuda_dev_use;
+ }
+ else
+ {
+ ngpu = 0;
+ }
+
+ nthreads_tmpi =
+ get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
+
+ if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
+ {
+ /* Dims/steps are divided over the nodes iso splitting the atoms */
+ min_atoms_per_mpi_thread = 0;
+ }
+ else
+ {
+ if (bCanUseGPU)
+ {
+ min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
+ }
+ else
+ {
+ min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
+ }
+ }
+
+ /* Check if an algorithm does not support parallel simulation. */
+ if (nthreads_tmpi != 1 &&
+ ( inputrec->eI == eiLBFGS ||
+ inputrec->coulombtype == eelEWALD ) )
+ {
+ nthreads_tmpi = 1;
+
+ md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
+ if (hw_opt->nthreads_tmpi > nthreads_tmpi)
+ {
+ gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
+ }
+ }
+ else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
+ {
+ /* the thread number was chosen automatically, but there are too many
+ threads (too few atoms per thread) */
+ nthreads_new = max(1, mtop->natoms/min_atoms_per_mpi_thread);
+
+ /* Avoid partial use of Hyper-Threading */
+ if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+ nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
+ {
+ nthreads_new = nthreads_hw/2;
+ }
+
+ /* Avoid large prime numbers in the thread count */
+ if (nthreads_new >= 6)
+ {
+ /* Use only 6,8,10 with additional factors of 2 */
+ int fac;
+
+ fac = 2;
+ while (3*fac*2 <= nthreads_new)
+ {
+ fac *= 2;
+ }
+
+ nthreads_new = (nthreads_new/fac)*fac;
+ }
+ else
+ {
+ /* Avoid 5 */
+ if (nthreads_new == 5)
+ {
+ nthreads_new = 4;
+ }
+ }
+
+ nthreads_tmpi = nthreads_new;
+
+ fprintf(stderr, "\n");
+ fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
+ fprintf(stderr, " only starting %d thread-MPI threads.\n", nthreads_tmpi);
+ fprintf(stderr, " You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
+ }
+
+ return nthreads_tmpi;
+}
+#endif /* GMX_THREAD_MPI */
+
+
+/* Environment variable for setting nstlist */
+static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
+/* Try to increase nstlist when using a GPU with nstlist less than this */
+static const int NSTLIST_GPU_ENOUGH = 20;
+/* Increase nstlist until the non-bonded cost increases more than this factor */
- real rlist_inc, rlist_ok, rlist_max, rlist_new, rlist_prev;
++static const float NBNXN_GPU_LIST_OK_FAC = 1.20;
++/* Don't increase nstlist beyond a non-bonded cost increases of this factor.
++ * A standard (protein+)water system at 300K with PME ewald_rtol=1e-5
++ * needs 1.28 at rcoulomb=0.9 and 1.24 at rcoulomb=1.0 to get to nstlist=40.
++ */
++static const float NBNXN_GPU_LIST_MAX_FAC = 1.30;
+
+/* Try to increase nstlist when running on a GPU */
+static void increase_nstlist(FILE *fp, t_commrec *cr,
+ t_inputrec *ir, const gmx_mtop_t *mtop, matrix box)
+{
+ char *env;
+ int nstlist_orig, nstlist_prev;
+ verletbuf_list_setup_t ls;
- const int nstl[] = { 20, 25, 40, 50 };
++ real rlist_nstlist10, rlist_inc, rlist_ok, rlist_max;
++ real rlist_new, rlist_prev;
+ int i;
+ t_state state_tmp;
+ gmx_bool bBox, bDD, bCont;
+ const char *nstl_fmt = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+ const char *vbd_err = "Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
+ const char *box_err = "Can not increase nstlist for GPU run because the box is too small";
+ const char *dd_err = "Can not increase nstlist for GPU run because of domain decomposition limitations";
+ char buf[STRLEN];
+
+ /* Number of + nstlist alternative values to try when switching */
- /* Allow rlist to make the list double the size of the cut-off sphere */
++ const int nstl[] = { 20, 25, 40 };
+#define NNSTL sizeof(nstl)/sizeof(nstl[0])
+
+ env = getenv(NSTLIST_ENVVAR);
+ if (env == NULL)
+ {
+ if (fp != NULL)
+ {
+ fprintf(fp, nstl_fmt, ir->nstlist);
+ }
+ }
+
+ if (ir->verletbuf_drift == 0)
+ {
+ gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+ }
+
+ if (ir->verletbuf_drift < 0)
+ {
+ if (MASTER(cr))
+ {
+ fprintf(stderr, "%s\n", vbd_err);
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp, "%s\n", vbd_err);
+ }
+
+ return;
+ }
+
+ nstlist_orig = ir->nstlist;
+ if (env != NULL)
+ {
+ sprintf(buf, "Getting nstlist from environment variable GMX_NSTLIST=%s", env);
+ if (MASTER(cr))
+ {
+ fprintf(stderr, "%s\n", buf);
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp, "%s\n", buf);
+ }
+ sscanf(env, "%d", &ir->nstlist);
+ }
+
+ verletbuf_get_list_setup(TRUE, &ls);
+
- rlist_ok = (max(ir->rvdw, ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC, 1.0/3.0) - rlist_inc;
- rlist_max = (max(ir->rvdw, ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC, 1.0/3.0) - rlist_inc;
++ /* Allow rlist to make the list a given factor larger than the list
++ * would be with nstlist=10.
++ */
++ nstlist_prev = ir->nstlist;
++ ir->nstlist = 10;
++ calc_verlet_buffer_size(mtop, det(box), ir, ir->verletbuf_drift, &ls,
++ NULL, &rlist_nstlist10);
++ ir->nstlist = nstlist_prev;
++
++ /* Determine the pair list size increase due to zero interactions */
+ rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE, mtop->natoms/det(box));
++ rlist_ok = (rlist_nstlist10 + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC, 1.0/3.0) - rlist_inc;
++ rlist_max = (rlist_nstlist10 + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC, 1.0/3.0) - rlist_inc;
+ if (debug)
+ {
+ fprintf(debug, "GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
+ rlist_inc, rlist_max);
+ }
+
+ i = 0;
+ nstlist_prev = nstlist_orig;
+ rlist_prev = ir->rlist;
+ do
+ {
+ if (env == NULL)
+ {
+ ir->nstlist = nstl[i];
+ }
+
+ /* Set the pair-list buffer size in ir */
+ calc_verlet_buffer_size(mtop, det(box), ir, ir->verletbuf_drift, &ls,
+ NULL, &rlist_new);
+
+ /* Does rlist fit in the box? */
+ bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
+ bDD = TRUE;
+ if (bBox && DOMAINDECOMP(cr))
+ {
+ /* Check if rlist fits in the domain decomposition */
+ if (inputrec2nboundeddim(ir) < DIM)
+ {
+ gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+ }
+ copy_mat(box, state_tmp.box);
+ bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
+ }
+
+ bCont = FALSE;
+
+ if (env == NULL)
+ {
+ if (bBox && bDD && rlist_new <= rlist_max)
+ {
+ /* Increase nstlist */
+ nstlist_prev = ir->nstlist;
+ rlist_prev = rlist_new;
+ bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
+ }
+ else
+ {
+ /* Stick with the previous nstlist */
+ ir->nstlist = nstlist_prev;
+ rlist_new = rlist_prev;
+ bBox = TRUE;
+ bDD = TRUE;
+ }
+ }
+
+ i++;
+ }
+ while (bCont);
+
+ if (!bBox || !bDD)
+ {
+ gmx_warning(!bBox ? box_err : dd_err);
+ if (fp != NULL)
+ {
+ fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
+ }
+ ir->nstlist = nstlist_orig;
+ }
+ else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+ {
+ sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
+ nstlist_orig, ir->nstlist,
+ ir->rlist, rlist_new);
+ if (MASTER(cr))
+ {
+ fprintf(stderr, "%s\n\n", buf);
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp, "%s\n\n", buf);
+ }
+ ir->rlist = rlist_new;
+ ir->rlistlong = rlist_new;
+ }
+}
+
+static void prepare_verlet_scheme(FILE *fplog,
+ const gmx_hw_info_t *hwinfo,
+ t_commrec *cr,
+ t_inputrec *ir,
+ const gmx_mtop_t *mtop,
+ matrix box,
+ gmx_bool *bUseGPU)
+{
+ /* Here we only check for GPU usage on the MPI master process,
+ * as here we don't know how many GPUs we will use yet.
+ * We check for a GPU on all processes later.
+ */
+ *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
+
+ if (ir->verletbuf_drift > 0)
+ {
+ /* Update the Verlet buffer size for the current run setup */
+ verletbuf_list_setup_t ls;
+ real rlist_new;
+
+ /* Here we assume CPU acceleration is on. But as currently
+ * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+ * and 4x2 gives a larger buffer than 4x4, this is ok.
+ */
+ verletbuf_get_list_setup(*bUseGPU, &ls);
+
+ calc_verlet_buffer_size(mtop, det(box), ir,
+ ir->verletbuf_drift, &ls,
+ NULL, &rlist_new);
+ if (rlist_new != ir->rlist)
+ {
+ if (fplog != NULL)
+ {
+ fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+ ir->rlist, rlist_new,
+ ls.cluster_size_i, ls.cluster_size_j);
+ }
+ ir->rlist = rlist_new;
+ ir->rlistlong = rlist_new;
+ }
+ }
+
+ /* With GPU or emulation we should check nstlist for performance */
+ if ((EI_DYNAMICS(ir->eI) &&
+ *bUseGPU &&
+ ir->nstlist < NSTLIST_GPU_ENOUGH) ||
+ getenv(NSTLIST_ENVVAR) != NULL)
+ {
+ /* Choose a better nstlist */
+ increase_nstlist(fplog, cr, ir, mtop, box);
+ }
+}
+
+static void convert_to_verlet_scheme(FILE *fplog,
+ t_inputrec *ir,
+ gmx_mtop_t *mtop, real box_vol)
+{
+ char *conv_mesg = "Converting input file with group cut-off scheme to the Verlet cut-off scheme";
+
+ md_print_warn(NULL, fplog, "%s\n", conv_mesg);
+
+ ir->cutoff_scheme = ecutsVERLET;
+ ir->verletbuf_drift = 0.005;
+
+ if (ir->rcoulomb != ir->rvdw)
+ {
+ gmx_fatal(FARGS, "The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
+ }
+
+ if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
+ {
+ gmx_fatal(FARGS, "User non-bonded potentials are not (yet) supported with the Verlet scheme");
+ }
+ else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
+ {
+ md_print_warn(NULL, fplog, "Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
+
+ if (EVDW_SWITCHED(ir->vdwtype))
+ {
+ ir->vdwtype = evdwCUT;
+ }
+ if (EEL_SWITCHED(ir->coulombtype))
+ {
+ if (EEL_FULL(ir->coulombtype))
+ {
+ /* With full electrostatic only PME can be switched */
+ ir->coulombtype = eelPME;
+ }
+ else
+ {
+ md_print_warn(NULL, fplog, "NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n", eel_names[ir->coulombtype]);
+ ir->coulombtype = eelRF;
+ ir->epsilon_rf = 0.0;
+ }
+ }
+
+ /* We set the target energy drift to a small number.
+ * Note that this is only for testing. For production the user
+ * should think about this and set the mdp options.
+ */
+ ir->verletbuf_drift = 1e-4;
+ }
+
+ if (inputrec2nboundeddim(ir) != 3)
+ {
+ gmx_fatal(FARGS, "Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
+ }
+
+ if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
+ {
+ gmx_fatal(FARGS, "Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
+ }
+
+ if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
+ {
+ verletbuf_list_setup_t ls;
+
+ verletbuf_get_list_setup(FALSE, &ls);
+ calc_verlet_buffer_size(mtop, box_vol, ir, ir->verletbuf_drift, &ls,
+ NULL, &ir->rlist);
+ }
+ else
+ {
+ ir->verletbuf_drift = -1;
+ ir->rlist = 1.05*max(ir->rvdw, ir->rcoulomb);
+ }
+
+ gmx_mtop_remove_chargegroups(mtop);
+}
+
+static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
+ int cutoff_scheme,
+ gmx_bool bIsSimMaster)
+{
+ gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
+
+#ifndef GMX_THREAD_MPI
+ if (hw_opt->nthreads_tot > 0)
+ {
+ gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+ }
+ if (hw_opt->nthreads_tmpi > 0)
+ {
+ gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+ }
+#endif
+
+#ifndef GMX_OPENMP
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but Gromacs was compiled without OpenMP support");
+ }
+ hw_opt->nthreads_omp = 1;
+#endif
+
+ if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
+ {
+ /* We have the same number of OpenMP threads for PP and PME processes,
+ * thus we can perform several consistency checks.
+ */
+ if (hw_opt->nthreads_tmpi > 0 &&
+ hw_opt->nthreads_omp > 0 &&
+ hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
+ {
+ gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
+ hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
+ }
+
+ if (hw_opt->nthreads_tmpi > 0 &&
+ hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
+ {
+ gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
+ hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
+ }
+
+ if (hw_opt->nthreads_omp > 0 &&
+ hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
+ {
+ gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
+ hw_opt->nthreads_tot, hw_opt->nthreads_omp);
+ }
+
+ if (hw_opt->nthreads_tmpi > 0 &&
+ hw_opt->nthreads_omp <= 0)
+ {
+ hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+ }
+ }
+
+#ifndef GMX_OPENMP
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS, "OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
+ }
+#endif
+
+ if (cutoff_scheme == ecutsGROUP)
+ {
+ /* We only have OpenMP support for PME only nodes */
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
+ ecutscheme_names[cutoff_scheme],
+ ecutscheme_names[ecutsVERLET]);
+ }
+ hw_opt->nthreads_omp = 1;
+ }
+
+ if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
+ {
+ gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
+ }
+
+ if (hw_opt->nthreads_tot == 1)
+ {
+ hw_opt->nthreads_tmpi = 1;
+
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
+ hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
+ }
+ hw_opt->nthreads_omp = 1;
+ }
+
+ if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+ {
+ hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+ hw_opt->nthreads_tot,
+ hw_opt->nthreads_tmpi,
+ hw_opt->nthreads_omp,
+ hw_opt->nthreads_omp_pme,
+ hw_opt->gpu_id != NULL ? hw_opt->gpu_id : "");
+
+ }
+}
+
+
+/* Override the value in inputrec with value passed on the command line (if any) */
+static void override_nsteps_cmdline(FILE *fplog,
+ gmx_large_int_t nsteps_cmdline,
+ t_inputrec *ir,
+ const t_commrec *cr)
+{
+ char sbuf[STEPSTRSIZE];
+
+ assert(ir);
+ assert(cr);
+
+ /* override with anything else than the default -2 */
+ if (nsteps_cmdline > -2)
+ {
+ char stmp[STRLEN];
+
+ ir->nsteps = nsteps_cmdline;
+ if (EI_DYNAMICS(ir->eI))
+ {
+ sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps, %.3f ps",
+ gmx_step_str(nsteps_cmdline, sbuf),
+ nsteps_cmdline*ir->delta_t);
+ }
+ else
+ {
+ sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps",
+ gmx_step_str(nsteps_cmdline, sbuf));
+ }
+
+ md_print_warn(cr, fplog, "%s\n", stmp);
+ }
+}
+
+/* Data structure set by SIMMASTER which needs to be passed to all nodes
+ * before the other nodes have read the tpx file and called gmx_detect_hardware.
+ */
+typedef struct {
+ int cutoff_scheme; /* The cutoff scheme from inputrec_t */
+ gmx_bool bUseGPU; /* Use GPU or GPU emulation */
+} master_inf_t;
+
+int mdrunner(gmx_hw_opt_t *hw_opt,
+ FILE *fplog, t_commrec *cr, int nfile,
+ const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+ gmx_bool bCompact, int nstglobalcomm,
+ ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+ const char *dddlb_opt, real dlb_scale,
+ const char *ddcsx, const char *ddcsy, const char *ddcsz,
+ const char *nbpu_opt,
+ gmx_large_int_t nsteps_cmdline, int nstepout, int resetstep,
+ int nmultisim, int repl_ex_nst, int repl_ex_nex,
+ int repl_ex_seed, real pforce, real cpt_period, real max_hours,
+ const char *deviceOptions, unsigned long Flags)
+{
+ gmx_bool bForceUseGPU, bTryUseGPU;
+ double nodetime = 0, realtime;
+ t_inputrec *inputrec;
+ t_state *state = NULL;
+ matrix box;
+ gmx_ddbox_t ddbox = {0};
+ int npme_major, npme_minor;
+ real tmpr1, tmpr2;
+ t_nrnb *nrnb;
+ gmx_mtop_t *mtop = NULL;
+ t_mdatoms *mdatoms = NULL;
+ t_forcerec *fr = NULL;
+ t_fcdata *fcd = NULL;
+ real ewaldcoeff = 0;
+ gmx_pme_t *pmedata = NULL;
+ gmx_vsite_t *vsite = NULL;
+ gmx_constr_t constr;
+ int i, m, nChargePerturbed = -1, status, nalloc;
+ char *gro;
+ gmx_wallcycle_t wcycle;
+ gmx_bool bReadRNG, bReadEkin;
+ int list;
+ gmx_walltime_accounting_t walltime_accounting = NULL;
+ int rc;
+ gmx_large_int_t reset_counters;
+ gmx_edsam_t ed = NULL;
+ t_commrec *cr_old = cr;
+ int nthreads_pme = 1;
+ int nthreads_pp = 1;
+ gmx_membed_t membed = NULL;
+ gmx_hw_info_t *hwinfo = NULL;
+ master_inf_t minf = {-1, FALSE};
+
+ /* CAUTION: threads may be started later on in this function, so
+ cr doesn't reflect the final parallel state right now */
+ snew(inputrec, 1);
+ snew(mtop, 1);
+
+ if (Flags & MD_APPENDFILES)
+ {
+ fplog = NULL;
+ }
+
+ bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+ bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+
+ /* Detect hardware, gather information. This is an operation that is
+ * global for this process (MPI rank). */
+ hwinfo = gmx_detect_hardware(fplog, cr,
+ bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
+
+
+ snew(state, 1);
+ if (SIMMASTER(cr))
+ {
+ /* Read (nearly) all data required for the simulation */
+ read_tpx_state(ftp2fn(efTPX, nfile, fnm), inputrec, state, NULL, mtop);
+
+ if (inputrec->cutoff_scheme != ecutsVERLET &&
+ ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
+ {
+ convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box));
+ }
+
+
+ minf.cutoff_scheme = inputrec->cutoff_scheme;
+ minf.bUseGPU = FALSE;
+
+ if (inputrec->cutoff_scheme == ecutsVERLET)
+ {
+ prepare_verlet_scheme(fplog, hwinfo, cr,
+ inputrec, mtop, state->box,
+ &minf.bUseGPU);
+ }
+ else if (hwinfo->bCanUseGPU)
+ {
+ md_print_warn(cr, fplog,
+ "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+ " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
+ " (for quick performance testing you can use the -testverlet option)\n");
+
+ if (bForceUseGPU)
+ {
+ gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
+ }
+ }
+#ifdef GMX_IS_BGQ
+ else
+ {
+ md_print_warn(cr, fplog,
+ "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
+ " BlueGene/Q. You will observe better performance from using the\n"
+ " Verlet cut-off scheme.\n");
+ }
+#endif
+ }
+#ifndef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_bcast_sim(sizeof(minf), &minf, cr);
+ }
+#endif
+ if (minf.bUseGPU && cr->npmenodes == -1)
+ {
+ /* Don't automatically use PME-only nodes with GPUs */
+ cr->npmenodes = 0;
+ }
+
+ /* Check for externally set OpenMP affinity and turn off internal
+ * pinning if any is found. We need to do this check early to tell
+ * thread-MPI whether it should do pinning when spawning threads.
+ * TODO: the above no longer holds, we should move these checks down
+ */
+ gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
+
+#ifdef GMX_THREAD_MPI
+ /* With thread-MPI inputrec is only set here on the master thread */
+ if (SIMMASTER(cr))
+#endif
+ {
+ check_and_update_hw_opt(hw_opt, minf.cutoff_scheme, SIMMASTER(cr));
+
+#ifdef GMX_THREAD_MPI
+ /* Early check for externally set process affinity. Can't do over all
+ * MPI processes because hwinfo is not available everywhere, but with
+ * thread-MPI it's needed as pinning might get turned off which needs
+ * to be known before starting thread-MPI. */
+ gmx_check_thread_affinity_set(fplog,
+ NULL,
+ hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+#endif
+
+#ifdef GMX_THREAD_MPI
+ if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
+ {
+ gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
+ }
+#endif
+
+ if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
+ cr->npmenodes <= 0)
+ {
+ gmx_fatal(FARGS, "You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
+ }
+ }
+
+#ifdef GMX_THREAD_MPI
+ if (SIMMASTER(cr))
+ {
+ /* NOW the threads will be started: */
+ hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+ hw_opt,
+ inputrec, mtop,
+ cr, fplog);
+ if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
+ {
+ hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+ }
+
+ if (hw_opt->nthreads_tmpi > 1)
+ {
+ /* now start the threads. */
+ cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
+ oenv, bVerbose, bCompact, nstglobalcomm,
+ ddxyz, dd_node_order, rdd, rconstr,
+ dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
+ nbpu_opt,
+ nsteps_cmdline, nstepout, resetstep, nmultisim,
+ repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
+ cpt_period, max_hours, deviceOptions,
+ Flags);
+ /* the main thread continues here with a new cr. We don't deallocate
+ the old cr because other threads may still be reading it. */
+ if (cr == NULL)
+ {
+ gmx_comm("Failed to spawn threads");
+ }
+ }
+ }
+#endif
+ /* END OF CAUTION: cr is now reliable */
+
+ /* g_membed initialisation *
+ * Because we change the mtop, init_membed is called before the init_parallel *
+ * (in case we ever want to make it run in parallel) */
+ if (opt2bSet("-membed", nfile, fnm))
+ {
+ if (MASTER(cr))
+ {
+ fprintf(stderr, "Initializing membed");
+ }
+ membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
+ }
+
+ if (PAR(cr))
+ {
+ /* now broadcast everything to the non-master nodes/threads: */
+ init_parallel(cr, inputrec, mtop);
+
+ /* This check needs to happen after get_nthreads_mpi() */
+ if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
+ {
+ gmx_fatal_collective(FARGS, cr, NULL,
+ "The Verlet cut-off scheme is not supported with particle decomposition.\n"
+ "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
+ }
+ }
+ if (fplog != NULL)
+ {
+ pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+ }
+
+ /* now make sure the state is initialized and propagated */
+ set_state_entries(state, inputrec, cr->nnodes);
+
+ /* A parallel command line option consistency check that we can
+ only do after any threads have started. */
+ if (!PAR(cr) &&
+ (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
+ {
+ gmx_fatal(FARGS,
+ "The -dd or -npme option request a parallel simulation, "
+#ifndef GMX_MPI
+ "but %s was compiled without threads or MPI enabled"
+#else
+#ifdef GMX_THREAD_MPI
+ "but the number of threads (option -nt) is 1"
+#else
+ "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
+#endif
+#endif
+ , ShortProgram()
+ );
+ }
+
+ if ((Flags & MD_RERUN) &&
+ (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+ {
+ gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+ }
+
+ if (can_use_allvsall(inputrec, TRUE, cr, fplog) && PAR(cr))
+ {
+ /* Simple neighbour searching and (also?) all-vs-all loops
+ * do not work with domain decomposition. */
+ Flags |= MD_PARTDEC;
+ }
+
+ if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
+ {
+ if (cr->npmenodes > 0)
+ {
+ if (!EEL_PME(inputrec->coulombtype))
+ {
+ gmx_fatal_collective(FARGS, cr, NULL,
+ "PME nodes are requested, but the system does not use PME electrostatics");
+ }
+ if (Flags & MD_PARTDEC)
+ {
+ gmx_fatal_collective(FARGS, cr, NULL,
+ "PME nodes are requested, but particle decomposition does not support separate PME nodes");
+ }
+ }
+
+ cr->npmenodes = 0;
+ }
+
+#ifdef GMX_FAHCORE
+ fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+#endif
+
+ /* NMR restraints must be initialized before load_checkpoint,
+ * since with time averaging the history is added to t_state.
+ * For proper consistency check we therefore need to extend
+ * t_state here.
+ * So the PME-only nodes (if present) will also initialize
+ * the distance restraints.
+ */
+ snew(fcd, 1);
+
+ /* This needs to be called before read_checkpoint to extend the state */
+ init_disres(fplog, mtop, inputrec, cr, Flags & MD_PARTDEC, fcd, state, repl_ex_nst > 0);
+
+ if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
+ {
+ if (PAR(cr) && !(Flags & MD_PARTDEC))
+ {
+ gmx_fatal(FARGS, "Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
+ }
+ /* Orientation restraints */
+ if (MASTER(cr))
+ {
+ init_orires(fplog, mtop, state->x, inputrec, cr->ms, &(fcd->orires),
+ state);
+ }
+ }
+
+ if (DEFORM(*inputrec))
+ {
+ /* Store the deform reference box before reading the checkpoint */
+ if (SIMMASTER(cr))
+ {
+ copy_mat(state->box, box);
+ }
+ if (PAR(cr))
+ {
+ gmx_bcast(sizeof(box), box, cr);
+ }
+ /* Because we do not have the update struct available yet
+ * in which the reference values should be stored,
+ * we store them temporarily in static variables.
+ * This should be thread safe, since they are only written once
+ * and with identical values.
+ */
+#ifdef GMX_THREAD_MPI
+ tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+#endif
+ deform_init_init_step_tpx = inputrec->init_step;
+ copy_mat(box, deform_init_box_tpx);
+#ifdef GMX_THREAD_MPI
+ tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+#endif
+ }
+
+ if (opt2bSet("-cpi", nfile, fnm))
+ {
+ /* Check if checkpoint file exists before doing continuation.
+ * This way we can use identical input options for the first and subsequent runs...
+ */
+ if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
+ {
+ load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+ cr, Flags & MD_PARTDEC, ddxyz,
+ inputrec, state, &bReadRNG, &bReadEkin,
+ (Flags & MD_APPENDFILES),
+ (Flags & MD_APPENDFILESSET));
+
+ if (bReadRNG)
+ {
+ Flags |= MD_READ_RNG;
+ }
+ if (bReadEkin)
+ {
+ Flags |= MD_READ_EKIN;
+ }
+ }
+ }
+
+ if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
+#ifdef GMX_THREAD_MPI
+ /* With thread MPI only the master node/thread exists in mdrun.c,
+ * therefore non-master nodes need to open the "seppot" log file here.
+ */
+ || (!MASTER(cr) && (Flags & MD_SEPPOT))
+#endif
+ )
+ {
+ gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr, !(Flags & MD_SEPPOT),
+ Flags, &fplog);
+ }
+
+ /* override nsteps with value from cmdline */
+ override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+
+ if (SIMMASTER(cr))
+ {
+ copy_mat(state->box, box);
+ }
+
+ if (PAR(cr))
+ {
+ gmx_bcast(sizeof(box), box, cr);
+ }
+
+ /* Essential dynamics */
+ if (opt2bSet("-ei", nfile, fnm))
+ {
+ /* Open input and output files, allocate space for ED data structure */
+ ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
+ }
+
+ if (PAR(cr) && !((Flags & MD_PARTDEC) ||
+ EI_TPI(inputrec->eI) ||
+ inputrec->eI == eiNM))
+ {
+ cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
+ dddlb_opt, dlb_scale,
+ ddcsx, ddcsy, ddcsz,
+ mtop, inputrec,
+ box, state->x,
+ &ddbox, &npme_major, &npme_minor);
+
+ make_dd_communicators(fplog, cr, dd_node_order);
+
+ /* Set overallocation to avoid frequent reallocation of arrays */
+ set_over_alloc_dd(TRUE);
+ }
+ else
+ {
+ /* PME, if used, is done on all nodes with 1D decomposition */
+ cr->npmenodes = 0;
+ cr->duty = (DUTY_PP | DUTY_PME);
+ npme_major = 1;
+ npme_minor = 1;
+ /* NM and TPI perform single node energy calculations in parallel */
+ if (!(inputrec->eI == eiNM || EI_TPI(inputrec->eI)))
+ {
+ npme_major = cr->nnodes;
+ }
+
+ if (inputrec->ePBC == epbcSCREW)
+ {
+ gmx_fatal(FARGS,
+ "pbc=%s is only implemented with domain decomposition",
+ epbc_names[inputrec->ePBC]);
+ }
+ }
+
+ if (PAR(cr))
+ {
+ /* After possible communicator splitting in make_dd_communicators.
+ * we can set up the intra/inter node communication.
+ */
+ gmx_setup_nodecomm(fplog, cr);
+ }
+
+ /* Initialize per-physical-node MPI process/thread ID and counters. */
+ gmx_init_intranode_counters(cr);
+
+#ifdef GMX_MPI
+ md_print_info(cr, fplog, "Using %d MPI %s\n",
+ cr->nnodes,
+#ifdef GMX_THREAD_MPI
+ cr->nnodes == 1 ? "thread" : "threads"
+#else
+ cr->nnodes == 1 ? "process" : "processes"
+#endif
+ );
+ fflush(stderr);
+#endif
+
+ gmx_omp_nthreads_init(fplog, cr,
+ hwinfo->nthreads_hw_avail,
+ hw_opt->nthreads_omp,
+ hw_opt->nthreads_omp_pme,
+ (cr->duty & DUTY_PP) == 0,
+ inputrec->cutoff_scheme == ecutsVERLET);
+
+ /* check consistency and decide on the number of gpus to use. */
+ gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi,
+ minf.bUseGPU);
+
+ /* getting number of PP/PME threads
+ PME: env variable should be read only on one node to make sure it is
+ identical everywhere;
+ */
+ /* TODO nthreads_pp is only used for pinning threads.
+ * This is a temporary solution until we have a hw topology library.
+ */
+ nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
+ nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+ wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);
+
+ if (PAR(cr))
+ {
+ /* Master synchronizes its value of reset_counters with all nodes
+ * including PME only nodes */
+ reset_counters = wcycle_get_reset_counters(wcycle);
+ gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+ wcycle_set_reset_counters(wcycle, reset_counters);
+ }
+
+ snew(nrnb, 1);
+ if (cr->duty & DUTY_PP)
+ {
+ /* For domain decomposition we allocate dynamically
+ * in dd_partition_system.
+ */
+ if (DOMAINDECOMP(cr))
+ {
+ bcast_state_setup(cr, state);
+ }
+ else
+ {
+ if (PAR(cr))
+ {
+ bcast_state(cr, state, TRUE);
+ }
+ }
+
+ /* Initiate forcerecord */
+ fr = mk_forcerec();
+ fr->hwinfo = hwinfo;
+ init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
+ opt2fn("-table", nfile, fnm),
+ opt2fn("-tabletf", nfile, fnm),
+ opt2fn("-tablep", nfile, fnm),
+ opt2fn("-tableb", nfile, fnm),
+ nbpu_opt,
+ FALSE, pforce);
+
+ /* version for PCA_NOT_READ_NODE (see md.c) */
+ /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
+ "nofile","nofile","nofile","nofile",FALSE,pforce);
+ */
+ fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
+
+ /* Initialize QM-MM */
+ if (fr->bQMMM)
+ {
+ init_QMMMrec(cr, mtop, inputrec, fr);
+ }
+
+ /* Initialize the mdatoms structure.
+ * mdatoms is not filled with atom data,
+ * as this can not be done now with domain decomposition.
+ */
+ mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
+
+ if (mdatoms->nPerturbed > 0 && inputrec->cutoff_scheme == ecutsVERLET)
+ {
+ gmx_fatal(FARGS, "The Verlet cut-off scheme does not (yet) support free-energy calculations with perturbed atoms, only perturbed interactions. This will be implemented soon. Use the group scheme for now.");
+ }
+
+ /* Initialize the virtual site communication */
+ vsite = init_vsite(mtop, cr, FALSE);
+
+ calc_shifts(box, fr->shift_vec);
+
+ /* With periodic molecules the charge groups should be whole at start up
+ * and the virtual sites should not be far from their proper positions.
+ */
+ if (!inputrec->bContinuation && MASTER(cr) &&
+ !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+ {
+ /* Make molecules whole at start of run */
+ if (fr->ePBC != epbcNONE)
+ {
+ do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
+ }
+ if (vsite)
+ {
+ /* Correct initial vsite positions are required
+ * for the initial distribution in the domain decomposition
+ * and for the initial shell prediction.
+ */
+ construct_vsites_mtop(vsite, mtop, state->x);
+ }
+ }
+
+ if (EEL_PME(fr->eeltype))
+ {
+ ewaldcoeff = fr->ewaldcoeff;
+ pmedata = &fr->pmedata;
+ }
+ else
+ {
+ pmedata = NULL;
+ }
+ }
+ else
+ {
+ /* This is a PME only node */
+
+ /* We don't need the state */
+ done_state(state);
+
+ ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
+ snew(pmedata, 1);
+ }
+
+ if (hw_opt->thread_affinity != threadaffOFF)
+ {
+ /* Before setting affinity, check whether the affinity has changed
+ * - which indicates that probably the OpenMP library has changed it
+ * since we first checked).
+ */
+ gmx_check_thread_affinity_set(fplog, cr,
+ hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+ /* Set the CPU affinity */
+ gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
+ }
+
+ /* Initiate PME if necessary,
+ * either on all nodes or on dedicated PME nodes only. */
+ if (EEL_PME(inputrec->coulombtype))
+ {
+ if (mdatoms)
+ {
+ nChargePerturbed = mdatoms->nChargePerturbed;
+ }
+ if (cr->npmenodes > 0)
+ {
+ /* The PME only nodes need to know nChargePerturbed */
+ gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+ }
+
+ if (cr->duty & DUTY_PME)
+ {
+ status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
+ mtop ? mtop->natoms : 0, nChargePerturbed,
+ (Flags & MD_REPRODUCIBLE), nthreads_pme);
+ if (status != 0)
+ {
+ gmx_fatal(FARGS, "Error %d initializing PME", status);
+ }
+ }
+ }
+
+
+ if (integrator[inputrec->eI].func == do_md)
+ {
+ /* Turn on signal handling on all nodes */
+ /*
+ * (A user signal from the PME nodes (if any)
+ * is communicated to the PP nodes.
+ */
+ signal_handler_install();
+ }
+
+ if (cr->duty & DUTY_PP)
+ {
+ /* Assumes uniform use of the number of OpenMP threads */
+ walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+
+ if (inputrec->ePull != epullNO)
+ {
+ /* Initialize pull code */
+ init_pull(fplog, inputrec, nfile, fnm, mtop, cr, oenv, inputrec->fepvals->init_lambda,
+ EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
+ }
+
+ if (inputrec->bRot)
+ {
+ /* Initialize enforced rotation code */
+ init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
+ bVerbose, Flags);
+ }
+
+ constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
+
+ if (DOMAINDECOMP(cr))
+ {
+ dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
+ Flags & MD_DDBONDCHECK, fr->cginfo_mb);
+
+ set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);
+
+ setup_dd_grid(fplog, cr->dd);
+ }
+
+ /* Now do whatever the user wants us to do (how flexible...) */
+ integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
+ oenv, bVerbose, bCompact,
+ nstglobalcomm,
+ vsite, constr,
+ nstepout, inputrec, mtop,
+ fcd, state,
+ mdatoms, nrnb, wcycle, ed, fr,
+ repl_ex_nst, repl_ex_nex, repl_ex_seed,
+ membed,
+ cpt_period, max_hours,
+ deviceOptions,
+ Flags,
+ walltime_accounting);
+
+ if (inputrec->ePull != epullNO)
+ {
+ finish_pull(inputrec->pull);
+ }
+
+ if (inputrec->bRot)
+ {
+ finish_rot(inputrec->rot);
+ }
+
+ }
+ else
+ {
+ /* do PME only */
+ walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+ gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff, inputrec);
+ }
+
+ wallcycle_stop(wcycle, ewcRUN);
+
+ /* Finish up, write some stuff
+ * if rerunMD, don't write last frame again
+ */
+ finish_run(fplog, cr,
+ inputrec, nrnb, wcycle, walltime_accounting,
+ fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
+ nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
+ EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+
+ if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
+ {
+ char gpu_err_str[STRLEN];
+
+ /* free GPU memory and uninitialize GPU (by destroying the context) */
+ nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
+
+ if (!free_gpu(gpu_err_str))
+ {
+ gmx_warning("On node %d failed to free GPU #%d: %s",
+ cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+ }
+ }
+
+ if (opt2bSet("-membed", nfile, fnm))
+ {
+ sfree(membed);
+ }
+
+ gmx_hardware_info_free(hwinfo);
+
+ /* Does what it says */
+ print_date_and_time(fplog, cr->nodeid, "Finished mdrun", walltime_accounting);
+ walltime_accounting_destroy(walltime_accounting);
+
+ /* Close logfile already here if we were appending to it */
+ if (MASTER(cr) && (Flags & MD_APPENDFILES))
+ {
+ gmx_log_close(fplog);
+ }
+
+ rc = (int)gmx_get_stop_condition();
+
+#ifdef GMX_THREAD_MPI
+ /* we need to join all threads. The sub-threads join when they
+ exit this function, but the master thread needs to be told to
+ wait for that. */
+ if (PAR(cr) && MASTER(cr))
+ {
+ tMPI_Finalize();
+ }
+#endif
+
+ return rc;
+}