Merge release-4-6 into master

author Roland Schulz <roland@utk.edu>

Wed, 30 Oct 2013 03:03:45 +0000 (23:03 -0400)

committer Roland Schulz <roland@utk.edu>

Wed, 30 Oct 2013 03:04:44 +0000 (23:04 -0400)
author Roland Schulz <roland@utk.edu>
Wed, 30 Oct 2013 03:03:45 +0000 (23:03 -0400)
committer Roland Schulz <roland@utk.edu>
Wed, 30 Oct 2013 03:04:44 +0000 (23:04 -0400)
diff --cc src/gromacs/gmxpreprocess/calc_verletbuf.c

index 385ba89b1cc059ac55a0cd04b90e0f9157b30e93,0000000000000000000000000000000000000000..398e8d3f6b2de021a4aa887b8261c8f827caa385

mode 100644,000000..100644
--- 1/src/gromacs/gmxpreprocess/calc_verletbuf.c
--- /dev/null
+++ b/src/gromacs/gmxpreprocess/calc_verletbuf.c
@@@ -1,723 -1,0 +1,960 @@@
-     real     mass; /* mass */
-     int      type; /* type (used for LJ parameters) */
-     real     q;    /* charge */
-     int      con;  /* constrained: 0, else 1, if 1, use #DOF=2 iso 3 */
-     int      n;    /* total #atoms of this type in the system */
- } verletbuf_atomtype_t;
+ +/*  -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.03
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <assert.h>
+ +
+ +#include <sys/types.h>
+ +#include <math.h>
+ +#include "typedefs.h"
+ +#include "physics.h"
+ +#include "smalloc.h"
+ +#include "gmx_fatal.h"
+ +#include "macros.h"
+ +#include "vec.h"
+ +#include "coulomb.h"
+ +#include "calc_verletbuf.h"
+ +#include "../mdlib/nbnxn_consts.h"
+ +
+ +#ifdef GMX_NBNXN_SIMD
+ +/* The include below sets the SIMD instruction type (precision+width)
+ + * for all nbnxn SIMD search and non-bonded kernel code.
+ + */
+ +#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
+ +#define GMX_USE_HALF_WIDTH_SIMD_HERE
+ +#endif
+ +#include "gmx_simd_macros.h"
+ +#endif
+ +
++
++/* The code in this file estimates a pairlist buffer length
++ * given a target energy drift per atom per picosecond.
++ * This is done by estimating the drift given a buffer length.
++ * Ideally we would like to have a tight overestimate of the drift,
++ * but that can be difficult to achieve.
++ *
++ * Significant approximations used:
++ *
++ * Uniform particle density. UNDERESTIMATES the drift by rho_global/rho_local.
++ *
++ * Interactions don't affect particle motion. OVERESTIMATES the drift on longer
++ * time scales. This approximation probably introduces the largest errors.
++ *
++ * Only take one constraint per particle into account: OVERESTIMATES the drift.
++ *
++ * For rotating constraints assume the same functional shape for time scales
++ * where the constraints rotate significantly as the exact expression for
++ * short time scales. OVERESTIMATES the drift on long time scales.
++ *
++ * For non-linear virtual sites use the mass of the lightest constructing atom
++ * to determine the displacement. OVER/UNDERESTIMATES the drift, depending on
++ * the geometry and masses of constructing atoms.
++ *
++ * Note that the formulas for normal atoms and linear virtual sites are exact,
++ * apart from the first two approximations.
++ *
++ * Note that apart from the effect of the above approximations, the actual
++ * drift of the total energy of a system can be order of magnitude smaller
++ * due to cancellation of positive and negative drift for different pairs.
++ */
++
++
+ +/* Struct for unique atom type for calculating the energy drift.
+ + * The atom displacement depends on mass and constraints.
+ + * The energy jump for given distance depend on LJ type and q.
+ + */
+ +typedef struct
+ +{
-                    real mass, int type, real q, int con, int nmol)
++    real     mass;     /* mass */
++    int      type;     /* type (used for LJ parameters) */
++    real     q;        /* charge */
++    gmx_bool bConstr;  /* constrained, if TRUE, use #DOF=2 iso 3 */
++    real     con_mass; /* mass of heaviest atom connected by constraints */
++    real     con_len;  /* constraint length to the heaviest atom */
++} atom_nonbonded_kinetic_prop_t;
+ +
++/* Struct for unique atom type for calculating the energy drift.
++ * The atom displacement depends on mass and constraints.
++ * The energy jump for given distance depend on LJ type and q.
++ */
++typedef struct
++{
++    atom_nonbonded_kinetic_prop_t prop; /* non-bonded and kinetic atom prop. */
++    int                           n;    /* #atoms of this type in the system */
++} verletbuf_atomtype_t;
+ +
+ +void verletbuf_get_list_setup(gmx_bool                bGPU,
+ +                              verletbuf_list_setup_t *list_setup)
+ +{
+ +    list_setup->cluster_size_i     = NBNXN_CPU_CLUSTER_I_SIZE;
+ +
+ +    if (bGPU)
+ +    {
+ +        list_setup->cluster_size_j = NBNXN_GPU_CLUSTER_SIZE;
+ +    }
+ +    else
+ +    {
+ +#ifndef GMX_NBNXN_SIMD
+ +        list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
+ +#else
+ +        list_setup->cluster_size_j = GMX_SIMD_WIDTH_HERE;
+ +#ifdef GMX_NBNXN_SIMD_2XNN
+ +        /* We assume the smallest cluster size to be on the safe side */
+ +        list_setup->cluster_size_j /= 2;
+ +#endif
+ +#endif
+ +    }
+ +}
+ +
++static gmx_bool
++atom_nonbonded_kinetic_prop_equal(const atom_nonbonded_kinetic_prop_t *prop1,
++                                  const atom_nonbonded_kinetic_prop_t *prop2)
++{
++    return (prop1->mass     == prop2->mass &&
++            prop1->type     == prop2->type &&
++            prop1->q        == prop2->q &&
++            prop1->bConstr  == prop2->bConstr &&
++            prop1->con_mass == prop2->con_mass &&
++            prop1->con_len  == prop2->con_len);
++}
++
+ +static void add_at(verletbuf_atomtype_t **att_p, int *natt_p,
-     int                   natt, i;
++                   const atom_nonbonded_kinetic_prop_t *prop,
++                   int nmol)
+ +{
+ +    verletbuf_atomtype_t *att;
-     if (mass == 0)
++    int                     natt, i;
+ +
-     while (i < natt &&
-            !(mass == att[i].mass &&
-              type == att[i].type &&
-              q    == att[i].q &&
-              con  == att[i].con))
++    if (prop->mass == 0)
+ +    {
+ +        /* Ignore massless particles */
+ +        return;
+ +    }
+ +
+ +    att  = *att_p;
+ +    natt = *natt_p;
+ +
+ +    i = 0;
-         (*att_p)[i].mass = mass;
-         (*att_p)[i].type = type;
-         (*att_p)[i].q    = q;
-         (*att_p)[i].con  = con;
++    while (i < natt && !atom_nonbonded_kinetic_prop_equal(prop, &att[i].prop))
+ +    {
+ +        i++;
+ +    }
+ +
+ +    if (i < natt)
+ +    {
+ +        att[i].n += nmol;
+ +    }
+ +    else
+ +    {
+ +        (*natt_p)++;
+ +        srenew(*att_p, *natt_p);
-     verletbuf_atomtype_t *att;
-     int                   natt;
-     int                   mb, nmol, ft, i, j, a1, a2, a3, a;
-     const t_atoms        *atoms;
-     const t_ilist        *il;
-     const t_atom         *at;
-     const t_iparams      *ip;
-     real                 *con_m, *vsite_m, cam[5];
++        (*att_p)[i].prop = *prop;
+ +        (*att_p)[i].n    = nmol;
+ +    }
+ +}
+ +
++static void get_vsite_masses(const gmx_moltype_t *moltype,
++                             const gmx_ffparams_t *ffparams,
++                             real *vsite_m,
++                             int *n_nonlin_vsite)
++{
++    int            ft, i;
++    const t_ilist *il;
++
++    *n_nonlin_vsite = 0;
++
++    /* Check for virtual sites, determine mass from constructing atoms */
++    for (ft = 0; ft < F_NRE; ft++)
++    {
++        if (IS_VSITE(ft))
++        {
++            il = &moltype->ilist[ft];
++
++            for (i = 0; i < il->nr; i += 1+NRAL(ft))
++            {
++                const t_iparams *ip;
++                real             cam[5], inv_mass, m_aj;
++                int              a1, j, aj, coeff;
++
++                ip = &ffparams->iparams[il->iatoms[i]];
++
++                a1 = il->iatoms[i+1];
++
++                if (ft != F_VSITEN)
++                {
++                    for (j = 1; j < NRAL(ft); j++)
++                    {
++                        cam[j] = moltype->atoms.atom[il->iatoms[i+1+j]].m;
++                        if (cam[j] == 0)
++                        {
++                            cam[j] = vsite_m[il->iatoms[i+1+j]];
++                        }
++                        if (cam[j] == 0)
++                        {
++                            gmx_fatal(FARGS, "In molecule type '%s' %s construction involves atom %d, which is a virtual site of equal or high complexity. This is not supported.",
++                                      *moltype->name,
++                                      interaction_function[ft].longname,
++                                      il->iatoms[i+1+j]+1);
++                        }
++                    }
++                }
++
++                switch (ft)
++                {
++                    case F_VSITE2:
++                        /* Exact */
++                        vsite_m[a1] = (cam[1]*cam[2])/(cam[2]*sqr(1-ip->vsite.a) + cam[1]*sqr(ip->vsite.a));
++                        break;
++                    case F_VSITE3:
++                        /* Exact */
++                        vsite_m[a1] = (cam[1]*cam[2]*cam[3])/(cam[2]*cam[3]*sqr(1-ip->vsite.a-ip->vsite.b) + cam[1]*cam[3]*sqr(ip->vsite.a) + cam[1]*cam[2]*sqr(ip->vsite.b));
++                        break;
++                    case F_VSITEN:
++                        /* Exact */
++                        inv_mass = 0;
++                        for (j = 0; j < 3*ip->vsiten.n; j += 3)
++                        {
++                            aj    = il->iatoms[i+j+2];
++                            coeff = ip[il->iatoms[i+j]].vsiten.a;
++                            if (moltype->atoms.atom[aj].ptype == eptVSite)
++                            {
++                                m_aj = vsite_m[aj];
++                            }
++                            else
++                            {
++                                m_aj = moltype->atoms.atom[aj].m;
++                            }
++                            if (m_aj <= 0)
++                            {
++                                gmx_incons("The mass of a vsiten constructing atom is <= 0");
++                            }
++                            inv_mass += coeff*coeff/m_aj;
++                        }
++                        vsite_m[a1] = 1/inv_mass;
++                        break;
++                    default:
++                        /* Use the mass of the lightest constructing atom.
++                         * This is an approximation.
++                         * If the distance of the virtual site to the
++                         * constructing atom is less than all distances
++                         * between constructing atoms, this is a safe
++                         * over-estimate of the displacement of the vsite.
++                         * This condition holds for all H mass replacement
++                         * vsite constructions, except for SP2/3 groups.
++                         * In SP3 groups one H will have a F_VSITE3
++                         * construction, so even there the total drift
++                         * estimate shouldn't be far off.
++                         */
++                        assert(j >= 1);
++                        vsite_m[a1] = cam[1];
++                        for (j = 2; j < NRAL(ft); j++)
++                        {
++                            vsite_m[a1] = min(vsite_m[a1], cam[j]);
++                        }
++                        (*n_nonlin_vsite)++;
++                        break;
++                }
++                if (gmx_debug_at)
++                {
++                    fprintf(debug, "atom %4d %-20s mass %6.3f\n",
++                            a1, interaction_function[ft].longname, vsite_m[a1]);
++                }
++            }
++        }
++    }
++}
++
+ +static void get_verlet_buffer_atomtypes(const gmx_mtop_t      *mtop,
+ +                                        verletbuf_atomtype_t **att_p,
+ +                                        int                   *natt_p,
+ +                                        int                   *n_nonlin_vsite)
+ +{
-         /* Check for constraints, as they affect the kinetic energy */
-         snew(con_m, atoms->nr);
++    verletbuf_atomtype_t          *att;
++    int                            natt;
++    int                            mb, nmol, ft, i, a1, a2, a3, a;
++    const t_atoms                 *atoms;
++    const t_ilist                 *il;
++    const t_iparams               *ip;
++    atom_nonbonded_kinetic_prop_t *prop;
++    real                          *vsite_m;
++    int                            n_nonlin_vsite_mol;
+ +
+ +    att  = NULL;
+ +    natt = 0;
+ +
+ +    if (n_nonlin_vsite != NULL)
+ +    {
+ +        *n_nonlin_vsite = 0;
+ +    }
+ +
+ +    for (mb = 0; mb < mtop->nmolblock; mb++)
+ +    {
+ +        nmol = mtop->molblock[mb].nmol;
+ +
+ +        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ +
-                 con_m[a1] += atoms->atom[a2].m;
-                 con_m[a2] += atoms->atom[a1].m;
++        /* Check for constraints, as they affect the kinetic energy.
++         * For virtual sites we need the masses and geometry of
++         * the constructing atoms to determine their velocity distribution.
++         */
++        snew(prop, atoms->nr);
+ +        snew(vsite_m, atoms->nr);
+ +
+ +        for (ft = F_CONSTR; ft <= F_CONSTRNC; ft++)
+ +        {
+ +            il = &mtop->moltype[mtop->molblock[mb].type].ilist[ft];
+ +
+ +            for (i = 0; i < il->nr; i += 1+NRAL(ft))
+ +            {
++                ip         = &mtop->ffparams.iparams[il->iatoms[i]];
+ +                a1         = il->iatoms[i+1];
+ +                a2         = il->iatoms[i+2];
-             con_m[a1] += atoms->atom[a2].m + atoms->atom[a3].m;
-             con_m[a2] += atoms->atom[a1].m + atoms->atom[a3].m;
-             con_m[a3] += atoms->atom[a1].m + atoms->atom[a2].m;
-         }
- 
-         /* Check for virtual sites, determine mass from constructing atoms */
-         for (ft = 0; ft < F_NRE; ft++)
-         {
-             if (IS_VSITE(ft))
-             {
-                 il = &mtop->moltype[mtop->molblock[mb].type].ilist[ft];
- 
-                 for (i = 0; i < il->nr; i += 1+NRAL(ft))
-                 {
-                     ip = &mtop->ffparams.iparams[il->iatoms[i]];
++                if (atoms->atom[a2].m > prop[a1].con_mass)
++                {
++                    prop[a1].con_mass = atoms->atom[a2].m;
++                    prop[a1].con_len  = ip->constr.dA;
++                }
++                if (atoms->atom[a1].m > prop[a2].con_mass)
++                {
++                    prop[a2].con_mass = atoms->atom[a1].m;
++                    prop[a2].con_len  = ip->constr.dA;
++                }
+ +            }
+ +        }
+ +
+ +        il = &mtop->moltype[mtop->molblock[mb].type].ilist[F_SETTLE];
+ +
+ +        for (i = 0; i < il->nr; i += 1+NRAL(F_SETTLE))
+ +        {
++            ip         = &mtop->ffparams.iparams[il->iatoms[i]];
+ +            a1         = il->iatoms[i+1];
+ +            a2         = il->iatoms[i+2];
+ +            a3         = il->iatoms[i+3];
-                     a1 = il->iatoms[i+1];
++            /* Usually the mass of a1 (usually oxygen) is larger than a2/a3.
++             * If this is not the case, we overestimate the displacement,
++             * which leads to a larger buffer (ok since this is an exotic case).
++             */
++            prop[a1].con_mass = atoms->atom[a2].m;
++            prop[a1].con_len  = ip->settle.doh;
+ +
-                     for (j = 1; j < NRAL(ft); j++)
-                     {
-                         cam[j] = atoms->atom[il->iatoms[i+1+j]].m;
-                         if (cam[j] == 0)
-                         {
-                             cam[j] = vsite_m[il->iatoms[i+1+j]];
-                         }
-                         if (cam[j] == 0)
-                         {
-                             gmx_fatal(FARGS, "In molecule type '%s' %s construction involves atom %d, which is a virtual site of equal or high complexity. This is not supported.",
-                                       *mtop->moltype[mtop->molblock[mb].type].name,
-                                       interaction_function[ft].longname,
-                                       il->iatoms[i+1+j]+1);
-                         }
-                     }
++            prop[a2].con_mass = atoms->atom[a1].m;
++            prop[a2].con_len  = ip->settle.doh;
+ +
-                     switch (ft)
-                     {
-                         case F_VSITE2:
-                             /* Exact except for ignoring constraints */
-                             vsite_m[a1] = (cam[2]*sqr(1-ip->vsite.a) + cam[1]*sqr(ip->vsite.a))/(cam[1]*cam[2]);
-                             break;
-                         case F_VSITE3:
-                             /* Exact except for ignoring constraints */
-                             vsite_m[a1] = (cam[2]*cam[3]*sqr(1-ip->vsite.a-ip->vsite.b) + cam[1]*cam[3]*sqr(ip->vsite.a) + cam[1]*cam[2]*sqr(ip->vsite.b))/(cam[1]*cam[2]*cam[3]);
-                             break;
-                         default:
-                             /* Use the mass of the lightest constructing atom.
-                              * This is an approximation.
-                              * If the distance of the virtual site to the
-                              * constructing atom is less than all distances
-                              * between constructing atoms, this is a safe
-                              * over-estimate of the displacement of the vsite.
-                              * This condition holds for all H mass replacement
-                              * replacement vsite constructions, except for SP2/3
-                              * groups. In SP3 groups one H will have a F_VSITE3
-                              * construction, so even there the total drift
-                              * estimation shouldn't be far off.
-                              */
-                             assert(j >= 1);
-                             vsite_m[a1] = cam[1];
-                             for (j = 2; j < NRAL(ft); j++)
-                             {
-                                 vsite_m[a1] = min(vsite_m[a1], cam[j]);
-                             }
-                             if (n_nonlin_vsite != NULL)
-                             {
-                                 *n_nonlin_vsite += nmol;
-                             }
-                             break;
-                     }
-                 }
-             }
++            prop[a3].con_mass = atoms->atom[a1].m;
++            prop[a3].con_len  = ip->settle.doh;
++        }
+ +
-             at = &atoms->atom[a];
-             /* We consider an atom constrained, #DOF=2, when it is
-              * connected with constraints to one or more atoms with
-              * total mass larger than 1.5 that of the atom itself.
++        get_vsite_masses(&mtop->moltype[mtop->molblock[mb].type],
++                         &mtop->ffparams,
++                         vsite_m,
++                         &n_nonlin_vsite_mol);
++        if (n_nonlin_vsite != NULL)
++        {
++            *n_nonlin_vsite += nmol*n_nonlin_vsite_mol;
+ +        }
+ +
+ +        for (a = 0; a < atoms->nr; a++)
+ +        {
-             add_at(&att, &natt,
-                    at->m, at->type, at->q, con_m[a] > 1.5*at->m, nmol);
++            if (atoms->atom[a].ptype == eptVSite)
++            {
++                prop[a].mass = vsite_m[a];
++            }
++            else
++            {
++                prop[a].mass = atoms->atom[a].m;
++            }
++            prop[a].type     = atoms->atom[a].type;
++            prop[a].q        = atoms->atom[a].q;
++             /* We consider an atom constrained, #DOF=2, when it is
++             * connected with constraints to (at least one) atom with
++             * a mass of more than 0.4x its own mass. This is not a critical
++             * parameter, since with roughly equal masses the unconstrained
++             * and constrained displacement will not differ much (and both
++             * overestimate the displacement).
+ +             */
-         sfree(con_m);
++            prop[a].bConstr = (prop[a].con_mass > 0.4*prop[a].mass);
++
++            add_at(&att, &natt, &prop[a], nmol);
+ +        }
+ +
+ +        sfree(vsite_m);
-             fprintf(debug, "type %d: m %5.2f t %d q %6.3f con %d n %d\n",
-                     a, att[a].mass, att[a].type, att[a].q, att[a].con, att[a].n);
++        sfree(prop);
+ +    }
+ +
+ +    if (gmx_debug_at)
+ +    {
+ +        for (a = 0; a < natt; a++)
+ +        {
- static void approx_2dof(real s2, real x,
-                         real *shift, real *scale)
++            fprintf(debug, "type %d: m %5.2f t %d q %6.3f con %d con_m %5.3f con_l %5.3f n %d\n",
++                    a, att[a].prop.mass, att[a].prop.type, att[a].prop.q,
++                    att[a].prop.bConstr, att[a].prop.con_mass, att[a].prop.con_len, 
++                    att[a].n);
+ +        }
+ +    }
+ +
+ +    *att_p  = att;
+ +    *natt_p = natt;
+ +}
+ +
-     real   s2i, s2j, s2, s;
++/* This function computes two components of the estimate of the variance
++ * in the displacement of one atom in a system of two constrained atoms.
++ * Returns in sigma2_2d the variance due to rotation of the constrained
++ * atom around the atom to which it constrained.
++ * Returns in sigma2_3d the variance due to displacement of the COM
++ * of the whole system of the two constrained atoms.
++ *
++ * Note that we only take a single constraint (the one to the heaviest atom)
++ * into account. If an atom has multiple constraints, this will result in
++ * an overestimate of the displacement, which gives a larger drift and buffer.
++ */
++static void constrained_atom_sigma2(real kT_fac,
++                                    const atom_nonbonded_kinetic_prop_t *prop,
++                                    real *sigma2_2d,
++                                    real *sigma2_3d)
++{
++    real sigma2_rot;
++    real com_dist;
++    real sigma2_rel;
++    real scale;
++
++    /* Here we decompose the motion of a constrained atom into two
++     * components: rotation around the COM and translation of the COM.
++     */
++
++    /* Determine the variance for the displacement of the rotational mode */
++    sigma2_rot = kT_fac/(prop->mass*(prop->mass + prop->con_mass)/prop->con_mass);
++
++    /* The distance from the atom to the COM, i.e. the rotational arm */
++    com_dist = prop->con_len*prop->con_mass/(prop->mass + prop->con_mass);
++
++    /* The variance relative to the arm */
++    sigma2_rel = sigma2_rot/(com_dist*com_dist);
++    /* At 6 the scaling formula has slope 0,
++     * so we keep sigma2_2d constant after that.
++     */
++    if (sigma2_rel < 6)
++    {
++        /* A constrained atom rotates around the atom it is constrained to.
++         * This results in a smaller linear displacement than for a free atom.
++         * For a perfectly circular displacement, this lowers the displacement
++         * by: 1/arcsin(arc_length)
++         * and arcsin(x) = 1 + x^2/6 + ...
++         * For sigma2_rel<<1 the displacement distribution is erfc
++         * (exact formula is provided below). For larger sigma, it is clear
++         * that the displacement can't be larger than 2*com_dist.
++         * It turns out that the distribution becomes nearly uniform.
++         * For intermediate sigma2_rel, scaling down sigma with the third
++         * order expansion of arcsin with argument sigma_rel turns out
++         * to give a very good approximation of the distribution and variance.
++         * Even for larger values, the variance is only slightly overestimated.
++         * Note that the most relevant displacements are in the long tail.
++         * This rotation approximation always overestimates the tail (which
++         * runs to infinity, whereas it should be <= 2*com_dist).
++         * Thus we always overestimate the drift and the buffer size.
++         */
++        scale      = 1/(1 + sigma2_rel/6);
++        *sigma2_2d = sigma2_rot*scale*scale;
++    }
++    else
++    {
++        /* sigma_2d is set to the maximum given by the scaling above.
++         * For large sigma2 the real displacement distribution is close
++         * to uniform over -2*con_len to 2*com_dist.
++         * Our erfc with sigma_2d=sqrt(1.5)*com_dist (which means the sigma
++         * of the erfc output distribution is con_dist) overestimates
++         * the variance and additionally has a long tail. This means
++         * we have a (safe) overestimation of the drift.
++         */
++        *sigma2_2d = 1.5*com_dist*com_dist;
++    }
++
++    /* The constrained atom also moves (in 3D) with the COM of both atoms */
++    *sigma2_3d = kT_fac/(prop->mass + prop->con_mass);
++}
++
++static void get_atom_sigma2(real kT_fac,
++                            const atom_nonbonded_kinetic_prop_t *prop,
++                            real *sigma2_2d,
++                            real *sigma2_3d)
++{
++    if (prop->bConstr)
++    {
++        /* Complicated constraint calculation in a separate function */
++        constrained_atom_sigma2(kT_fac, prop, sigma2_2d, sigma2_3d);
++    }
++    else
++    {
++        /* Unconstrained atom: trivial */
++        *sigma2_2d = 0;
++        *sigma2_3d = kT_fac/prop->mass;
++    }
++}
++
++static void approx_2dof(real s2, real x, real *shift, real *scale)
+ +{
+ +    /* A particle with 1 DOF constrained has 2 DOFs instead of 3.
+ +     * This code is also used for particles with multiple constraints,
+ +     * in which case we overestimate the displacement.
+ +     * The 2DOF distribution is sqrt(pi/2)*erfc(r/(sqrt(2)*s))/(2*s).
+ +     * We approximate this with scale*Gaussian(s,r+shift),
+ +     * by matching the distribution value and derivative at x.
+ +     * This is a tight overestimate for all r>=0 at any s and x.
+ +     */
+ +    real ex, er;
+ +
+ +    ex = exp(-x*x/(2*s2));
+ +    er = gmx_erfc(x/sqrt(2*s2));
+ +
+ +    *shift = -x + sqrt(2*s2/M_PI)*ex/er;
+ +    *scale = 0.5*M_PI*exp(ex*ex/(M_PI*er*er))*er;
+ +}
+ +
+ +static real ener_drift(const verletbuf_atomtype_t *att, int natt,
+ +                       const gmx_ffparams_t *ffp,
+ +                       real kT_fac,
+ +                       real md_ljd, real md_ljr, real md_el, real dd_el,
+ +                       real r_buffer,
+ +                       real rlist, real boxvol)
+ +{
+ +    double drift_tot, pot1, pot2, pot;
+ +    int    i, j;
-         s2i = kT_fac/att[i].mass;
-         ti  = att[i].type;
++    real   s2i_2d, s2i_3d, s2j_2d, s2j_3d, s2, s;
+ +    int    ti, tj;
+ +    real   md, dd;
+ +    real   sc_fac, rsh;
+ +    double c_exp, c_erfc;
+ +
+ +    drift_tot = 0;
+ +
+ +    /* Loop over the different atom type pairs */
+ +    for (i = 0; i < natt; i++)
+ +    {
-             s2j = kT_fac/att[j].mass;
-             tj  = att[j].type;
++        get_atom_sigma2(kT_fac, &att[i].prop, &s2i_2d, &s2i_3d);
++        ti = att[i].prop.type;
+ +
+ +        for (j = i; j < natt; j++)
+ +        {
-                 md_el*att[i].q*att[j].q;
++            get_atom_sigma2(kT_fac, &att[j].prop, &s2j_2d, &s2j_3d);
++            tj = att[j].prop.type;
++
++            /* Add up the up to four independent variances */
++            s2 = s2i_2d + s2i_3d + s2j_2d + s2j_3d; 
+ +
+ +            /* Note that attractive and repulsive potentials for individual
+ +             * pairs will partially cancel.
+ +             */
+ +            /* -dV/dr at the cut-off for LJ + Coulomb */
+ +            md =
+ +                md_ljd*ffp->iparams[ti*ffp->atnr+tj].lj.c6 +
+ +                md_ljr*ffp->iparams[ti*ffp->atnr+tj].lj.c12 +
-             dd = dd_el*att[i].q*att[j].q;
- 
-             s2  = s2i + s2j;
++                md_el*att[i].prop.q*att[j].prop.q;
+ +
+ +            /* d2V/dr2 at the cut-off for Coulomb, we neglect LJ */
-             if (att[i].con)
++            dd = dd_el*att[i].prop.q*att[j].prop.q;
+ +
+ +            rsh    = r_buffer;
+ +            sc_fac = 1.0;
+ +            /* For constraints: adapt r and scaling for the Gaussian */
-                 approx_2dof(s2i, r_buffer*s2i/s2, &sh, &sc);
++            if (att[i].prop.bConstr)
+ +            {
+ +                real sh, sc;
-             if (att[j].con)
++
++                approx_2dof(s2i_2d, r_buffer*s2i_2d/s2, &sh, &sc);
+ +                rsh    += sh;
+ +                sc_fac *= sc;
+ +            }
-                 approx_2dof(s2j, r_buffer*s2j/s2, &sh, &sc);
++            if (att[j].prop.bConstr)
+ +            {
+ +                real sh, sc;
-                 fprintf(debug, "n %d %d d s %.3f %.3f con %d md %8.1e dd %8.1e pot1 %8.1e pot2 %8.1e pot %8.1e\n",
-                         att[i].n, att[j].n, sqrt(s2i), sqrt(s2j),
-                         att[i].con+att[j].con,
++
++                approx_2dof(s2j_2d, r_buffer*s2j_2d/s2, &sh, &sc);
+ +                rsh    += sh;
+ +                sc_fac *= sc;
+ +            }
+ +
+ +            /* Exact contribution of an atom pair with Gaussian displacement
+ +             * with sigma s to the energy drift for a potential with
+ +             * derivative -md and second derivative dd at the cut-off.
+ +             * The only catch is that for potentials that change sign
+ +             * near the cut-off there could be an unlucky compensation
+ +             * of positive and negative energy drift.
+ +             * Such potentials are extremely rare though.
+ +             *
+ +             * Note that pot has unit energy*length, as the linear
+ +             * atom density still needs to be put in.
+ +             */
+ +            c_exp  = exp(-rsh*rsh/(2*s2))/sqrt(2*M_PI);
+ +            c_erfc = 0.5*gmx_erfc(rsh/(sqrt(2*s2)));
+ +            s      = sqrt(s2);
+ +
+ +            pot1 = sc_fac*
+ +                md/2*((rsh*rsh + s2)*c_erfc - rsh*s*c_exp);
+ +            pot2 = sc_fac*
+ +                dd/6*(s*(rsh*rsh + 2*s2)*c_exp - rsh*(rsh*rsh + 3*s2)*c_erfc);
+ +            pot = pot1 + pot2;
+ +
+ +            if (gmx_debug_at)
+ +            {
-                 att[i].mass = 1;
++                fprintf(debug, "n %d %d d s %.3f %.3f %.3f %.3f con %d md %8.1e dd %8.1e pot1 %8.1e pot2 %8.1e pot %8.1e\n",
++                        att[i].n, att[j].n,
++                        sqrt(s2i_2d), sqrt(s2i_3d),
++                        sqrt(s2j_2d), sqrt(s2j_3d),
++                        att[i].prop.bConstr+att[j].prop.bConstr,
+ +                        md, dd, pot1, pot2, pot);
+ +            }
+ +
+ +            /* Multiply by the number of atom pairs */
+ +            if (j == i)
+ +            {
+ +                pot *= (double)att[i].n*(att[i].n - 1)/2;
+ +            }
+ +            else
+ +            {
+ +                pot *= (double)att[i].n*att[j].n;
+ +            }
+ +            /* We need the line density to get the energy drift of the system.
+ +             * The effective average r^2 is close to (rlist+sigma)^2.
+ +             */
+ +            pot *= 4*M_PI*sqr(rlist + s)/boxvol;
+ +
+ +            /* Add the unsigned drift to avoid cancellation of errors */
+ +            drift_tot += fabs(pot);
+ +        }
+ +    }
+ +
+ +    return drift_tot;
+ +}
+ +
+ +static real surface_frac(int cluster_size, real particle_distance, real rlist)
+ +{
+ +    real d, area_rel;
+ +
+ +    if (rlist < 0.5*particle_distance)
+ +    {
+ +        /* We have non overlapping spheres */
+ +        return 1.0;
+ +    }
+ +
+ +    /* Half the inter-particle distance relative to rlist */
+ +    d = 0.5*particle_distance/rlist;
+ +
+ +    /* Determine the area of the surface at distance rlist to the closest
+ +     * particle, relative to surface of a sphere of radius rlist.
+ +     * The formulas below assume close to cubic cells for the pair search grid,
+ +     * which the pair search code tries to achieve.
+ +     * Note that in practice particle distances will not be delta distributed,
+ +     * but have some spread, often involving shorter distances,
+ +     * as e.g. O-H bonds in a water molecule. Thus the estimates below will
+ +     * usually be slightly too high and thus conservative.
+ +     */
+ +    switch (cluster_size)
+ +    {
+ +        case 1:
+ +            /* One particle: trivial */
+ +            area_rel = 1.0;
+ +            break;
+ +        case 2:
+ +            /* Two particles: two spheres at fractional distance 2*a */
+ +            area_rel = 1.0 + d;
+ +            break;
+ +        case 4:
+ +            /* We assume a perfect, symmetric tetrahedron geometry.
+ +             * The surface around a tetrahedron is too complex for a full
+ +             * analytical solution, so we use a Taylor expansion.
+ +             */
+ +            area_rel = (1.0 + 1/M_PI*(6*acos(1/sqrt(3))*d +
+ +                                      sqrt(3)*d*d*(1.0 +
+ +                                                   5.0/18.0*d*d +
+ +                                                   7.0/45.0*d*d*d*d +
+ +                                                   83.0/756.0*d*d*d*d*d*d)));
+ +            break;
+ +        default:
+ +            gmx_incons("surface_frac called with unsupported cluster_size");
+ +            area_rel = 1.0;
+ +    }
+ +
+ +    return area_rel/cluster_size;
+ +}
+ +
+ +void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
+ +                             const t_inputrec *ir, real drift_target,
+ +                             const verletbuf_list_setup_t *list_setup,
+ +                             int *n_nonlin_vsite,
+ +                             real *rlist)
+ +{
+ +    double                resolution;
+ +    char                 *env;
+ +
+ +    real                  particle_distance;
+ +    real                  nb_clust_frac_pairs_not_in_list_at_cutoff;
+ +
+ +    verletbuf_atomtype_t *att  = NULL;
+ +    int                   natt = -1, i;
+ +    double                reppow;
+ +    real                  md_ljd, md_ljr, md_el, dd_el;
+ +    real                  elfac;
+ +    real                  kT_fac, mass_min;
+ +    int                   ib0, ib1, ib;
+ +    real                  rb, rl;
+ +    real                  drift;
+ +
+ +    /* Resolution of the buffer size */
+ +    resolution = 0.001;
+ +
+ +    env = getenv("GMX_VERLET_BUFFER_RES");
+ +    if (env != NULL)
+ +    {
+ +        sscanf(env, "%lf", &resolution);
+ +    }
+ +
+ +    /* In an atom wise pair-list there would be no pairs in the list
+ +     * beyond the pair-list cut-off.
+ +     * However, we use a pair-list of groups vs groups of atoms.
+ +     * For groups of 4 atoms, the parallelism of SSE instructions, only
+ +     * 10% of the atoms pairs are not in the list just beyond the cut-off.
+ +     * As this percentage increases slowly compared to the decrease of the
+ +     * Gaussian displacement distribution over this range, we can simply
+ +     * reduce the drift by this fraction.
+ +     * For larger groups, e.g. of 8 atoms, this fraction will be lower,
+ +     * so then buffer size will be on the conservative (large) side.
+ +     *
+ +     * Note that the formulas used here do not take into account
+ +     * cancellation of errors which could occur by missing both
+ +     * attractive and repulsive interactions.
+ +     *
+ +     * The only major assumption is homogeneous particle distribution.
+ +     * For an inhomogeneous system, such as a liquid-vapor system,
+ +     * the buffer will be underestimated. The actual energy drift
+ +     * will be higher by the factor: local/homogeneous particle density.
+ +     *
+ +     * The results of this estimate have been checked againt simulations.
+ +     * In most cases the real drift differs by less than a factor 2.
+ +     */
+ +
+ +    /* Worst case assumption: HCP packing of particles gives largest distance */
+ +    particle_distance = pow(boxvol*sqrt(2)/mtop->natoms, 1.0/3.0);
+ +
+ +    get_verlet_buffer_atomtypes(mtop, &att, &natt, n_nonlin_vsite);
+ +    assert(att != NULL && natt >= 0);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "particle distance assuming HCP packing: %f nm\n",
+ +                particle_distance);
+ +        fprintf(debug, "energy drift atom types: %d\n", natt);
+ +    }
+ +
+ +    reppow = mtop->ffparams.reppow;
+ +    md_ljd = 0;
+ +    md_ljr = 0;
+ +    if (ir->vdwtype == evdwCUT)
+ +    {
+ +        /* -dV/dr of -r^-6 and r^-repporw */
+ +        md_ljd = -6*pow(ir->rvdw, -7.0);
+ +        md_ljr = reppow*pow(ir->rvdw, -(reppow+1));
+ +        /* The contribution of the second derivative is negligible */
+ +    }
+ +    else
+ +    {
+ +        gmx_fatal(FARGS, "Energy drift calculation is only implemented for plain cut-off Lennard-Jones interactions");
+ +    }
+ +
+ +    elfac = ONE_4PI_EPS0/ir->epsilon_r;
+ +
+ +    /* Determine md=-dV/dr and dd=d^2V/dr^2 */
+ +    md_el = 0;
+ +    dd_el = 0;
+ +    if (ir->coulombtype == eelCUT || EEL_RF(ir->coulombtype))
+ +    {
+ +        real eps_rf, k_rf;
+ +
+ +        if (ir->coulombtype == eelCUT)
+ +        {
+ +            eps_rf = 1;
+ +            k_rf   = 0;
+ +        }
+ +        else
+ +        {
+ +            eps_rf = ir->epsilon_rf/ir->epsilon_r;
+ +            if (eps_rf != 0)
+ +            {
+ +                k_rf = pow(ir->rcoulomb, -3.0)*(eps_rf - ir->epsilon_r)/(2*eps_rf + ir->epsilon_r);
+ +            }
+ +            else
+ +            {
+ +                /* epsilon_rf = infinity */
+ +                k_rf = 0.5*pow(ir->rcoulomb, -3.0);
+ +            }
+ +        }
+ +
+ +        if (eps_rf > 0)
+ +        {
+ +            md_el = elfac*(pow(ir->rcoulomb, -2.0) - 2*k_rf*ir->rcoulomb);
+ +        }
+ +        dd_el = elfac*(2*pow(ir->rcoulomb, -3.0) + 2*k_rf);
+ +    }
+ +    else if (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD)
+ +    {
+ +        real b, rc, br;
+ +
+ +        b     = calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
+ +        rc    = ir->rcoulomb;
+ +        br    = b*rc;
+ +        md_el = elfac*(b*exp(-br*br)*M_2_SQRTPI/rc + gmx_erfc(br)/(rc*rc));
+ +        dd_el = elfac/(rc*rc)*(2*b*(1 + br*br)*exp(-br*br)*M_2_SQRTPI + 2*gmx_erfc(br)/rc);
+ +    }
+ +    else
+ +    {
+ +        gmx_fatal(FARGS, "Energy drift calculation is only implemented for Reaction-Field and Ewald electrostatics");
+ +    }
+ +
+ +    /* Determine the variance of the atomic displacement
+ +     * over nstlist-1 steps: kT_fac
+ +     * For inertial dynamics (not Brownian dynamics) the mass factor
+ +     * is not included in kT_fac, it is added later.
+ +     */
+ +    if (ir->eI == eiBD)
+ +    {
+ +        /* Get the displacement distribution from the random component only.
+ +         * With accurate integration the systematic (force) displacement
+ +         * should be negligible (unless nstlist is extremely large, which
+ +         * you wouldn't do anyhow).
+ +         */
+ +        kT_fac = 2*BOLTZ*ir->opts.ref_t[0]*(ir->nstlist-1)*ir->delta_t;
+ +        if (ir->bd_fric > 0)
+ +        {
+ +            /* This is directly sigma^2 of the displacement */
+ +            kT_fac /= ir->bd_fric;
+ +
+ +            /* Set the masses to 1 as kT_fac is the full sigma^2,
+ +             * but we divide by m in ener_drift().
+ +             */
+ +            for (i = 0; i < natt; i++)
+ +            {
-     mass_min = att[0].mass;
++                att[i].prop.mass = 1;
+ +            }
+ +        }
+ +        else
+ +        {
+ +            real tau_t;
+ +
+ +            /* Per group tau_t is not implemented yet, use the maximum */
+ +            tau_t = ir->opts.tau_t[0];
+ +            for (i = 1; i < ir->opts.ngtc; i++)
+ +            {
+ +                tau_t = max(tau_t, ir->opts.tau_t[i]);
+ +            }
+ +
+ +            kT_fac *= tau_t;
+ +            /* This kT_fac needs to be divided by the mass to get sigma^2 */
+ +        }
+ +    }
+ +    else
+ +    {
+ +        kT_fac = BOLTZ*ir->opts.ref_t[0]*sqr((ir->nstlist-1)*ir->delta_t);
+ +    }
+ +
-         mass_min = min(mass_min, att[i].mass);
++    mass_min = att[0].prop.mass;
+ +    for (i = 1; i < natt; i++)
+ +    {
++        mass_min = min(mass_min, att[i].prop.mass);
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "md_ljd %e md_ljr %e\n", md_ljd, md_ljr);
+ +        fprintf(debug, "md_el %e dd_el %e\n", md_el, dd_el);
+ +        fprintf(debug, "sqrt(kT_fac) %f\n", sqrt(kT_fac));
+ +        fprintf(debug, "mass_min %f\n", mass_min);
+ +    }
+ +
+ +    /* Search using bisection */
+ +    ib0 = -1;
+ +    /* The drift will be neglible at 5 times the max sigma */
+ +    ib1 = (int)(5*2*sqrt(kT_fac/mass_min)/resolution) + 1;
+ +    while (ib1 - ib0 > 1)
+ +    {
+ +        ib = (ib0 + ib1)/2;
+ +        rb = ib*resolution;
+ +        rl = max(ir->rvdw, ir->rcoulomb) + rb;
+ +
+ +        /* Calculate the average energy drift at the last step
+ +         * of the nstlist steps at which the pair-list is used.
+ +         */
+ +        drift = ener_drift(att, natt, &mtop->ffparams,
+ +                           kT_fac,
+ +                           md_ljd, md_ljr, md_el, dd_el, rb,
+ +                           rl, boxvol);
+ +
+ +        /* Correct for the fact that we are using a Ni x Nj particle pair list
+ +         * and not a 1 x 1 particle pair list. This reduces the drift.
+ +         */
+ +        /* We don't have a formula for 8 (yet), use 4 which is conservative */
+ +        nb_clust_frac_pairs_not_in_list_at_cutoff =
+ +            surface_frac(min(list_setup->cluster_size_i, 4),
+ +                         particle_distance, rl)*
+ +            surface_frac(min(list_setup->cluster_size_j, 4),
+ +                         particle_distance, rl);
+ +        drift *= nb_clust_frac_pairs_not_in_list_at_cutoff;
+ +
+ +        /* Convert the drift to drift per unit time per atom */
+ +        drift /= ir->nstlist*ir->delta_t*mtop->natoms;
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "ib %3d %3d %3d rb %.3f %dx%d fac %.3f drift %f\n",
+ +                    ib0, ib, ib1, rb,
+ +                    list_setup->cluster_size_i, list_setup->cluster_size_j,
+ +                    nb_clust_frac_pairs_not_in_list_at_cutoff,
+ +                    drift);
+ +        }
+ +
+ +        if (fabs(drift) > drift_target)
+ +        {
+ +            ib0 = ib;
+ +        }
+ +        else
+ +        {
+ +            ib1 = ib;
+ +        }
+ +    }
+ +
+ +    sfree(att);
+ +
+ +    *rlist = max(ir->rvdw, ir->rcoulomb) + ib1*resolution;
+ +}
diff --cc src/gromacs/legacyheaders/gmx_simd_ref.h
Simple merge
diff --cc src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index a68fa392775362ed41961fda506c39744d1362e4,0000000000000000000000000000000000000000..511d6a3f9df4381b39ed49fce9eb703e4a9e13bd

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@@ -1,710 -1,0 +1,710 @@@
- #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ > 300
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +
+ +#include <stdlib.h>
+ +#include <assert.h>
+ +
+ +#if defined(_MSVC)
+ +#include <limits>
+ +#endif
+ +
+ +#include <cuda.h>
+ +
+ +#include "types/simple.h" 
+ +#include "types/nbnxn_pairlist.h"
+ +#include "types/nb_verlet.h"
+ +#include "types/ishift.h"
+ +#include "types/force_flags.h"
+ +#include "../nbnxn_consts.h"
+ +
+ +#ifdef TMPI_ATOMICS
+ +#include "thread_mpi/atomic.h"
+ +#endif
+ +
+ +#include "nbnxn_cuda_types.h"
+ +#include "../../gmxlib/cuda_tools/cudautils.cuh"
+ +#include "nbnxn_cuda.h"
+ +#include "nbnxn_cuda_data_mgmt.h"
+ +
++#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
+ +#define USE_TEXOBJ
+ +#endif
+ +
+ +/*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
+ +texture<float, 1, cudaReadModeElementType> nbfp_texref;
+ +
+ +/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
+ +texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
+ +
+ +/* Convenience defines */
+ +#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+ +#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
+ +
+ +/***** The kernels come here *****/
+ +#include "nbnxn_cuda_kernel_utils.cuh"
+ +
+ +/* Top-level kernel generation: will generate through multiple inclusion the
+ + * following flavors for all kernels:
+ + * - force-only output;
+ + * - force and energy output;
+ + * - force-only with pair list pruning;
+ + * - force and energy output with pair list pruning.
+ + */
+ +/** Force only **/
+ +#include "nbnxn_cuda_kernels.cuh"
+ +/** Force & energy **/
+ +#define CALC_ENERGIES
+ +#include "nbnxn_cuda_kernels.cuh"
+ +#undef CALC_ENERGIES
+ +
+ +/*** Pair-list pruning kernels ***/
+ +/** Force only **/
+ +#define PRUNE_NBL
+ +#include "nbnxn_cuda_kernels.cuh"
+ +/** Force & energy **/
+ +#define CALC_ENERGIES
+ +#include "nbnxn_cuda_kernels.cuh"
+ +#undef CALC_ENERGIES
+ +#undef PRUNE_NBL
+ +
+ +/*! Nonbonded kernel function pointer type */
+ +typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
+ +                                     const cu_nbparam_t,
+ +                                     const cu_plist_t,
+ +                                     bool);
+ +
+ +/*********************************/
+ +
+ +/* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
+ +static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
+ +static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
+ +static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
+ +
+ +
+ +/* Bit-pattern used for polling-based GPU synchronization. It is used as a float
+ + * and corresponds to having the exponent set to the maximum (127 -- single
+ + * precision) and the mantissa to 0.
+ + */
+ +static unsigned int poll_wait_pattern = (0x7FU << 23);
+ +
+ +/*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
+ +static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
+ +{
+ +    int max_grid_x_size;
+ +
+ +    assert(dinfo);
+ +
+ +    max_grid_x_size = dinfo->prop.maxGridSize[0];
+ +
+ +    /* do we exceed the grid x dimension limit? */
+ +    if (nwork_units > max_grid_x_size)
+ +    {
+ +        gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
+ +                  "The number of nonbonded work units (=number of super-clusters) exceeds the"
+ +                  "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
+ +    }
+ +
+ +    return nwork_units;
+ +}
+ +
+ +
+ +/* Constant arrays listing all kernel function pointers and enabling selection
+ +   of a kernel in an elegant manner. */
+ +
+ +static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
+ +static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
+ +
+ +/*! Pointers to the default kernels organized in a 3 dim array by:
+ + *  electrostatics type, energy calculation on/off, and pruning on/off.
+ + *
+ + *  Note that the order of electrostatics (1st dimension) has to match the
+ + *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
+ + */
+ +static const nbnxn_cu_kfunc_ptr_t
+ +nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+ +{
+ +    { { k_nbnxn_cutoff,                     k_nbnxn_cutoff_prune },
+ +      { k_nbnxn_cutoff_ener,                k_nbnxn_cutoff_ener_prune } },
+ +    { { k_nbnxn_rf,                         k_nbnxn_rf_prune },
+ +      { k_nbnxn_rf_ener,                    k_nbnxn_rf_ener_prune } },
+ +    { { k_nbnxn_ewald_tab,                  k_nbnxn_ewald_tab_prune },
+ +      { k_nbnxn_ewald_tab_ener,             k_nbnxn_ewald_tab_ener_prune } },
+ +    { { k_nbnxn_ewald_tab_twin,             k_nbnxn_ewald_tab_twin_prune },
+ +      { k_nbnxn_ewald_tab_twin_ener,        k_nbnxn_ewald_twin_ener_prune } },
+ +    { { k_nbnxn_ewald,                      k_nbnxn_ewald_prune },
+ +      { k_nbnxn_ewald_ener,                 k_nbnxn_ewald_ener_prune } },
+ +    { { k_nbnxn_ewald_twin,                 k_nbnxn_ewald_twin_prune },
+ +      { k_nbnxn_ewald_twin_ener,            k_nbnxn_ewald_twin_ener_prune } },
+ +};
+ +
+ +/*! Pointers to the legacy kernels organized in a 3 dim array by:
+ + *  electrostatics type, energy calculation on/off, and pruning on/off.
+ + *
+ + *  Note that the order of electrostatics (1st dimension) has to match the
+ + *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
+ + */
+ +static const nbnxn_cu_kfunc_ptr_t
+ +nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+ +{
+ +    { { k_nbnxn_cutoff_legacy,              k_nbnxn_cutoff_prune_legacy },
+ +      { k_nbnxn_cutoff_ener_legacy,         k_nbnxn_cutoff_ener_prune_legacy } },
+ +    { { k_nbnxn_rf_legacy,                  k_nbnxn_rf_prune_legacy },
+ +      { k_nbnxn_rf_ener_legacy,             k_nbnxn_rf_ener_prune_legacy } },
+ +    { { k_nbnxn_ewald_tab_legacy,           k_nbnxn_ewald_tab_prune_legacy },
+ +      { k_nbnxn_ewald_tab_ener_legacy,      k_nbnxn_ewald_tab_ener_prune_legacy } },
+ +    { { k_nbnxn_ewald_tab_twin_legacy,      k_nbnxn_ewald_tab_twin_prune_legacy },
+ +      { k_nbnxn_ewald_tab_twin_ener_legacy, k_nbnxn_ewald_tab_twin_ener_prune_legacy } },
+ +};
+ +
+ +/*! Return a pointer to the kernel version to be executed at the current step. */
+ +static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
+ +                                                       bool bDoEne, bool bDoPrune)
+ +{
+ +    assert(kver < eNbnxnCuKNR);
+ +    assert(eeltype < eelCuNR);
+ +
+ +    if (NBNXN_KVER_LEGACY(kver))
+ +    {
+ +        /* no analytical Ewald with legacy kernels */
+ +        assert(eeltype <= eelCuEWALD_TAB_TWIN);
+ +
+ +        return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
+ +    }
+ +    else
+ +    {
+ +        return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
+ +    }
+ +}
+ +
+ +/*! Calculates the amount of shared memory required for kernel version in use. */
+ +static inline int calc_shmem_required(int kver)
+ +{
+ +    int shmem;
+ +
+ +    /* size of shmem (force-buffers/xq/atom type preloading) */
+ +    if (NBNXN_KVER_LEGACY(kver))
+ +    {
+ +        /* i-atom x+q in shared memory */
+ +        shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+ +        /* force reduction buffers in shared memory */
+ +        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+ +    }
+ +    else
+ +    {
+ +        /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
+ +        /* i-atom x+q in shared memory */
+ +        shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+ +        /* cj in shared memory, for both warps separately */
+ +        shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
+ +#ifdef IATYPE_SHMEM
+ +        /* i-atom types in shared memory */
+ +        shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
+ +#endif
+ +#if __CUDA_ARCH__ < 300
+ +        /* force reduction buffers in shared memory */
+ +        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+ +#endif
+ +    }
+ +
+ +    return shmem;
+ +}
+ +
+ +/*! As we execute nonbonded workload in separate streams, before launching 
+ +   the kernel we need to make sure that he following operations have completed:
+ +   - atomdata allocation and related H2D transfers (every nstlist step);
+ +   - pair list H2D transfer (every nstlist step);
+ +   - shift vector H2D transfer (every nstlist step);
+ +   - force (+shift force and energy) output clearing (every step).
+ +
+ +   These operations are issued in the local stream at the beginning of the step
+ +   and therefore always complete before the local kernel launch. The non-local
+ +   kernel is launched after the local on the same device/context, so this is
+ +   inherently scheduled after the operations in the local stream (including the
+ +   above "misc_ops").
+ +   However, for the sake of having a future-proof implementation, we use the
+ +   misc_ops_done event to record the point in time when the above  operations
+ +   are finished and synchronize with this event in the non-local stream.
+ +*/
+ +void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
+ +                              const nbnxn_atomdata_t *nbatom,
+ +                              int flags,
+ +                              int iloc)
+ +{
+ +    cudaError_t stat;
+ +    int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
+ +    /* CUDA kernel launch-related stuff */
+ +    int  shmem, nblock;
+ +    dim3 dim_block, dim_grid;
+ +    nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
+ +
+ +    cu_atomdata_t   *adat   = cu_nb->atdat;
+ +    cu_nbparam_t    *nbp    = cu_nb->nbparam;
+ +    cu_plist_t      *plist  = cu_nb->plist[iloc];
+ +    cu_timers_t     *t      = cu_nb->timers;
+ +    cudaStream_t    stream  = cu_nb->stream[iloc];
+ +
+ +    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
+ +    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ +    bool bDoTime     = cu_nb->bDoTime;
+ +
+ +    /* turn energy calculation always on/off (for debugging/testing only) */
+ +    bCalcEner = (bCalcEner || always_ener) && !never_ener;
+ +
+ +    /* don't launch the kernel if there is no work to do */
+ +    if (plist->nsci == 0)
+ +    {
+ +        return;
+ +    }
+ +
+ +    /* calculate the atom data index range based on locality */
+ +    if (LOCAL_I(iloc))
+ +    {
+ +        adat_begin  = 0;
+ +        adat_len    = adat->natoms_local;
+ +    }
+ +    else
+ +    {
+ +        adat_begin  = adat->natoms_local;
+ +        adat_len    = adat->natoms - adat->natoms_local;
+ +    }
+ +
+ +    /* When we get here all misc operations issues in the local stream are done,
+ +       so we record that in the local stream and wait for it in the nonlocal one. */
+ +    if (cu_nb->bUseTwoStreams)
+ +    {
+ +        if (iloc == eintLocal)
+ +        {
+ +            stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
+ +            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
+ +        }
+ +        else
+ +        {
+ +            stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
+ +            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
+ +        }
+ +    }
+ +
+ +    /* beginning of timed HtoD section */
+ +    if (bDoTime)
+ +    {
+ +        stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
+ +        CU_RET_ERR(stat, "cudaEventRecord failed");
+ +    }
+ +
+ +    /* HtoD x, q */
+ +    cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
+ +                      adat_len * sizeof(*adat->xq), stream); 
+ +
+ +    if (bDoTime)
+ +    {
+ +        stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
+ +        CU_RET_ERR(stat, "cudaEventRecord failed");
+ +    }
+ +
+ +    /* beginning of timed nonbonded calculation section */
+ +    if (bDoTime)
+ +    {
+ +        stat = cudaEventRecord(t->start_nb_k[iloc], stream);
+ +        CU_RET_ERR(stat, "cudaEventRecord failed");
+ +    }
+ +
+ +    /* get the pointer to the kernel flavor we need to use */
+ +    nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
+ +                                    plist->bDoPrune || always_prune);
+ +
+ +    /* kernel launch config */
+ +    nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
+ +    dim_block = dim3(CL_SIZE, CL_SIZE, 1);
+ +    dim_grid  = dim3(nblock, 1, 1);
+ +    shmem     = calc_shmem_required(cu_nb->kernel_ver);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
+ +                "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+ +                dim_block.x, dim_block.y, dim_block.z,
+ +                dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
+ +                NCL_PER_SUPERCL, plist->na_c);
+ +    }
+ +
+ +    nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
+ +    CU_LAUNCH_ERR("k_calc_nb");
+ +
+ +    if (bDoTime)
+ +    {
+ +        stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
+ +        CU_RET_ERR(stat, "cudaEventRecord failed");
+ +    }
+ +}
+ +
+ +void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
+ +                               const nbnxn_atomdata_t *nbatom,
+ +                               int flags,
+ +                               int aloc)
+ +{
+ +    cudaError_t stat;
+ +    int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
+ +    int iloc = -1;
+ +
+ +    /* determine interaction locality from atom locality */
+ +    if (LOCAL_A(aloc))
+ +    {
+ +        iloc = eintLocal;
+ +    }
+ +    else if (NONLOCAL_A(aloc))
+ +    {
+ +        iloc = eintNonlocal;
+ +    }
+ +    else
+ +    {
+ +        char stmp[STRLEN];
+ +        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
+ +                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
+ +        gmx_incons(stmp);
+ +    }
+ +
+ +    cu_atomdata_t   *adat   = cu_nb->atdat;
+ +    cu_timers_t     *t      = cu_nb->timers;
+ +    bool            bDoTime = cu_nb->bDoTime;
+ +    cudaStream_t    stream  = cu_nb->stream[iloc];
+ +
+ +    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
+ +    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ +
+ +    /* don't launch copy-back if there was no work to do */
+ +    if (cu_nb->plist[iloc]->nsci == 0)
+ +    {
+ +        return;
+ +    }
+ +
+ +    /* calculate the atom data index range based on locality */
+ +    if (LOCAL_A(aloc))
+ +    {
+ +        adat_begin  = 0;
+ +        adat_len    = adat->natoms_local;
+ +        adat_end    = cu_nb->atdat->natoms_local;
+ +    }
+ +    else
+ +    {
+ +        adat_begin  = adat->natoms_local;
+ +        adat_len    = adat->natoms - adat->natoms_local;
+ +        adat_end    = cu_nb->atdat->natoms;
+ +    }
+ +
+ +    /* beginning of timed D2H section */
+ +    if (bDoTime)
+ +    {
+ +        stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
+ +        CU_RET_ERR(stat, "cudaEventRecord failed");
+ +    }
+ +
+ +    if (!cu_nb->bUseStreamSync)
+ +    {
+ +        /* For safety reasons set a few (5%) forces to NaN. This way even if the
+ +           polling "hack" fails with some future NVIDIA driver we'll get a crash. */
+ +        for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
+ +        {
+ +#ifdef NAN
+ +            nbatom->out[0].f[i] = NAN;
+ +#else
+ +#  ifdef _MSVC
+ +            if (numeric_limits<float>::has_quiet_NaN)
+ +            {
+ +                nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
+ +            }
+ +            else
+ +#  endif
+ +            {
+ +                nbatom->out[0].f[i] = GMX_REAL_MAX;
+ +            }
+ +#endif
+ +        }
+ +
+ +        /* Set the last four bytes of the force array to a bit pattern
+ +           which can't be the result of the force calculation:
+ +           max exponent (127) and zero mantissa. */
+ +        *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
+ +    }
+ +
+ +    /* With DD the local D2H transfer can only start after the non-local 
+ +       has been launched. */
+ +    if (iloc == eintLocal && cu_nb->bUseTwoStreams)
+ +    {
+ +        stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
+ +        CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
+ +    }
+ +
+ +    /* DtoH f */
+ +    cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin, 
+ +                      (adat_len)*sizeof(*adat->f), stream);
+ +
+ +    /* After the non-local D2H is launched the nonlocal_done event can be
+ +       recorded which signals that the local D2H can proceed. This event is not
+ +       placed after the non-local kernel because we first need the non-local
+ +       data back first. */
+ +    if (iloc == eintNonlocal)
+ +    {
+ +        stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
+ +        CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
+ +    }
+ +
+ +    /* only transfer energies in the local stream */
+ +    if (LOCAL_I(iloc))
+ +    {
+ +        /* DtoH fshift */
+ +        if (bCalcFshift)
+ +        {
+ +            cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
+ +                              SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
+ +        }
+ +
+ +        /* DtoH energies */
+ +        if (bCalcEner)
+ +        {
+ +            cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
+ +                              sizeof(*cu_nb->nbst.e_lj), stream);
+ +            cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
+ +                              sizeof(*cu_nb->nbst.e_el), stream);
+ +        }
+ +    }
+ +
+ +    if (bDoTime)
+ +    {
+ +        stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
+ +        CU_RET_ERR(stat, "cudaEventRecord failed");
+ +    }
+ +}
+ +
+ +/* Atomic compare-exchange operation on unsigned values. It is used in
+ + * polling wait for the GPU.
+ + */
+ +static inline bool atomic_cas(volatile unsigned int *ptr,
+ +                              unsigned int oldval,
+ +                              unsigned int newval)
+ +{
+ +    assert(ptr);
+ +
+ +#ifdef TMPI_ATOMICS
+ +    return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
+ +#else
+ +    gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
+ +    return true;
+ +#endif
+ +}
+ +
+ +void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
+ +                         const nbnxn_atomdata_t *nbatom,
+ +                         int flags, int aloc,
+ +                         real *e_lj, real *e_el, rvec *fshift)
+ +{
+ +    /* NOTE:  only implemented for single-precision at this time */
+ +    cudaError_t stat;
+ +    int i, adat_end, iloc = -1;
+ +    volatile unsigned int *poll_word;
+ +
+ +    /* determine interaction locality from atom locality */
+ +    if (LOCAL_A(aloc))
+ +    {
+ +        iloc = eintLocal;
+ +    }
+ +    else if (NONLOCAL_A(aloc))
+ +    {
+ +        iloc = eintNonlocal;
+ +    }
+ +    else
+ +    {
+ +        char stmp[STRLEN];
+ +        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
+ +                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
+ +        gmx_incons(stmp);
+ +    }
+ +
+ +    cu_plist_t      *plist   = cu_nb->plist[iloc];
+ +    cu_timers_t     *timers  = cu_nb->timers;
+ +    wallclock_gpu_t *timings = cu_nb->timings;
+ +    nb_staging      nbst     = cu_nb->nbst;
+ +
+ +    bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
+ +    bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ +
+ +    /* turn energy calculation always on/off (for debugging/testing only) */
+ +    bCalcEner = (bCalcEner || always_ener) && !never_ener; 
+ +
+ +    /* don't launch wait/update timers & counters if there was no work to do
+ +
+ +       NOTE: if timing with multiple GPUs (streams) becomes possible, the
+ +       counters could end up being inconsistent due to not being incremented
+ +       on some of the nodes! */
+ +    if (cu_nb->plist[iloc]->nsci == 0)
+ +    {
+ +        return;
+ +    }
+ +
+ +    /* calculate the atom data index range based on locality */
+ +    if (LOCAL_A(aloc))
+ +    {
+ +        adat_end = cu_nb->atdat->natoms_local;
+ +    }
+ +    else
+ +    {
+ +        adat_end = cu_nb->atdat->natoms;
+ +    }
+ +
+ +    if (cu_nb->bUseStreamSync)
+ +    {
+ +        stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
+ +        CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
+ +    }
+ +    else 
+ +    {
+ +        /* Busy-wait until we get the signal pattern set in last byte
+ +         * of the l/nl float vector. This pattern corresponds to a floating
+ +         * point number which can't be the result of the force calculation
+ +         * (maximum, 127 exponent and 0 mantissa).
+ +         * The polling uses atomic compare-exchange.
+ +         */
+ +        poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
+ +        while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
+ +    }
+ +
+ +    /* timing data accumulation */
+ +    if (cu_nb->bDoTime)
+ +    {
+ +        /* only increase counter once (at local F wait) */
+ +        if (LOCAL_I(iloc))
+ +        {
+ +            timings->nb_c++;
+ +            timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
+ +        }
+ +
+ +        /* kernel timings */
+ +        timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
+ +            cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
+ +
+ +        /* X/q H2D and F D2H timings */
+ +        timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
+ +                                                 timers->stop_nb_h2d[iloc]);
+ +        timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
+ +                                                 timers->stop_nb_d2h[iloc]);
+ +
+ +        /* only count atdat and pair-list H2D at pair-search step */
+ +        if (plist->bDoPrune)
+ +        {
+ +            /* atdat transfer timing (add only once, at local F wait) */
+ +            if (LOCAL_A(aloc))
+ +            {
+ +                timings->pl_h2d_c++;
+ +                timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
+ +                                                         timers->stop_atdat);
+ +            }
+ +
+ +            timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
+ +                                                     timers->stop_pl_h2d[iloc]);
+ +        }
+ +    }
+ +
+ +    /* add up energies and shift forces (only once at local F wait) */
+ +    if (LOCAL_I(iloc))
+ +    {
+ +        if (bCalcEner)
+ +        {
+ +            *e_lj += *nbst.e_lj;
+ +            *e_el += *nbst.e_el;
+ +        }
+ +
+ +        if (bCalcFshift)
+ +        {
+ +            for (i = 0; i < SHIFTS; i++)
+ +            {
+ +                fshift[i][0] += nbst.fshift[i].x;
+ +                fshift[i][1] += nbst.fshift[i].y;
+ +                fshift[i][2] += nbst.fshift[i].z;
+ +            }
+ +        }
+ +    }
+ +
+ +    /* turn off pruning (doesn't matter if this is pair-search step or not) */
+ +    plist->bDoPrune = false;
+ +}
+ +
+ +/*! Return the reference to the nbfp texture. */
+ +const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
+ +{
+ +    return nbfp_texref;
+ +}
+ +
+ +/*! Return the reference to the coulomb_tab. */
+ +const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
+ +{
+ +    return coulomb_tab_texref;
+ +}
+ +
+ +/*! Set up the cache configuration for the non-bonded kernels,
+ + */
+ +void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
+ +{
+ +    cudaError_t stat;
+ +
+ +    for (int i = 0; i < eelCuNR; i++)
+ +    {
+ +        for (int j = 0; j < nEnergyKernelTypes; j++)
+ +        {
+ +            for (int k = 0; k < nPruneKernelTypes; k++)
+ +            {
+ +                /* Legacy kernel 16/48 kB Shared/L1
+ +                 * No analytical Ewald!
+ +                 */
+ +                if (i != eelCuEWALD_ANA && i != eelCuEWALD_ANA_TWIN)
+ +                {
+ +                    stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
+ +                    CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+ +                }
+ +
+ +                if (devinfo->prop.major >= 3)
+ +                {
+ +                    /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
+ +                    stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
+ +                }
+ +                else
+ +                {
+ +                    /* On Fermi prefer L1 gives 2% higher performance */
+ +                    /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
+ +                    stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
+ +                }
+ +                CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+ +            }
+ +        }
+ +    }
+ +}
diff --cc src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h

index 812e2e06f8db40692ad8323d7ce5507abc95043c,0000000000000000000000000000000000000000..2a46f0381eed4fe36e831b1a596f83868841e7b8

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
@@@ -1,211 -1,0 +1,204 @@@
- /* Set the stride for the lookup of the two LJ parameters from their
-  * (padded) array.
-  * Note that currently only arrays with stride 2 and 4 are available.
-  * Since the reference code does not require alignment, we can always use 2.
-  */
- static const int nbfp_stride = 2;
- 
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS Development Team
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +#ifndef _nbnxn_kernel_simd_utils_h_
+ +#define _nbnxn_kernel_simd_utils_h_
+ +
+ +/*! \brief Provides hardware-specific utility routines for the SIMD kernels.
+ + *
+ + * Defines all functions, typedefs, constants and macros that have
+ + * explicit dependencies on the j-cluster size, precision, or SIMD
+ + * width. This includes handling diagonal, Newton and topology
+ + * exclusions.
+ + *
+ + * The functionality which depends on the j-cluster size is:
+ + *   LJ-parameter lookup
+ + *   force table lookup
+ + *   energy group pair energy storage
+ + */
+ +
+ +#if !defined GMX_NBNXN_SIMD_2XNN && !defined GMX_NBNXN_SIMD_4XN
+ +#error "Must define an NBNxN kernel flavour before including NBNxN kernel utility functions"
+ +#endif
+ +
+ +#ifdef GMX_SIMD_REFERENCE_PLAIN_C
+ +
+ +/* Align a stack-based thread-local working array. */
+ +static gmx_inline int *
+ +prepare_table_load_buffer(const int *array)
+ +{
+ +    return NULL;
+ +}
+ +
+ +#include "nbnxn_kernel_simd_utils_ref.h"
+ +
+ +#else /* GMX_SIMD_REFERENCE_PLAIN_C */
+ +
+ +#ifdef GMX_X86_SSE2
+ +/* Include x86 SSE2 compatible SIMD functions */
+ +
+ +/* Set the stride for the lookup of the two LJ parameters from their
+ + * (padded) array. We use the minimum supported SIMD memory alignment.
+ + */
+ +#if defined GMX_DOUBLE
+ +static const int nbfp_stride = 2;
+ +#else
+ +static const int nbfp_stride = 4;
+ +#endif
+ +
+ +/* Align a stack-based thread-local working array. Table loads on
+ + * full-width AVX_256 use the array, but other implementations do
+ + * not. */
+ +static gmx_inline int *
+ +prepare_table_load_buffer(const int *array)
+ +{
+ +#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+ +    return gmx_simd_align_int(array);
+ +#else
+ +    return NULL;
+ +#endif
+ +}
+ +
+ +#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+ +
+ +/* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
+ +#if GMX_SIMD_WIDTH_HERE == 8
+ +#define TAB_FDV0
+ +#endif
+ +
+ +/*
+ +Berk, 2xnn.c had the following code, but I think it is safe to remove now, given the code immediately above.
+ +
+ +#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
+ +/ * AVX-256 single precision 2x(4+4) kernel,
+ + * we can do half SIMD-width aligned FDV0 table loads.
+ + * /
+ +#define TAB_FDV0
+ +#endif
+ +*/
+ +
+ +#ifdef GMX_DOUBLE
+ +#include "nbnxn_kernel_simd_utils_x86_256d.h"
+ +#else  /* GMX_DOUBLE */
+ +#include "nbnxn_kernel_simd_utils_x86_256s.h"
+ +#endif /* GMX_DOUBLE */
+ +
+ +#else  /* defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
+ +
+ +/* We use the FDV0 table layout when we can use aligned table loads */
+ +#if GMX_SIMD_WIDTH_HERE == 4
+ +#define TAB_FDV0
+ +#endif
+ +
+ +#ifdef GMX_DOUBLE
+ +#include "nbnxn_kernel_simd_utils_x86_128d.h"
+ +#else  /* GMX_DOUBLE */
+ +#include "nbnxn_kernel_simd_utils_x86_128s.h"
+ +#endif /* GMX_DOUBLE */
+ +
+ +#endif /* defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
+ +
+ +#else  /* GMX_X86_SSE2 */
+ +
+ +#if GMX_SIMD_WIDTH_HERE > 4
+ +static const int nbfp_stride = 4;
+ +#else
+ +static const int nbfp_stride = GMX_SIMD_WIDTH_HERE;
+ +#endif
+ +
+ +/* We use the FDV0 table layout when we can use aligned table loads */
+ +#if GMX_SIMD_WIDTH_HERE == 4
+ +#define TAB_FDV0
+ +#endif
+ +
+ +#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+ +#include "nbnxn_kernel_simd_utils_ibm_qpx.h"
+ +#endif /* GMX_CPU_ACCELERATION_IBM_QPX */
+ +
+ +#endif /* GMX_X86_SSE2 */
+ +#endif /* GMX_SIMD_REFERENCE_PLAIN_C */
+ +
+ +
+ +#ifdef UNROLLJ
+ +/* Add energy register to possibly multiple terms in the energy array */
+ +static inline void add_ener_grp(gmx_mm_pr e_S, real *v, const int *offset_jj)
+ +{
+ +    int jj;
+ +
+ +    /* We need to balance the number of store operations with
+ +     * the rapidly increases number of combinations of energy groups.
+ +     * We add to a temporary buffer for 1 i-group vs 2 j-groups.
+ +     */
+ +    for (jj = 0; jj < (UNROLLJ/2); jj++)
+ +    {
+ +        gmx_mm_pr v_S;
+ +
+ +        v_S = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
+ +        gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE, gmx_add_pr(v_S, e_S));
+ +    }
+ +}
+ +#endif
+ +
+ +#if defined GMX_NBNXN_SIMD_2XNN && defined UNROLLJ
+ +/* As add_ener_grp, but for two groups of UNROLLJ/2 stored in
+ + * a single SIMD register.
+ + */
+ +static inline void
+ +add_ener_grp_halves(gmx_mm_pr e_S, real *v0, real *v1, const int *offset_jj)
+ +{
+ +    gmx_mm_hpr e_S0, e_S1;
+ +    int        jj;
+ +
+ +    gmx_pr_to_2hpr(e_S, &e_S0, &e_S1);
+ +
+ +    for (jj = 0; jj < (UNROLLJ/2); jj++)
+ +    {
+ +        gmx_mm_hpr v_S;
+ +
+ +        gmx_load_hpr(&v_S, v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ +        gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S0));
+ +    }
+ +    for (jj = 0; jj < (UNROLLJ/2); jj++)
+ +    {
+ +        gmx_mm_hpr v_S;
+ +
+ +        gmx_load_hpr(&v_S, v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ +        gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S1));
+ +    }
+ +}
+ +#endif
+ +
+ +#endif /* _nbnxn_kernel_simd_utils_h_ */
diff --cc src/programs/mdrun/repl_ex.c

index 8501a9667696ab0b586cbe48ff48191614ed5dba,0000000000000000000000000000000000000000..2e26c77c369806039e5c65730ec7a8e79a5241d0

mode 100644,000000..100644
--- 1/src/programs/mdrun/repl_ex.c
--- /dev/null
+++ b/src/programs/mdrun/repl_ex.c
@@@ -1,1438 -1,0 +1,1442 @@@
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <math.h>
+ +#include "repl_ex.h"
+ +#include "network.h"
+ +#include "random.h"
+ +#include "smalloc.h"
+ +#include "physics.h"
+ +#include "copyrite.h"
+ +#include "macros.h"
+ +#include "vec.h"
+ +#include "names.h"
+ +#include "mvdata.h"
+ +#include "domdec.h"
+ +#include "partdec.h"
+ +
+ +#define PROBABILITYCUTOFF 100
+ +/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
+ +
+ +enum {
+ +    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
+ +};
+ +const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
+ +/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
+ +   it are multiple replica exchange methods */
+ +/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
+ +   Let's wait until we feel better about the pressure control methods giving exact ensembles.  Right now, we assume constant pressure  */
+ +
+ +typedef struct gmx_repl_ex
+ +{
+ +    int      repl;
+ +    int      nrepl;
+ +    real     temp;
+ +    int      type;
+ +    real   **q;
+ +    gmx_bool bNPT;
+ +    real    *pres;
+ +    int     *ind;
+ +    int     *allswaps;
+ +    int      nst;
+ +    int      nex;
+ +    int      seed;
+ +    int      nattempt[2];
+ +    real    *prob_sum;
+ +    int    **nmoves;
+ +    int     *nexchange;
+ +
+ +    /* these are helper arrays for replica exchange; allocated here so they
+ +       don't have to be allocated each time */
+ +    int      *destinations;
+ +    int     **cyclic;
+ +    int     **order;
+ +    int      *tmpswap;
+ +    gmx_bool *incycle;
+ +    gmx_bool *bEx;
+ +
+ +    /* helper arrays to hold the quantities that are exchanged */
+ +    real  *prob;
+ +    real  *Epot;
+ +    real  *beta;
+ +    real  *Vol;
+ +    real **de;
+ +
+ +} t_gmx_repl_ex;
+ +
+ +static gmx_bool repl_quantity(const gmx_multisim_t *ms,
+ +                              struct gmx_repl_ex *re, int ere, real q)
+ +{
+ +    real    *qall;
+ +    gmx_bool bDiff;
+ +    int      i, s;
+ +
+ +    snew(qall, ms->nsim);
+ +    qall[re->repl] = q;
+ +    gmx_sum_sim(ms->nsim, qall, ms);
+ +
+ +    bDiff = FALSE;
+ +    for (s = 1; s < ms->nsim; s++)
+ +    {
+ +        if (qall[s] != qall[0])
+ +        {
+ +            bDiff = TRUE;
+ +        }
+ +    }
+ +
+ +    if (bDiff)
+ +    {
+ +        /* Set the replica exchange type and quantities */
+ +        re->type = ere;
+ +
+ +        snew(re->q[ere], re->nrepl);
+ +        for (s = 0; s < ms->nsim; s++)
+ +        {
+ +            re->q[ere][s] = qall[s];
+ +        }
+ +    }
+ +    sfree(qall);
+ +    return bDiff;
+ +}
+ +
+ +gmx_repl_ex_t init_replica_exchange(FILE *fplog,
+ +                                    const gmx_multisim_t *ms,
+ +                                    const t_state *state,
+ +                                    const t_inputrec *ir,
+ +                                    int nst, int nex, int init_seed)
+ +{
+ +    real                temp, pres;
+ +    int                 i, j, k;
+ +    struct gmx_repl_ex *re;
+ +    gmx_bool            bTemp;
+ +    gmx_bool            bLambda = FALSE;
+ +
+ +    fprintf(fplog, "\nInitializing Replica Exchange\n");
+ +
+ +    if (ms == NULL || ms->nsim == 1)
+ +    {
+ +        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
+ +    }
+ +
+ +    snew(re, 1);
+ +
+ +    re->repl     = ms->sim;
+ +    re->nrepl    = ms->nsim;
+ +    snew(re->q, ereENDSINGLE);
+ +
+ +    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
+ +
+ +    check_multi_int(fplog, ms, state->natoms, "the number of atoms", FALSE);
+ +    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
+ +    check_multi_large_int(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
+ +    check_multi_large_int(fplog, ms, (ir->init_step+nst-1)/nst,
+ +                          "first exchange step: init_step/-replex", FALSE);
+ +    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
+ +    check_multi_int(fplog, ms, ir->opts.ngtc,
+ +                    "the number of temperature coupling groups", FALSE);
+ +    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
+ +    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
+ +    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
+ +
+ +    re->temp = ir->opts.ref_t[0];
+ +    for (i = 1; (i < ir->opts.ngtc); i++)
+ +    {
+ +        if (ir->opts.ref_t[i] != re->temp)
+ +        {
+ +            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+ +            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
+ +        }
+ +    }
+ +
+ +    re->type = -1;
+ +    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
+ +    if (ir->efep != efepNO)
+ +    {
+ +        bLambda = repl_quantity(ms, re, ereLAMBDA, (real)ir->fepvals->init_fep_state);
+ +    }
+ +    if (re->type == -1)  /* nothing was assigned */
+ +    {
+ +        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
+ +    }
+ +    if (bLambda && bTemp)
+ +    {
+ +        re->type = ereTL;
+ +    }
+ +
+ +    if (bTemp)
+ +    {
+ +        please_cite(fplog, "Sugita1999a");
+ +        if (ir->epc != epcNO)
+ +        {
+ +            re->bNPT = TRUE;
+ +            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
+ +            please_cite(fplog, "Okabe2001a");
+ +        }
+ +        if (ir->etc == etcBERENDSEN)
+ +        {
+ +            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
+ +                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
+ +        }
+ +    }
+ +    if (bLambda)
+ +    {
+ +        if (ir->fepvals->delta_lambda != 0)   /* check this? */
+ +        {
+ +            gmx_fatal(FARGS, "delta_lambda is not zero");
+ +        }
+ +    }
+ +    if (re->bNPT)
+ +    {
+ +        snew(re->pres, re->nrepl);
+ +        if (ir->epct == epctSURFACETENSION)
+ +        {
+ +            pres = ir->ref_p[ZZ][ZZ];
+ +        }
+ +        else
+ +        {
+ +            pres = 0;
+ +            j    = 0;
+ +            for (i = 0; i < DIM; i++)
+ +            {
+ +                if (ir->compress[i][i] != 0)
+ +                {
+ +                    pres += ir->ref_p[i][i];
+ +                    j++;
+ +                }
+ +            }
+ +            pres /= j;
+ +        }
+ +        re->pres[re->repl] = pres;
+ +        gmx_sum_sim(re->nrepl, re->pres, ms);
+ +    }
+ +
+ +    /* Make an index for increasing replica order */
+ +    /* only makes sense if one or the other is varying, not both!
+ +       if both are varying, we trust the order the person gave. */
+ +    snew(re->ind, re->nrepl);
+ +    for (i = 0; i < re->nrepl; i++)
+ +    {
+ +        re->ind[i] = i;
+ +    }
+ +
+ +    if (re->type < ereENDSINGLE)
+ +    {
+ +
+ +        for (i = 0; i < re->nrepl; i++)
+ +        {
+ +            for (j = i+1; j < re->nrepl; j++)
+ +            {
+ +                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
+ +                {
+ +                    k          = re->ind[i];
+ +                    re->ind[i] = re->ind[j];
+ +                    re->ind[j] = k;
+ +                }
+ +                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
+ +                {
+ +                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    /* keep track of all the swaps, starting with the initial placement. */
+ +    snew(re->allswaps, re->nrepl);
+ +    for (i = 0; i < re->nrepl; i++)
+ +    {
+ +        re->allswaps[i] = re->ind[i];
+ +    }
+ +
+ +    switch (re->type)
+ +    {
+ +        case ereTEMP:
+ +            fprintf(fplog, "\nReplica exchange in temperature\n");
+ +            for (i = 0; i < re->nrepl; i++)
+ +            {
+ +                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
+ +            }
+ +            fprintf(fplog, "\n");
+ +            break;
+ +        case ereLAMBDA:
+ +            fprintf(fplog, "\nReplica exchange in lambda\n");
+ +            for (i = 0; i < re->nrepl; i++)
+ +            {
+ +                fprintf(fplog, " %3d", (int)re->q[re->type][re->ind[i]]);
+ +            }
+ +            fprintf(fplog, "\n");
+ +            break;
+ +        case ereTL:
+ +            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
+ +            for (i = 0; i < re->nrepl; i++)
+ +            {
+ +                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
+ +            }
+ +            fprintf(fplog, "\n");
+ +            for (i = 0; i < re->nrepl; i++)
+ +            {
+ +                fprintf(fplog, " %5d", (int)re->q[ereLAMBDA][re->ind[i]]);
+ +            }
+ +            fprintf(fplog, "\n");
+ +            break;
+ +        default:
+ +            gmx_incons("Unknown replica exchange quantity");
+ +    }
+ +    if (re->bNPT)
+ +    {
+ +        fprintf(fplog, "\nRepl  p");
+ +        for (i = 0; i < re->nrepl; i++)
+ +        {
+ +            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
+ +        }
+ +
+ +        for (i = 0; i < re->nrepl; i++)
+ +        {
+ +            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
+ +            {
+ +                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+ +                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
+ +            }
+ +        }
+ +    }
+ +    re->nst = nst;
+ +    if (init_seed == -1)
+ +    {
+ +        if (MASTERSIM(ms))
+ +        {
+ +            re->seed = make_seed();
+ +        }
+ +        else
+ +        {
+ +            re->seed = 0;
+ +        }
+ +        gmx_sumi_sim(1, &(re->seed), ms);
+ +    }
+ +    else
+ +    {
+ +        re->seed = init_seed;
+ +    }
+ +    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
+ +    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
+ +
+ +    re->nattempt[0] = 0;
+ +    re->nattempt[1] = 0;
+ +
+ +    snew(re->prob_sum, re->nrepl);
+ +    snew(re->nexchange, re->nrepl);
+ +    snew(re->nmoves, re->nrepl);
+ +    for (i = 0; i < re->nrepl; i++)
+ +    {
+ +        snew(re->nmoves[i], re->nrepl);
+ +    }
+ +    fprintf(fplog, "Replica exchange information below: x=exchange, pr=probability\n");
+ +
+ +    /* generate space for the helper functions so we don't have to snew each time */
+ +
+ +    snew(re->destinations, re->nrepl);
+ +    snew(re->incycle, re->nrepl);
+ +    snew(re->tmpswap, re->nrepl);
+ +    snew(re->cyclic, re->nrepl);
+ +    snew(re->order, re->nrepl);
+ +    for (i = 0; i < re->nrepl; i++)
+ +    {
+ +        snew(re->cyclic[i], re->nrepl);
+ +        snew(re->order[i], re->nrepl);
+ +    }
+ +    /* allocate space for the functions storing the data for the replicas */
+ +    /* not all of these arrays needed in all cases, but they don't take
+ +       up much space, since the max size is nrepl**2 */
+ +    snew(re->prob, re->nrepl);
+ +    snew(re->bEx, re->nrepl);
+ +    snew(re->beta, re->nrepl);
+ +    snew(re->Vol, re->nrepl);
+ +    snew(re->Epot, re->nrepl);
+ +    snew(re->de, re->nrepl);
+ +    for (i = 0; i < re->nrepl; i++)
+ +    {
+ +        snew(re->de[i], re->nrepl);
+ +    }
+ +    re->nex = nex;
+ +    return re;
+ +}
+ +
+ +static void exchange_reals(const gmx_multisim_t *ms, int b, real *v, int n)
+ +{
+ +    real *buf;
+ +    int   i;
+ +
+ +    if (v)
+ +    {
+ +        snew(buf, n);
+ +#ifdef GMX_MPI
+ +        /*
+ +           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+ +           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
+ +           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+ +         */
+ +        {
+ +            MPI_Request mpi_req;
+ +
+ +            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+ +                      ms->mpi_comm_masters, &mpi_req);
+ +            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
+ +                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+ +            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+ +        }
+ +#endif
+ +        for (i = 0; i < n; i++)
+ +        {
+ +            v[i] = buf[i];
+ +        }
+ +        sfree(buf);
+ +    }
+ +}
+ +
+ +
+ +static void exchange_ints(const gmx_multisim_t *ms, int b, int *v, int n)
+ +{
+ +    int *buf;
+ +    int  i;
+ +
+ +    if (v)
+ +    {
+ +        snew(buf, n);
+ +#ifdef GMX_MPI
+ +        /*
+ +           MPI_Sendrecv(v,  n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
+ +             buf,n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
+ +             ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+ +         */
+ +        {
+ +            MPI_Request mpi_req;
+ +
+ +            MPI_Isend(v, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
+ +                      ms->mpi_comm_masters, &mpi_req);
+ +            MPI_Recv(buf, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
+ +                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+ +            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+ +        }
+ +#endif
+ +        for (i = 0; i < n; i++)
+ +        {
+ +            v[i] = buf[i];
+ +        }
+ +        sfree(buf);
+ +    }
+ +}
+ +
+ +static void exchange_doubles(const gmx_multisim_t *ms, int b, double *v, int n)
+ +{
+ +    double *buf;
+ +    int     i;
+ +
+ +    if (v)
+ +    {
+ +        snew(buf, n);
+ +#ifdef GMX_MPI
+ +        /*
+ +           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+ +           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
+ +           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+ +         */
+ +        {
+ +            MPI_Request mpi_req;
+ +
+ +            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+ +                      ms->mpi_comm_masters, &mpi_req);
+ +            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
+ +                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+ +            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+ +        }
+ +#endif
+ +        for (i = 0; i < n; i++)
+ +        {
+ +            v[i] = buf[i];
+ +        }
+ +        sfree(buf);
+ +    }
+ +}
+ +
+ +static void exchange_rvecs(const gmx_multisim_t *ms, int b, rvec *v, int n)
+ +{
+ +    rvec *buf;
+ +    int   i;
+ +
+ +    if (v)
+ +    {
+ +        snew(buf, n);
+ +#ifdef GMX_MPI
+ +        /*
+ +           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+ +           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
+ +           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
+ +         */
+ +        {
+ +            MPI_Request mpi_req;
+ +
+ +            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+ +                      ms->mpi_comm_masters, &mpi_req);
+ +            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
+ +                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
+ +            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
+ +        }
+ +#endif
+ +        for (i = 0; i < n; i++)
+ +        {
+ +            copy_rvec(buf[i], v[i]);
+ +        }
+ +        sfree(buf);
+ +    }
+ +}
+ +
+ +static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
+ +{
+ +    /* When t_state changes, this code should be updated. */
+ +    int ngtc, nnhpres;
+ +    ngtc    = state->ngtc * state->nhchainlength;
+ +    nnhpres = state->nnhpres* state->nhchainlength;
+ +    exchange_rvecs(ms, b, state->box, DIM);
+ +    exchange_rvecs(ms, b, state->box_rel, DIM);
+ +    exchange_rvecs(ms, b, state->boxv, DIM);
+ +    exchange_reals(ms, b, &(state->veta), 1);
+ +    exchange_reals(ms, b, &(state->vol0), 1);
+ +    exchange_rvecs(ms, b, state->svir_prev, DIM);
+ +    exchange_rvecs(ms, b, state->fvir_prev, DIM);
+ +    exchange_rvecs(ms, b, state->pres_prev, DIM);
+ +    exchange_doubles(ms, b, state->nosehoover_xi, ngtc);
+ +    exchange_doubles(ms, b, state->nosehoover_vxi, ngtc);
+ +    exchange_doubles(ms, b, state->nhpres_xi, nnhpres);
+ +    exchange_doubles(ms, b, state->nhpres_vxi, nnhpres);
+ +    exchange_doubles(ms, b, state->therm_integral, state->ngtc);
+ +    exchange_rvecs(ms, b, state->x, state->natoms);
+ +    exchange_rvecs(ms, b, state->v, state->natoms);
+ +    exchange_rvecs(ms, b, state->sd_X, state->natoms);
+ +}
+ +
+ +static void copy_rvecs(rvec *s, rvec *d, int n)
+ +{
+ +    int i;
+ +
+ +    if (d != NULL)
+ +    {
+ +        for (i = 0; i < n; i++)
+ +        {
+ +            copy_rvec(s[i], d[i]);
+ +        }
+ +    }
+ +}
+ +
+ +static void copy_doubles(const double *s, double *d, int n)
+ +{
+ +    int i;
+ +
+ +    if (d != NULL)
+ +    {
+ +        for (i = 0; i < n; i++)
+ +        {
+ +            d[i] = s[i];
+ +        }
+ +    }
+ +}
+ +
+ +static void copy_reals(const real *s, real *d, int n)
+ +{
+ +    int i;
+ +
+ +    if (d != NULL)
+ +    {
+ +        for (i = 0; i < n; i++)
+ +        {
+ +            d[i] = s[i];
+ +        }
+ +    }
+ +}
+ +
+ +static void copy_ints(const int *s, int *d, int n)
+ +{
+ +    int i;
+ +
+ +    if (d != NULL)
+ +    {
+ +        for (i = 0; i < n; i++)
+ +        {
+ +            d[i] = s[i];
+ +        }
+ +    }
+ +}
+ +
+ +#define scopy_rvecs(v, n)   copy_rvecs(state->v, state_local->v, n);
+ +#define scopy_doubles(v, n) copy_doubles(state->v, state_local->v, n);
+ +#define scopy_reals(v, n) copy_reals(state->v, state_local->v, n);
+ +#define scopy_ints(v, n)   copy_ints(state->v, state_local->v, n);
+ +
+ +static void copy_state_nonatomdata(t_state *state, t_state *state_local)
+ +{
+ +    /* When t_state changes, this code should be updated. */
+ +    int ngtc, nnhpres;
+ +    ngtc    = state->ngtc * state->nhchainlength;
+ +    nnhpres = state->nnhpres* state->nhchainlength;
+ +    scopy_rvecs(box, DIM);
+ +    scopy_rvecs(box_rel, DIM);
+ +    scopy_rvecs(boxv, DIM);
+ +    state_local->veta = state->veta;
+ +    state_local->vol0 = state->vol0;
+ +    scopy_rvecs(svir_prev, DIM);
+ +    scopy_rvecs(fvir_prev, DIM);
+ +    scopy_rvecs(pres_prev, DIM);
+ +    scopy_doubles(nosehoover_xi, ngtc);
+ +    scopy_doubles(nosehoover_vxi, ngtc);
+ +    scopy_doubles(nhpres_xi, nnhpres);
+ +    scopy_doubles(nhpres_vxi, nnhpres);
+ +    scopy_doubles(therm_integral, state->ngtc);
+ +    scopy_rvecs(x, state->natoms);
+ +    scopy_rvecs(v, state->natoms);
+ +    scopy_rvecs(sd_X, state->natoms);
+ +    copy_ints(&(state->fep_state), &(state_local->fep_state), 1);
+ +    scopy_reals(lambda, efptNR);
+ +}
+ +
+ +static void scale_velocities(t_state *state, real fac)
+ +{
+ +    int i;
+ +
+ +    if (state->v)
+ +    {
+ +        for (i = 0; i < state->natoms; i++)
+ +        {
+ +            svmul(fac, state->v[i], state->v[i]);
+ +        }
+ +    }
+ +}
+ +
+ +static void pd_collect_state(const t_commrec *cr, t_state *state)
+ +{
+ +    int shift;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Collecting state before exchange\n");
+ +    }
+ +    shift = cr->nnodes - cr->npmenodes - 1;
+ +    move_rvecs(cr, FALSE, FALSE, state->x, NULL, shift, NULL);
+ +    if (state->v)
+ +    {
+ +        move_rvecs(cr, FALSE, FALSE, state->v, NULL, shift, NULL);
+ +    }
+ +    if (state->sd_X)
+ +    {
+ +        move_rvecs(cr, FALSE, FALSE, state->sd_X, NULL, shift, NULL);
+ +    }
+ +}
+ +
+ +static void print_transition_matrix(FILE *fplog, int n, int **nmoves, int *nattempt)
+ +{
+ +    int   i, j, ntot;
+ +    float Tprint;
+ +
+ +    ntot = nattempt[0] + nattempt[1];
+ +    fprintf(fplog, "\n");
+ +    fprintf(fplog, "Repl");
+ +    for (i = 0; i < n; i++)
+ +    {
+ +        fprintf(fplog, "    ");  /* put the title closer to the center */
+ +    }
+ +    fprintf(fplog, "Empirical Transition Matrix\n");
+ +
+ +    fprintf(fplog, "Repl");
+ +    for (i = 0; i < n; i++)
+ +    {
+ +        fprintf(fplog, "%8d", (i+1));
+ +    }
+ +    fprintf(fplog, "\n");
+ +
+ +    for (i = 0; i < n; i++)
+ +    {
+ +        fprintf(fplog, "Repl");
+ +        for (j = 0; j < n; j++)
+ +        {
+ +            Tprint = 0.0;
+ +            if (nmoves[i][j] > 0)
+ +            {
+ +                Tprint = nmoves[i][j]/(2.0*ntot);
+ +            }
+ +            fprintf(fplog, "%8.4f", Tprint);
+ +        }
+ +        fprintf(fplog, "%3d\n", i);
+ +    }
+ +}
+ +
+ +static void print_ind(FILE *fplog, const char *leg, int n, int *ind, gmx_bool *bEx)
+ +{
+ +    int i;
+ +
+ +    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
+ +    for (i = 1; i < n; i++)
+ +    {
+ +        fprintf(fplog, " %c %2d", (bEx != 0 && bEx[i]) ? 'x' : ' ', ind[i]);
+ +    }
+ +    fprintf(fplog, "\n");
+ +}
+ +
+ +static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
+ +{
+ +    int i;
+ +
+ +    for (i = 0; i < n; i++)
+ +    {
+ +        tmpswap[i] = allswaps[i];
+ +    }
+ +    for (i = 0; i < n; i++)
+ +    {
+ +        allswaps[i] = tmpswap[pind[i]];
+ +    }
+ +
+ +    fprintf(fplog, "\nAccepted Exchanges:   ");
+ +    for (i = 0; i < n; i++)
+ +    {
+ +        fprintf(fplog, "%d ", pind[i]);
+ +    }
+ +    fprintf(fplog, "\n");
+ +
+ +    /* the "Order After Exchange" is the state label corresponding to the configuration that
+ +       started in state listed in order, i.e.
+ +
+ +       3 0 1 2
+ +
+ +       means that the:
+ +       configuration starting in simulation 3 is now in simulation 0,
+ +       configuration starting in simulation 0 is now in simulation 1,
+ +       configuration starting in simulation 1 is now in simulation 2,
+ +       configuration starting in simulation 2 is now in simulation 3
+ +     */
+ +    fprintf(fplog, "Order After Exchange: ");
+ +    for (i = 0; i < n; i++)
+ +    {
+ +        fprintf(fplog, "%d ", allswaps[i]);
+ +    }
+ +    fprintf(fplog, "\n\n");
+ +}
+ +
+ +static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
+ +{
+ +    int  i;
+ +    char buf[8];
+ +
+ +    fprintf(fplog, "Repl %2s ", leg);
+ +    for (i = 1; i < n; i++)
+ +    {
+ +        if (prob[i] >= 0)
+ +        {
+ +            sprintf(buf, "%4.2f", prob[i]);
+ +            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
+ +        }
+ +        else
+ +        {
+ +            fprintf(fplog, "     ");
+ +        }
+ +    }
+ +    fprintf(fplog, "\n");
+ +}
+ +
+ +static void print_count(FILE *fplog, const char *leg, int n, int *count)
+ +{
+ +    int i;
+ +
+ +    fprintf(fplog, "Repl %2s ", leg);
+ +    for (i = 1; i < n; i++)
+ +    {
+ +        fprintf(fplog, " %4d", count[i]);
+ +    }
+ +    fprintf(fplog, "\n");
+ +}
+ +
+ +static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
+ +{
+ +
+ +    real   ediff, dpV, delta = 0;
+ +    real  *Epot = re->Epot;
+ +    real  *Vol  = re->Vol;
+ +    real **de   = re->de;
+ +    real  *beta = re->beta;
+ +
+ +    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
+ +       to the non permuted case */
+ +
+ +    switch (re->type)
+ +    {
+ +        case ereTEMP:
+ +            /*
+ +             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
+ +             */
+ +            ediff = Epot[b] - Epot[a];
+ +            delta = -(beta[bp] - beta[ap])*ediff;
+ +            break;
+ +        case ereLAMBDA:
+ +            /* two cases:  when we are permuted, and not.  */
+ +            /* non-permuted:
+ +               ediff =  E_new - E_old
+ +                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
+ +                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
+ +                     =  de[b][a] + de[a][b] */
+ +
+ +            /* permuted:
+ +               ediff =  E_new - E_old
+ +                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
+ +                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
+ +                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
+ +                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
+ +                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
+ +            /* but, in the current code implementation, we flip configurations, not indices . . .
+ +               So let's examine that.
+ +                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
+ +                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
+ +                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
+ +                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
+ +                     So the simple solution is to flip the
+ +                     position of perturbed and original indices in the tests.
+ +             */
+ +
+ +            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
+ +            delta = ediff*beta[a]; /* assume all same temperature in this case */
+ +            break;
+ +        case ereTL:
+ +            /* not permuted:  */
+ +            /* delta =  reduced E_new - reduced E_old
+ +                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
+ +                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
+ +                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
+ +                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
+ +                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
+ +                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
+ +                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
+ +            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
+ +            /* permuted (big breath!) */
+ +            /*   delta =  reduced E_new - reduced E_old
+ +                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
+ +                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+ +                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
+ +                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
+ +                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+ +                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
+ +                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
+ +             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
+ +                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
+ +                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
+ +             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
+ +                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
+ +             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
+ +            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
+ +            break;
+ +        default:
+ +            gmx_incons("Unknown replica exchange quantity");
+ +    }
+ +    if (bPrint)
+ +    {
+ +        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
+ +    }
+ +    if (re->bNPT)
+ +    {
+ +        /* revist the calculation for 5.0.  Might be some improvements. */
+ +        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
+ +        if (bPrint)
+ +        {
+ +            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\nb", dpV, delta + dpV);
+ +        }
+ +        delta += dpV;
+ +    }
+ +    return delta;
+ +}
+ +
+ +static void
+ +test_for_replica_exchange(FILE                 *fplog,
+ +                          const gmx_multisim_t *ms,
+ +                          struct gmx_repl_ex   *re,
+ +                          gmx_enerdata_t       *enerd,
+ +                          real                  vol,
+ +                          gmx_large_int_t       step,
+ +                          real                  time)
+ +{
+ +    int       m, i, j, a, b, ap, bp, i0, i1, tmp;
+ +    real      ediff = 0, delta = 0, dpV = 0;
+ +    gmx_bool  bPrint, bMultiEx;
+ +    gmx_bool *bEx      = re->bEx;
+ +    real     *prob     = re->prob;
+ +    int      *pind     = re->destinations; /* permuted index */
+ +    gmx_bool  bEpot    = FALSE;
+ +    gmx_bool  bDLambda = FALSE;
+ +    gmx_bool  bVol     = FALSE;
+ +
+ +    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
+ +    fprintf(fplog, "Replica exchange at step " gmx_large_int_pfmt " time %g\n", step, time);
+ +
+ +    if (re->bNPT)
+ +    {
+ +        for (i = 0; i < re->nrepl; i++)
+ +        {
+ +            re->Vol[i] = 0;
+ +        }
+ +        bVol               = TRUE;
+ +        re->Vol[re->repl]  = vol;
+ +    }
+ +    if ((re->type == ereTEMP || re->type == ereTL))
+ +    {
+ +        for (i = 0; i < re->nrepl; i++)
+ +        {
+ +            re->Epot[i] = 0;
+ +        }
+ +        bEpot              = TRUE;
+ +        re->Epot[re->repl] = enerd->term[F_EPOT];
+ +        /* temperatures of different states*/
+ +        for (i = 0; i < re->nrepl; i++)
+ +        {
+ +            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
+ +        }
+ +    }
+ +    else
+ +    {
+ +        for (i = 0; i < re->nrepl; i++)
+ +        {
+ +            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
+ +        }
+ +    }
+ +    if (re->type == ereLAMBDA || re->type == ereTL)
+ +    {
+ +        bDLambda = TRUE;
+ +        /* lambda differences. */
+ +        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
+ +           minus the energy of the jth simulation in the jth Hamiltonian */
+ +        for (i = 0; i < re->nrepl; i++)
+ +        {
+ +            for (j = 0; j < re->nrepl; j++)
+ +            {
+ +                re->de[i][j] = 0;
+ +            }
+ +        }
+ +        for (i = 0; i < re->nrepl; i++)
+ +        {
+ +            re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
+ +        }
+ +    }
+ +
+ +    /* now actually do the communication */
+ +    if (bVol)
+ +    {
+ +        gmx_sum_sim(re->nrepl, re->Vol, ms);
+ +    }
+ +    if (bEpot)
+ +    {
+ +        gmx_sum_sim(re->nrepl, re->Epot, ms);
+ +    }
+ +    if (bDLambda)
+ +    {
+ +        for (i = 0; i < re->nrepl; i++)
+ +        {
+ +            gmx_sum_sim(re->nrepl, re->de[i], ms);
+ +        }
+ +    }
+ +
+ +    /* make a duplicate set of indices for shuffling */
+ +    for (i = 0; i < re->nrepl; i++)
+ +    {
+ +        pind[i] = re->ind[i];
+ +    }
+ +
+ +    if (bMultiEx)
+ +    {
+ +        /* multiple random switch exchange */
+ +        for (i = 0; i < re->nex; i++)
+ +        {
+ +            /* randomly select a pair  */
+ +            /* in theory, could reduce this by identifying only which switches had a nonneglibible
+ +               probability of occurring (log p > -100) and only operate on those switches */
+ +            /* find out which state it is from, and what label that state currently has. Likely
+ +               more work that useful. */
+ +            i0 = (int)(re->nrepl*rando(&(re->seed)));
+ +            i1 = (int)(re->nrepl*rando(&(re->seed)));
+ +            if (i0 == i1)
+ +            {
+ +                i--;
+ +                continue;  /* self-exchange, back up and do it again */
+ +            }
+ +
+ +            a  = re->ind[i0]; /* what are the indices of these states? */
+ +            b  = re->ind[i1];
+ +            ap = pind[i0];
+ +            bp = pind[i1];
+ +
+ +            bPrint = FALSE; /* too noisy */
+ +            /* calculate the energy difference */
+ +            /* if the code changes to flip the STATES, rather than the configurations,
+ +               use the commented version of the code */
+ +            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
+ +            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
+ +
+ +            /* we actually only use the first space in the prob and bEx array,
+ +               since there are actually many switches between pairs. */
+ +
+ +            if (delta <= 0)
+ +            {
+ +                /* accepted */
+ +                prob[0] = 1;
+ +                bEx[0]  = TRUE;
+ +            }
+ +            else
+ +            {
+ +                if (delta > PROBABILITYCUTOFF)
+ +                {
+ +                    prob[0] = 0;
+ +                }
+ +                else
+ +                {
+ +                    prob[0] = exp(-delta);
+ +                }
+ +                /* roll a number to determine if accepted */
+ +                bEx[0] = (rando(&(re->seed)) < prob[0]);
+ +            }
+ +            re->prob_sum[0] += prob[0];
+ +
+ +            if (bEx[0])
+ +            {
+ +                /* swap the states */
+ +                tmp      = pind[i0];
+ +                pind[i0] = pind[i1];
+ +                pind[i1] = tmp;
+ +            }
+ +        }
+ +        re->nattempt[0]++;  /* keep track of total permutation trials here */
+ +        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
+ +    }
+ +    else
+ +    {
+ +        /* standard nearest neighbor replica exchange */
+ +        m = (step / re->nst) % 2;
+ +        for (i = 1; i < re->nrepl; i++)
+ +        {
+ +            a = re->ind[i-1];
+ +            b = re->ind[i];
+ +
+ +            bPrint = (re->repl == a || re->repl == b);
+ +            if (i % 2 == m)
+ +            {
+ +                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
+ +                if (delta <= 0)
+ +                {
+ +                    /* accepted */
+ +                    prob[i] = 1;
+ +                    bEx[i]  = TRUE;
+ +                }
+ +                else
+ +                {
+ +                    if (delta > PROBABILITYCUTOFF)
+ +                    {
+ +                        prob[i] = 0;
+ +                    }
+ +                    else
+ +                    {
+ +                        prob[i] = exp(-delta);
+ +                    }
+ +                    /* roll a number to determine if accepted */
+ +                    bEx[i] = (rando(&(re->seed)) < prob[i]);
+ +                }
+ +                re->prob_sum[i] += prob[i];
+ +
+ +                if (bEx[i])
+ +                {
+ +                    /* swap these two */
+ +                    tmp       = pind[i-1];
+ +                    pind[i-1] = pind[i];
+ +                    pind[i]   = tmp;
+ +                    re->nexchange[i]++;  /* statistics for back compatibility */
+ +                }
+ +            }
+ +            else
+ +            {
+ +                prob[i] = -1;
+ +                bEx[i]  = FALSE;
+ +            }
+ +        }
+ +        /* print some statistics */
+ +        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
+ +        print_prob(fplog, "pr", re->nrepl, prob);
+ +        fprintf(fplog, "\n");
+ +        re->nattempt[m]++;
+ +    }
+ +
+ +    /* record which moves were made and accepted */
+ +    for (i = 0; i < re->nrepl; i++)
+ +    {
+ +        re->nmoves[re->ind[i]][pind[i]] += 1;
+ +        re->nmoves[pind[i]][re->ind[i]] += 1;
+ +    }
+ +    fflush(fplog); /* make sure we can see what the last exchange was */
+ +}
+ +
+ +static void write_debug_x(t_state *state)
+ +{
+ +    int i;
+ +
+ +    if (debug)
+ +    {
+ +        for (i = 0; i < state->natoms; i += 10)
+ +        {
+ +            fprintf(debug, "dx %5d %10.5f %10.5f %10.5f\n", i, state->x[i][XX], state->x[i][YY], state->x[i][ZZ]);
+ +        }
+ +    }
+ +}
+ +
+ +static void
+ +cyclic_decomposition(const int *destinations,
+ +                     int      **cyclic,
+ +                     gmx_bool  *incycle,
+ +                     const int  nrepl,
+ +                     int       *nswap)
+ +{
+ +
+ +    int i, j, c, p;
+ +    int maxlen = 1;
+ +    for (i = 0; i < nrepl; i++)
+ +    {
+ +        incycle[i] = FALSE;
+ +    }
+ +    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
+ +    {
+ +        if (incycle[i])
+ +        {
+ +            cyclic[i][0] = -1;
+ +            continue;
+ +        }
+ +        cyclic[i][0] = i;
+ +        incycle[i]   = TRUE;
+ +        c            = 1;
+ +        p            = i;
+ +        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
+ +        {
+ +            p = destinations[p];    /* start permuting */
+ +            if (p == i)
+ +            {
+ +                cyclic[i][c] = -1;
+ +                if (c > maxlen)
+ +                {
+ +                    maxlen = c;
+ +                }
+ +                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
+ +            }
+ +            else
+ +            {
+ +                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
+ +                incycle[p]   = TRUE;
+ +                c++;
+ +            }
+ +        }
+ +    }
+ +    *nswap = maxlen - 1;
+ +
+ +    if (debug)
+ +    {
+ +        for (i = 0; i < nrepl; i++)
+ +        {
+ +            fprintf(debug, "Cycle %d:", i);
+ +            for (j = 0; j < nrepl; j++)
+ +            {
+ +                if (cyclic[i][j] < 0)
+ +                {
+ +                    break;
+ +                }
+ +                fprintf(debug, "%2d", cyclic[i][j]);
+ +            }
+ +            fprintf(debug, "\n");
+ +        }
+ +        fflush(debug);
+ +    }
+ +}
+ +
+ +static void
+ +compute_exchange_order(FILE     *fplog,
+ +                       int     **cyclic,
+ +                       int     **order,
+ +                       const int nrepl,
+ +                       const int maxswap)
+ +{
+ +    int i, j;
+ +
+ +    for (j = 0; j < maxswap; j++)
+ +    {
+ +        for (i = 0; i < nrepl; i++)
+ +        {
+ +            if (cyclic[i][j+1] >= 0)
+ +            {
+ +                order[cyclic[i][j+1]][j] = cyclic[i][j];
+ +                order[cyclic[i][j]][j]   = cyclic[i][j+1];
+ +            }
+ +        }
+ +        for (i = 0; i < nrepl; i++)
+ +        {
+ +            if (order[i][j] < 0)
+ +            {
+ +                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
+ +            }
+ +        }
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(fplog, "Replica Exchange Order\n");
+ +        for (i = 0; i < nrepl; i++)
+ +        {
+ +            fprintf(fplog, "Replica %d:", i);
+ +            for (j = 0; j < maxswap; j++)
+ +            {
+ +                if (order[i][j] < 0)
+ +                {
+ +                    break;
+ +                }
+ +                fprintf(debug, "%2d", order[i][j]);
+ +            }
+ +            fprintf(fplog, "\n");
+ +        }
+ +        fflush(fplog);
+ +    }
+ +}
+ +
+ +static void
+ +prepare_to_do_exchange(FILE      *fplog,
+ +                       const int *destinations,
+ +                       const int  replica_id,
+ +                       const int  nrepl,
+ +                       int       *maxswap,
+ +                       int      **order,
+ +                       int      **cyclic,
+ +                       int       *incycle,
+ +                       gmx_bool  *bThisReplicaExchanged)
+ +{
+ +    int i, j;
+ +    /* Hold the cyclic decomposition of the (multiple) replica
+ +     * exchange. */
+ +    gmx_bool bAnyReplicaExchanged = FALSE;
+ +    *bThisReplicaExchanged = FALSE;
+ +
+ +    for (i = 0; i < nrepl; i++)
+ +    {
+ +        if (destinations[i] != i)
+ +        {
+ +            /* only mark as exchanged if the index has been shuffled */
+ +            bAnyReplicaExchanged = TRUE;
+ +            break;
+ +        }
+ +    }
+ +    if (bAnyReplicaExchanged)
+ +    {
+ +        /* reinitialize the placeholder arrays */
+ +        for (i = 0; i < nrepl; i++)
+ +        {
+ +            for (j = 0; j < nrepl; j++)
+ +            {
+ +                cyclic[i][j] = -1;
+ +                order[i][j]  = -1;
+ +            }
+ +        }
+ +
+ +        /* Identify the cyclic decomposition of the permutation (very
+ +         * fast if neighbor replica exchange). */
+ +        cyclic_decomposition(destinations, cyclic, incycle, nrepl, maxswap);
+ +
+ +        /* Now translate the decomposition into a replica exchange
+ +         * order at each step. */
+ +        compute_exchange_order(fplog, cyclic, order, nrepl, *maxswap);
+ +
+ +        /* Did this replica do any exchange at any point? */
+ +        for (j = 0; j < *maxswap; j++)
+ +        {
+ +            if (replica_id != order[replica_id][j])
+ +            {
+ +                *bThisReplicaExchanged = TRUE;
+ +                break;
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *re,
+ +                          t_state *state, gmx_enerdata_t *enerd,
+ +                          t_state *state_local, gmx_large_int_t step, real time)
+ +{
+ +    int i, j;
+ +    int replica_id = 0;
+ +    int exchange_partner;
+ +    int maxswap = 0;
+ +    /* Number of rounds of exchanges needed to deal with any multiple
+ +     * exchanges. */
+ +    /* Where each replica ends up after the exchange attempt(s). */
+ +    /* The order in which multiple exchanges will occur. */
+ +    gmx_bool bThisReplicaExchanged = FALSE;
+ +
+ +    if (MASTER(cr))
+ +    {
+ +        replica_id  = re->repl;
+ +        test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
+ +        prepare_to_do_exchange(fplog, re->destinations, replica_id, re->nrepl, &maxswap,
+ +                               re->order, re->cyclic, re->incycle, &bThisReplicaExchanged);
+ +    }
+ +    /* Do intra-simulation broadcast so all processors belonging to
+ +     * each simulation know whether they need to participate in
+ +     * collecting the state. Otherwise, they might as well get on with
+ +     * the next thing to do. */
+ +    if (PAR(cr))
+ +    {
+ +#ifdef GMX_MPI
+ +        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
+ +                  cr->mpi_comm_mygroup);
+ +#endif
+ +    }
+ +
+ +    if (bThisReplicaExchanged)
+ +    {
+ +        /* Exchange the states */
+ +
+ +        if (PAR(cr))
+ +        {
+ +            /* Collect the global state on the master node */
+ +            if (DOMAINDECOMP(cr))
+ +            {
+ +                dd_collect_state(cr->dd, state_local, state);
+ +            }
+ +            else
+ +            {
+ +                pd_collect_state(cr, state);
+ +            }
+ +        }
++        else
++        {
++            copy_state_nonatomdata(state_local, state);
++        }
+ +
+ +        if (MASTER(cr))
+ +        {
+ +            /* There will be only one swap cycle with standard replica
+ +             * exchange, but there may be multiple swap cycles if we
+ +             * allow multiple swaps. */
+ +
+ +            for (j = 0; j < maxswap; j++)
+ +            {
+ +                exchange_partner = re->order[replica_id][j];
+ +
+ +                if (exchange_partner != replica_id)
+ +                {
+ +                    /* Exchange the global states between the master nodes */
+ +                    if (debug)
+ +                    {
+ +                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
+ +                    }
+ +                    exchange_state(cr->ms, exchange_partner, state);
+ +                }
+ +            }
+ +            /* For temperature-type replica exchange, we need to scale
+ +             * the velocities. */
+ +            if (re->type == ereTEMP || re->type == ereTL)
+ +            {
+ +                scale_velocities(state, sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
+ +            }
+ +
+ +        }
+ +
+ +        /* With domain decomposition the global state is distributed later */
+ +        if (!DOMAINDECOMP(cr))
+ +        {
+ +            /* Copy the global state to the local state data structure */
+ +            copy_state_nonatomdata(state, state_local);
+ +
+ +            if (PAR(cr))
+ +            {
+ +                bcast_state(cr, state, FALSE);
+ +            }
+ +        }
+ +    }
+ +
+ +    return bThisReplicaExchanged;
+ +}
+ +
+ +void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
+ +{
+ +    int  i;
+ +
+ +    fprintf(fplog, "\nReplica exchange statistics\n");
+ +
+ +    if (re->nex == 0)
+ +    {
+ +        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
+ +                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
+ +
+ +        fprintf(fplog, "Repl  average probabilities:\n");
+ +        for (i = 1; i < re->nrepl; i++)
+ +        {
+ +            if (re->nattempt[i%2] == 0)
+ +            {
+ +                re->prob[i] = 0;
+ +            }
+ +            else
+ +            {
+ +                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
+ +            }
+ +        }
+ +        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+ +        print_prob(fplog, "", re->nrepl, re->prob);
+ +
+ +        fprintf(fplog, "Repl  number of exchanges:\n");
+ +        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+ +        print_count(fplog, "", re->nrepl, re->nexchange);
+ +
+ +        fprintf(fplog, "Repl  average number of exchanges:\n");
+ +        for (i = 1; i < re->nrepl; i++)
+ +        {
+ +            if (re->nattempt[i%2] == 0)
+ +            {
+ +                re->prob[i] = 0;
+ +            }
+ +            else
+ +            {
+ +                re->prob[i] =  ((real)re->nexchange[i])/re->nattempt[i%2];
+ +            }
+ +        }
+ +        print_ind(fplog, "", re->nrepl, re->ind, NULL);
+ +        print_prob(fplog, "", re->nrepl, re->prob);
+ +
+ +        fprintf(fplog, "\n");
+ +    }
+ +    /* print the transition matrix */
+ +    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
+ +}
diff --cc src/programs/mdrun/runner.c

index 3b3d98356aae458f2dfe78d74ef72c918d952dac,0000000000000000000000000000000000000000..1b24205fd602239d05df3962ae9fc05cb1c13eed

mode 100644,000000..100644
--- 1/src/programs/mdrun/runner.c
--- /dev/null
+++ b/src/programs/mdrun/runner.c
@@@ -1,1665 -1,0 +1,1678 @@@
- static const float  NBNXN_GPU_LIST_OK_FAC   = 1.25;
- /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
- static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.40;
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +#include <signal.h>
+ +#include <stdlib.h>
+ +#ifdef HAVE_UNISTD_H
+ +#include <unistd.h>
+ +#endif
+ +#include <string.h>
+ +#include <assert.h>
+ +
+ +#include "typedefs.h"
+ +#include "smalloc.h"
+ +#include "sysstuff.h"
+ +#include "statutil.h"
+ +#include "force.h"
+ +#include "mdrun.h"
+ +#include "md_logging.h"
+ +#include "md_support.h"
+ +#include "network.h"
+ +#include "pull.h"
+ +#include "pull_rotation.h"
+ +#include "names.h"
+ +#include "disre.h"
+ +#include "orires.h"
+ +#include "pme.h"
+ +#include "mdatoms.h"
+ +#include "repl_ex.h"
+ +#include "qmmm.h"
+ +#include "domdec.h"
+ +#include "partdec.h"
+ +#include "coulomb.h"
+ +#include "constr.h"
+ +#include "mvdata.h"
+ +#include "checkpoint.h"
+ +#include "mtop_util.h"
+ +#include "sighandler.h"
+ +#include "gromacs/fileio/tpxio.h"
+ +#include "txtdump.h"
+ +#include "gmx_detect_hardware.h"
+ +#include "gmx_omp_nthreads.h"
+ +#include "pull_rotation.h"
+ +#include "calc_verletbuf.h"
+ +#include "../mdlib/nbnxn_search.h"
+ +#include "../mdlib/nbnxn_consts.h"
+ +#include "gmx_fatal_collective.h"
+ +#include "membed.h"
+ +#include "macros.h"
+ +#include "gmx_omp.h"
+ +#include "gmx_thread_affinity.h"
+ +
+ +#include "gromacs/utility/gmxmpi.h"
+ +
+ +#ifdef GMX_FAHCORE
+ +#include "corewrap.h"
+ +#endif
+ +
+ +#include "gpu_utils.h"
+ +#include "nbnxn_cuda_data_mgmt.h"
+ +
+ +typedef struct {
+ +    gmx_integrator_t *func;
+ +} gmx_intp_t;
+ +
+ +/* The array should match the eI array in include/types/enums.h */
+ +const gmx_intp_t    integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}};
+ +
+ +gmx_large_int_t     deform_init_init_step_tpx;
+ +matrix              deform_init_box_tpx;
+ +#ifdef GMX_THREAD_MPI
+ +tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+ +#endif
+ +
+ +
+ +#ifdef GMX_THREAD_MPI
+ +struct mdrunner_arglist
+ +{
+ +    gmx_hw_opt_t   *hw_opt;
+ +    FILE           *fplog;
+ +    t_commrec      *cr;
+ +    int             nfile;
+ +    const t_filenm *fnm;
+ +    output_env_t    oenv;
+ +    gmx_bool        bVerbose;
+ +    gmx_bool        bCompact;
+ +    int             nstglobalcomm;
+ +    ivec            ddxyz;
+ +    int             dd_node_order;
+ +    real            rdd;
+ +    real            rconstr;
+ +    const char     *dddlb_opt;
+ +    real            dlb_scale;
+ +    const char     *ddcsx;
+ +    const char     *ddcsy;
+ +    const char     *ddcsz;
+ +    const char     *nbpu_opt;
+ +    gmx_large_int_t nsteps_cmdline;
+ +    int             nstepout;
+ +    int             resetstep;
+ +    int             nmultisim;
+ +    int             repl_ex_nst;
+ +    int             repl_ex_nex;
+ +    int             repl_ex_seed;
+ +    real            pforce;
+ +    real            cpt_period;
+ +    real            max_hours;
+ +    const char     *deviceOptions;
+ +    unsigned long   Flags;
+ +    int             ret; /* return value */
+ +};
+ +
+ +
+ +/* The function used for spawning threads. Extracts the mdrunner()
+ +   arguments from its one argument and calls mdrunner(), after making
+ +   a commrec. */
+ +static void mdrunner_start_fn(void *arg)
+ +{
+ +    struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
+ +    struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
+ +                                            that it's thread-local. This doesn't
+ +                                            copy pointed-to items, of course,
+ +                                            but those are all const. */
+ +    t_commrec *cr;                       /* we need a local version of this */
+ +    FILE      *fplog = NULL;
+ +    t_filenm  *fnm;
+ +
+ +    fnm = dup_tfn(mc.nfile, mc.fnm);
+ +
+ +    cr = reinitialize_commrec_for_this_thread(mc.cr);
+ +
+ +    if (MASTER(cr))
+ +    {
+ +        fplog = mc.fplog;
+ +    }
+ +
+ +    mda->ret = mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+ +                        mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
+ +                        mc.ddxyz, mc.dd_node_order, mc.rdd,
+ +                        mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
+ +                        mc.ddcsx, mc.ddcsy, mc.ddcsz,
+ +                        mc.nbpu_opt,
+ +                        mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+ +                        mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+ +                        mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
+ +}
+ +
+ +/* called by mdrunner() to start a specific number of threads (including
+ +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
+ +   for each thread.
+ +   All options besides nthreads are the same as for mdrunner(). */
+ +static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
+ +                                         FILE *fplog, t_commrec *cr, int nfile,
+ +                                         const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+ +                                         gmx_bool bCompact, int nstglobalcomm,
+ +                                         ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+ +                                         const char *dddlb_opt, real dlb_scale,
+ +                                         const char *ddcsx, const char *ddcsy, const char *ddcsz,
+ +                                         const char *nbpu_opt,
+ +                                         gmx_large_int_t nsteps_cmdline,
+ +                                         int nstepout, int resetstep,
+ +                                         int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+ +                                         real pforce, real cpt_period, real max_hours,
+ +                                         const char *deviceOptions, unsigned long Flags)
+ +{
+ +    int                      ret;
+ +    struct mdrunner_arglist *mda;
+ +    t_commrec               *crn; /* the new commrec */
+ +    t_filenm                *fnmn;
+ +
+ +    /* first check whether we even need to start tMPI */
+ +    if (hw_opt->nthreads_tmpi < 2)
+ +    {
+ +        return cr;
+ +    }
+ +
+ +    /* a few small, one-time, almost unavoidable memory leaks: */
+ +    snew(mda, 1);
+ +    fnmn = dup_tfn(nfile, fnm);
+ +
+ +    /* fill the data structure to pass as void pointer to thread start fn */
+ +    mda->hw_opt         = hw_opt;
+ +    mda->fplog          = fplog;
+ +    mda->cr             = cr;
+ +    mda->nfile          = nfile;
+ +    mda->fnm            = fnmn;
+ +    mda->oenv           = oenv;
+ +    mda->bVerbose       = bVerbose;
+ +    mda->bCompact       = bCompact;
+ +    mda->nstglobalcomm  = nstglobalcomm;
+ +    mda->ddxyz[XX]      = ddxyz[XX];
+ +    mda->ddxyz[YY]      = ddxyz[YY];
+ +    mda->ddxyz[ZZ]      = ddxyz[ZZ];
+ +    mda->dd_node_order  = dd_node_order;
+ +    mda->rdd            = rdd;
+ +    mda->rconstr        = rconstr;
+ +    mda->dddlb_opt      = dddlb_opt;
+ +    mda->dlb_scale      = dlb_scale;
+ +    mda->ddcsx          = ddcsx;
+ +    mda->ddcsy          = ddcsy;
+ +    mda->ddcsz          = ddcsz;
+ +    mda->nbpu_opt       = nbpu_opt;
+ +    mda->nsteps_cmdline = nsteps_cmdline;
+ +    mda->nstepout       = nstepout;
+ +    mda->resetstep      = resetstep;
+ +    mda->nmultisim      = nmultisim;
+ +    mda->repl_ex_nst    = repl_ex_nst;
+ +    mda->repl_ex_nex    = repl_ex_nex;
+ +    mda->repl_ex_seed   = repl_ex_seed;
+ +    mda->pforce         = pforce;
+ +    mda->cpt_period     = cpt_period;
+ +    mda->max_hours      = max_hours;
+ +    mda->deviceOptions  = deviceOptions;
+ +    mda->Flags          = Flags;
+ +
+ +    /* now spawn new threads that start mdrunner_start_fn(), while
+ +       the main thread returns, we set thread affinity later */
+ +    ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
+ +                       mdrunner_start_fn, (void*)(mda) );
+ +    if (ret != TMPI_SUCCESS)
+ +    {
+ +        return NULL;
+ +    }
+ +
+ +    crn = reinitialize_commrec_for_this_thread(cr);
+ +    return crn;
+ +}
+ +
+ +
+ +static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+ +                                        const gmx_hw_opt_t  *hw_opt,
+ +                                        int                  nthreads_tot,
+ +                                        int                  ngpu)
+ +{
+ +    int nthreads_tmpi;
+ +
+ +    /* There are no separate PME nodes here, as we ensured in
+ +     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
+ +     * and a conditional ensures we would not have ended up here.
+ +     * Note that separate PME nodes might be switched on later.
+ +     */
+ +    if (ngpu > 0)
+ +    {
+ +        nthreads_tmpi = ngpu;
+ +        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
+ +        {
+ +            nthreads_tmpi = nthreads_tot;
+ +        }
+ +    }
+ +    else if (hw_opt->nthreads_omp > 0)
+ +    {
+ +        /* Here we could oversubscribe, when we do, we issue a warning later */
+ +        nthreads_tmpi = max(1, nthreads_tot/hw_opt->nthreads_omp);
+ +    }
+ +    else
+ +    {
+ +        /* TODO choose nthreads_omp based on hardware topology
+ +           when we have a hardware topology detection library */
+ +        /* In general, when running up to 4 threads, OpenMP should be faster.
+ +         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
+ +         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
+ +         * even on two CPUs it's usually faster (but with many OpenMP threads
+ +         * it could be faster not to use HT, currently we always use HT).
+ +         * On Nehalem/Westmere we want to avoid running 16 threads over
+ +         * two CPUs with HT, so we need a limit<16; thus we use 12.
+ +         * A reasonable limit for Intel Sandy and Ivy bridge,
+ +         * not knowing the topology, is 16 threads.
+ +         */
+ +        const int nthreads_omp_always_faster             =  4;
+ +        const int nthreads_omp_always_faster_Nehalem     = 12;
+ +        const int nthreads_omp_always_faster_SandyBridge = 16;
+ +        const int first_model_Nehalem                    = 0x1A;
+ +        const int first_model_SandyBridge                = 0x2A;
+ +        gmx_bool  bIntel_Family6;
+ +
+ +        bIntel_Family6 =
+ +            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
+ +             gmx_cpuid_family(hwinfo->cpuid_info) == 6);
+ +
+ +        if (nthreads_tot <= nthreads_omp_always_faster ||
+ +            (bIntel_Family6 &&
+ +             ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
+ +              (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
+ +        {
+ +            /* Use pure OpenMP parallelization */
+ +            nthreads_tmpi = 1;
+ +        }
+ +        else
+ +        {
+ +            /* Don't use OpenMP parallelization */
+ +            nthreads_tmpi = nthreads_tot;
+ +        }
+ +    }
+ +
+ +    return nthreads_tmpi;
+ +}
+ +
+ +
+ +/* Get the number of threads to use for thread-MPI based on how many
+ + * were requested, which algorithms we're using,
+ + * and how many particles there are.
+ + * At the point we have already called check_and_update_hw_opt.
+ + * Thus all options should be internally consistent and consistent
+ + * with the hardware, except that ntmpi could be larger than #GPU.
+ + */
+ +static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
+ +                            gmx_hw_opt_t *hw_opt,
+ +                            t_inputrec *inputrec, gmx_mtop_t *mtop,
+ +                            const t_commrec *cr,
+ +                            FILE *fplog)
+ +{
+ +    int      nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu;
+ +    int      min_atoms_per_mpi_thread;
+ +    char    *env;
+ +    char     sbuf[STRLEN];
+ +    gmx_bool bCanUseGPU;
+ +
+ +    if (hw_opt->nthreads_tmpi > 0)
+ +    {
+ +        /* Trivial, return right away */
+ +        return hw_opt->nthreads_tmpi;
+ +    }
+ +
+ +    nthreads_hw = hwinfo->nthreads_hw_avail;
+ +
+ +    /* How many total (#tMPI*#OpenMP) threads can we start? */
+ +    if (hw_opt->nthreads_tot > 0)
+ +    {
+ +        nthreads_tot_max = hw_opt->nthreads_tot;
+ +    }
+ +    else
+ +    {
+ +        nthreads_tot_max = nthreads_hw;
+ +    }
+ +
+ +    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
+ +    if (bCanUseGPU)
+ +    {
+ +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
+ +    }
+ +    else
+ +    {
+ +        ngpu = 0;
+ +    }
+ +
+ +    nthreads_tmpi =
+ +        get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
+ +
+ +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
+ +    {
+ +        /* Dims/steps are divided over the nodes iso splitting the atoms */
+ +        min_atoms_per_mpi_thread = 0;
+ +    }
+ +    else
+ +    {
+ +        if (bCanUseGPU)
+ +        {
+ +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
+ +        }
+ +        else
+ +        {
+ +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
+ +        }
+ +    }
+ +
+ +    /* Check if an algorithm does not support parallel simulation.  */
+ +    if (nthreads_tmpi != 1 &&
+ +        ( inputrec->eI == eiLBFGS ||
+ +          inputrec->coulombtype == eelEWALD ) )
+ +    {
+ +        nthreads_tmpi = 1;
+ +
+ +        md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
+ +        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
+ +        {
+ +            gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
+ +        }
+ +    }
+ +    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
+ +    {
+ +        /* the thread number was chosen automatically, but there are too many
+ +           threads (too few atoms per thread) */
+ +        nthreads_new = max(1, mtop->natoms/min_atoms_per_mpi_thread);
+ +
+ +        /* Avoid partial use of Hyper-Threading */
+ +        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+ +            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
+ +        {
+ +            nthreads_new = nthreads_hw/2;
+ +        }
+ +
+ +        /* Avoid large prime numbers in the thread count */
+ +        if (nthreads_new >= 6)
+ +        {
+ +            /* Use only 6,8,10 with additional factors of 2 */
+ +            int fac;
+ +
+ +            fac = 2;
+ +            while (3*fac*2 <= nthreads_new)
+ +            {
+ +                fac *= 2;
+ +            }
+ +
+ +            nthreads_new = (nthreads_new/fac)*fac;
+ +        }
+ +        else
+ +        {
+ +            /* Avoid 5 */
+ +            if (nthreads_new == 5)
+ +            {
+ +                nthreads_new = 4;
+ +            }
+ +        }
+ +
+ +        nthreads_tmpi = nthreads_new;
+ +
+ +        fprintf(stderr, "\n");
+ +        fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
+ +        fprintf(stderr, "      only starting %d thread-MPI threads.\n", nthreads_tmpi);
+ +        fprintf(stderr, "      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
+ +    }
+ +
+ +    return nthreads_tmpi;
+ +}
+ +#endif /* GMX_THREAD_MPI */
+ +
+ +
+ +/* Environment variable for setting nstlist */
+ +static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
+ +/* Try to increase nstlist when using a GPU with nstlist less than this */
+ +static const int    NSTLIST_GPU_ENOUGH      = 20;
+ +/* Increase nstlist until the non-bonded cost increases more than this factor */
-     real                   rlist_inc, rlist_ok, rlist_max, rlist_new, rlist_prev;
++static const float  NBNXN_GPU_LIST_OK_FAC   = 1.20;
++/* Don't increase nstlist beyond a non-bonded cost increases of this factor.
++ * A standard (protein+)water system at 300K with PME ewald_rtol=1e-5
++ * needs 1.28 at rcoulomb=0.9 and 1.24 at rcoulomb=1.0 to get to nstlist=40.
++ */
++static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.30;
+ +
+ +/* Try to increase nstlist when running on a GPU */
+ +static void increase_nstlist(FILE *fp, t_commrec *cr,
+ +                             t_inputrec *ir, const gmx_mtop_t *mtop, matrix box)
+ +{
+ +    char                  *env;
+ +    int                    nstlist_orig, nstlist_prev;
+ +    verletbuf_list_setup_t ls;
-     const int nstl[] = { 20, 25, 40, 50 };
++    real                   rlist_nstlist10, rlist_inc, rlist_ok, rlist_max;
++    real                   rlist_new, rlist_prev;
+ +    int                    i;
+ +    t_state                state_tmp;
+ +    gmx_bool               bBox, bDD, bCont;
+ +    const char            *nstl_fmt = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+ +    const char            *vbd_err  = "Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
+ +    const char            *box_err  = "Can not increase nstlist for GPU run because the box is too small";
+ +    const char            *dd_err   = "Can not increase nstlist for GPU run because of domain decomposition limitations";
+ +    char                   buf[STRLEN];
+ +
+ +    /* Number of + nstlist alternative values to try when switching  */
-     /* Allow rlist to make the list double the size of the cut-off sphere */
++    const int nstl[] = { 20, 25, 40 };
+ +#define NNSTL  sizeof(nstl)/sizeof(nstl[0])
+ +
+ +    env = getenv(NSTLIST_ENVVAR);
+ +    if (env == NULL)
+ +    {
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp, nstl_fmt, ir->nstlist);
+ +        }
+ +    }
+ +
+ +    if (ir->verletbuf_drift == 0)
+ +    {
+ +        gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+ +    }
+ +
+ +    if (ir->verletbuf_drift < 0)
+ +    {
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr, "%s\n", vbd_err);
+ +        }
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp, "%s\n", vbd_err);
+ +        }
+ +
+ +        return;
+ +    }
+ +
+ +    nstlist_orig = ir->nstlist;
+ +    if (env != NULL)
+ +    {
+ +        sprintf(buf, "Getting nstlist from environment variable GMX_NSTLIST=%s", env);
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr, "%s\n", buf);
+ +        }
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp, "%s\n", buf);
+ +        }
+ +        sscanf(env, "%d", &ir->nstlist);
+ +    }
+ +
+ +    verletbuf_get_list_setup(TRUE, &ls);
+ +
-     rlist_ok  = (max(ir->rvdw, ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC, 1.0/3.0) - rlist_inc;
-     rlist_max = (max(ir->rvdw, ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC, 1.0/3.0) - rlist_inc;
++    /* Allow rlist to make the list a given factor larger than the list
++     * would be with nstlist=10.
++     */
++    nstlist_prev = ir->nstlist;
++    ir->nstlist  = 10;
++    calc_verlet_buffer_size(mtop, det(box), ir, ir->verletbuf_drift, &ls,
++                            NULL, &rlist_nstlist10);
++    ir->nstlist  = nstlist_prev;
++
++    /* Determine the pair list size increase due to zero interactions */
+ +    rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE, mtop->natoms/det(box));
++    rlist_ok  = (rlist_nstlist10 + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC, 1.0/3.0) - rlist_inc;
++    rlist_max = (rlist_nstlist10 + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC, 1.0/3.0) - rlist_inc;
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
+ +                rlist_inc, rlist_max);
+ +    }
+ +
+ +    i            = 0;
+ +    nstlist_prev = nstlist_orig;
+ +    rlist_prev   = ir->rlist;
+ +    do
+ +    {
+ +        if (env == NULL)
+ +        {
+ +            ir->nstlist = nstl[i];
+ +        }
+ +
+ +        /* Set the pair-list buffer size in ir */
+ +        calc_verlet_buffer_size(mtop, det(box), ir, ir->verletbuf_drift, &ls,
+ +                                NULL, &rlist_new);
+ +
+ +        /* Does rlist fit in the box? */
+ +        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
+ +        bDD  = TRUE;
+ +        if (bBox && DOMAINDECOMP(cr))
+ +        {
+ +            /* Check if rlist fits in the domain decomposition */
+ +            if (inputrec2nboundeddim(ir) < DIM)
+ +            {
+ +                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+ +            }
+ +            copy_mat(box, state_tmp.box);
+ +            bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
+ +        }
+ +
+ +        bCont = FALSE;
+ +
+ +        if (env == NULL)
+ +        {
+ +            if (bBox && bDD && rlist_new <= rlist_max)
+ +            {
+ +                /* Increase nstlist */
+ +                nstlist_prev = ir->nstlist;
+ +                rlist_prev   = rlist_new;
+ +                bCont        = (i+1 < NNSTL && rlist_new < rlist_ok);
+ +            }
+ +            else
+ +            {
+ +                /* Stick with the previous nstlist */
+ +                ir->nstlist = nstlist_prev;
+ +                rlist_new   = rlist_prev;
+ +                bBox        = TRUE;
+ +                bDD         = TRUE;
+ +            }
+ +        }
+ +
+ +        i++;
+ +    }
+ +    while (bCont);
+ +
+ +    if (!bBox || !bDD)
+ +    {
+ +        gmx_warning(!bBox ? box_err : dd_err);
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
+ +        }
+ +        ir->nstlist = nstlist_orig;
+ +    }
+ +    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+ +    {
+ +        sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
+ +                nstlist_orig, ir->nstlist,
+ +                ir->rlist, rlist_new);
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr, "%s\n\n", buf);
+ +        }
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp, "%s\n\n", buf);
+ +        }
+ +        ir->rlist     = rlist_new;
+ +        ir->rlistlong = rlist_new;
+ +    }
+ +}
+ +
+ +static void prepare_verlet_scheme(FILE                           *fplog,
+ +                                  const gmx_hw_info_t            *hwinfo,
+ +                                  t_commrec                      *cr,
+ +                                  t_inputrec                     *ir,
+ +                                  const gmx_mtop_t               *mtop,
+ +                                  matrix                          box,
+ +                                  gmx_bool                       *bUseGPU)
+ +{
+ +    /* Here we only check for GPU usage on the MPI master process,
+ +     * as here we don't know how many GPUs we will use yet.
+ +     * We check for a GPU on all processes later.
+ +     */
+ +    *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
+ +
+ +    if (ir->verletbuf_drift > 0)
+ +    {
+ +        /* Update the Verlet buffer size for the current run setup */
+ +        verletbuf_list_setup_t ls;
+ +        real                   rlist_new;
+ +
+ +        /* Here we assume CPU acceleration is on. But as currently
+ +         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+ +         * and 4x2 gives a larger buffer than 4x4, this is ok.
+ +         */
+ +        verletbuf_get_list_setup(*bUseGPU, &ls);
+ +
+ +        calc_verlet_buffer_size(mtop, det(box), ir,
+ +                                ir->verletbuf_drift, &ls,
+ +                                NULL, &rlist_new);
+ +        if (rlist_new != ir->rlist)
+ +        {
+ +            if (fplog != NULL)
+ +            {
+ +                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+ +                        ir->rlist, rlist_new,
+ +                        ls.cluster_size_i, ls.cluster_size_j);
+ +            }
+ +            ir->rlist     = rlist_new;
+ +            ir->rlistlong = rlist_new;
+ +        }
+ +    }
+ +
+ +    /* With GPU or emulation we should check nstlist for performance */
+ +    if ((EI_DYNAMICS(ir->eI) &&
+ +         *bUseGPU &&
+ +         ir->nstlist < NSTLIST_GPU_ENOUGH) ||
+ +        getenv(NSTLIST_ENVVAR) != NULL)
+ +    {
+ +        /* Choose a better nstlist */
+ +        increase_nstlist(fplog, cr, ir, mtop, box);
+ +    }
+ +}
+ +
+ +static void convert_to_verlet_scheme(FILE *fplog,
+ +                                     t_inputrec *ir,
+ +                                     gmx_mtop_t *mtop, real box_vol)
+ +{
+ +    char *conv_mesg = "Converting input file with group cut-off scheme to the Verlet cut-off scheme";
+ +
+ +    md_print_warn(NULL, fplog, "%s\n", conv_mesg);
+ +
+ +    ir->cutoff_scheme   = ecutsVERLET;
+ +    ir->verletbuf_drift = 0.005;
+ +
+ +    if (ir->rcoulomb != ir->rvdw)
+ +    {
+ +        gmx_fatal(FARGS, "The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
+ +    }
+ +
+ +    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
+ +    {
+ +        gmx_fatal(FARGS, "User non-bonded potentials are not (yet) supported with the Verlet scheme");
+ +    }
+ +    else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
+ +    {
+ +        md_print_warn(NULL, fplog, "Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
+ +
+ +        if (EVDW_SWITCHED(ir->vdwtype))
+ +        {
+ +            ir->vdwtype = evdwCUT;
+ +        }
+ +        if (EEL_SWITCHED(ir->coulombtype))
+ +        {
+ +            if (EEL_FULL(ir->coulombtype))
+ +            {
+ +                /* With full electrostatic only PME can be switched */
+ +                ir->coulombtype = eelPME;
+ +            }
+ +            else
+ +            {
+ +                md_print_warn(NULL, fplog, "NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n", eel_names[ir->coulombtype]);
+ +                ir->coulombtype = eelRF;
+ +                ir->epsilon_rf  = 0.0;
+ +            }
+ +        }
+ +
+ +        /* We set the target energy drift to a small number.
+ +         * Note that this is only for testing. For production the user
+ +         * should think about this and set the mdp options.
+ +         */
+ +        ir->verletbuf_drift = 1e-4;
+ +    }
+ +
+ +    if (inputrec2nboundeddim(ir) != 3)
+ +    {
+ +        gmx_fatal(FARGS, "Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
+ +    }
+ +
+ +    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
+ +    {
+ +        gmx_fatal(FARGS, "Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
+ +    }
+ +
+ +    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
+ +    {
+ +        verletbuf_list_setup_t ls;
+ +
+ +        verletbuf_get_list_setup(FALSE, &ls);
+ +        calc_verlet_buffer_size(mtop, box_vol, ir, ir->verletbuf_drift, &ls,
+ +                                NULL, &ir->rlist);
+ +    }
+ +    else
+ +    {
+ +        ir->verletbuf_drift = -1;
+ +        ir->rlist           = 1.05*max(ir->rvdw, ir->rcoulomb);
+ +    }
+ +
+ +    gmx_mtop_remove_chargegroups(mtop);
+ +}
+ +
+ +static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
+ +                                    int           cutoff_scheme,
+ +                                    gmx_bool      bIsSimMaster)
+ +{
+ +    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
+ +
+ +#ifndef GMX_THREAD_MPI
+ +    if (hw_opt->nthreads_tot > 0)
+ +    {
+ +        gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+ +    }
+ +    if (hw_opt->nthreads_tmpi > 0)
+ +    {
+ +        gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+ +    }
+ +#endif
+ +
+ +#ifndef GMX_OPENMP
+ +    if (hw_opt->nthreads_omp > 1)
+ +    {
+ +        gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but Gromacs was compiled without OpenMP support");
+ +    }
+ +    hw_opt->nthreads_omp = 1;
+ +#endif
+ +
+ +    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
+ +    {
+ +        /* We have the same number of OpenMP threads for PP and PME processes,
+ +         * thus we can perform several consistency checks.
+ +         */
+ +        if (hw_opt->nthreads_tmpi > 0 &&
+ +            hw_opt->nthreads_omp > 0 &&
+ +            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
+ +        {
+ +            gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
+ +                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
+ +        }
+ +
+ +        if (hw_opt->nthreads_tmpi > 0 &&
+ +            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
+ +        {
+ +            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
+ +                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
+ +        }
+ +
+ +        if (hw_opt->nthreads_omp > 0 &&
+ +            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
+ +        {
+ +            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
+ +                      hw_opt->nthreads_tot, hw_opt->nthreads_omp);
+ +        }
+ +
+ +        if (hw_opt->nthreads_tmpi > 0 &&
+ +            hw_opt->nthreads_omp <= 0)
+ +        {
+ +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+ +        }
+ +    }
+ +
+ +#ifndef GMX_OPENMP
+ +    if (hw_opt->nthreads_omp > 1)
+ +    {
+ +        gmx_fatal(FARGS, "OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
+ +    }
+ +#endif
+ +
+ +    if (cutoff_scheme == ecutsGROUP)
+ +    {
+ +        /* We only have OpenMP support for PME only nodes */
+ +        if (hw_opt->nthreads_omp > 1)
+ +        {
+ +            gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
+ +                      ecutscheme_names[cutoff_scheme],
+ +                      ecutscheme_names[ecutsVERLET]);
+ +        }
+ +        hw_opt->nthreads_omp = 1;
+ +    }
+ +
+ +    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
+ +    {
+ +        gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
+ +    }
+ +
+ +    if (hw_opt->nthreads_tot == 1)
+ +    {
+ +        hw_opt->nthreads_tmpi = 1;
+ +
+ +        if (hw_opt->nthreads_omp > 1)
+ +        {
+ +            gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
+ +                      hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
+ +        }
+ +        hw_opt->nthreads_omp = 1;
+ +    }
+ +
+ +    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+ +    {
+ +        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+ +                hw_opt->nthreads_tot,
+ +                hw_opt->nthreads_tmpi,
+ +                hw_opt->nthreads_omp,
+ +                hw_opt->nthreads_omp_pme,
+ +                hw_opt->gpu_id != NULL ? hw_opt->gpu_id : "");
+ +
+ +    }
+ +}
+ +
+ +
+ +/* Override the value in inputrec with value passed on the command line (if any) */
+ +static void override_nsteps_cmdline(FILE            *fplog,
+ +                                    gmx_large_int_t  nsteps_cmdline,
+ +                                    t_inputrec      *ir,
+ +                                    const t_commrec *cr)
+ +{
+ +    char sbuf[STEPSTRSIZE];
+ +
+ +    assert(ir);
+ +    assert(cr);
+ +
+ +    /* override with anything else than the default -2 */
+ +    if (nsteps_cmdline > -2)
+ +    {
+ +        char stmp[STRLEN];
+ +
+ +        ir->nsteps = nsteps_cmdline;
+ +        if (EI_DYNAMICS(ir->eI))
+ +        {
+ +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps, %.3f ps",
+ +                    gmx_step_str(nsteps_cmdline, sbuf),
+ +                    nsteps_cmdline*ir->delta_t);
+ +        }
+ +        else
+ +        {
+ +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps",
+ +                    gmx_step_str(nsteps_cmdline, sbuf));
+ +        }
+ +
+ +        md_print_warn(cr, fplog, "%s\n", stmp);
+ +    }
+ +}
+ +
+ +/* Data structure set by SIMMASTER which needs to be passed to all nodes
+ + * before the other nodes have read the tpx file and called gmx_detect_hardware.
+ + */
+ +typedef struct {
+ +    int      cutoff_scheme; /* The cutoff scheme from inputrec_t */
+ +    gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
+ +} master_inf_t;
+ +
+ +int mdrunner(gmx_hw_opt_t *hw_opt,
+ +             FILE *fplog, t_commrec *cr, int nfile,
+ +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+ +             gmx_bool bCompact, int nstglobalcomm,
+ +             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+ +             const char *dddlb_opt, real dlb_scale,
+ +             const char *ddcsx, const char *ddcsy, const char *ddcsz,
+ +             const char *nbpu_opt,
+ +             gmx_large_int_t nsteps_cmdline, int nstepout, int resetstep,
+ +             int nmultisim, int repl_ex_nst, int repl_ex_nex,
+ +             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
+ +             const char *deviceOptions, unsigned long Flags)
+ +{
+ +    gmx_bool                  bForceUseGPU, bTryUseGPU;
+ +    double                    nodetime = 0, realtime;
+ +    t_inputrec               *inputrec;
+ +    t_state                  *state = NULL;
+ +    matrix                    box;
+ +    gmx_ddbox_t               ddbox = {0};
+ +    int                       npme_major, npme_minor;
+ +    real                      tmpr1, tmpr2;
+ +    t_nrnb                   *nrnb;
+ +    gmx_mtop_t               *mtop       = NULL;
+ +    t_mdatoms                *mdatoms    = NULL;
+ +    t_forcerec               *fr         = NULL;
+ +    t_fcdata                 *fcd        = NULL;
+ +    real                      ewaldcoeff = 0;
+ +    gmx_pme_t                *pmedata    = NULL;
+ +    gmx_vsite_t              *vsite      = NULL;
+ +    gmx_constr_t              constr;
+ +    int                       i, m, nChargePerturbed = -1, status, nalloc;
+ +    char                     *gro;
+ +    gmx_wallcycle_t           wcycle;
+ +    gmx_bool                  bReadRNG, bReadEkin;
+ +    int                       list;
+ +    gmx_walltime_accounting_t walltime_accounting = NULL;
+ +    int                       rc;
+ +    gmx_large_int_t           reset_counters;
+ +    gmx_edsam_t               ed           = NULL;
+ +    t_commrec                *cr_old       = cr;
+ +    int                       nthreads_pme = 1;
+ +    int                       nthreads_pp  = 1;
+ +    gmx_membed_t              membed       = NULL;
+ +    gmx_hw_info_t            *hwinfo       = NULL;
+ +    master_inf_t              minf         = {-1, FALSE};
+ +
+ +    /* CAUTION: threads may be started later on in this function, so
+ +       cr doesn't reflect the final parallel state right now */
+ +    snew(inputrec, 1);
+ +    snew(mtop, 1);
+ +
+ +    if (Flags & MD_APPENDFILES)
+ +    {
+ +        fplog = NULL;
+ +    }
+ +
+ +    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+ +    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+ +
+ +    /* Detect hardware, gather information. This is an operation that is
+ +     * global for this process (MPI rank). */
+ +    hwinfo = gmx_detect_hardware(fplog, cr,
+ +                                 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
+ +
+ +
+ +    snew(state, 1);
+ +    if (SIMMASTER(cr))
+ +    {
+ +        /* Read (nearly) all data required for the simulation */
+ +        read_tpx_state(ftp2fn(efTPX, nfile, fnm), inputrec, state, NULL, mtop);
+ +
+ +        if (inputrec->cutoff_scheme != ecutsVERLET &&
+ +            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
+ +        {
+ +            convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box));
+ +        }
+ +
+ +
+ +        minf.cutoff_scheme = inputrec->cutoff_scheme;
+ +        minf.bUseGPU       = FALSE;
+ +
+ +        if (inputrec->cutoff_scheme == ecutsVERLET)
+ +        {
+ +            prepare_verlet_scheme(fplog, hwinfo, cr,
+ +                                  inputrec, mtop, state->box,
+ +                                  &minf.bUseGPU);
+ +        }
+ +        else if (hwinfo->bCanUseGPU)
+ +        {
+ +            md_print_warn(cr, fplog,
+ +                          "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+ +                          "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
+ +                          "      (for quick performance testing you can use the -testverlet option)\n");
+ +
+ +            if (bForceUseGPU)
+ +            {
+ +                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
+ +            }
+ +        }
+ +#ifdef GMX_IS_BGQ
+ +        else
+ +        {
+ +            md_print_warn(cr, fplog,
+ +                          "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
+ +                          "      BlueGene/Q. You will observe better performance from using the\n"
+ +                          "      Verlet cut-off scheme.\n");
+ +        }
+ +#endif
+ +    }
+ +#ifndef GMX_THREAD_MPI
+ +    if (PAR(cr))
+ +    {
+ +        gmx_bcast_sim(sizeof(minf), &minf, cr);
+ +    }
+ +#endif
+ +    if (minf.bUseGPU && cr->npmenodes == -1)
+ +    {
+ +        /* Don't automatically use PME-only nodes with GPUs */
+ +        cr->npmenodes = 0;
+ +    }
+ +
+ +    /* Check for externally set OpenMP affinity and turn off internal
+ +     * pinning if any is found. We need to do this check early to tell
+ +     * thread-MPI whether it should do pinning when spawning threads.
+ +     * TODO: the above no longer holds, we should move these checks down
+ +     */
+ +    gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
+ +
+ +#ifdef GMX_THREAD_MPI
+ +    /* With thread-MPI inputrec is only set here on the master thread */
+ +    if (SIMMASTER(cr))
+ +#endif
+ +    {
+ +        check_and_update_hw_opt(hw_opt, minf.cutoff_scheme, SIMMASTER(cr));
+ +
+ +#ifdef GMX_THREAD_MPI
+ +        /* Early check for externally set process affinity. Can't do over all
+ +         * MPI processes because hwinfo is not available everywhere, but with
+ +         * thread-MPI it's needed as pinning might get turned off which needs
+ +         * to be known before starting thread-MPI. */
+ +        gmx_check_thread_affinity_set(fplog,
+ +                                      NULL,
+ +                                      hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+ +#endif
+ +
+ +#ifdef GMX_THREAD_MPI
+ +        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
+ +        {
+ +            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
+ +        }
+ +#endif
+ +
+ +        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
+ +            cr->npmenodes <= 0)
+ +        {
+ +            gmx_fatal(FARGS, "You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
+ +        }
+ +    }
+ +
+ +#ifdef GMX_THREAD_MPI
+ +    if (SIMMASTER(cr))
+ +    {
+ +        /* NOW the threads will be started: */
+ +        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+ +                                                 hw_opt,
+ +                                                 inputrec, mtop,
+ +                                                 cr, fplog);
+ +        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
+ +        {
+ +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+ +        }
+ +
+ +        if (hw_opt->nthreads_tmpi > 1)
+ +        {
+ +            /* now start the threads. */
+ +            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
+ +                                        oenv, bVerbose, bCompact, nstglobalcomm,
+ +                                        ddxyz, dd_node_order, rdd, rconstr,
+ +                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
+ +                                        nbpu_opt,
+ +                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
+ +                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
+ +                                        cpt_period, max_hours, deviceOptions,
+ +                                        Flags);
+ +            /* the main thread continues here with a new cr. We don't deallocate
+ +               the old cr because other threads may still be reading it. */
+ +            if (cr == NULL)
+ +            {
+ +                gmx_comm("Failed to spawn threads");
+ +            }
+ +        }
+ +    }
+ +#endif
+ +    /* END OF CAUTION: cr is now reliable */
+ +
+ +    /* g_membed initialisation *
+ +     * Because we change the mtop, init_membed is called before the init_parallel *
+ +     * (in case we ever want to make it run in parallel) */
+ +    if (opt2bSet("-membed", nfile, fnm))
+ +    {
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr, "Initializing membed");
+ +        }
+ +        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
+ +    }
+ +
+ +    if (PAR(cr))
+ +    {
+ +        /* now broadcast everything to the non-master nodes/threads: */
+ +        init_parallel(cr, inputrec, mtop);
+ +
+ +        /* This check needs to happen after get_nthreads_mpi() */
+ +        if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
+ +        {
+ +            gmx_fatal_collective(FARGS, cr, NULL,
+ +                                 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
+ +                                 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
+ +        }
+ +    }
+ +    if (fplog != NULL)
+ +    {
+ +        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+ +    }
+ +
+ +    /* now make sure the state is initialized and propagated */
+ +    set_state_entries(state, inputrec, cr->nnodes);
+ +
+ +    /* A parallel command line option consistency check that we can
+ +       only do after any threads have started. */
+ +    if (!PAR(cr) &&
+ +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
+ +    {
+ +        gmx_fatal(FARGS,
+ +                  "The -dd or -npme option request a parallel simulation, "
+ +#ifndef GMX_MPI
+ +                  "but %s was compiled without threads or MPI enabled"
+ +#else
+ +#ifdef GMX_THREAD_MPI
+ +                  "but the number of threads (option -nt) is 1"
+ +#else
+ +                  "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
+ +#endif
+ +#endif
+ +                  , ShortProgram()
+ +                  );
+ +    }
+ +
+ +    if ((Flags & MD_RERUN) &&
+ +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+ +    {
+ +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+ +    }
+ +
+ +    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && PAR(cr))
+ +    {
+ +        /* Simple neighbour searching and (also?) all-vs-all loops
+ +         * do not work with domain decomposition. */
+ +        Flags |= MD_PARTDEC;
+ +    }
+ +
+ +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
+ +    {
+ +        if (cr->npmenodes > 0)
+ +        {
+ +            if (!EEL_PME(inputrec->coulombtype))
+ +            {
+ +                gmx_fatal_collective(FARGS, cr, NULL,
+ +                                     "PME nodes are requested, but the system does not use PME electrostatics");
+ +            }
+ +            if (Flags & MD_PARTDEC)
+ +            {
+ +                gmx_fatal_collective(FARGS, cr, NULL,
+ +                                     "PME nodes are requested, but particle decomposition does not support separate PME nodes");
+ +            }
+ +        }
+ +
+ +        cr->npmenodes = 0;
+ +    }
+ +
+ +#ifdef GMX_FAHCORE
+ +    fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+ +#endif
+ +
+ +    /* NMR restraints must be initialized before load_checkpoint,
+ +     * since with time averaging the history is added to t_state.
+ +     * For proper consistency check we therefore need to extend
+ +     * t_state here.
+ +     * So the PME-only nodes (if present) will also initialize
+ +     * the distance restraints.
+ +     */
+ +    snew(fcd, 1);
+ +
+ +    /* This needs to be called before read_checkpoint to extend the state */
+ +    init_disres(fplog, mtop, inputrec, cr, Flags & MD_PARTDEC, fcd, state, repl_ex_nst > 0);
+ +
+ +    if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
+ +    {
+ +        if (PAR(cr) && !(Flags & MD_PARTDEC))
+ +        {
+ +            gmx_fatal(FARGS, "Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
+ +        }
+ +        /* Orientation restraints */
+ +        if (MASTER(cr))
+ +        {
+ +            init_orires(fplog, mtop, state->x, inputrec, cr->ms, &(fcd->orires),
+ +                        state);
+ +        }
+ +    }
+ +
+ +    if (DEFORM(*inputrec))
+ +    {
+ +        /* Store the deform reference box before reading the checkpoint */
+ +        if (SIMMASTER(cr))
+ +        {
+ +            copy_mat(state->box, box);
+ +        }
+ +        if (PAR(cr))
+ +        {
+ +            gmx_bcast(sizeof(box), box, cr);
+ +        }
+ +        /* Because we do not have the update struct available yet
+ +         * in which the reference values should be stored,
+ +         * we store them temporarily in static variables.
+ +         * This should be thread safe, since they are only written once
+ +         * and with identical values.
+ +         */
+ +#ifdef GMX_THREAD_MPI
+ +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+ +#endif
+ +        deform_init_init_step_tpx = inputrec->init_step;
+ +        copy_mat(box, deform_init_box_tpx);
+ +#ifdef GMX_THREAD_MPI
+ +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+ +#endif
+ +    }
+ +
+ +    if (opt2bSet("-cpi", nfile, fnm))
+ +    {
+ +        /* Check if checkpoint file exists before doing continuation.
+ +         * This way we can use identical input options for the first and subsequent runs...
+ +         */
+ +        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
+ +        {
+ +            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+ +                            cr, Flags & MD_PARTDEC, ddxyz,
+ +                            inputrec, state, &bReadRNG, &bReadEkin,
+ +                            (Flags & MD_APPENDFILES),
+ +                            (Flags & MD_APPENDFILESSET));
+ +
+ +            if (bReadRNG)
+ +            {
+ +                Flags |= MD_READ_RNG;
+ +            }
+ +            if (bReadEkin)
+ +            {
+ +                Flags |= MD_READ_EKIN;
+ +            }
+ +        }
+ +    }
+ +
+ +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
+ +#ifdef GMX_THREAD_MPI
+ +        /* With thread MPI only the master node/thread exists in mdrun.c,
+ +         * therefore non-master nodes need to open the "seppot" log file here.
+ +         */
+ +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
+ +#endif
+ +        )
+ +    {
+ +        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr, !(Flags & MD_SEPPOT),
+ +                     Flags, &fplog);
+ +    }
+ +
+ +    /* override nsteps with value from cmdline */
+ +    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+ +
+ +    if (SIMMASTER(cr))
+ +    {
+ +        copy_mat(state->box, box);
+ +    }
+ +
+ +    if (PAR(cr))
+ +    {
+ +        gmx_bcast(sizeof(box), box, cr);
+ +    }
+ +
+ +    /* Essential dynamics */
+ +    if (opt2bSet("-ei", nfile, fnm))
+ +    {
+ +        /* Open input and output files, allocate space for ED data structure */
+ +        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
+ +    }
+ +
+ +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
+ +                     EI_TPI(inputrec->eI) ||
+ +                     inputrec->eI == eiNM))
+ +    {
+ +        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
+ +                                           dddlb_opt, dlb_scale,
+ +                                           ddcsx, ddcsy, ddcsz,
+ +                                           mtop, inputrec,
+ +                                           box, state->x,
+ +                                           &ddbox, &npme_major, &npme_minor);
+ +
+ +        make_dd_communicators(fplog, cr, dd_node_order);
+ +
+ +        /* Set overallocation to avoid frequent reallocation of arrays */
+ +        set_over_alloc_dd(TRUE);
+ +    }
+ +    else
+ +    {
+ +        /* PME, if used, is done on all nodes with 1D decomposition */
+ +        cr->npmenodes = 0;
+ +        cr->duty      = (DUTY_PP | DUTY_PME);
+ +        npme_major    = 1;
+ +        npme_minor    = 1;
+ +        /* NM and TPI perform single node energy calculations in parallel */
+ +        if (!(inputrec->eI == eiNM || EI_TPI(inputrec->eI)))
+ +        {
+ +            npme_major = cr->nnodes;
+ +        }
+ +
+ +        if (inputrec->ePBC == epbcSCREW)
+ +        {
+ +            gmx_fatal(FARGS,
+ +                      "pbc=%s is only implemented with domain decomposition",
+ +                      epbc_names[inputrec->ePBC]);
+ +        }
+ +    }
+ +
+ +    if (PAR(cr))
+ +    {
+ +        /* After possible communicator splitting in make_dd_communicators.
+ +         * we can set up the intra/inter node communication.
+ +         */
+ +        gmx_setup_nodecomm(fplog, cr);
+ +    }
+ +
+ +    /* Initialize per-physical-node MPI process/thread ID and counters. */
+ +    gmx_init_intranode_counters(cr);
+ +
+ +#ifdef GMX_MPI
+ +    md_print_info(cr, fplog, "Using %d MPI %s\n",
+ +                  cr->nnodes,
+ +#ifdef GMX_THREAD_MPI
+ +                  cr->nnodes == 1 ? "thread" : "threads"
+ +#else
+ +                  cr->nnodes == 1 ? "process" : "processes"
+ +#endif
+ +                  );
+ +    fflush(stderr);
+ +#endif
+ +
+ +    gmx_omp_nthreads_init(fplog, cr,
+ +                          hwinfo->nthreads_hw_avail,
+ +                          hw_opt->nthreads_omp,
+ +                          hw_opt->nthreads_omp_pme,
+ +                          (cr->duty & DUTY_PP) == 0,
+ +                          inputrec->cutoff_scheme == ecutsVERLET);
+ +
+ +    /* check consistency and decide on the number of gpus to use. */
+ +    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi,
+ +                                     minf.bUseGPU);
+ +
+ +    /* getting number of PP/PME threads
+ +       PME: env variable should be read only on one node to make sure it is
+ +       identical everywhere;
+ +     */
+ +    /* TODO nthreads_pp is only used for pinning threads.
+ +     * This is a temporary solution until we have a hw topology library.
+ +     */
+ +    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
+ +    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+ +
+ +    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);
+ +
+ +    if (PAR(cr))
+ +    {
+ +        /* Master synchronizes its value of reset_counters with all nodes
+ +         * including PME only nodes */
+ +        reset_counters = wcycle_get_reset_counters(wcycle);
+ +        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+ +        wcycle_set_reset_counters(wcycle, reset_counters);
+ +    }
+ +
+ +    snew(nrnb, 1);
+ +    if (cr->duty & DUTY_PP)
+ +    {
+ +        /* For domain decomposition we allocate dynamically
+ +         * in dd_partition_system.
+ +         */
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            bcast_state_setup(cr, state);
+ +        }
+ +        else
+ +        {
+ +            if (PAR(cr))
+ +            {
+ +                bcast_state(cr, state, TRUE);
+ +            }
+ +        }
+ +
+ +        /* Initiate forcerecord */
+ +        fr         = mk_forcerec();
+ +        fr->hwinfo = hwinfo;
+ +        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
+ +                      opt2fn("-table", nfile, fnm),
+ +                      opt2fn("-tabletf", nfile, fnm),
+ +                      opt2fn("-tablep", nfile, fnm),
+ +                      opt2fn("-tableb", nfile, fnm),
+ +                      nbpu_opt,
+ +                      FALSE, pforce);
+ +
+ +        /* version for PCA_NOT_READ_NODE (see md.c) */
+ +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
+ +           "nofile","nofile","nofile","nofile",FALSE,pforce);
+ +         */
+ +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
+ +
+ +        /* Initialize QM-MM */
+ +        if (fr->bQMMM)
+ +        {
+ +            init_QMMMrec(cr, mtop, inputrec, fr);
+ +        }
+ +
+ +        /* Initialize the mdatoms structure.
+ +         * mdatoms is not filled with atom data,
+ +         * as this can not be done now with domain decomposition.
+ +         */
+ +        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
+ +
+ +        if (mdatoms->nPerturbed > 0 && inputrec->cutoff_scheme == ecutsVERLET)
+ +        {
+ +            gmx_fatal(FARGS, "The Verlet cut-off scheme does not (yet) support free-energy calculations with perturbed atoms, only perturbed interactions. This will be implemented soon. Use the group scheme for now.");
+ +        }
+ +
+ +        /* Initialize the virtual site communication */
+ +        vsite = init_vsite(mtop, cr, FALSE);
+ +
+ +        calc_shifts(box, fr->shift_vec);
+ +
+ +        /* With periodic molecules the charge groups should be whole at start up
+ +         * and the virtual sites should not be far from their proper positions.
+ +         */
+ +        if (!inputrec->bContinuation && MASTER(cr) &&
+ +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+ +        {
+ +            /* Make molecules whole at start of run */
+ +            if (fr->ePBC != epbcNONE)
+ +            {
+ +                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
+ +            }
+ +            if (vsite)
+ +            {
+ +                /* Correct initial vsite positions are required
+ +                 * for the initial distribution in the domain decomposition
+ +                 * and for the initial shell prediction.
+ +                 */
+ +                construct_vsites_mtop(vsite, mtop, state->x);
+ +            }
+ +        }
+ +
+ +        if (EEL_PME(fr->eeltype))
+ +        {
+ +            ewaldcoeff = fr->ewaldcoeff;
+ +            pmedata    = &fr->pmedata;
+ +        }
+ +        else
+ +        {
+ +            pmedata = NULL;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        /* This is a PME only node */
+ +
+ +        /* We don't need the state */
+ +        done_state(state);
+ +
+ +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
+ +        snew(pmedata, 1);
+ +    }
+ +
+ +    if (hw_opt->thread_affinity != threadaffOFF)
+ +    {
+ +        /* Before setting affinity, check whether the affinity has changed
+ +         * - which indicates that probably the OpenMP library has changed it
+ +         * since we first checked).
+ +         */
+ +        gmx_check_thread_affinity_set(fplog, cr,
+ +                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+ +
+ +        /* Set the CPU affinity */
+ +        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
+ +    }
+ +
+ +    /* Initiate PME if necessary,
+ +     * either on all nodes or on dedicated PME nodes only. */
+ +    if (EEL_PME(inputrec->coulombtype))
+ +    {
+ +        if (mdatoms)
+ +        {
+ +            nChargePerturbed = mdatoms->nChargePerturbed;
+ +        }
+ +        if (cr->npmenodes > 0)
+ +        {
+ +            /* The PME only nodes need to know nChargePerturbed */
+ +            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+ +        }
+ +
+ +        if (cr->duty & DUTY_PME)
+ +        {
+ +            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
+ +                                  mtop ? mtop->natoms : 0, nChargePerturbed,
+ +                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
+ +            if (status != 0)
+ +            {
+ +                gmx_fatal(FARGS, "Error %d initializing PME", status);
+ +            }
+ +        }
+ +    }
+ +
+ +
+ +    if (integrator[inputrec->eI].func == do_md)
+ +    {
+ +        /* Turn on signal handling on all nodes */
+ +        /*
+ +         * (A user signal from the PME nodes (if any)
+ +         * is communicated to the PP nodes.
+ +         */
+ +        signal_handler_install();
+ +    }
+ +
+ +    if (cr->duty & DUTY_PP)
+ +    {
+ +        /* Assumes uniform use of the number of OpenMP threads */
+ +        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+ +
+ +        if (inputrec->ePull != epullNO)
+ +        {
+ +            /* Initialize pull code */
+ +            init_pull(fplog, inputrec, nfile, fnm, mtop, cr, oenv, inputrec->fepvals->init_lambda,
+ +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
+ +        }
+ +
+ +        if (inputrec->bRot)
+ +        {
+ +            /* Initialize enforced rotation code */
+ +            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
+ +                     bVerbose, Flags);
+ +        }
+ +
+ +        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
+ +
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
+ +                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);
+ +
+ +            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);
+ +
+ +            setup_dd_grid(fplog, cr->dd);
+ +        }
+ +
+ +        /* Now do whatever the user wants us to do (how flexible...) */
+ +        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
+ +                                      oenv, bVerbose, bCompact,
+ +                                      nstglobalcomm,
+ +                                      vsite, constr,
+ +                                      nstepout, inputrec, mtop,
+ +                                      fcd, state,
+ +                                      mdatoms, nrnb, wcycle, ed, fr,
+ +                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
+ +                                      membed,
+ +                                      cpt_period, max_hours,
+ +                                      deviceOptions,
+ +                                      Flags,
+ +                                      walltime_accounting);
+ +
+ +        if (inputrec->ePull != epullNO)
+ +        {
+ +            finish_pull(inputrec->pull);
+ +        }
+ +
+ +        if (inputrec->bRot)
+ +        {
+ +            finish_rot(inputrec->rot);
+ +        }
+ +
+ +    }
+ +    else
+ +    {
+ +        /* do PME only */
+ +        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+ +        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff, inputrec);
+ +    }
+ +
+ +    wallcycle_stop(wcycle, ewcRUN);
+ +
+ +    /* Finish up, write some stuff
+ +     * if rerunMD, don't write last frame again
+ +     */
+ +    finish_run(fplog, cr,
+ +               inputrec, nrnb, wcycle, walltime_accounting,
+ +               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
+ +               nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
+ +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+ +
+ +    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
+ +    {
+ +        char gpu_err_str[STRLEN];
+ +
+ +        /* free GPU memory and uninitialize GPU (by destroying the context) */
+ +        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
+ +
+ +        if (!free_gpu(gpu_err_str))
+ +        {
+ +            gmx_warning("On node %d failed to free GPU #%d: %s",
+ +                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+ +        }
+ +    }
+ +
+ +    if (opt2bSet("-membed", nfile, fnm))
+ +    {
+ +        sfree(membed);
+ +    }
+ +
+ +    gmx_hardware_info_free(hwinfo);
+ +
+ +    /* Does what it says */
+ +    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", walltime_accounting);
+ +    walltime_accounting_destroy(walltime_accounting);
+ +
+ +    /* Close logfile already here if we were appending to it */
+ +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+ +    {
+ +        gmx_log_close(fplog);
+ +    }
+ +
+ +    rc = (int)gmx_get_stop_condition();
+ +
+ +#ifdef GMX_THREAD_MPI
+ +    /* we need to join all threads. The sub-threads join when they
+ +       exit this function, but the master thread needs to be told to
+ +       wait for that. */
+ +    if (PAR(cr) && MASTER(cr))
+ +    {
+ +        tMPI_Finalize();
+ +    }
+ +#endif
+ +
+ +    return rc;
+ +}
author	Roland Schulz <roland@utk.edu>
	Wed, 30 Oct 2013 03:03:45 +0000 (23:03 -0400)
committer	Roland Schulz <roland@utk.edu>
	Wed, 30 Oct 2013 03:04:44 +0000 (23:04 -0400)
		1	2
src/gromacs/gmxpreprocess/calc_verletbuf.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/legacyheaders/gmx_simd_ref.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h	patch \|	diff1 \|	\|	blob \| history
src/programs/mdrun/repl_ex.c	patch \|	diff1 \|	\|	blob \| history
src/programs/mdrun/runner.c	patch \|	diff1 \|	\|	blob \| history